/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.33 - (hide annotations) (download)
Sat Sep 5 10:41:07 2009 UTC (15 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.32: +52 -17 lines
++ whatpm/t/ChangeLog	5 Sep 2009 10:40:03 -0000
	* tokenizer-test-1.test: Updated test results on unclosed start
	and end tags (HTML5 revision 2990).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	5 Sep 2009 10:40:48 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat, attrs-1.dat: Updated test results on unclosed
	tags and attlist declarations (cf. HTML5 revision 2990).

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 10:39:09 -0000
	* Tokenizer.pm.src: Discard unclosed tags (HTML5 revision 2990).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.33 our $VERSION=do{my @r=(q$Revision: 1.32 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.32 sub COMMENT_END_BANG_STATE () { 102 }
109     sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
111     sub BOGUS_COMMENT_STATE () { 19 }
112     sub DOCTYPE_STATE () { 20 }
113     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114     sub DOCTYPE_NAME_STATE () { 22 }
115     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124     sub BOGUS_DOCTYPE_STATE () { 32 }
125     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126     sub SELF_CLOSING_START_TAG_STATE () { 34 }
127     sub CDATA_SECTION_STATE () { 35 }
128     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136     ## NOTE: "Entity data state", "entity in attribute value state", and
137     ## "consume a character reference" algorithm are jointly implemented
138     ## using the following six states:
139     sub ENTITY_STATE () { 44 }
140     sub ENTITY_HASH_STATE () { 45 }
141     sub NCR_NUM_STATE () { 46 }
142     sub HEXREF_X_STATE () { 47 }
143     sub HEXREF_HEX_STATE () { 48 }
144     sub ENTITY_NAME_STATE () { 49 }
145     sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147 wakaba 1.12 ## XML-only states
148 wakaba 1.8 sub PI_STATE () { 51 }
149     sub PI_TARGET_STATE () { 52 }
150     sub PI_TARGET_AFTER_STATE () { 53 }
151     sub PI_DATA_STATE () { 54 }
152     sub PI_AFTER_STATE () { 55 }
153     sub PI_DATA_AFTER_STATE () { 56 }
154 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157     sub DOCTYPE_TAG_STATE () { 60 }
158     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159     sub MD_ATTLIST_STATE () { 62 }
160     sub MD_E_STATE () { 63 }
161     sub MD_ELEMENT_STATE () { 64 }
162     sub MD_ENTITY_STATE () { 65 }
163     sub MD_NOTATION_STATE () { 66 }
164     sub DOCTYPE_MD_STATE () { 67 }
165     sub BEFORE_MD_NAME_STATE () { 68 }
166     sub MD_NAME_STATE () { 69 }
167     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174     sub ALLOWED_TOKEN_STATE () { 77 }
175     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
183     sub NDATA_STATE () { 86 }
184     sub AFTER_NDATA_STATE () { 87 }
185     sub BEFORE_NOTATION_NAME_STATE () { 88 }
186     sub NOTATION_NAME_STATE () { 89 }
187 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190     sub AFTER_ELEMENT_NAME_STATE () { 93 }
191     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192     sub CONTENT_KEYWORD_STATE () { 95 }
193     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194     sub CM_ELEMENT_NAME_STATE () { 97 }
195     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197     sub AFTER_MD_DEF_STATE () { 100 }
198     sub BOGUS_MD_STATE () { 101 }
199 wakaba 1.8
200 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
201     ## list and descriptions)
202    
203     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204     sub FOREIGN_EL () { 0b1_00000000000 }
205    
206     ## Character reference mappings
207    
208     my $charref_map = {
209     0x0D => 0x000A,
210     0x80 => 0x20AC,
211     0x81 => 0xFFFD,
212     0x82 => 0x201A,
213     0x83 => 0x0192,
214     0x84 => 0x201E,
215     0x85 => 0x2026,
216     0x86 => 0x2020,
217     0x87 => 0x2021,
218     0x88 => 0x02C6,
219     0x89 => 0x2030,
220     0x8A => 0x0160,
221     0x8B => 0x2039,
222     0x8C => 0x0152,
223     0x8D => 0xFFFD,
224     0x8E => 0x017D,
225     0x8F => 0xFFFD,
226     0x90 => 0xFFFD,
227     0x91 => 0x2018,
228     0x92 => 0x2019,
229     0x93 => 0x201C,
230     0x94 => 0x201D,
231     0x95 => 0x2022,
232     0x96 => 0x2013,
233     0x97 => 0x2014,
234     0x98 => 0x02DC,
235     0x99 => 0x2122,
236     0x9A => 0x0161,
237     0x9B => 0x203A,
238     0x9C => 0x0153,
239     0x9D => 0xFFFD,
240     0x9E => 0x017E,
241     0x9F => 0x0178,
242     }; # $charref_map
243     $charref_map->{$_} = 0xFFFD
244     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
245     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
246     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
247     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
248     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
249     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
250     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
251    
252     ## Implementations MUST act as if state machine in the spec
253    
254     sub _initialize_tokenizer ($) {
255     my $self = shift;
256    
257     ## NOTE: Fields set by |new| constructor:
258     #$self->{level}
259     #$self->{set_nc}
260     #$self->{parse_error}
261 wakaba 1.3 #$self->{is_xml} (if XML)
262 wakaba 1.1
263     $self->{state} = DATA_STATE; # MUST
264 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
265     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266 wakaba 1.1 #$self->{entity__value}; # initialized when used
267     #$self->{entity__match}; # initialized when used
268     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
269     undef $self->{ct}; # current token
270     undef $self->{ca}; # current attribute
271     undef $self->{last_stag_name}; # last emitted start tag name
272     #$self->{prev_state}; # initialized when used
273     delete $self->{self_closing};
274     $self->{char_buffer} = '';
275     $self->{char_buffer_pos} = 0;
276     $self->{nc} = -1; # next input character
277     #$self->{next_nc}
278    
279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
280     $self->{line_prev} = $self->{line};
281     $self->{column_prev} = $self->{column};
282     $self->{column}++;
283     $self->{nc}
284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
285     } else {
286     $self->{set_nc}->($self);
287     }
288    
289     $self->{token} = [];
290     # $self->{escape}
291     } # _initialize_tokenizer
292    
293     ## A token has:
294     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
295 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
296 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
297     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
298 wakaba 1.11 ## ->{target} (PI_TOKEN)
299 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
300     ## ->{sysid} (DOCTYPE_TOKEN)
301     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
302     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
303     ## ->{name}
304     ## ->{value}
305     ## ->{has_reference} == 1 or 0
306 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
307     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
308 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
309 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
310 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
311    
312 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
313     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
314     ## while the token is pushed back to the stack.
315    
316     ## Emitted token MUST immediately be handled by the tree construction state.
317    
318     ## Before each step, UA MAY check to see if either one of the scripts in
319     ## "list of scripts that will execute as soon as possible" or the first
320     ## script in the "list of scripts that will execute asynchronously",
321     ## has completed loading. If one has, then it MUST be executed
322     ## and removed from the list.
323    
324     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
325     ## (This requirement was dropped from HTML5 spec, unfortunately.)
326    
327     my $is_space = {
328     0x0009 => 1, # CHARACTER TABULATION (HT)
329     0x000A => 1, # LINE FEED (LF)
330     #0x000B => 0, # LINE TABULATION (VT)
331 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
332 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
333     0x0020 => 1, # SPACE (SP)
334     };
335    
336     sub _get_next_token ($) {
337     my $self = shift;
338    
339     if ($self->{self_closing}) {
340     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
341     ## NOTE: The |self_closing| flag is only set by start tag token.
342     ## In addition, when a start tag token is emitted, it is always set to
343     ## |ct|.
344     delete $self->{self_closing};
345     }
346    
347     if (@{$self->{token}}) {
348     $self->{self_closing} = $self->{token}->[0]->{self_closing};
349     return shift @{$self->{token}};
350     }
351    
352     A: {
353     if ($self->{state} == PCDATA_STATE) {
354     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
355    
356     if ($self->{nc} == 0x0026) { # &
357    
358     ## NOTE: In the spec, the tokenizer is switched to the
359     ## "entity data state". In this implementation, the tokenizer
360     ## is switched to the |ENTITY_STATE|, which is an implementation
361     ## of the "consume a character reference" algorithm.
362     $self->{entity_add} = -1;
363     $self->{prev_state} = DATA_STATE;
364     $self->{state} = ENTITY_STATE;
365    
366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
367     $self->{line_prev} = $self->{line};
368     $self->{column_prev} = $self->{column};
369     $self->{column}++;
370     $self->{nc}
371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
372     } else {
373     $self->{set_nc}->($self);
374     }
375    
376     redo A;
377     } elsif ($self->{nc} == 0x003C) { # <
378    
379     $self->{state} = TAG_OPEN_STATE;
380    
381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
382     $self->{line_prev} = $self->{line};
383     $self->{column_prev} = $self->{column};
384     $self->{column}++;
385     $self->{nc}
386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
387     } else {
388     $self->{set_nc}->($self);
389     }
390    
391     redo A;
392     } elsif ($self->{nc} == -1) {
393    
394     return ({type => END_OF_FILE_TOKEN,
395     line => $self->{line}, column => $self->{column}});
396     last A; ## TODO: ok?
397     } else {
398    
399     #
400     }
401    
402     # Anything else
403     my $token = {type => CHARACTER_TOKEN,
404     data => chr $self->{nc},
405     line => $self->{line}, column => $self->{column},
406     };
407     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
408    
409     ## Stay in the state.
410    
411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
412     $self->{line_prev} = $self->{line};
413     $self->{column_prev} = $self->{column};
414     $self->{column}++;
415     $self->{nc}
416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
417     } else {
418     $self->{set_nc}->($self);
419     }
420    
421     return ($token);
422     redo A;
423     } elsif ($self->{state} == DATA_STATE) {
424     $self->{s_kwd} = '' unless defined $self->{s_kwd};
425     if ($self->{nc} == 0x0026) { # &
426     $self->{s_kwd} = '';
427     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
428     not $self->{escape}) {
429    
430     ## NOTE: In the spec, the tokenizer is switched to the
431     ## "entity data state". In this implementation, the tokenizer
432     ## is switched to the |ENTITY_STATE|, which is an implementation
433     ## of the "consume a character reference" algorithm.
434     $self->{entity_add} = -1;
435     $self->{prev_state} = DATA_STATE;
436     $self->{state} = ENTITY_STATE;
437    
438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
439     $self->{line_prev} = $self->{line};
440     $self->{column_prev} = $self->{column};
441     $self->{column}++;
442     $self->{nc}
443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
444     } else {
445     $self->{set_nc}->($self);
446     }
447    
448     redo A;
449     } else {
450    
451     #
452     }
453     } elsif ($self->{nc} == 0x002D) { # -
454     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
455 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
456 wakaba 1.1
457     $self->{escape} = 1; # unless $self->{escape};
458     $self->{s_kwd} = '--';
459     #
460 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
461 wakaba 1.1
462     $self->{s_kwd} = '--';
463     #
464 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
465    
466     $self->{s_kwd} .= '-';
467     #
468 wakaba 1.1 } else {
469    
470 wakaba 1.5 $self->{s_kwd} = '-';
471 wakaba 1.1 #
472     }
473     }
474    
475     #
476     } elsif ($self->{nc} == 0x0021) { # !
477     if (length $self->{s_kwd}) {
478    
479     $self->{s_kwd} .= '!';
480     #
481     } else {
482    
483     #$self->{s_kwd} = '';
484     #
485     }
486     #
487     } elsif ($self->{nc} == 0x003C) { # <
488     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
489     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
490     not $self->{escape})) {
491    
492     $self->{state} = TAG_OPEN_STATE;
493    
494     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
495     $self->{line_prev} = $self->{line};
496     $self->{column_prev} = $self->{column};
497     $self->{column}++;
498     $self->{nc}
499     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
500     } else {
501     $self->{set_nc}->($self);
502     }
503    
504     redo A;
505     } else {
506    
507     $self->{s_kwd} = '';
508     #
509     }
510     } elsif ($self->{nc} == 0x003E) { # >
511     if ($self->{escape} and
512     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
513     if ($self->{s_kwd} eq '--') {
514    
515     delete $self->{escape};
516 wakaba 1.5 #
517 wakaba 1.1 } else {
518    
519 wakaba 1.5 #
520 wakaba 1.1 }
521 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
522    
523     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
524     line => $self->{line_prev},
525     column => $self->{column_prev} - 1);
526     #
527 wakaba 1.1 } else {
528    
529 wakaba 1.5 #
530 wakaba 1.1 }
531    
532     $self->{s_kwd} = '';
533     #
534 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
535     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
536    
537     $self->{s_kwd} .= ']';
538     } elsif ($self->{s_kwd} eq ']]') {
539    
540     #
541     } else {
542    
543     $self->{s_kwd} = '';
544     }
545     #
546 wakaba 1.1 } elsif ($self->{nc} == -1) {
547    
548     $self->{s_kwd} = '';
549     return ({type => END_OF_FILE_TOKEN,
550     line => $self->{line}, column => $self->{column}});
551     last A; ## TODO: ok?
552     } else {
553    
554     $self->{s_kwd} = '';
555     #
556     }
557    
558     # Anything else
559     my $token = {type => CHARACTER_TOKEN,
560     data => chr $self->{nc},
561     line => $self->{line}, column => $self->{column},
562     };
563 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
564 wakaba 1.1 length $token->{data})) {
565     $self->{s_kwd} = '';
566     }
567    
568     ## Stay in the data state.
569 wakaba 1.5 if (not $self->{is_xml} and
570     $self->{content_model} == PCDATA_CONTENT_MODEL) {
571 wakaba 1.1
572     $self->{state} = PCDATA_STATE;
573     } else {
574    
575     ## Stay in the state.
576     }
577    
578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
579     $self->{line_prev} = $self->{line};
580     $self->{column_prev} = $self->{column};
581     $self->{column}++;
582     $self->{nc}
583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
584     } else {
585     $self->{set_nc}->($self);
586     }
587    
588     return ($token);
589     redo A;
590     } elsif ($self->{state} == TAG_OPEN_STATE) {
591 wakaba 1.10 ## XML5: "tag state".
592    
593 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
594     if ($self->{nc} == 0x002F) { # /
595    
596    
597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
598     $self->{line_prev} = $self->{line};
599     $self->{column_prev} = $self->{column};
600     $self->{column}++;
601     $self->{nc}
602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
603     } else {
604     $self->{set_nc}->($self);
605     }
606    
607     $self->{state} = CLOSE_TAG_OPEN_STATE;
608     redo A;
609     } elsif ($self->{nc} == 0x0021) { # !
610    
611 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
612 wakaba 1.1 #
613     } else {
614    
615 wakaba 1.12 $self->{s_kwd} = '';
616 wakaba 1.1 #
617     }
618    
619     ## reconsume
620     $self->{state} = DATA_STATE;
621     return ({type => CHARACTER_TOKEN, data => '<',
622     line => $self->{line_prev},
623     column => $self->{column_prev},
624     });
625     redo A;
626     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
627     if ($self->{nc} == 0x0021) { # !
628    
629     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
630    
631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
632     $self->{line_prev} = $self->{line};
633     $self->{column_prev} = $self->{column};
634     $self->{column}++;
635     $self->{nc}
636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
637     } else {
638     $self->{set_nc}->($self);
639     }
640    
641     redo A;
642     } elsif ($self->{nc} == 0x002F) { # /
643    
644     $self->{state} = CLOSE_TAG_OPEN_STATE;
645    
646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
647     $self->{line_prev} = $self->{line};
648     $self->{column_prev} = $self->{column};
649     $self->{column}++;
650     $self->{nc}
651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
652     } else {
653     $self->{set_nc}->($self);
654     }
655    
656     redo A;
657     } elsif (0x0041 <= $self->{nc} and
658     $self->{nc} <= 0x005A) { # A..Z
659    
660     $self->{ct}
661     = {type => START_TAG_TOKEN,
662 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
663 wakaba 1.1 line => $self->{line_prev},
664     column => $self->{column_prev}};
665     $self->{state} = TAG_NAME_STATE;
666    
667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668     $self->{line_prev} = $self->{line};
669     $self->{column_prev} = $self->{column};
670     $self->{column}++;
671     $self->{nc}
672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673     } else {
674     $self->{set_nc}->($self);
675     }
676    
677     redo A;
678     } elsif (0x0061 <= $self->{nc} and
679     $self->{nc} <= 0x007A) { # a..z
680    
681     $self->{ct} = {type => START_TAG_TOKEN,
682     tag_name => chr ($self->{nc}),
683     line => $self->{line_prev},
684     column => $self->{column_prev}};
685     $self->{state} = TAG_NAME_STATE;
686    
687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
688     $self->{line_prev} = $self->{line};
689     $self->{column_prev} = $self->{column};
690     $self->{column}++;
691     $self->{nc}
692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
693     } else {
694     $self->{set_nc}->($self);
695     }
696    
697     redo A;
698     } elsif ($self->{nc} == 0x003E) { # >
699    
700     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
701     line => $self->{line_prev},
702     column => $self->{column_prev});
703     $self->{state} = DATA_STATE;
704 wakaba 1.5 $self->{s_kwd} = '';
705 wakaba 1.1
706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
707     $self->{line_prev} = $self->{line};
708     $self->{column_prev} = $self->{column};
709     $self->{column}++;
710     $self->{nc}
711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
712     } else {
713     $self->{set_nc}->($self);
714     }
715    
716    
717     return ({type => CHARACTER_TOKEN, data => '<>',
718     line => $self->{line_prev},
719     column => $self->{column_prev},
720     });
721    
722     redo A;
723     } elsif ($self->{nc} == 0x003F) { # ?
724 wakaba 1.8 if ($self->{is_xml}) {
725    
726     $self->{state} = PI_STATE;
727    
728     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
729     $self->{line_prev} = $self->{line};
730     $self->{column_prev} = $self->{column};
731     $self->{column}++;
732     $self->{nc}
733     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
734     } else {
735     $self->{set_nc}->($self);
736     }
737    
738     redo A;
739     } else {
740    
741     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
742     line => $self->{line_prev},
743     column => $self->{column_prev});
744     $self->{state} = BOGUS_COMMENT_STATE;
745     $self->{ct} = {type => COMMENT_TOKEN, data => '',
746     line => $self->{line_prev},
747     column => $self->{column_prev},
748     };
749     ## $self->{nc} is intentionally left as is
750     redo A;
751     }
752 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
753 wakaba 1.1
754     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
755     line => $self->{line_prev},
756     column => $self->{column_prev});
757     $self->{state} = DATA_STATE;
758 wakaba 1.5 $self->{s_kwd} = '';
759 wakaba 1.1 ## reconsume
760    
761     return ({type => CHARACTER_TOKEN, data => '<',
762     line => $self->{line_prev},
763     column => $self->{column_prev},
764     });
765    
766     redo A;
767 wakaba 1.9 } else {
768     ## XML5: "<:" is a parse error.
769    
770     $self->{ct} = {type => START_TAG_TOKEN,
771     tag_name => chr ($self->{nc}),
772     line => $self->{line_prev},
773     column => $self->{column_prev}};
774     $self->{state} = TAG_NAME_STATE;
775    
776     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
777     $self->{line_prev} = $self->{line};
778     $self->{column_prev} = $self->{column};
779     $self->{column}++;
780     $self->{nc}
781     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
782     } else {
783     $self->{set_nc}->($self);
784     }
785    
786     redo A;
787 wakaba 1.1 }
788     } else {
789     die "$0: $self->{content_model} in tag open";
790     }
791     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
792     ## NOTE: The "close tag open state" in the spec is implemented as
793     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
794    
795 wakaba 1.10 ## XML5: "end tag state".
796    
797 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
798     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
799     if (defined $self->{last_stag_name}) {
800     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
801 wakaba 1.12 $self->{kwd} = '';
802 wakaba 1.1 ## Reconsume.
803     redo A;
804     } else {
805     ## No start tag token has ever been emitted
806     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
807    
808     $self->{state} = DATA_STATE;
809 wakaba 1.5 $self->{s_kwd} = '';
810 wakaba 1.1 ## Reconsume.
811     return ({type => CHARACTER_TOKEN, data => '</',
812     line => $l, column => $c,
813     });
814     redo A;
815     }
816     }
817    
818     if (0x0041 <= $self->{nc} and
819     $self->{nc} <= 0x005A) { # A..Z
820    
821     $self->{ct}
822     = {type => END_TAG_TOKEN,
823 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
824 wakaba 1.1 line => $l, column => $c};
825     $self->{state} = TAG_NAME_STATE;
826    
827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
828     $self->{line_prev} = $self->{line};
829     $self->{column_prev} = $self->{column};
830     $self->{column}++;
831     $self->{nc}
832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
833     } else {
834     $self->{set_nc}->($self);
835     }
836    
837     redo A;
838     } elsif (0x0061 <= $self->{nc} and
839     $self->{nc} <= 0x007A) { # a..z
840    
841     $self->{ct} = {type => END_TAG_TOKEN,
842     tag_name => chr ($self->{nc}),
843     line => $l, column => $c};
844     $self->{state} = TAG_NAME_STATE;
845    
846     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
847     $self->{line_prev} = $self->{line};
848     $self->{column_prev} = $self->{column};
849     $self->{column}++;
850     $self->{nc}
851     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
852     } else {
853     $self->{set_nc}->($self);
854     }
855    
856     redo A;
857     } elsif ($self->{nc} == 0x003E) { # >
858     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
859     line => $self->{line_prev}, ## "<" in "</>"
860     column => $self->{column_prev} - 1);
861     $self->{state} = DATA_STATE;
862 wakaba 1.5 $self->{s_kwd} = '';
863 wakaba 1.10 if ($self->{is_xml}) {
864    
865     ## XML5: No parse error.
866    
867     ## NOTE: This parser raises a parse error, since it supports
868     ## XML1, not XML5.
869    
870     ## NOTE: A short end tag token.
871     my $ct = {type => END_TAG_TOKEN,
872     tag_name => '',
873     line => $self->{line_prev},
874     column => $self->{column_prev} - 1,
875     };
876    
877     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
878     $self->{line_prev} = $self->{line};
879     $self->{column_prev} = $self->{column};
880     $self->{column}++;
881     $self->{nc}
882     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
883     } else {
884     $self->{set_nc}->($self);
885     }
886    
887     return ($ct);
888     } else {
889    
890    
891 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
892     $self->{line_prev} = $self->{line};
893     $self->{column_prev} = $self->{column};
894     $self->{column}++;
895     $self->{nc}
896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
897     } else {
898     $self->{set_nc}->($self);
899     }
900    
901 wakaba 1.10 }
902 wakaba 1.1 redo A;
903     } elsif ($self->{nc} == -1) {
904    
905     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
906 wakaba 1.5 $self->{s_kwd} = '';
907 wakaba 1.1 $self->{state} = DATA_STATE;
908     # reconsume
909    
910     return ({type => CHARACTER_TOKEN, data => '</',
911     line => $l, column => $c,
912     });
913    
914     redo A;
915 wakaba 1.10 } elsif (not $self->{is_xml} or
916     $is_space->{$self->{nc}}) {
917 wakaba 1.1
918 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
919     line => $self->{line_prev}, # "<" of "</"
920     column => $self->{column_prev} - 1);
921 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
922     $self->{ct} = {type => COMMENT_TOKEN, data => '',
923     line => $self->{line_prev}, # "<" of "</"
924     column => $self->{column_prev} - 1,
925     };
926     ## NOTE: $self->{nc} is intentionally left as is.
927     ## Although the "anything else" case of the spec not explicitly
928     ## states that the next input character is to be reconsumed,
929     ## it will be included to the |data| of the comment token
930     ## generated from the bogus end tag, as defined in the
931     ## "bogus comment state" entry.
932     redo A;
933 wakaba 1.10 } else {
934     ## XML5: "</:" is a parse error.
935    
936     $self->{ct} = {type => END_TAG_TOKEN,
937     tag_name => chr ($self->{nc}),
938     line => $l, column => $c};
939     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
940    
941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
942     $self->{line_prev} = $self->{line};
943     $self->{column_prev} = $self->{column};
944     $self->{column}++;
945     $self->{nc}
946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
947     } else {
948     $self->{set_nc}->($self);
949     }
950    
951     redo A;
952 wakaba 1.1 }
953     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
954 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
955 wakaba 1.1 if (length $ch) {
956     my $CH = $ch;
957     $ch =~ tr/a-z/A-Z/;
958     my $nch = chr $self->{nc};
959     if ($nch eq $ch or $nch eq $CH) {
960    
961     ## Stay in the state.
962 wakaba 1.12 $self->{kwd} .= $nch;
963 wakaba 1.1
964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
965     $self->{line_prev} = $self->{line};
966     $self->{column_prev} = $self->{column};
967     $self->{column}++;
968     $self->{nc}
969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
970     } else {
971     $self->{set_nc}->($self);
972     }
973    
974     redo A;
975     } else {
976    
977     $self->{state} = DATA_STATE;
978 wakaba 1.5 $self->{s_kwd} = '';
979 wakaba 1.1 ## Reconsume.
980     return ({type => CHARACTER_TOKEN,
981 wakaba 1.12 data => '</' . $self->{kwd},
982 wakaba 1.1 line => $self->{line_prev},
983 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
984 wakaba 1.1 });
985     redo A;
986     }
987     } else { # after "<{tag-name}"
988     unless ($is_space->{$self->{nc}} or
989     {
990     0x003E => 1, # >
991     0x002F => 1, # /
992     -1 => 1, # EOF
993     }->{$self->{nc}}) {
994    
995     ## Reconsume.
996     $self->{state} = DATA_STATE;
997 wakaba 1.5 $self->{s_kwd} = '';
998 wakaba 1.1 return ({type => CHARACTER_TOKEN,
999 wakaba 1.12 data => '</' . $self->{kwd},
1000 wakaba 1.1 line => $self->{line_prev},
1001 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1002 wakaba 1.1 });
1003     redo A;
1004     } else {
1005    
1006     $self->{ct}
1007     = {type => END_TAG_TOKEN,
1008     tag_name => $self->{last_stag_name},
1009     line => $self->{line_prev},
1010 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1011 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1012     ## Reconsume.
1013     redo A;
1014     }
1015     }
1016     } elsif ($self->{state} == TAG_NAME_STATE) {
1017     if ($is_space->{$self->{nc}}) {
1018    
1019     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1020    
1021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1022     $self->{line_prev} = $self->{line};
1023     $self->{column_prev} = $self->{column};
1024     $self->{column}++;
1025     $self->{nc}
1026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1027     } else {
1028     $self->{set_nc}->($self);
1029     }
1030    
1031     redo A;
1032     } elsif ($self->{nc} == 0x003E) { # >
1033     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1034    
1035     $self->{last_stag_name} = $self->{ct}->{tag_name};
1036     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1037     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1038     #if ($self->{ct}->{attributes}) {
1039     # ## NOTE: This should never be reached.
1040     # !!! cp (36);
1041     # !!! parse-error (type => 'end tag attribute');
1042     #} else {
1043    
1044     #}
1045     } else {
1046     die "$0: $self->{ct}->{type}: Unknown token type";
1047     }
1048     $self->{state} = DATA_STATE;
1049 wakaba 1.5 $self->{s_kwd} = '';
1050 wakaba 1.1
1051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1052     $self->{line_prev} = $self->{line};
1053     $self->{column_prev} = $self->{column};
1054     $self->{column}++;
1055     $self->{nc}
1056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1057     } else {
1058     $self->{set_nc}->($self);
1059     }
1060    
1061    
1062     return ($self->{ct}); # start tag or end tag
1063    
1064     redo A;
1065     } elsif (0x0041 <= $self->{nc} and
1066     $self->{nc} <= 0x005A) { # A..Z
1067    
1068 wakaba 1.4 $self->{ct}->{tag_name}
1069     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1070 wakaba 1.1 # start tag or end tag
1071     ## Stay in this state
1072    
1073     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1074     $self->{line_prev} = $self->{line};
1075     $self->{column_prev} = $self->{column};
1076     $self->{column}++;
1077     $self->{nc}
1078     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1079     } else {
1080     $self->{set_nc}->($self);
1081     }
1082    
1083     redo A;
1084     } elsif ($self->{nc} == -1) {
1085     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1086     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1087    
1088     $self->{last_stag_name} = $self->{ct}->{tag_name};
1089     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1090     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1091     #if ($self->{ct}->{attributes}) {
1092     # ## NOTE: This state should never be reached.
1093     # !!! cp (40);
1094     # !!! parse-error (type => 'end tag attribute');
1095     #} else {
1096    
1097     #}
1098     } else {
1099     die "$0: $self->{ct}->{type}: Unknown token type";
1100     }
1101     $self->{state} = DATA_STATE;
1102 wakaba 1.5 $self->{s_kwd} = '';
1103 wakaba 1.1 # reconsume
1104    
1105 wakaba 1.33 ## Discard the token.
1106     #return ($self->{ct}); # start tag or end tag
1107 wakaba 1.1
1108     redo A;
1109     } elsif ($self->{nc} == 0x002F) { # /
1110    
1111     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1112    
1113     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1114     $self->{line_prev} = $self->{line};
1115     $self->{column_prev} = $self->{column};
1116     $self->{column}++;
1117     $self->{nc}
1118     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1119     } else {
1120     $self->{set_nc}->($self);
1121     }
1122    
1123     redo A;
1124     } else {
1125    
1126     $self->{ct}->{tag_name} .= chr $self->{nc};
1127     # start tag or end tag
1128     ## Stay in the state
1129    
1130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1131     $self->{line_prev} = $self->{line};
1132     $self->{column_prev} = $self->{column};
1133     $self->{column}++;
1134     $self->{nc}
1135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1136     } else {
1137     $self->{set_nc}->($self);
1138     }
1139    
1140     redo A;
1141     }
1142     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1143 wakaba 1.11 ## XML5: "Tag attribute name before state".
1144    
1145 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1146    
1147     ## Stay in the state
1148    
1149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1150     $self->{line_prev} = $self->{line};
1151     $self->{column_prev} = $self->{column};
1152     $self->{column}++;
1153     $self->{nc}
1154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1155     } else {
1156     $self->{set_nc}->($self);
1157     }
1158    
1159     redo A;
1160     } elsif ($self->{nc} == 0x003E) { # >
1161     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1162    
1163     $self->{last_stag_name} = $self->{ct}->{tag_name};
1164     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1165     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1166     if ($self->{ct}->{attributes}) {
1167    
1168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1169     } else {
1170    
1171     }
1172     } else {
1173     die "$0: $self->{ct}->{type}: Unknown token type";
1174     }
1175     $self->{state} = DATA_STATE;
1176 wakaba 1.5 $self->{s_kwd} = '';
1177 wakaba 1.1
1178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1179     $self->{line_prev} = $self->{line};
1180     $self->{column_prev} = $self->{column};
1181     $self->{column}++;
1182     $self->{nc}
1183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1184     } else {
1185     $self->{set_nc}->($self);
1186     }
1187    
1188    
1189     return ($self->{ct}); # start tag or end tag
1190    
1191     redo A;
1192     } elsif (0x0041 <= $self->{nc} and
1193     $self->{nc} <= 0x005A) { # A..Z
1194    
1195     $self->{ca}
1196 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1197 wakaba 1.1 value => '',
1198     line => $self->{line}, column => $self->{column}};
1199     $self->{state} = ATTRIBUTE_NAME_STATE;
1200    
1201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1202     $self->{line_prev} = $self->{line};
1203     $self->{column_prev} = $self->{column};
1204     $self->{column}++;
1205     $self->{nc}
1206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1207     } else {
1208     $self->{set_nc}->($self);
1209     }
1210    
1211     redo A;
1212     } elsif ($self->{nc} == 0x002F) { # /
1213    
1214     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1215    
1216     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1217     $self->{line_prev} = $self->{line};
1218     $self->{column_prev} = $self->{column};
1219     $self->{column}++;
1220     $self->{nc}
1221     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1222     } else {
1223     $self->{set_nc}->($self);
1224     }
1225    
1226     redo A;
1227     } elsif ($self->{nc} == -1) {
1228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1229     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1230    
1231     $self->{last_stag_name} = $self->{ct}->{tag_name};
1232     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1233     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1234     if ($self->{ct}->{attributes}) {
1235    
1236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1237     } else {
1238    
1239     }
1240     } else {
1241     die "$0: $self->{ct}->{type}: Unknown token type";
1242     }
1243     $self->{state} = DATA_STATE;
1244 wakaba 1.5 $self->{s_kwd} = '';
1245 wakaba 1.1 # reconsume
1246    
1247 wakaba 1.33 ## Discard the token.
1248     #return ($self->{ct}); # start tag or end tag
1249 wakaba 1.1
1250     redo A;
1251     } else {
1252     if ({
1253     0x0022 => 1, # "
1254     0x0027 => 1, # '
1255 wakaba 1.30 0x003C => 1, # <
1256 wakaba 1.1 0x003D => 1, # =
1257     }->{$self->{nc}}) {
1258    
1259 wakaba 1.11 ## XML5: Not a parse error.
1260 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1261     } else {
1262    
1263 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1264 wakaba 1.1 }
1265     $self->{ca}
1266     = {name => chr ($self->{nc}),
1267     value => '',
1268     line => $self->{line}, column => $self->{column}};
1269     $self->{state} = ATTRIBUTE_NAME_STATE;
1270    
1271     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1272     $self->{line_prev} = $self->{line};
1273     $self->{column_prev} = $self->{column};
1274     $self->{column}++;
1275     $self->{nc}
1276     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1277     } else {
1278     $self->{set_nc}->($self);
1279     }
1280    
1281     redo A;
1282     }
1283     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1284 wakaba 1.11 ## XML5: "Tag attribute name state".
1285    
1286 wakaba 1.1 my $before_leave = sub {
1287     if (exists $self->{ct}->{attributes} # start tag or end tag
1288     ->{$self->{ca}->{name}}) { # MUST
1289    
1290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1291     ## Discard $self->{ca} # MUST
1292     } else {
1293    
1294     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1295     = $self->{ca};
1296 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1297 wakaba 1.1 }
1298     }; # $before_leave
1299    
1300     if ($is_space->{$self->{nc}}) {
1301    
1302     $before_leave->();
1303     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1304    
1305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1306     $self->{line_prev} = $self->{line};
1307     $self->{column_prev} = $self->{column};
1308     $self->{column}++;
1309     $self->{nc}
1310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1311     } else {
1312     $self->{set_nc}->($self);
1313     }
1314    
1315     redo A;
1316     } elsif ($self->{nc} == 0x003D) { # =
1317    
1318     $before_leave->();
1319     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1320    
1321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1322     $self->{line_prev} = $self->{line};
1323     $self->{column_prev} = $self->{column};
1324     $self->{column}++;
1325     $self->{nc}
1326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1327     } else {
1328     $self->{set_nc}->($self);
1329     }
1330    
1331     redo A;
1332     } elsif ($self->{nc} == 0x003E) { # >
1333 wakaba 1.11 if ($self->{is_xml}) {
1334    
1335     ## XML5: Not a parse error.
1336     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1337     } else {
1338    
1339     }
1340    
1341 wakaba 1.1 $before_leave->();
1342     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1343    
1344     $self->{last_stag_name} = $self->{ct}->{tag_name};
1345     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1346    
1347     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1348     if ($self->{ct}->{attributes}) {
1349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1350     }
1351     } else {
1352     die "$0: $self->{ct}->{type}: Unknown token type";
1353     }
1354     $self->{state} = DATA_STATE;
1355 wakaba 1.5 $self->{s_kwd} = '';
1356 wakaba 1.1
1357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1358     $self->{line_prev} = $self->{line};
1359     $self->{column_prev} = $self->{column};
1360     $self->{column}++;
1361     $self->{nc}
1362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1363     } else {
1364     $self->{set_nc}->($self);
1365     }
1366    
1367    
1368     return ($self->{ct}); # start tag or end tag
1369    
1370     redo A;
1371     } elsif (0x0041 <= $self->{nc} and
1372     $self->{nc} <= 0x005A) { # A..Z
1373    
1374 wakaba 1.4 $self->{ca}->{name}
1375     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1376 wakaba 1.1 ## Stay in the state
1377    
1378     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1379     $self->{line_prev} = $self->{line};
1380     $self->{column_prev} = $self->{column};
1381     $self->{column}++;
1382     $self->{nc}
1383     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1384     } else {
1385     $self->{set_nc}->($self);
1386     }
1387    
1388     redo A;
1389     } elsif ($self->{nc} == 0x002F) { # /
1390 wakaba 1.11 if ($self->{is_xml}) {
1391    
1392     ## XML5: Not a parse error.
1393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1394     } else {
1395    
1396     }
1397 wakaba 1.1
1398     $before_leave->();
1399     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1400    
1401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1402     $self->{line_prev} = $self->{line};
1403     $self->{column_prev} = $self->{column};
1404     $self->{column}++;
1405     $self->{nc}
1406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1407     } else {
1408     $self->{set_nc}->($self);
1409     }
1410    
1411     redo A;
1412     } elsif ($self->{nc} == -1) {
1413     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1414     $before_leave->();
1415     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1416    
1417     $self->{last_stag_name} = $self->{ct}->{tag_name};
1418     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1419     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1420     if ($self->{ct}->{attributes}) {
1421    
1422     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1423     } else {
1424     ## NOTE: This state should never be reached.
1425    
1426     }
1427     } else {
1428     die "$0: $self->{ct}->{type}: Unknown token type";
1429     }
1430     $self->{state} = DATA_STATE;
1431 wakaba 1.5 $self->{s_kwd} = '';
1432 wakaba 1.1 # reconsume
1433    
1434 wakaba 1.33 ## Discard the token.
1435     #return ($self->{ct}); # start tag or end tag
1436 wakaba 1.1
1437     redo A;
1438     } else {
1439 wakaba 1.30 if ({
1440     0x0022 => 1, # "
1441     0x0027 => 1, # '
1442     0x003C => 1, # <
1443     }->{$self->{nc}}) {
1444 wakaba 1.1
1445 wakaba 1.11 ## XML5: Not a parse error.
1446 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1447     } else {
1448    
1449     }
1450     $self->{ca}->{name} .= chr ($self->{nc});
1451     ## Stay in the state
1452    
1453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1454     $self->{line_prev} = $self->{line};
1455     $self->{column_prev} = $self->{column};
1456     $self->{column}++;
1457     $self->{nc}
1458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1459     } else {
1460     $self->{set_nc}->($self);
1461     }
1462    
1463     redo A;
1464     }
1465     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1466 wakaba 1.11 ## XML5: "Tag attribute name after state".
1467    
1468 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1469    
1470     ## Stay in the state
1471    
1472     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1473     $self->{line_prev} = $self->{line};
1474     $self->{column_prev} = $self->{column};
1475     $self->{column}++;
1476     $self->{nc}
1477     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1478     } else {
1479     $self->{set_nc}->($self);
1480     }
1481    
1482     redo A;
1483     } elsif ($self->{nc} == 0x003D) { # =
1484    
1485     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1486    
1487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1488     $self->{line_prev} = $self->{line};
1489     $self->{column_prev} = $self->{column};
1490     $self->{column}++;
1491     $self->{nc}
1492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1493     } else {
1494     $self->{set_nc}->($self);
1495     }
1496    
1497     redo A;
1498     } elsif ($self->{nc} == 0x003E) { # >
1499 wakaba 1.11 if ($self->{is_xml}) {
1500    
1501     ## XML5: Not a parse error.
1502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1503     } else {
1504    
1505     }
1506    
1507 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1508    
1509     $self->{last_stag_name} = $self->{ct}->{tag_name};
1510     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512     if ($self->{ct}->{attributes}) {
1513    
1514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1515     } else {
1516     ## NOTE: This state should never be reached.
1517    
1518     }
1519     } else {
1520     die "$0: $self->{ct}->{type}: Unknown token type";
1521     }
1522     $self->{state} = DATA_STATE;
1523 wakaba 1.5 $self->{s_kwd} = '';
1524 wakaba 1.1
1525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1526     $self->{line_prev} = $self->{line};
1527     $self->{column_prev} = $self->{column};
1528     $self->{column}++;
1529     $self->{nc}
1530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1531     } else {
1532     $self->{set_nc}->($self);
1533     }
1534    
1535    
1536     return ($self->{ct}); # start tag or end tag
1537    
1538     redo A;
1539     } elsif (0x0041 <= $self->{nc} and
1540     $self->{nc} <= 0x005A) { # A..Z
1541    
1542     $self->{ca}
1543 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1544 wakaba 1.1 value => '',
1545     line => $self->{line}, column => $self->{column}};
1546     $self->{state} = ATTRIBUTE_NAME_STATE;
1547    
1548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1549     $self->{line_prev} = $self->{line};
1550     $self->{column_prev} = $self->{column};
1551     $self->{column}++;
1552     $self->{nc}
1553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1554     } else {
1555     $self->{set_nc}->($self);
1556     }
1557    
1558     redo A;
1559     } elsif ($self->{nc} == 0x002F) { # /
1560 wakaba 1.11 if ($self->{is_xml}) {
1561    
1562     ## XML5: Not a parse error.
1563     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1564     } else {
1565    
1566     }
1567 wakaba 1.1
1568     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1569    
1570     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1571     $self->{line_prev} = $self->{line};
1572     $self->{column_prev} = $self->{column};
1573     $self->{column}++;
1574     $self->{nc}
1575     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1576     } else {
1577     $self->{set_nc}->($self);
1578     }
1579    
1580     redo A;
1581     } elsif ($self->{nc} == -1) {
1582     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1583     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1584    
1585     $self->{last_stag_name} = $self->{ct}->{tag_name};
1586     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1587     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1588     if ($self->{ct}->{attributes}) {
1589    
1590     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1591     } else {
1592     ## NOTE: This state should never be reached.
1593    
1594     }
1595     } else {
1596     die "$0: $self->{ct}->{type}: Unknown token type";
1597     }
1598 wakaba 1.5 $self->{s_kwd} = '';
1599 wakaba 1.1 $self->{state} = DATA_STATE;
1600     # reconsume
1601    
1602 wakaba 1.33 ## Discard the token.
1603     #return ($self->{ct}); # start tag or end tag
1604 wakaba 1.1
1605     redo A;
1606     } else {
1607 wakaba 1.11 if ($self->{is_xml}) {
1608    
1609     ## XML5: Not a parse error.
1610     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1611     } else {
1612    
1613     }
1614    
1615 wakaba 1.30 if ({
1616     0x0022 => 1, # "
1617     0x0027 => 1, # '
1618     0x003C => 1, # <
1619     }->{$self->{nc}}) {
1620 wakaba 1.1
1621 wakaba 1.11 ## XML5: Not a parse error.
1622 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1623     } else {
1624    
1625     }
1626     $self->{ca}
1627     = {name => chr ($self->{nc}),
1628     value => '',
1629     line => $self->{line}, column => $self->{column}};
1630     $self->{state} = ATTRIBUTE_NAME_STATE;
1631    
1632     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1633     $self->{line_prev} = $self->{line};
1634     $self->{column_prev} = $self->{column};
1635     $self->{column}++;
1636     $self->{nc}
1637     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1638     } else {
1639     $self->{set_nc}->($self);
1640     }
1641    
1642     redo A;
1643     }
1644     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1645 wakaba 1.11 ## XML5: "Tag attribute value before state".
1646    
1647 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1648    
1649     ## Stay in the state
1650    
1651     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1652     $self->{line_prev} = $self->{line};
1653     $self->{column_prev} = $self->{column};
1654     $self->{column}++;
1655     $self->{nc}
1656     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1657     } else {
1658     $self->{set_nc}->($self);
1659     }
1660    
1661     redo A;
1662     } elsif ($self->{nc} == 0x0022) { # "
1663    
1664     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1665    
1666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1667     $self->{line_prev} = $self->{line};
1668     $self->{column_prev} = $self->{column};
1669     $self->{column}++;
1670     $self->{nc}
1671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1672     } else {
1673     $self->{set_nc}->($self);
1674     }
1675    
1676     redo A;
1677     } elsif ($self->{nc} == 0x0026) { # &
1678    
1679     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1680     ## reconsume
1681     redo A;
1682     } elsif ($self->{nc} == 0x0027) { # '
1683    
1684     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1685    
1686     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1687     $self->{line_prev} = $self->{line};
1688     $self->{column_prev} = $self->{column};
1689     $self->{column}++;
1690     $self->{nc}
1691     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1692     } else {
1693     $self->{set_nc}->($self);
1694     }
1695    
1696     redo A;
1697     } elsif ($self->{nc} == 0x003E) { # >
1698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1699     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1700    
1701     $self->{last_stag_name} = $self->{ct}->{tag_name};
1702     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1703     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1704     if ($self->{ct}->{attributes}) {
1705    
1706     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1707     } else {
1708     ## NOTE: This state should never be reached.
1709    
1710     }
1711     } else {
1712     die "$0: $self->{ct}->{type}: Unknown token type";
1713     }
1714     $self->{state} = DATA_STATE;
1715 wakaba 1.5 $self->{s_kwd} = '';
1716 wakaba 1.1
1717     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1718     $self->{line_prev} = $self->{line};
1719     $self->{column_prev} = $self->{column};
1720     $self->{column}++;
1721     $self->{nc}
1722     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1723     } else {
1724     $self->{set_nc}->($self);
1725     }
1726    
1727    
1728     return ($self->{ct}); # start tag or end tag
1729    
1730     redo A;
1731     } elsif ($self->{nc} == -1) {
1732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1733     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1734    
1735     $self->{last_stag_name} = $self->{ct}->{tag_name};
1736     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1737     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1738     if ($self->{ct}->{attributes}) {
1739    
1740     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1741     } else {
1742     ## NOTE: This state should never be reached.
1743    
1744     }
1745     } else {
1746     die "$0: $self->{ct}->{type}: Unknown token type";
1747     }
1748     $self->{state} = DATA_STATE;
1749 wakaba 1.5 $self->{s_kwd} = '';
1750 wakaba 1.1 ## reconsume
1751    
1752 wakaba 1.33 ## Discard the token.
1753     #return ($self->{ct}); # start tag or end tag
1754 wakaba 1.1
1755     redo A;
1756     } else {
1757 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1758 wakaba 1.1
1759 wakaba 1.11 ## XML5: Not a parse error.
1760 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1761 wakaba 1.11 } elsif ($self->{is_xml}) {
1762    
1763     ## XML5: No parse error.
1764     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1765 wakaba 1.1 } else {
1766    
1767     }
1768     $self->{ca}->{value} .= chr ($self->{nc});
1769     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1770    
1771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1772     $self->{line_prev} = $self->{line};
1773     $self->{column_prev} = $self->{column};
1774     $self->{column}++;
1775     $self->{nc}
1776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1777     } else {
1778     $self->{set_nc}->($self);
1779     }
1780    
1781     redo A;
1782     }
1783     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1784 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1785     ## ATTLIST attribute value double quoted state".
1786 wakaba 1.11
1787 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1788 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1789    
1790     ## XML5: "DOCTYPE ATTLIST name after state".
1791     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1792     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1793     } else {
1794    
1795     ## XML5: "Tag attribute name before state".
1796     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1797     }
1798 wakaba 1.1
1799     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1800     $self->{line_prev} = $self->{line};
1801     $self->{column_prev} = $self->{column};
1802     $self->{column}++;
1803     $self->{nc}
1804     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1805     } else {
1806     $self->{set_nc}->($self);
1807     }
1808    
1809     redo A;
1810     } elsif ($self->{nc} == 0x0026) { # &
1811    
1812 wakaba 1.11 ## XML5: Not defined yet.
1813    
1814 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1815     ## "entity in attribute value state". In this implementation, the
1816     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1817     ## implementation of the "consume a character reference" algorithm.
1818     $self->{prev_state} = $self->{state};
1819     $self->{entity_add} = 0x0022; # "
1820     $self->{state} = ENTITY_STATE;
1821    
1822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1823     $self->{line_prev} = $self->{line};
1824     $self->{column_prev} = $self->{column};
1825     $self->{column}++;
1826     $self->{nc}
1827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1828     } else {
1829     $self->{set_nc}->($self);
1830     }
1831    
1832     redo A;
1833 wakaba 1.25 } elsif ($self->{is_xml} and
1834     $is_space->{$self->{nc}}) {
1835    
1836     $self->{ca}->{value} .= ' ';
1837     ## Stay in the state.
1838    
1839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1840     $self->{line_prev} = $self->{line};
1841     $self->{column_prev} = $self->{column};
1842     $self->{column}++;
1843     $self->{nc}
1844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1845     } else {
1846     $self->{set_nc}->($self);
1847     }
1848    
1849     redo A;
1850 wakaba 1.1 } elsif ($self->{nc} == -1) {
1851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1852     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1853    
1854     $self->{last_stag_name} = $self->{ct}->{tag_name};
1855 wakaba 1.15
1856     $self->{state} = DATA_STATE;
1857     $self->{s_kwd} = '';
1858     ## reconsume
1859     return ($self->{ct}); # start tag
1860     redo A;
1861 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1862     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1863     if ($self->{ct}->{attributes}) {
1864    
1865     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1866     } else {
1867     ## NOTE: This state should never be reached.
1868    
1869     }
1870 wakaba 1.15
1871     $self->{state} = DATA_STATE;
1872     $self->{s_kwd} = '';
1873     ## reconsume
1874 wakaba 1.33
1875     ## Discard the token.
1876     #return ($self->{ct}); # end tag
1877    
1878 wakaba 1.15 redo A;
1879     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1880     ## XML5: No parse error above; not defined yet.
1881     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1883     ## Reconsume.
1884 wakaba 1.33
1885     ## Discard the token.
1886     #return ($self->{ct}); # ATTLIST
1887    
1888 wakaba 1.15 redo A;
1889 wakaba 1.1 } else {
1890     die "$0: $self->{ct}->{type}: Unknown token type";
1891     }
1892     } else {
1893 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1894 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1895    
1896     ## XML5: Not a parse error.
1897     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1898     } else {
1899    
1900     }
1901 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1902     $self->{read_until}->($self->{ca}->{value},
1903 wakaba 1.25 qq["&<\x09\x0C\x20],
1904 wakaba 1.1 length $self->{ca}->{value});
1905    
1906     ## Stay in the state
1907    
1908     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1909     $self->{line_prev} = $self->{line};
1910     $self->{column_prev} = $self->{column};
1911     $self->{column}++;
1912     $self->{nc}
1913     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1914     } else {
1915     $self->{set_nc}->($self);
1916     }
1917    
1918     redo A;
1919     }
1920     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1921 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1922     ## ATTLIST attribute value single quoted state".
1923 wakaba 1.11
1924 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1925 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1926    
1927     ## XML5: "DOCTYPE ATTLIST name after state".
1928     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1929     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1930     } else {
1931    
1932     ## XML5: "Before attribute name state" (sic).
1933     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1934     }
1935 wakaba 1.1
1936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1937     $self->{line_prev} = $self->{line};
1938     $self->{column_prev} = $self->{column};
1939     $self->{column}++;
1940     $self->{nc}
1941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1942     } else {
1943     $self->{set_nc}->($self);
1944     }
1945    
1946     redo A;
1947     } elsif ($self->{nc} == 0x0026) { # &
1948    
1949 wakaba 1.11 ## XML5: Not defined yet.
1950    
1951 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1952     ## "entity in attribute value state". In this implementation, the
1953     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1954     ## implementation of the "consume a character reference" algorithm.
1955     $self->{entity_add} = 0x0027; # '
1956     $self->{prev_state} = $self->{state};
1957     $self->{state} = ENTITY_STATE;
1958    
1959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1960     $self->{line_prev} = $self->{line};
1961     $self->{column_prev} = $self->{column};
1962     $self->{column}++;
1963     $self->{nc}
1964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1965     } else {
1966     $self->{set_nc}->($self);
1967     }
1968    
1969     redo A;
1970 wakaba 1.25 } elsif ($self->{is_xml} and
1971     $is_space->{$self->{nc}}) {
1972    
1973     $self->{ca}->{value} .= ' ';
1974     ## Stay in the state.
1975    
1976     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1977     $self->{line_prev} = $self->{line};
1978     $self->{column_prev} = $self->{column};
1979     $self->{column}++;
1980     $self->{nc}
1981     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1982     } else {
1983     $self->{set_nc}->($self);
1984     }
1985    
1986     redo A;
1987 wakaba 1.1 } elsif ($self->{nc} == -1) {
1988     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1989     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1990    
1991     $self->{last_stag_name} = $self->{ct}->{tag_name};
1992 wakaba 1.15
1993     $self->{state} = DATA_STATE;
1994     $self->{s_kwd} = '';
1995     ## reconsume
1996 wakaba 1.33
1997     ## Discard the token.
1998     #return ($self->{ct}); # start tag
1999    
2000 wakaba 1.15 redo A;
2001 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2002     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2003     if ($self->{ct}->{attributes}) {
2004    
2005     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2006     } else {
2007     ## NOTE: This state should never be reached.
2008    
2009     }
2010 wakaba 1.15
2011     $self->{state} = DATA_STATE;
2012     $self->{s_kwd} = '';
2013     ## reconsume
2014 wakaba 1.33
2015     ## Discard the token.
2016     #return ($self->{ct}); # end tag
2017    
2018 wakaba 1.15 redo A;
2019     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2020     ## XML5: No parse error above; not defined yet.
2021     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2022     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2023     ## Reconsume.
2024 wakaba 1.33
2025     ## Discard the token.
2026     #return ($self->{ct}); # ATTLIST
2027    
2028 wakaba 1.15 redo A;
2029 wakaba 1.1 } else {
2030     die "$0: $self->{ct}->{type}: Unknown token type";
2031     }
2032     } else {
2033 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2034 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2035    
2036     ## XML5: Not a parse error.
2037     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2038     } else {
2039    
2040     }
2041 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2042     $self->{read_until}->($self->{ca}->{value},
2043 wakaba 1.25 qq['&<\x09\x0C\x20],
2044 wakaba 1.1 length $self->{ca}->{value});
2045    
2046     ## Stay in the state
2047    
2048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2049     $self->{line_prev} = $self->{line};
2050     $self->{column_prev} = $self->{column};
2051     $self->{column}++;
2052     $self->{nc}
2053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2054     } else {
2055     $self->{set_nc}->($self);
2056     }
2057    
2058     redo A;
2059     }
2060     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2061 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2062    
2063 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2064 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2065    
2066     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2067     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2068     } else {
2069    
2070     ## XML5: "Tag attribute name before state".
2071     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2072     }
2073 wakaba 1.1
2074     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2075     $self->{line_prev} = $self->{line};
2076     $self->{column_prev} = $self->{column};
2077     $self->{column}++;
2078     $self->{nc}
2079     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2080     } else {
2081     $self->{set_nc}->($self);
2082     }
2083    
2084     redo A;
2085     } elsif ($self->{nc} == 0x0026) { # &
2086    
2087 wakaba 1.11
2088     ## XML5: Not defined yet.
2089    
2090 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2091     ## "entity in attribute value state". In this implementation, the
2092     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2093     ## implementation of the "consume a character reference" algorithm.
2094     $self->{entity_add} = -1;
2095     $self->{prev_state} = $self->{state};
2096     $self->{state} = ENTITY_STATE;
2097    
2098     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2099     $self->{line_prev} = $self->{line};
2100     $self->{column_prev} = $self->{column};
2101     $self->{column}++;
2102     $self->{nc}
2103     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2104     } else {
2105     $self->{set_nc}->($self);
2106     }
2107    
2108     redo A;
2109     } elsif ($self->{nc} == 0x003E) { # >
2110     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2111    
2112     $self->{last_stag_name} = $self->{ct}->{tag_name};
2113 wakaba 1.15
2114     $self->{state} = DATA_STATE;
2115     $self->{s_kwd} = '';
2116    
2117     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2118     $self->{line_prev} = $self->{line};
2119     $self->{column_prev} = $self->{column};
2120     $self->{column}++;
2121     $self->{nc}
2122     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2123     } else {
2124     $self->{set_nc}->($self);
2125     }
2126    
2127     return ($self->{ct}); # start tag
2128     redo A;
2129 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2130     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2131     if ($self->{ct}->{attributes}) {
2132    
2133     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2134     } else {
2135     ## NOTE: This state should never be reached.
2136    
2137     }
2138 wakaba 1.15
2139     $self->{state} = DATA_STATE;
2140     $self->{s_kwd} = '';
2141    
2142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2143     $self->{line_prev} = $self->{line};
2144     $self->{column_prev} = $self->{column};
2145     $self->{column}++;
2146     $self->{nc}
2147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2148     } else {
2149     $self->{set_nc}->($self);
2150     }
2151    
2152     return ($self->{ct}); # end tag
2153     redo A;
2154     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2155     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2156     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2157    
2158 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2159     $self->{line_prev} = $self->{line};
2160     $self->{column_prev} = $self->{column};
2161     $self->{column}++;
2162     $self->{nc}
2163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2164     } else {
2165     $self->{set_nc}->($self);
2166     }
2167    
2168 wakaba 1.15 return ($self->{ct}); # ATTLIST
2169     redo A;
2170     } else {
2171     die "$0: $self->{ct}->{type}: Unknown token type";
2172     }
2173 wakaba 1.1 } elsif ($self->{nc} == -1) {
2174     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2175    
2176 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2177 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2178 wakaba 1.15
2179     $self->{state} = DATA_STATE;
2180     $self->{s_kwd} = '';
2181     ## reconsume
2182 wakaba 1.33
2183     ## Discard the token.
2184     #return ($self->{ct}); # start tag
2185    
2186 wakaba 1.15 redo A;
2187 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2188 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2189 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2190     if ($self->{ct}->{attributes}) {
2191    
2192     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2193     } else {
2194     ## NOTE: This state should never be reached.
2195    
2196     }
2197 wakaba 1.15
2198     $self->{state} = DATA_STATE;
2199     $self->{s_kwd} = '';
2200     ## reconsume
2201 wakaba 1.33
2202     ## Discard the token.
2203     #return ($self->{ct}); # end tag
2204    
2205 wakaba 1.15 redo A;
2206     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2207     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2208     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2210     ## Reconsume.
2211 wakaba 1.33
2212     ## Discard the token.
2213     #return ($self->{ct}); # ATTLIST
2214    
2215 wakaba 1.15 redo A;
2216 wakaba 1.1 } else {
2217     die "$0: $self->{ct}->{type}: Unknown token type";
2218     }
2219     } else {
2220     if ({
2221     0x0022 => 1, # "
2222     0x0027 => 1, # '
2223     0x003D => 1, # =
2224 wakaba 1.26 0x003C => 1, # <
2225 wakaba 1.1 }->{$self->{nc}}) {
2226    
2227 wakaba 1.11 ## XML5: Not a parse error.
2228 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2229     } else {
2230    
2231     }
2232     $self->{ca}->{value} .= chr ($self->{nc});
2233     $self->{read_until}->($self->{ca}->{value},
2234 wakaba 1.25 qq["'=& \x09\x0C>],
2235 wakaba 1.1 length $self->{ca}->{value});
2236    
2237     ## Stay in the state
2238    
2239     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2240     $self->{line_prev} = $self->{line};
2241     $self->{column_prev} = $self->{column};
2242     $self->{column}++;
2243     $self->{nc}
2244     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2245     } else {
2246     $self->{set_nc}->($self);
2247     }
2248    
2249     redo A;
2250     }
2251     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2252     if ($is_space->{$self->{nc}}) {
2253    
2254     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2255    
2256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2257     $self->{line_prev} = $self->{line};
2258     $self->{column_prev} = $self->{column};
2259     $self->{column}++;
2260     $self->{nc}
2261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2262     } else {
2263     $self->{set_nc}->($self);
2264     }
2265    
2266     redo A;
2267     } elsif ($self->{nc} == 0x003E) { # >
2268     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2269    
2270     $self->{last_stag_name} = $self->{ct}->{tag_name};
2271     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2272     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2273     if ($self->{ct}->{attributes}) {
2274    
2275     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2276     } else {
2277     ## NOTE: This state should never be reached.
2278    
2279     }
2280     } else {
2281     die "$0: $self->{ct}->{type}: Unknown token type";
2282     }
2283     $self->{state} = DATA_STATE;
2284 wakaba 1.5 $self->{s_kwd} = '';
2285 wakaba 1.1
2286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2287     $self->{line_prev} = $self->{line};
2288     $self->{column_prev} = $self->{column};
2289     $self->{column}++;
2290     $self->{nc}
2291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2292     } else {
2293     $self->{set_nc}->($self);
2294     }
2295    
2296    
2297     return ($self->{ct}); # start tag or end tag
2298    
2299     redo A;
2300     } elsif ($self->{nc} == 0x002F) { # /
2301    
2302     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2303    
2304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2305     $self->{line_prev} = $self->{line};
2306     $self->{column_prev} = $self->{column};
2307     $self->{column}++;
2308     $self->{nc}
2309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2310     } else {
2311     $self->{set_nc}->($self);
2312     }
2313    
2314     redo A;
2315     } elsif ($self->{nc} == -1) {
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2317     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2318    
2319     $self->{last_stag_name} = $self->{ct}->{tag_name};
2320     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2321     if ($self->{ct}->{attributes}) {
2322    
2323     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2324     } else {
2325     ## NOTE: This state should never be reached.
2326    
2327     }
2328     } else {
2329     die "$0: $self->{ct}->{type}: Unknown token type";
2330     }
2331     $self->{state} = DATA_STATE;
2332 wakaba 1.5 $self->{s_kwd} = '';
2333 wakaba 1.1 ## Reconsume.
2334 wakaba 1.33
2335     ## Discard the token.
2336     #return ($self->{ct}); # start tag or end tag
2337    
2338 wakaba 1.1 redo A;
2339     } else {
2340    
2341     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2342     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2343     ## reconsume
2344     redo A;
2345     }
2346     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2347 wakaba 1.11 ## XML5: "Empty tag state".
2348    
2349 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2350     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2351    
2352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2353     ## TODO: Different type than slash in start tag
2354     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2355     if ($self->{ct}->{attributes}) {
2356    
2357     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2358     } else {
2359    
2360     }
2361     ## TODO: Test |<title></title/>|
2362     } else {
2363    
2364     $self->{self_closing} = 1;
2365     }
2366    
2367     $self->{state} = DATA_STATE;
2368 wakaba 1.5 $self->{s_kwd} = '';
2369 wakaba 1.1
2370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2371     $self->{line_prev} = $self->{line};
2372     $self->{column_prev} = $self->{column};
2373     $self->{column}++;
2374     $self->{nc}
2375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2376     } else {
2377     $self->{set_nc}->($self);
2378     }
2379    
2380    
2381     return ($self->{ct}); # start tag or end tag
2382    
2383     redo A;
2384     } elsif ($self->{nc} == -1) {
2385     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2386     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2387    
2388     $self->{last_stag_name} = $self->{ct}->{tag_name};
2389     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2390     if ($self->{ct}->{attributes}) {
2391    
2392     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2393     } else {
2394     ## NOTE: This state should never be reached.
2395    
2396     }
2397     } else {
2398     die "$0: $self->{ct}->{type}: Unknown token type";
2399     }
2400 wakaba 1.11 ## XML5: "Tag attribute name before state".
2401 wakaba 1.1 $self->{state} = DATA_STATE;
2402 wakaba 1.5 $self->{s_kwd} = '';
2403 wakaba 1.1 ## Reconsume.
2404 wakaba 1.33
2405     ## Discard the token.
2406     #return ($self->{ct}); # start tag or end tag
2407    
2408 wakaba 1.1 redo A;
2409     } else {
2410    
2411     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2412     ## TODO: This error type is wrong.
2413     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2414     ## Reconsume.
2415     redo A;
2416     }
2417     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2418 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2419    
2420 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2421     ## consumes characters one-by-one basis.
2422    
2423     if ($self->{nc} == 0x003E) { # >
2424 wakaba 1.13 if ($self->{in_subset}) {
2425    
2426     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2427     } else {
2428    
2429     $self->{state} = DATA_STATE;
2430     $self->{s_kwd} = '';
2431     }
2432 wakaba 1.1
2433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2434     $self->{line_prev} = $self->{line};
2435     $self->{column_prev} = $self->{column};
2436     $self->{column}++;
2437     $self->{nc}
2438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2439     } else {
2440     $self->{set_nc}->($self);
2441     }
2442    
2443    
2444     return ($self->{ct}); # comment
2445     redo A;
2446     } elsif ($self->{nc} == -1) {
2447 wakaba 1.13 if ($self->{in_subset}) {
2448    
2449     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2450     } else {
2451    
2452     $self->{state} = DATA_STATE;
2453     $self->{s_kwd} = '';
2454     }
2455 wakaba 1.1 ## reconsume
2456    
2457     return ($self->{ct}); # comment
2458     redo A;
2459     } else {
2460    
2461     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2462     $self->{read_until}->($self->{ct}->{data},
2463     q[>],
2464     length $self->{ct}->{data});
2465    
2466     ## Stay in the state.
2467    
2468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2469     $self->{line_prev} = $self->{line};
2470     $self->{column_prev} = $self->{column};
2471     $self->{column}++;
2472     $self->{nc}
2473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2474     } else {
2475     $self->{set_nc}->($self);
2476     }
2477    
2478     redo A;
2479     }
2480     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2481 wakaba 1.14 ## XML5: "Markup declaration state".
2482 wakaba 1.1
2483     if ($self->{nc} == 0x002D) { # -
2484    
2485     $self->{state} = MD_HYPHEN_STATE;
2486    
2487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2488     $self->{line_prev} = $self->{line};
2489     $self->{column_prev} = $self->{column};
2490     $self->{column}++;
2491     $self->{nc}
2492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2493     } else {
2494     $self->{set_nc}->($self);
2495     }
2496    
2497     redo A;
2498     } elsif ($self->{nc} == 0x0044 or # D
2499     $self->{nc} == 0x0064) { # d
2500     ## ASCII case-insensitive.
2501    
2502     $self->{state} = MD_DOCTYPE_STATE;
2503 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2504 wakaba 1.1
2505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2506     $self->{line_prev} = $self->{line};
2507     $self->{column_prev} = $self->{column};
2508     $self->{column}++;
2509     $self->{nc}
2510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2511     } else {
2512     $self->{set_nc}->($self);
2513     }
2514    
2515     redo A;
2516 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2517     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2518     $self->{is_xml}) and
2519 wakaba 1.1 $self->{nc} == 0x005B) { # [
2520    
2521     $self->{state} = MD_CDATA_STATE;
2522 wakaba 1.12 $self->{kwd} = '[';
2523 wakaba 1.1
2524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525     $self->{line_prev} = $self->{line};
2526     $self->{column_prev} = $self->{column};
2527     $self->{column}++;
2528     $self->{nc}
2529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530     } else {
2531     $self->{set_nc}->($self);
2532     }
2533    
2534     redo A;
2535     } else {
2536    
2537     }
2538    
2539     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2540     line => $self->{line_prev},
2541     column => $self->{column_prev} - 1);
2542     ## Reconsume.
2543     $self->{state} = BOGUS_COMMENT_STATE;
2544     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2545     line => $self->{line_prev},
2546     column => $self->{column_prev} - 1,
2547     };
2548     redo A;
2549     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2550     if ($self->{nc} == 0x002D) { # -
2551    
2552     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2553     line => $self->{line_prev},
2554     column => $self->{column_prev} - 2,
2555     };
2556 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2557 wakaba 1.1
2558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559     $self->{line_prev} = $self->{line};
2560     $self->{column_prev} = $self->{column};
2561     $self->{column}++;
2562     $self->{nc}
2563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564     } else {
2565     $self->{set_nc}->($self);
2566     }
2567    
2568     redo A;
2569     } else {
2570    
2571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572     line => $self->{line_prev},
2573     column => $self->{column_prev} - 2);
2574     $self->{state} = BOGUS_COMMENT_STATE;
2575     ## Reconsume.
2576     $self->{ct} = {type => COMMENT_TOKEN,
2577     data => '-',
2578     line => $self->{line_prev},
2579     column => $self->{column_prev} - 2,
2580     };
2581     redo A;
2582     }
2583     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2584     ## ASCII case-insensitive.
2585     if ($self->{nc} == [
2586     undef,
2587     0x004F, # O
2588     0x0043, # C
2589     0x0054, # T
2590     0x0059, # Y
2591     0x0050, # P
2592 wakaba 1.12 ]->[length $self->{kwd}] or
2593 wakaba 1.1 $self->{nc} == [
2594     undef,
2595     0x006F, # o
2596     0x0063, # c
2597     0x0074, # t
2598     0x0079, # y
2599     0x0070, # p
2600 wakaba 1.12 ]->[length $self->{kwd}]) {
2601 wakaba 1.1
2602     ## Stay in the state.
2603 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2604 wakaba 1.1
2605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2606     $self->{line_prev} = $self->{line};
2607     $self->{column_prev} = $self->{column};
2608     $self->{column}++;
2609     $self->{nc}
2610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2611     } else {
2612     $self->{set_nc}->($self);
2613     }
2614    
2615     redo A;
2616 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2617 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2618     $self->{nc} == 0x0065)) { # e
2619 wakaba 1.12 if ($self->{is_xml} and
2620     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2621 wakaba 1.10
2622     ## XML5: case-sensitive.
2623     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2624     text => 'DOCTYPE',
2625     line => $self->{line_prev},
2626     column => $self->{column_prev} - 5);
2627     } else {
2628    
2629     }
2630 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2631     $self->{ct} = {type => DOCTYPE_TOKEN,
2632     quirks => 1,
2633     line => $self->{line_prev},
2634     column => $self->{column_prev} - 7,
2635     };
2636    
2637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2638     $self->{line_prev} = $self->{line};
2639     $self->{column_prev} = $self->{column};
2640     $self->{column}++;
2641     $self->{nc}
2642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2643     } else {
2644     $self->{set_nc}->($self);
2645     }
2646    
2647     redo A;
2648     } else {
2649    
2650     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2651     line => $self->{line_prev},
2652 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2653 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2654     ## Reconsume.
2655     $self->{ct} = {type => COMMENT_TOKEN,
2656 wakaba 1.12 data => $self->{kwd},
2657 wakaba 1.1 line => $self->{line_prev},
2658 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2659 wakaba 1.1 };
2660     redo A;
2661     }
2662     } elsif ($self->{state} == MD_CDATA_STATE) {
2663     if ($self->{nc} == {
2664     '[' => 0x0043, # C
2665     '[C' => 0x0044, # D
2666     '[CD' => 0x0041, # A
2667     '[CDA' => 0x0054, # T
2668     '[CDAT' => 0x0041, # A
2669 wakaba 1.12 }->{$self->{kwd}}) {
2670 wakaba 1.1
2671     ## Stay in the state.
2672 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2673 wakaba 1.1
2674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2675     $self->{line_prev} = $self->{line};
2676     $self->{column_prev} = $self->{column};
2677     $self->{column}++;
2678     $self->{nc}
2679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2680     } else {
2681     $self->{set_nc}->($self);
2682     }
2683    
2684     redo A;
2685 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2686 wakaba 1.1 $self->{nc} == 0x005B) { # [
2687 wakaba 1.6 if ($self->{is_xml} and
2688     not $self->{tainted} and
2689     @{$self->{open_elements} or []} == 0) {
2690 wakaba 1.8
2691 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2692     line => $self->{line_prev},
2693     column => $self->{column_prev} - 7);
2694     $self->{tainted} = 1;
2695 wakaba 1.8 } else {
2696    
2697 wakaba 1.6 }
2698    
2699 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2700     data => '',
2701     line => $self->{line_prev},
2702     column => $self->{column_prev} - 7};
2703     $self->{state} = CDATA_SECTION_STATE;
2704    
2705     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2706     $self->{line_prev} = $self->{line};
2707     $self->{column_prev} = $self->{column};
2708     $self->{column}++;
2709     $self->{nc}
2710     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2711     } else {
2712     $self->{set_nc}->($self);
2713     }
2714    
2715     redo A;
2716     } else {
2717    
2718     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2719     line => $self->{line_prev},
2720 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2721 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2722     ## Reconsume.
2723     $self->{ct} = {type => COMMENT_TOKEN,
2724 wakaba 1.12 data => $self->{kwd},
2725 wakaba 1.1 line => $self->{line_prev},
2726 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2727 wakaba 1.1 };
2728     redo A;
2729     }
2730     } elsif ($self->{state} == COMMENT_START_STATE) {
2731     if ($self->{nc} == 0x002D) { # -
2732    
2733     $self->{state} = COMMENT_START_DASH_STATE;
2734    
2735     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2736     $self->{line_prev} = $self->{line};
2737     $self->{column_prev} = $self->{column};
2738     $self->{column}++;
2739     $self->{nc}
2740     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2741     } else {
2742     $self->{set_nc}->($self);
2743     }
2744    
2745     redo A;
2746     } elsif ($self->{nc} == 0x003E) { # >
2747     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2748 wakaba 1.13 if ($self->{in_subset}) {
2749    
2750     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2751     } else {
2752    
2753     $self->{state} = DATA_STATE;
2754     $self->{s_kwd} = '';
2755     }
2756 wakaba 1.1
2757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2758     $self->{line_prev} = $self->{line};
2759     $self->{column_prev} = $self->{column};
2760     $self->{column}++;
2761     $self->{nc}
2762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2763     } else {
2764     $self->{set_nc}->($self);
2765     }
2766    
2767    
2768     return ($self->{ct}); # comment
2769    
2770     redo A;
2771     } elsif ($self->{nc} == -1) {
2772     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2773 wakaba 1.13 if ($self->{in_subset}) {
2774    
2775     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2776     } else {
2777    
2778     $self->{state} = DATA_STATE;
2779     $self->{s_kwd} = '';
2780     }
2781 wakaba 1.1 ## reconsume
2782    
2783     return ($self->{ct}); # comment
2784    
2785     redo A;
2786     } else {
2787    
2788     $self->{ct}->{data} # comment
2789     .= chr ($self->{nc});
2790     $self->{state} = COMMENT_STATE;
2791    
2792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2793     $self->{line_prev} = $self->{line};
2794     $self->{column_prev} = $self->{column};
2795     $self->{column}++;
2796     $self->{nc}
2797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2798     } else {
2799     $self->{set_nc}->($self);
2800     }
2801    
2802     redo A;
2803     }
2804     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2805     if ($self->{nc} == 0x002D) { # -
2806    
2807     $self->{state} = COMMENT_END_STATE;
2808    
2809     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2810     $self->{line_prev} = $self->{line};
2811     $self->{column_prev} = $self->{column};
2812     $self->{column}++;
2813     $self->{nc}
2814     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2815     } else {
2816     $self->{set_nc}->($self);
2817     }
2818    
2819     redo A;
2820     } elsif ($self->{nc} == 0x003E) { # >
2821     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2822 wakaba 1.13 if ($self->{in_subset}) {
2823    
2824     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2825     } else {
2826    
2827     $self->{state} = DATA_STATE;
2828     $self->{s_kwd} = '';
2829     }
2830 wakaba 1.1
2831     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2832     $self->{line_prev} = $self->{line};
2833     $self->{column_prev} = $self->{column};
2834     $self->{column}++;
2835     $self->{nc}
2836     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2837     } else {
2838     $self->{set_nc}->($self);
2839     }
2840    
2841    
2842     return ($self->{ct}); # comment
2843    
2844     redo A;
2845     } elsif ($self->{nc} == -1) {
2846     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2847 wakaba 1.13 if ($self->{in_subset}) {
2848    
2849     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2850     } else {
2851    
2852     $self->{state} = DATA_STATE;
2853     $self->{s_kwd} = '';
2854     }
2855 wakaba 1.1 ## reconsume
2856    
2857     return ($self->{ct}); # comment
2858    
2859     redo A;
2860     } else {
2861    
2862     $self->{ct}->{data} # comment
2863     .= '-' . chr ($self->{nc});
2864     $self->{state} = COMMENT_STATE;
2865    
2866     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2867     $self->{line_prev} = $self->{line};
2868     $self->{column_prev} = $self->{column};
2869     $self->{column}++;
2870     $self->{nc}
2871     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2872     } else {
2873     $self->{set_nc}->($self);
2874     }
2875    
2876     redo A;
2877     }
2878     } elsif ($self->{state} == COMMENT_STATE) {
2879 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2880    
2881 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2882    
2883     $self->{state} = COMMENT_END_DASH_STATE;
2884    
2885     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2886     $self->{line_prev} = $self->{line};
2887     $self->{column_prev} = $self->{column};
2888     $self->{column}++;
2889     $self->{nc}
2890     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2891     } else {
2892     $self->{set_nc}->($self);
2893     }
2894    
2895     redo A;
2896     } elsif ($self->{nc} == -1) {
2897     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2898 wakaba 1.13 if ($self->{in_subset}) {
2899    
2900     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2901     } else {
2902    
2903     $self->{state} = DATA_STATE;
2904     $self->{s_kwd} = '';
2905     }
2906 wakaba 1.1 ## reconsume
2907    
2908     return ($self->{ct}); # comment
2909    
2910     redo A;
2911     } else {
2912    
2913     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2914     $self->{read_until}->($self->{ct}->{data},
2915     q[-],
2916     length $self->{ct}->{data});
2917    
2918     ## Stay in the state
2919    
2920     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2921     $self->{line_prev} = $self->{line};
2922     $self->{column_prev} = $self->{column};
2923     $self->{column}++;
2924     $self->{nc}
2925     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2926     } else {
2927     $self->{set_nc}->($self);
2928     }
2929    
2930     redo A;
2931     }
2932     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2933 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2934 wakaba 1.10
2935 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2936    
2937     $self->{state} = COMMENT_END_STATE;
2938    
2939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940     $self->{line_prev} = $self->{line};
2941     $self->{column_prev} = $self->{column};
2942     $self->{column}++;
2943     $self->{nc}
2944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945     } else {
2946     $self->{set_nc}->($self);
2947     }
2948    
2949     redo A;
2950     } elsif ($self->{nc} == -1) {
2951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 wakaba 1.13 if ($self->{in_subset}) {
2953    
2954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955     } else {
2956    
2957     $self->{state} = DATA_STATE;
2958     $self->{s_kwd} = '';
2959     }
2960 wakaba 1.1 ## reconsume
2961    
2962     return ($self->{ct}); # comment
2963    
2964     redo A;
2965     } else {
2966    
2967     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2968     $self->{state} = COMMENT_STATE;
2969    
2970     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2971     $self->{line_prev} = $self->{line};
2972     $self->{column_prev} = $self->{column};
2973     $self->{column}++;
2974     $self->{nc}
2975     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2976     } else {
2977     $self->{set_nc}->($self);
2978     }
2979    
2980     redo A;
2981     }
2982 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2983     $self->{state} == COMMENT_END_BANG_STATE) {
2984 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2985 wakaba 1.31 ## (No comment end bang state.)
2986 wakaba 1.14
2987 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2988 wakaba 1.13 if ($self->{in_subset}) {
2989    
2990     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2991     } else {
2992    
2993     $self->{state} = DATA_STATE;
2994     $self->{s_kwd} = '';
2995     }
2996 wakaba 1.1
2997     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2998     $self->{line_prev} = $self->{line};
2999     $self->{column_prev} = $self->{column};
3000     $self->{column}++;
3001     $self->{nc}
3002     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3003     } else {
3004     $self->{set_nc}->($self);
3005     }
3006    
3007    
3008     return ($self->{ct}); # comment
3009    
3010     redo A;
3011     } elsif ($self->{nc} == 0x002D) { # -
3012 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
3013    
3014     $self->{ct}->{data} .= '--!'; # comment
3015     $self->{state} = COMMENT_END_DASH_STATE;
3016     } else {
3017    
3018     ## XML5: Not a parse error.
3019     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3020     line => $self->{line_prev},
3021     column => $self->{column_prev});
3022     $self->{ct}->{data} .= '-'; # comment
3023     ## Stay in the state
3024     }
3025 wakaba 1.1
3026 wakaba 1.31 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027     $self->{line_prev} = $self->{line};
3028     $self->{column_prev} = $self->{column};
3029     $self->{column}++;
3030     $self->{nc}
3031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032     } else {
3033     $self->{set_nc}->($self);
3034     }
3035    
3036     redo A;
3037 wakaba 1.32 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3038     $is_space->{$self->{nc}}) {
3039    
3040     $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3041     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3042     $self->{state} = COMMENT_END_SPACE_STATE;
3043    
3044     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3045     $self->{line_prev} = $self->{line};
3046     $self->{column_prev} = $self->{column};
3047     $self->{column}++;
3048     $self->{nc}
3049     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3050     } else {
3051     $self->{set_nc}->($self);
3052     }
3053    
3054     redo A;
3055     } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3056     $self->{nc} == 0x0021) { # !
3057    
3058 wakaba 1.31 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3059     $self->{state} = COMMENT_END_BANG_STATE;
3060 wakaba 1.1
3061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062     $self->{line_prev} = $self->{line};
3063     $self->{column_prev} = $self->{column};
3064     $self->{column}++;
3065     $self->{nc}
3066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3067     } else {
3068     $self->{set_nc}->($self);
3069     }
3070    
3071     redo A;
3072     } elsif ($self->{nc} == -1) {
3073     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3074 wakaba 1.13 if ($self->{in_subset}) {
3075    
3076     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3077     } else {
3078    
3079     $self->{state} = DATA_STATE;
3080     $self->{s_kwd} = '';
3081     }
3082 wakaba 1.31 ## Reconsume.
3083 wakaba 1.1
3084     return ($self->{ct}); # comment
3085    
3086     redo A;
3087     } else {
3088    
3089 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
3090     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3091     } else {
3092     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3093     }
3094 wakaba 1.1 $self->{state} = COMMENT_STATE;
3095    
3096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3097     $self->{line_prev} = $self->{line};
3098     $self->{column_prev} = $self->{column};
3099     $self->{column}++;
3100     $self->{nc}
3101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3102     } else {
3103     $self->{set_nc}->($self);
3104     }
3105    
3106     redo A;
3107     }
3108 wakaba 1.32 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3109     ## XML5: Not exist.
3110    
3111     if ($self->{nc} == 0x003E) { # >
3112     if ($self->{in_subset}) {
3113    
3114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115     } else {
3116    
3117     $self->{state} = DATA_STATE;
3118     $self->{s_kwd} = '';
3119     }
3120    
3121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122     $self->{line_prev} = $self->{line};
3123     $self->{column_prev} = $self->{column};
3124     $self->{column}++;
3125     $self->{nc}
3126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127     } else {
3128     $self->{set_nc}->($self);
3129     }
3130    
3131    
3132     return ($self->{ct}); # comment
3133    
3134     redo A;
3135     } elsif ($is_space->{$self->{nc}}) {
3136    
3137     $self->{ct}->{data} .= chr ($self->{nc}); # comment
3138     ## Stay in the state.
3139    
3140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3141     $self->{line_prev} = $self->{line};
3142     $self->{column_prev} = $self->{column};
3143     $self->{column}++;
3144     $self->{nc}
3145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3146     } else {
3147     $self->{set_nc}->($self);
3148     }
3149    
3150     redo A;
3151     } elsif ($self->{nc} == -1) {
3152     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3153     if ($self->{in_subset}) {
3154    
3155     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3156     } else {
3157    
3158     $self->{state} = DATA_STATE;
3159     $self->{s_kwd} = '';
3160     }
3161     ## Reconsume.
3162    
3163     return ($self->{ct}); # comment
3164    
3165     redo A;
3166     } else {
3167    
3168     $self->{ct}->{data} .= chr ($self->{nc}); # comment
3169     $self->{state} = COMMENT_STATE;
3170    
3171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3172     $self->{line_prev} = $self->{line};
3173     $self->{column_prev} = $self->{column};
3174     $self->{column}++;
3175     $self->{nc}
3176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3177     } else {
3178     $self->{set_nc}->($self);
3179     }
3180    
3181     redo A;
3182     }
3183 wakaba 1.1 } elsif ($self->{state} == DOCTYPE_STATE) {
3184     if ($is_space->{$self->{nc}}) {
3185    
3186     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3187    
3188     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3189     $self->{line_prev} = $self->{line};
3190     $self->{column_prev} = $self->{column};
3191     $self->{column}++;
3192     $self->{nc}
3193     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3194     } else {
3195     $self->{set_nc}->($self);
3196     }
3197    
3198     redo A;
3199 wakaba 1.28 } elsif ($self->{nc} == -1) {
3200    
3201     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3202     $self->{ct}->{quirks} = 1;
3203    
3204     $self->{state} = DATA_STATE;
3205     ## Reconsume.
3206     return ($self->{ct}); # DOCTYPE (quirks)
3207    
3208     redo A;
3209 wakaba 1.1 } else {
3210    
3211 wakaba 1.28 ## XML5: Swith to the bogus comment state.
3212 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3213     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3214     ## reconsume
3215     redo A;
3216     }
3217     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3218 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3219    
3220 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3221    
3222     ## Stay in the state
3223    
3224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3225     $self->{line_prev} = $self->{line};
3226     $self->{column_prev} = $self->{column};
3227     $self->{column}++;
3228     $self->{nc}
3229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3230     } else {
3231     $self->{set_nc}->($self);
3232     }
3233    
3234     redo A;
3235     } elsif ($self->{nc} == 0x003E) { # >
3236    
3237 wakaba 1.12 ## XML5: No parse error.
3238 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3239     $self->{state} = DATA_STATE;
3240 wakaba 1.5 $self->{s_kwd} = '';
3241 wakaba 1.1
3242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3243     $self->{line_prev} = $self->{line};
3244     $self->{column_prev} = $self->{column};
3245     $self->{column}++;
3246     $self->{nc}
3247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3248     } else {
3249     $self->{set_nc}->($self);
3250     }
3251    
3252    
3253     return ($self->{ct}); # DOCTYPE (quirks)
3254    
3255     redo A;
3256 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3257    
3258     $self->{ct}->{name} # DOCTYPE
3259     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3260     delete $self->{ct}->{quirks};
3261     $self->{state} = DOCTYPE_NAME_STATE;
3262    
3263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264     $self->{line_prev} = $self->{line};
3265     $self->{column_prev} = $self->{column};
3266     $self->{column}++;
3267     $self->{nc}
3268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269     } else {
3270     $self->{set_nc}->($self);
3271     }
3272    
3273     redo A;
3274 wakaba 1.1 } elsif ($self->{nc} == -1) {
3275    
3276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3277     $self->{state} = DATA_STATE;
3278 wakaba 1.5 $self->{s_kwd} = '';
3279 wakaba 1.1 ## reconsume
3280    
3281     return ($self->{ct}); # DOCTYPE (quirks)
3282    
3283     redo A;
3284 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3285    
3286     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3287     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3288 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3289     $self->{in_subset} = 1;
3290 wakaba 1.12
3291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3292     $self->{line_prev} = $self->{line};
3293     $self->{column_prev} = $self->{column};
3294     $self->{column}++;
3295     $self->{nc}
3296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3297     } else {
3298     $self->{set_nc}->($self);
3299     }
3300    
3301 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3302 wakaba 1.12 redo A;
3303 wakaba 1.1 } else {
3304    
3305     $self->{ct}->{name} = chr $self->{nc};
3306     delete $self->{ct}->{quirks};
3307     $self->{state} = DOCTYPE_NAME_STATE;
3308    
3309     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3310     $self->{line_prev} = $self->{line};
3311     $self->{column_prev} = $self->{column};
3312     $self->{column}++;
3313     $self->{nc}
3314     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3315     } else {
3316     $self->{set_nc}->($self);
3317     }
3318    
3319     redo A;
3320     }
3321     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3322 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3323    
3324     ## ISSUE: Redundant "First," in the spec.
3325    
3326 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3327    
3328     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3329    
3330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3331     $self->{line_prev} = $self->{line};
3332     $self->{column_prev} = $self->{column};
3333     $self->{column}++;
3334     $self->{nc}
3335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3336     } else {
3337     $self->{set_nc}->($self);
3338     }
3339    
3340     redo A;
3341     } elsif ($self->{nc} == 0x003E) { # >
3342    
3343     $self->{state} = DATA_STATE;
3344 wakaba 1.5 $self->{s_kwd} = '';
3345 wakaba 1.1
3346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3347     $self->{line_prev} = $self->{line};
3348     $self->{column_prev} = $self->{column};
3349     $self->{column}++;
3350     $self->{nc}
3351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3352     } else {
3353     $self->{set_nc}->($self);
3354     }
3355    
3356    
3357     return ($self->{ct}); # DOCTYPE
3358    
3359     redo A;
3360 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3361    
3362     $self->{ct}->{name} # DOCTYPE
3363     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3364     delete $self->{ct}->{quirks};
3365     ## Stay in the state.
3366    
3367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3368     $self->{line_prev} = $self->{line};
3369     $self->{column_prev} = $self->{column};
3370     $self->{column}++;
3371     $self->{nc}
3372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3373     } else {
3374     $self->{set_nc}->($self);
3375     }
3376    
3377     redo A;
3378 wakaba 1.1 } elsif ($self->{nc} == -1) {
3379    
3380     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3381     $self->{state} = DATA_STATE;
3382 wakaba 1.5 $self->{s_kwd} = '';
3383 wakaba 1.1 ## reconsume
3384    
3385     $self->{ct}->{quirks} = 1;
3386     return ($self->{ct}); # DOCTYPE
3387    
3388     redo A;
3389 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3390    
3391     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3392 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3393     $self->{in_subset} = 1;
3394 wakaba 1.12
3395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3396     $self->{line_prev} = $self->{line};
3397     $self->{column_prev} = $self->{column};
3398     $self->{column}++;
3399     $self->{nc}
3400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3401     } else {
3402     $self->{set_nc}->($self);
3403     }
3404    
3405 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3406 wakaba 1.12 redo A;
3407 wakaba 1.1 } else {
3408    
3409 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3410     ## Stay in the state.
3411 wakaba 1.1
3412     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3413     $self->{line_prev} = $self->{line};
3414     $self->{column_prev} = $self->{column};
3415     $self->{column}++;
3416     $self->{nc}
3417     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3418     } else {
3419     $self->{set_nc}->($self);
3420     }
3421    
3422     redo A;
3423     }
3424     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3425 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3426     ## state", but implemented differently.
3427    
3428 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3429    
3430     ## Stay in the state
3431    
3432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3433     $self->{line_prev} = $self->{line};
3434     $self->{column_prev} = $self->{column};
3435     $self->{column}++;
3436     $self->{nc}
3437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3438     } else {
3439     $self->{set_nc}->($self);
3440     }
3441    
3442     redo A;
3443     } elsif ($self->{nc} == 0x003E) { # >
3444 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3445    
3446     $self->{state} = DATA_STATE;
3447     $self->{s_kwd} = '';
3448     } else {
3449    
3450     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3451     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3452     }
3453 wakaba 1.1
3454    
3455     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3456     $self->{line_prev} = $self->{line};
3457     $self->{column_prev} = $self->{column};
3458     $self->{column}++;
3459     $self->{nc}
3460     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3461     } else {
3462     $self->{set_nc}->($self);
3463     }
3464    
3465 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3466 wakaba 1.1 redo A;
3467     } elsif ($self->{nc} == -1) {
3468 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3469    
3470     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3471     $self->{state} = DATA_STATE;
3472     $self->{s_kwd} = '';
3473     $self->{ct}->{quirks} = 1;
3474     } else {
3475    
3476     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3477     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3478     }
3479 wakaba 1.1
3480 wakaba 1.16 ## Reconsume.
3481     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3482 wakaba 1.1 redo A;
3483     } elsif ($self->{nc} == 0x0050 or # P
3484     $self->{nc} == 0x0070) { # p
3485 wakaba 1.12
3486 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3487 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3488 wakaba 1.1
3489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490     $self->{line_prev} = $self->{line};
3491     $self->{column_prev} = $self->{column};
3492     $self->{column}++;
3493     $self->{nc}
3494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3495     } else {
3496     $self->{set_nc}->($self);
3497     }
3498    
3499     redo A;
3500     } elsif ($self->{nc} == 0x0053 or # S
3501     $self->{nc} == 0x0073) { # s
3502 wakaba 1.12
3503 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3504 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3505    
3506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3507     $self->{line_prev} = $self->{line};
3508     $self->{column_prev} = $self->{column};
3509     $self->{column}++;
3510     $self->{nc}
3511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3512     } else {
3513     $self->{set_nc}->($self);
3514     }
3515    
3516     redo A;
3517 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3518     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3519     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3520    
3521     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3522     $self->{ct}->{value} = ''; # ENTITY
3523    
3524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3525     $self->{line_prev} = $self->{line};
3526     $self->{column_prev} = $self->{column};
3527     $self->{column}++;
3528     $self->{nc}
3529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3530     } else {
3531     $self->{set_nc}->($self);
3532     }
3533    
3534     redo A;
3535     } elsif ($self->{nc} == 0x0027 and # '
3536     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3537     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3538    
3539     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3540     $self->{ct}->{value} = ''; # ENTITY
3541    
3542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3543     $self->{line_prev} = $self->{line};
3544     $self->{column_prev} = $self->{column};
3545     $self->{column}++;
3546     $self->{nc}
3547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3548     } else {
3549     $self->{set_nc}->($self);
3550     }
3551    
3552     redo A;
3553 wakaba 1.16 } elsif ($self->{is_xml} and
3554     $self->{ct}->{type} == DOCTYPE_TOKEN and
3555     $self->{nc} == 0x005B) { # [
3556 wakaba 1.12
3557     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3558     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3559 wakaba 1.13 $self->{in_subset} = 1;
3560 wakaba 1.1
3561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3562     $self->{line_prev} = $self->{line};
3563     $self->{column_prev} = $self->{column};
3564     $self->{column}++;
3565     $self->{nc}
3566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3567     } else {
3568     $self->{set_nc}->($self);
3569     }
3570    
3571 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3572 wakaba 1.1 redo A;
3573     } else {
3574 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3575    
3576     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3577    
3578     $self->{ct}->{quirks} = 1;
3579     $self->{state} = BOGUS_DOCTYPE_STATE;
3580     } else {
3581    
3582     $self->{state} = BOGUS_MD_STATE;
3583     }
3584 wakaba 1.1
3585    
3586     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3587     $self->{line_prev} = $self->{line};
3588     $self->{column_prev} = $self->{column};
3589     $self->{column}++;
3590     $self->{nc}
3591     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3592     } else {
3593     $self->{set_nc}->($self);
3594     }
3595    
3596     redo A;
3597     }
3598     } elsif ($self->{state} == PUBLIC_STATE) {
3599     ## ASCII case-insensitive
3600     if ($self->{nc} == [
3601     undef,
3602     0x0055, # U
3603     0x0042, # B
3604     0x004C, # L
3605     0x0049, # I
3606 wakaba 1.12 ]->[length $self->{kwd}] or
3607 wakaba 1.1 $self->{nc} == [
3608     undef,
3609     0x0075, # u
3610     0x0062, # b
3611     0x006C, # l
3612     0x0069, # i
3613 wakaba 1.12 ]->[length $self->{kwd}]) {
3614 wakaba 1.1
3615     ## Stay in the state.
3616 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3617 wakaba 1.1
3618     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3619     $self->{line_prev} = $self->{line};
3620     $self->{column_prev} = $self->{column};
3621     $self->{column}++;
3622     $self->{nc}
3623     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3624     } else {
3625     $self->{set_nc}->($self);
3626     }
3627    
3628     redo A;
3629 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3630 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3631     $self->{nc} == 0x0063)) { # c
3632 wakaba 1.12 if ($self->{is_xml} and
3633     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3634    
3635     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3636     text => 'PUBLIC',
3637     line => $self->{line_prev},
3638     column => $self->{column_prev} - 4);
3639     } else {
3640    
3641     }
3642 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3643    
3644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3645     $self->{line_prev} = $self->{line};
3646     $self->{column_prev} = $self->{column};
3647     $self->{column}++;
3648     $self->{nc}
3649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3650     } else {
3651     $self->{set_nc}->($self);
3652     }
3653    
3654     redo A;
3655     } else {
3656 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3657 wakaba 1.1 line => $self->{line_prev},
3658 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3659 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3660    
3661     $self->{ct}->{quirks} = 1;
3662     $self->{state} = BOGUS_DOCTYPE_STATE;
3663     } else {
3664    
3665     $self->{state} = BOGUS_MD_STATE;
3666     }
3667 wakaba 1.1 ## Reconsume.
3668     redo A;
3669     }
3670     } elsif ($self->{state} == SYSTEM_STATE) {
3671     ## ASCII case-insensitive
3672     if ($self->{nc} == [
3673     undef,
3674     0x0059, # Y
3675     0x0053, # S
3676     0x0054, # T
3677     0x0045, # E
3678 wakaba 1.12 ]->[length $self->{kwd}] or
3679 wakaba 1.1 $self->{nc} == [
3680     undef,
3681     0x0079, # y
3682     0x0073, # s
3683     0x0074, # t
3684     0x0065, # e
3685 wakaba 1.12 ]->[length $self->{kwd}]) {
3686 wakaba 1.1
3687     ## Stay in the state.
3688 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3689 wakaba 1.1
3690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3691     $self->{line_prev} = $self->{line};
3692     $self->{column_prev} = $self->{column};
3693     $self->{column}++;
3694     $self->{nc}
3695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3696     } else {
3697     $self->{set_nc}->($self);
3698     }
3699    
3700     redo A;
3701 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3702 wakaba 1.1 ($self->{nc} == 0x004D or # M
3703     $self->{nc} == 0x006D)) { # m
3704 wakaba 1.12 if ($self->{is_xml} and
3705     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3706    
3707     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3708     text => 'SYSTEM',
3709     line => $self->{line_prev},
3710     column => $self->{column_prev} - 4);
3711     } else {
3712    
3713     }
3714 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3715    
3716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3717     $self->{line_prev} = $self->{line};
3718     $self->{column_prev} = $self->{column};
3719     $self->{column}++;
3720     $self->{nc}
3721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3722     } else {
3723     $self->{set_nc}->($self);
3724     }
3725    
3726     redo A;
3727     } else {
3728 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3729 wakaba 1.1 line => $self->{line_prev},
3730 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3731 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3732    
3733     $self->{ct}->{quirks} = 1;
3734     $self->{state} = BOGUS_DOCTYPE_STATE;
3735     } else {
3736    
3737     $self->{state} = BOGUS_MD_STATE;
3738     }
3739 wakaba 1.1 ## Reconsume.
3740     redo A;
3741     }
3742     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3743     if ($is_space->{$self->{nc}}) {
3744    
3745     ## Stay in the state
3746    
3747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3748     $self->{line_prev} = $self->{line};
3749     $self->{column_prev} = $self->{column};
3750     $self->{column}++;
3751     $self->{nc}
3752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3753     } else {
3754     $self->{set_nc}->($self);
3755     }
3756    
3757     redo A;
3758     } elsif ($self->{nc} eq 0x0022) { # "
3759    
3760     $self->{ct}->{pubid} = ''; # DOCTYPE
3761     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3762    
3763     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3764     $self->{line_prev} = $self->{line};
3765     $self->{column_prev} = $self->{column};
3766     $self->{column}++;
3767     $self->{nc}
3768     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3769     } else {
3770     $self->{set_nc}->($self);
3771     }
3772    
3773     redo A;
3774     } elsif ($self->{nc} eq 0x0027) { # '
3775    
3776     $self->{ct}->{pubid} = ''; # DOCTYPE
3777     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3778    
3779     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3780     $self->{line_prev} = $self->{line};
3781     $self->{column_prev} = $self->{column};
3782     $self->{column}++;
3783     $self->{nc}
3784     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3785     } else {
3786     $self->{set_nc}->($self);
3787     }
3788    
3789     redo A;
3790     } elsif ($self->{nc} eq 0x003E) { # >
3791 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3792    
3793     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794    
3795     $self->{state} = DATA_STATE;
3796     $self->{s_kwd} = '';
3797     $self->{ct}->{quirks} = 1;
3798     } else {
3799    
3800     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801     }
3802 wakaba 1.1
3803    
3804     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3805     $self->{line_prev} = $self->{line};
3806     $self->{column_prev} = $self->{column};
3807     $self->{column}++;
3808     $self->{nc}
3809     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3810     } else {
3811     $self->{set_nc}->($self);
3812     }
3813    
3814 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3815 wakaba 1.1 redo A;
3816     } elsif ($self->{nc} == -1) {
3817 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3818    
3819     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3820     $self->{state} = DATA_STATE;
3821     $self->{s_kwd} = '';
3822     $self->{ct}->{quirks} = 1;
3823     } else {
3824    
3825     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3826     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3827     }
3828 wakaba 1.1
3829     ## reconsume
3830     return ($self->{ct}); # DOCTYPE
3831     redo A;
3832 wakaba 1.16 } elsif ($self->{is_xml} and
3833     $self->{ct}->{type} == DOCTYPE_TOKEN and
3834     $self->{nc} == 0x005B) { # [
3835 wakaba 1.12
3836     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3837     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3838     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3839 wakaba 1.13 $self->{in_subset} = 1;
3840 wakaba 1.12
3841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3842     $self->{line_prev} = $self->{line};
3843     $self->{column_prev} = $self->{column};
3844     $self->{column}++;
3845     $self->{nc}
3846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3847     } else {
3848     $self->{set_nc}->($self);
3849     }
3850    
3851 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3852 wakaba 1.12 redo A;
3853 wakaba 1.1 } else {
3854     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3855    
3856 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3857    
3858     $self->{ct}->{quirks} = 1;
3859     $self->{state} = BOGUS_DOCTYPE_STATE;
3860     } else {
3861    
3862     $self->{state} = BOGUS_MD_STATE;
3863     }
3864    
3865 wakaba 1.1
3866     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3867     $self->{line_prev} = $self->{line};
3868     $self->{column_prev} = $self->{column};
3869     $self->{column}++;
3870     $self->{nc}
3871     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3872     } else {
3873     $self->{set_nc}->($self);
3874     }
3875    
3876     redo A;
3877     }
3878     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3879     if ($self->{nc} == 0x0022) { # "
3880    
3881     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3882    
3883     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3884     $self->{line_prev} = $self->{line};
3885     $self->{column_prev} = $self->{column};
3886     $self->{column}++;
3887     $self->{nc}
3888     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3889     } else {
3890     $self->{set_nc}->($self);
3891     }
3892    
3893     redo A;
3894     } elsif ($self->{nc} == 0x003E) { # >
3895     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3896    
3897 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3898    
3899     $self->{state} = DATA_STATE;
3900     $self->{s_kwd} = '';
3901     $self->{ct}->{quirks} = 1;
3902     } else {
3903    
3904     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3905     }
3906    
3907 wakaba 1.1
3908     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3909     $self->{line_prev} = $self->{line};
3910     $self->{column_prev} = $self->{column};
3911     $self->{column}++;
3912     $self->{nc}
3913     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3914     } else {
3915     $self->{set_nc}->($self);
3916     }
3917    
3918 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3919 wakaba 1.1 redo A;
3920     } elsif ($self->{nc} == -1) {
3921     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3922    
3923 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3924    
3925     $self->{state} = DATA_STATE;
3926     $self->{s_kwd} = '';
3927     $self->{ct}->{quirks} = 1;
3928     } else {
3929    
3930     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3931     }
3932    
3933     ## Reconsume.
3934 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3935     redo A;
3936     } else {
3937    
3938 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3939 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3940     length $self->{ct}->{pubid});
3941    
3942     ## Stay in the state
3943    
3944     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3945     $self->{line_prev} = $self->{line};
3946     $self->{column_prev} = $self->{column};
3947     $self->{column}++;
3948     $self->{nc}
3949     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3950     } else {
3951     $self->{set_nc}->($self);
3952     }
3953    
3954     redo A;
3955     }
3956     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3957     if ($self->{nc} == 0x0027) { # '
3958    
3959     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3960    
3961     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3962     $self->{line_prev} = $self->{line};
3963     $self->{column_prev} = $self->{column};
3964     $self->{column}++;
3965     $self->{nc}
3966     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3967     } else {
3968     $self->{set_nc}->($self);
3969     }
3970    
3971     redo A;
3972     } elsif ($self->{nc} == 0x003E) { # >
3973     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3974    
3975 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3976    
3977     $self->{state} = DATA_STATE;
3978     $self->{s_kwd} = '';
3979     $self->{ct}->{quirks} = 1;
3980     } else {
3981    
3982     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3983     }
3984    
3985 wakaba 1.1
3986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987     $self->{line_prev} = $self->{line};
3988     $self->{column_prev} = $self->{column};
3989     $self->{column}++;
3990     $self->{nc}
3991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992     } else {
3993     $self->{set_nc}->($self);
3994     }
3995    
3996 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3997 wakaba 1.1 redo A;
3998     } elsif ($self->{nc} == -1) {
3999     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
4000    
4001 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4002    
4003     $self->{state} = DATA_STATE;
4004     $self->{s_kwd} = '';
4005     $self->{ct}->{quirks} = 1;
4006     } else {
4007    
4008     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4009     }
4010    
4011 wakaba 1.1 ## reconsume
4012 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4013 wakaba 1.1 redo A;
4014     } else {
4015    
4016 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4017 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
4018     length $self->{ct}->{pubid});
4019    
4020     ## Stay in the state
4021    
4022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4023     $self->{line_prev} = $self->{line};
4024     $self->{column_prev} = $self->{column};
4025     $self->{column}++;
4026     $self->{nc}
4027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4028     } else {
4029     $self->{set_nc}->($self);
4030     }
4031    
4032     redo A;
4033     }
4034     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
4035     if ($is_space->{$self->{nc}}) {
4036    
4037     ## Stay in the state
4038    
4039     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4040     $self->{line_prev} = $self->{line};
4041     $self->{column_prev} = $self->{column};
4042     $self->{column}++;
4043     $self->{nc}
4044     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4045     } else {
4046     $self->{set_nc}->($self);
4047     }
4048    
4049     redo A;
4050     } elsif ($self->{nc} == 0x0022) { # "
4051    
4052 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4053 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4054    
4055     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4056     $self->{line_prev} = $self->{line};
4057     $self->{column_prev} = $self->{column};
4058     $self->{column}++;
4059     $self->{nc}
4060     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4061     } else {
4062     $self->{set_nc}->($self);
4063     }
4064    
4065     redo A;
4066     } elsif ($self->{nc} == 0x0027) { # '
4067    
4068 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4069 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4070    
4071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4072     $self->{line_prev} = $self->{line};
4073     $self->{column_prev} = $self->{column};
4074     $self->{column}++;
4075     $self->{nc}
4076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4077     } else {
4078     $self->{set_nc}->($self);
4079     }
4080    
4081     redo A;
4082     } elsif ($self->{nc} == 0x003E) { # >
4083 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4084     if ($self->{is_xml}) {
4085    
4086     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4087     } else {
4088    
4089     }
4090     $self->{state} = DATA_STATE;
4091     $self->{s_kwd} = '';
4092 wakaba 1.12 } else {
4093 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
4094    
4095     } else {
4096    
4097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4098     }
4099     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4100 wakaba 1.12 }
4101 wakaba 1.16
4102 wakaba 1.1
4103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4104     $self->{line_prev} = $self->{line};
4105     $self->{column_prev} = $self->{column};
4106     $self->{column}++;
4107     $self->{nc}
4108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4109     } else {
4110     $self->{set_nc}->($self);
4111     }
4112    
4113 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4114 wakaba 1.1 redo A;
4115     } elsif ($self->{nc} == -1) {
4116 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4117    
4118     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4119    
4120     $self->{state} = DATA_STATE;
4121     $self->{s_kwd} = '';
4122     $self->{ct}->{quirks} = 1;
4123     } else {
4124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4125     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4126     }
4127 wakaba 1.1
4128     ## reconsume
4129 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4130 wakaba 1.1 redo A;
4131 wakaba 1.16 } elsif ($self->{is_xml} and
4132     $self->{ct}->{type} == DOCTYPE_TOKEN and
4133     $self->{nc} == 0x005B) { # [
4134 wakaba 1.12
4135     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4136     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4137     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4138 wakaba 1.13 $self->{in_subset} = 1;
4139 wakaba 1.12
4140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4141     $self->{line_prev} = $self->{line};
4142     $self->{column_prev} = $self->{column};
4143     $self->{column}++;
4144     $self->{nc}
4145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4146     } else {
4147     $self->{set_nc}->($self);
4148     }
4149    
4150 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4151 wakaba 1.12 redo A;
4152 wakaba 1.1 } else {
4153     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4154    
4155 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4156    
4157     $self->{ct}->{quirks} = 1;
4158     $self->{state} = BOGUS_DOCTYPE_STATE;
4159     } else {
4160    
4161     $self->{state} = BOGUS_MD_STATE;
4162     }
4163    
4164 wakaba 1.1
4165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4166     $self->{line_prev} = $self->{line};
4167     $self->{column_prev} = $self->{column};
4168     $self->{column}++;
4169     $self->{nc}
4170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4171     } else {
4172     $self->{set_nc}->($self);
4173     }
4174    
4175     redo A;
4176     }
4177     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4178     if ($is_space->{$self->{nc}}) {
4179    
4180     ## Stay in the state
4181    
4182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4183     $self->{line_prev} = $self->{line};
4184     $self->{column_prev} = $self->{column};
4185     $self->{column}++;
4186     $self->{nc}
4187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4188     } else {
4189     $self->{set_nc}->($self);
4190     }
4191    
4192     redo A;
4193     } elsif ($self->{nc} == 0x0022) { # "
4194    
4195     $self->{ct}->{sysid} = ''; # DOCTYPE
4196     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4197    
4198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4199     $self->{line_prev} = $self->{line};
4200     $self->{column_prev} = $self->{column};
4201     $self->{column}++;
4202     $self->{nc}
4203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4204     } else {
4205     $self->{set_nc}->($self);
4206     }
4207    
4208     redo A;
4209     } elsif ($self->{nc} == 0x0027) { # '
4210    
4211     $self->{ct}->{sysid} = ''; # DOCTYPE
4212     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4213    
4214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4215     $self->{line_prev} = $self->{line};
4216     $self->{column_prev} = $self->{column};
4217     $self->{column}++;
4218     $self->{nc}
4219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4220     } else {
4221     $self->{set_nc}->($self);
4222     }
4223    
4224     redo A;
4225     } elsif ($self->{nc} == 0x003E) { # >
4226     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4227    
4228     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4229     $self->{line_prev} = $self->{line};
4230     $self->{column_prev} = $self->{column};
4231     $self->{column}++;
4232     $self->{nc}
4233     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4234     } else {
4235     $self->{set_nc}->($self);
4236     }
4237    
4238    
4239 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4240    
4241     $self->{state} = DATA_STATE;
4242     $self->{s_kwd} = '';
4243     $self->{ct}->{quirks} = 1;
4244     } else {
4245    
4246     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247     }
4248 wakaba 1.1
4249 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4250 wakaba 1.1 redo A;
4251     } elsif ($self->{nc} == -1) {
4252 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4253    
4254     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4255     $self->{state} = DATA_STATE;
4256     $self->{s_kwd} = '';
4257     $self->{ct}->{quirks} = 1;
4258     } else {
4259    
4260     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4261     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4262     }
4263 wakaba 1.1
4264     ## reconsume
4265 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4266 wakaba 1.1 redo A;
4267 wakaba 1.16 } elsif ($self->{is_xml} and
4268     $self->{ct}->{type} == DOCTYPE_TOKEN and
4269     $self->{nc} == 0x005B) { # [
4270 wakaba 1.12
4271     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4272    
4273     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4274     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4275 wakaba 1.13 $self->{in_subset} = 1;
4276 wakaba 1.12
4277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4278     $self->{line_prev} = $self->{line};
4279     $self->{column_prev} = $self->{column};
4280     $self->{column}++;
4281     $self->{nc}
4282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4283     } else {
4284     $self->{set_nc}->($self);
4285     }
4286    
4287 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4288 wakaba 1.12 redo A;
4289 wakaba 1.1 } else {
4290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4291    
4292 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4293    
4294     $self->{ct}->{quirks} = 1;
4295     $self->{state} = BOGUS_DOCTYPE_STATE;
4296     } else {
4297    
4298     $self->{state} = BOGUS_MD_STATE;
4299     }
4300    
4301 wakaba 1.1
4302     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4303     $self->{line_prev} = $self->{line};
4304     $self->{column_prev} = $self->{column};
4305     $self->{column}++;
4306     $self->{nc}
4307     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4308     } else {
4309     $self->{set_nc}->($self);
4310     }
4311    
4312     redo A;
4313     }
4314     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4315     if ($self->{nc} == 0x0022) { # "
4316    
4317     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4318    
4319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4320     $self->{line_prev} = $self->{line};
4321     $self->{column_prev} = $self->{column};
4322     $self->{column}++;
4323     $self->{nc}
4324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4325     } else {
4326     $self->{set_nc}->($self);
4327     }
4328    
4329     redo A;
4330 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4331 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4332    
4333 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4334    
4335     $self->{state} = DATA_STATE;
4336     $self->{s_kwd} = '';
4337     $self->{ct}->{quirks} = 1;
4338     } else {
4339    
4340     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4341     }
4342    
4343 wakaba 1.1
4344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4345     $self->{line_prev} = $self->{line};
4346     $self->{column_prev} = $self->{column};
4347     $self->{column}++;
4348     $self->{nc}
4349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4350     } else {
4351     $self->{set_nc}->($self);
4352     }
4353    
4354 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4355 wakaba 1.1 redo A;
4356     } elsif ($self->{nc} == -1) {
4357     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4358    
4359 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4360    
4361     $self->{state} = DATA_STATE;
4362     $self->{s_kwd} = '';
4363     $self->{ct}->{quirks} = 1;
4364     } else {
4365    
4366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367     }
4368    
4369 wakaba 1.1 ## reconsume
4370 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4371 wakaba 1.1 redo A;
4372     } else {
4373    
4374 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4375 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4376     length $self->{ct}->{sysid});
4377    
4378     ## Stay in the state
4379    
4380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4381     $self->{line_prev} = $self->{line};
4382     $self->{column_prev} = $self->{column};
4383     $self->{column}++;
4384     $self->{nc}
4385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4386     } else {
4387     $self->{set_nc}->($self);
4388     }
4389    
4390     redo A;
4391     }
4392     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4393     if ($self->{nc} == 0x0027) { # '
4394    
4395     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4396    
4397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4398     $self->{line_prev} = $self->{line};
4399     $self->{column_prev} = $self->{column};
4400     $self->{column}++;
4401     $self->{nc}
4402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4403     } else {
4404     $self->{set_nc}->($self);
4405     }
4406    
4407     redo A;
4408 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4409 wakaba 1.1
4410     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4411    
4412     $self->{state} = DATA_STATE;
4413 wakaba 1.5 $self->{s_kwd} = '';
4414 wakaba 1.1
4415     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4416     $self->{line_prev} = $self->{line};
4417     $self->{column_prev} = $self->{column};
4418     $self->{column}++;
4419     $self->{nc}
4420     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4421     } else {
4422     $self->{set_nc}->($self);
4423     }
4424    
4425    
4426     $self->{ct}->{quirks} = 1;
4427     return ($self->{ct}); # DOCTYPE
4428    
4429     redo A;
4430     } elsif ($self->{nc} == -1) {
4431     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4432    
4433 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4434    
4435     $self->{state} = DATA_STATE;
4436     $self->{s_kwd} = '';
4437     $self->{ct}->{quirks} = 1;
4438     } else {
4439    
4440     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4441     }
4442    
4443 wakaba 1.1 ## reconsume
4444 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4445 wakaba 1.1 redo A;
4446     } else {
4447    
4448 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4449 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4450     length $self->{ct}->{sysid});
4451    
4452     ## Stay in the state
4453    
4454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4455     $self->{line_prev} = $self->{line};
4456     $self->{column_prev} = $self->{column};
4457     $self->{column}++;
4458     $self->{nc}
4459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4460     } else {
4461     $self->{set_nc}->($self);
4462     }
4463    
4464     redo A;
4465     }
4466     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4467     if ($is_space->{$self->{nc}}) {
4468 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4469    
4470     $self->{state} = BEFORE_NDATA_STATE;
4471     } else {
4472    
4473     ## Stay in the state
4474     }
4475 wakaba 1.1
4476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4477     $self->{line_prev} = $self->{line};
4478     $self->{column_prev} = $self->{column};
4479     $self->{column}++;
4480     $self->{nc}
4481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4482     } else {
4483     $self->{set_nc}->($self);
4484     }
4485    
4486     redo A;
4487     } elsif ($self->{nc} == 0x003E) { # >
4488 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4489    
4490     $self->{state} = DATA_STATE;
4491     $self->{s_kwd} = '';
4492     } else {
4493    
4494     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4495     }
4496    
4497 wakaba 1.1
4498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499     $self->{line_prev} = $self->{line};
4500     $self->{column_prev} = $self->{column};
4501     $self->{column}++;
4502     $self->{nc}
4503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504     } else {
4505     $self->{set_nc}->($self);
4506     }
4507    
4508 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4509 wakaba 1.1 redo A;
4510 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4511     ($self->{nc} == 0x004E or # N
4512     $self->{nc} == 0x006E)) { # n
4513    
4514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4515     $self->{state} = NDATA_STATE;
4516     $self->{kwd} = chr $self->{nc};
4517    
4518     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4519     $self->{line_prev} = $self->{line};
4520     $self->{column_prev} = $self->{column};
4521     $self->{column}++;
4522     $self->{nc}
4523     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4524     } else {
4525     $self->{set_nc}->($self);
4526     }
4527    
4528     redo A;
4529 wakaba 1.1 } elsif ($self->{nc} == -1) {
4530 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4531    
4532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4533     $self->{state} = DATA_STATE;
4534     $self->{s_kwd} = '';
4535     $self->{ct}->{quirks} = 1;
4536     } else {
4537    
4538     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540     }
4541    
4542 wakaba 1.1 ## reconsume
4543 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4544 wakaba 1.1 redo A;
4545 wakaba 1.16 } elsif ($self->{is_xml} and
4546     $self->{ct}->{type} == DOCTYPE_TOKEN and
4547     $self->{nc} == 0x005B) { # [
4548 wakaba 1.12
4549     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4550     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4551 wakaba 1.13 $self->{in_subset} = 1;
4552 wakaba 1.12
4553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4554     $self->{line_prev} = $self->{line};
4555     $self->{column_prev} = $self->{column};
4556     $self->{column}++;
4557     $self->{nc}
4558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4559     } else {
4560     $self->{set_nc}->($self);
4561     }
4562    
4563 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4564 wakaba 1.12 redo A;
4565 wakaba 1.1 } else {
4566     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4567    
4568 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4569    
4570     #$self->{ct}->{quirks} = 1;
4571     $self->{state} = BOGUS_DOCTYPE_STATE;
4572     } else {
4573    
4574     $self->{state} = BOGUS_MD_STATE;
4575     }
4576    
4577 wakaba 1.1
4578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4579     $self->{line_prev} = $self->{line};
4580     $self->{column_prev} = $self->{column};
4581     $self->{column}++;
4582     $self->{nc}
4583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4584     } else {
4585     $self->{set_nc}->($self);
4586     }
4587    
4588     redo A;
4589     }
4590 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4591     if ($is_space->{$self->{nc}}) {
4592    
4593     ## Stay in the state.
4594    
4595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4596     $self->{line_prev} = $self->{line};
4597     $self->{column_prev} = $self->{column};
4598     $self->{column}++;
4599     $self->{nc}
4600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4601     } else {
4602     $self->{set_nc}->($self);
4603     }
4604    
4605     redo A;
4606     } elsif ($self->{nc} == 0x003E) { # >
4607    
4608     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4609    
4610     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4611     $self->{line_prev} = $self->{line};
4612     $self->{column_prev} = $self->{column};
4613     $self->{column}++;
4614     $self->{nc}
4615     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4616     } else {
4617     $self->{set_nc}->($self);
4618     }
4619    
4620     return ($self->{ct}); # ENTITY
4621     redo A;
4622     } elsif ($self->{nc} == 0x004E or # N
4623     $self->{nc} == 0x006E) { # n
4624    
4625     $self->{state} = NDATA_STATE;
4626     $self->{kwd} = chr $self->{nc};
4627    
4628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4629     $self->{line_prev} = $self->{line};
4630     $self->{column_prev} = $self->{column};
4631     $self->{column}++;
4632     $self->{nc}
4633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4634     } else {
4635     $self->{set_nc}->($self);
4636     }
4637    
4638     redo A;
4639     } elsif ($self->{nc} == -1) {
4640    
4641     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4642     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4643     ## reconsume
4644     return ($self->{ct}); # ENTITY
4645     redo A;
4646     } else {
4647    
4648     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4649     $self->{state} = BOGUS_MD_STATE;
4650    
4651     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4652     $self->{line_prev} = $self->{line};
4653     $self->{column_prev} = $self->{column};
4654     $self->{column}++;
4655     $self->{nc}
4656     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4657     } else {
4658     $self->{set_nc}->($self);
4659     }
4660    
4661     redo A;
4662     }
4663 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4664     if ($self->{nc} == 0x003E) { # >
4665    
4666     $self->{state} = DATA_STATE;
4667 wakaba 1.5 $self->{s_kwd} = '';
4668 wakaba 1.1
4669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4670     $self->{line_prev} = $self->{line};
4671     $self->{column_prev} = $self->{column};
4672     $self->{column}++;
4673     $self->{nc}
4674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4675     } else {
4676     $self->{set_nc}->($self);
4677     }
4678    
4679    
4680     return ($self->{ct}); # DOCTYPE
4681    
4682     redo A;
4683 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4684 wakaba 1.13
4685     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4686     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4687     $self->{in_subset} = 1;
4688    
4689 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4690     $self->{line_prev} = $self->{line};
4691     $self->{column_prev} = $self->{column};
4692     $self->{column}++;
4693     $self->{nc}
4694     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4695     } else {
4696     $self->{set_nc}->($self);
4697     }
4698    
4699 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4700     redo A;
4701 wakaba 1.1 } elsif ($self->{nc} == -1) {
4702    
4703     $self->{state} = DATA_STATE;
4704 wakaba 1.5 $self->{s_kwd} = '';
4705 wakaba 1.1 ## reconsume
4706    
4707     return ($self->{ct}); # DOCTYPE
4708    
4709     redo A;
4710     } else {
4711    
4712     my $s = '';
4713 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4714 wakaba 1.1
4715     ## Stay in the state
4716    
4717     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4718     $self->{line_prev} = $self->{line};
4719     $self->{column_prev} = $self->{column};
4720     $self->{column}++;
4721     $self->{nc}
4722     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4723     } else {
4724     $self->{set_nc}->($self);
4725     }
4726    
4727     redo A;
4728     }
4729     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4730     ## NOTE: "CDATA section state" in the state is jointly implemented
4731     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4732     ## and |CDATA_SECTION_MSE2_STATE|.
4733 wakaba 1.10
4734     ## XML5: "CDATA state".
4735 wakaba 1.1
4736     if ($self->{nc} == 0x005D) { # ]
4737    
4738     $self->{state} = CDATA_SECTION_MSE1_STATE;
4739    
4740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4741     $self->{line_prev} = $self->{line};
4742     $self->{column_prev} = $self->{column};
4743     $self->{column}++;
4744     $self->{nc}
4745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4746     } else {
4747     $self->{set_nc}->($self);
4748     }
4749    
4750     redo A;
4751     } elsif ($self->{nc} == -1) {
4752 wakaba 1.6 if ($self->{is_xml}) {
4753 wakaba 1.8
4754 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4755 wakaba 1.8 } else {
4756    
4757 wakaba 1.6 }
4758    
4759 wakaba 1.1 $self->{state} = DATA_STATE;
4760 wakaba 1.5 $self->{s_kwd} = '';
4761 wakaba 1.10 ## Reconsume.
4762 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4763    
4764     return ($self->{ct}); # character
4765     } else {
4766    
4767     ## No token to emit. $self->{ct} is discarded.
4768     }
4769     redo A;
4770     } else {
4771    
4772     $self->{ct}->{data} .= chr $self->{nc};
4773     $self->{read_until}->($self->{ct}->{data},
4774     q<]>,
4775     length $self->{ct}->{data});
4776    
4777     ## Stay in the state.
4778    
4779     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4780     $self->{line_prev} = $self->{line};
4781     $self->{column_prev} = $self->{column};
4782     $self->{column}++;
4783     $self->{nc}
4784     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4785     } else {
4786     $self->{set_nc}->($self);
4787     }
4788    
4789     redo A;
4790     }
4791    
4792     ## ISSUE: "text tokens" in spec.
4793     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4794 wakaba 1.10 ## XML5: "CDATA bracket state".
4795    
4796 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4797    
4798     $self->{state} = CDATA_SECTION_MSE2_STATE;
4799    
4800     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4801     $self->{line_prev} = $self->{line};
4802     $self->{column_prev} = $self->{column};
4803     $self->{column}++;
4804     $self->{nc}
4805     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4806     } else {
4807     $self->{set_nc}->($self);
4808     }
4809    
4810     redo A;
4811     } else {
4812    
4813 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4814 wakaba 1.1 $self->{ct}->{data} .= ']';
4815 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4816 wakaba 1.1 ## Reconsume.
4817     redo A;
4818     }
4819     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4820 wakaba 1.10 ## XML5: "CDATA end state".
4821    
4822 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4823     $self->{state} = DATA_STATE;
4824 wakaba 1.5 $self->{s_kwd} = '';
4825 wakaba 1.1
4826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4827     $self->{line_prev} = $self->{line};
4828     $self->{column_prev} = $self->{column};
4829     $self->{column}++;
4830     $self->{nc}
4831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4832     } else {
4833     $self->{set_nc}->($self);
4834     }
4835    
4836     if (length $self->{ct}->{data}) { # character
4837    
4838     return ($self->{ct}); # character
4839     } else {
4840    
4841     ## No token to emit. $self->{ct} is discarded.
4842     }
4843     redo A;
4844     } elsif ($self->{nc} == 0x005D) { # ]
4845     # character
4846     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4847     ## Stay in the state.
4848    
4849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4850     $self->{line_prev} = $self->{line};
4851     $self->{column_prev} = $self->{column};
4852     $self->{column}++;
4853     $self->{nc}
4854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4855     } else {
4856     $self->{set_nc}->($self);
4857     }
4858    
4859     redo A;
4860     } else {
4861    
4862     $self->{ct}->{data} .= ']]'; # character
4863     $self->{state} = CDATA_SECTION_STATE;
4864 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4865 wakaba 1.1 redo A;
4866     }
4867     } elsif ($self->{state} == ENTITY_STATE) {
4868     if ($is_space->{$self->{nc}} or
4869     {
4870     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4871     $self->{entity_add} => 1,
4872     }->{$self->{nc}}) {
4873 wakaba 1.22 if ($self->{is_xml}) {
4874    
4875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4876     line => $self->{line_prev},
4877     column => $self->{column_prev}
4878     + ($self->{nc} == -1 ? 1 : 0));
4879     } else {
4880    
4881     ## No error
4882     }
4883 wakaba 1.1 ## Don't consume
4884     ## Return nothing.
4885     #
4886     } elsif ($self->{nc} == 0x0023) { # #
4887    
4888     $self->{state} = ENTITY_HASH_STATE;
4889 wakaba 1.12 $self->{kwd} = '#';
4890 wakaba 1.1
4891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4892     $self->{line_prev} = $self->{line};
4893     $self->{column_prev} = $self->{column};
4894     $self->{column}++;
4895     $self->{nc}
4896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4897     } else {
4898     $self->{set_nc}->($self);
4899     }
4900    
4901     redo A;
4902 wakaba 1.22 } elsif ($self->{is_xml} or
4903     (0x0041 <= $self->{nc} and
4904 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4905     (0x0061 <= $self->{nc} and
4906     $self->{nc} <= 0x007A)) { # a..z
4907    
4908     require Whatpm::_NamedEntityList;
4909     $self->{state} = ENTITY_NAME_STATE;
4910 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4911     $self->{entity__value} = $self->{kwd};
4912 wakaba 1.1 $self->{entity__match} = 0;
4913    
4914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4915     $self->{line_prev} = $self->{line};
4916     $self->{column_prev} = $self->{column};
4917     $self->{column}++;
4918     $self->{nc}
4919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4920     } else {
4921     $self->{set_nc}->($self);
4922     }
4923    
4924     redo A;
4925     } else {
4926    
4927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4928     ## Return nothing.
4929     #
4930     }
4931    
4932     ## NOTE: No character is consumed by the "consume a character
4933     ## reference" algorithm. In other word, there is an "&" character
4934     ## that does not introduce a character reference, which would be
4935     ## appended to the parent element or the attribute value in later
4936     ## process of the tokenizer.
4937    
4938     if ($self->{prev_state} == DATA_STATE) {
4939    
4940     $self->{state} = $self->{prev_state};
4941 wakaba 1.5 $self->{s_kwd} = '';
4942 wakaba 1.1 ## Reconsume.
4943     return ({type => CHARACTER_TOKEN, data => '&',
4944     line => $self->{line_prev},
4945     column => $self->{column_prev},
4946     });
4947     redo A;
4948     } else {
4949    
4950     $self->{ca}->{value} .= '&';
4951     $self->{state} = $self->{prev_state};
4952 wakaba 1.5 $self->{s_kwd} = '';
4953 wakaba 1.1 ## Reconsume.
4954     redo A;
4955     }
4956     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4957 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4958 wakaba 1.1
4959     $self->{state} = HEXREF_X_STATE;
4960 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4961 wakaba 1.1
4962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4963     $self->{line_prev} = $self->{line};
4964     $self->{column_prev} = $self->{column};
4965     $self->{column}++;
4966     $self->{nc}
4967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4968     } else {
4969     $self->{set_nc}->($self);
4970     }
4971    
4972     redo A;
4973 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4974    
4975     if ($self->{is_xml}) {
4976     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4977     }
4978     $self->{state} = HEXREF_X_STATE;
4979     $self->{kwd} .= chr $self->{nc};
4980    
4981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4982     $self->{line_prev} = $self->{line};
4983     $self->{column_prev} = $self->{column};
4984     $self->{column}++;
4985     $self->{nc}
4986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4987     } else {
4988     $self->{set_nc}->($self);
4989     }
4990    
4991     redo A;
4992 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4993     $self->{nc} <= 0x0039) { # 0..9
4994    
4995     $self->{state} = NCR_NUM_STATE;
4996 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4997 wakaba 1.1
4998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4999     $self->{line_prev} = $self->{line};
5000     $self->{column_prev} = $self->{column};
5001     $self->{column}++;
5002     $self->{nc}
5003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5004     } else {
5005     $self->{set_nc}->($self);
5006     }
5007    
5008     redo A;
5009     } else {
5010     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
5011     line => $self->{line_prev},
5012     column => $self->{column_prev} - 1);
5013    
5014     ## NOTE: According to the spec algorithm, nothing is returned,
5015     ## and then "&#" is appended to the parent element or the attribute
5016     ## value in the later processing.
5017    
5018     if ($self->{prev_state} == DATA_STATE) {
5019    
5020     $self->{state} = $self->{prev_state};
5021 wakaba 1.5 $self->{s_kwd} = '';
5022 wakaba 1.1 ## Reconsume.
5023     return ({type => CHARACTER_TOKEN,
5024     data => '&#',
5025     line => $self->{line_prev},
5026     column => $self->{column_prev} - 1,
5027     });
5028     redo A;
5029     } else {
5030    
5031     $self->{ca}->{value} .= '&#';
5032     $self->{state} = $self->{prev_state};
5033 wakaba 1.5 $self->{s_kwd} = '';
5034 wakaba 1.1 ## Reconsume.
5035     redo A;
5036     }
5037     }
5038     } elsif ($self->{state} == NCR_NUM_STATE) {
5039     if (0x0030 <= $self->{nc} and
5040     $self->{nc} <= 0x0039) { # 0..9
5041    
5042 wakaba 1.12 $self->{kwd} *= 10;
5043     $self->{kwd} += $self->{nc} - 0x0030;
5044 wakaba 1.1
5045     ## Stay in the state.
5046    
5047     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5048     $self->{line_prev} = $self->{line};
5049     $self->{column_prev} = $self->{column};
5050     $self->{column}++;
5051     $self->{nc}
5052     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5053     } else {
5054     $self->{set_nc}->($self);
5055     }
5056    
5057     redo A;
5058     } elsif ($self->{nc} == 0x003B) { # ;
5059    
5060    
5061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5062     $self->{line_prev} = $self->{line};
5063     $self->{column_prev} = $self->{column};
5064     $self->{column}++;
5065     $self->{nc}
5066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5067     } else {
5068     $self->{set_nc}->($self);
5069     }
5070    
5071     #
5072     } else {
5073    
5074     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5075     ## Reconsume.
5076     #
5077     }
5078    
5079 wakaba 1.12 my $code = $self->{kwd};
5080 wakaba 1.1 my $l = $self->{line_prev};
5081     my $c = $self->{column_prev};
5082 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5083     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5084     ($self->{is_xml} and $code == 0x0000)) {
5085 wakaba 1.1
5086     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5087     text => (sprintf 'U+%04X', $code),
5088     line => $l, column => $c);
5089     $code = $charref_map->{$code};
5090     } elsif ($code > 0x10FFFF) {
5091    
5092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5093     text => (sprintf 'U-%08X', $code),
5094     line => $l, column => $c);
5095     $code = 0xFFFD;
5096     }
5097    
5098     if ($self->{prev_state} == DATA_STATE) {
5099    
5100     $self->{state} = $self->{prev_state};
5101 wakaba 1.5 $self->{s_kwd} = '';
5102 wakaba 1.1 ## Reconsume.
5103     return ({type => CHARACTER_TOKEN, data => chr $code,
5104 wakaba 1.7 has_reference => 1,
5105 wakaba 1.1 line => $l, column => $c,
5106     });
5107     redo A;
5108     } else {
5109    
5110     $self->{ca}->{value} .= chr $code;
5111     $self->{ca}->{has_reference} = 1;
5112     $self->{state} = $self->{prev_state};
5113 wakaba 1.5 $self->{s_kwd} = '';
5114 wakaba 1.1 ## Reconsume.
5115     redo A;
5116     }
5117     } elsif ($self->{state} == HEXREF_X_STATE) {
5118     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
5119     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
5120     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
5121     # 0..9, A..F, a..f
5122    
5123     $self->{state} = HEXREF_HEX_STATE;
5124 wakaba 1.12 $self->{kwd} = 0;
5125 wakaba 1.1 ## Reconsume.
5126     redo A;
5127     } else {
5128     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
5129     line => $self->{line_prev},
5130     column => $self->{column_prev} - 2);
5131    
5132     ## NOTE: According to the spec algorithm, nothing is returned,
5133     ## and then "&#" followed by "X" or "x" is appended to the parent
5134     ## element or the attribute value in the later processing.
5135    
5136     if ($self->{prev_state} == DATA_STATE) {
5137    
5138     $self->{state} = $self->{prev_state};
5139 wakaba 1.5 $self->{s_kwd} = '';
5140 wakaba 1.1 ## Reconsume.
5141     return ({type => CHARACTER_TOKEN,
5142 wakaba 1.12 data => '&' . $self->{kwd},
5143 wakaba 1.1 line => $self->{line_prev},
5144 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
5145 wakaba 1.1 });
5146     redo A;
5147     } else {
5148    
5149 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
5150 wakaba 1.1 $self->{state} = $self->{prev_state};
5151 wakaba 1.5 $self->{s_kwd} = '';
5152 wakaba 1.1 ## Reconsume.
5153     redo A;
5154     }
5155     }
5156     } elsif ($self->{state} == HEXREF_HEX_STATE) {
5157     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5158     # 0..9
5159    
5160 wakaba 1.12 $self->{kwd} *= 0x10;
5161     $self->{kwd} += $self->{nc} - 0x0030;
5162 wakaba 1.1 ## Stay in the state.
5163    
5164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5165     $self->{line_prev} = $self->{line};
5166     $self->{column_prev} = $self->{column};
5167     $self->{column}++;
5168     $self->{nc}
5169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5170     } else {
5171     $self->{set_nc}->($self);
5172     }
5173    
5174     redo A;
5175     } elsif (0x0061 <= $self->{nc} and
5176     $self->{nc} <= 0x0066) { # a..f
5177    
5178 wakaba 1.12 $self->{kwd} *= 0x10;
5179     $self->{kwd} += $self->{nc} - 0x0060 + 9;
5180 wakaba 1.1 ## Stay in the state.
5181    
5182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5183     $self->{line_prev} = $self->{line};
5184     $self->{column_prev} = $self->{column};
5185     $self->{column}++;
5186     $self->{nc}
5187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5188     } else {
5189     $self->{set_nc}->($self);
5190     }
5191    
5192     redo A;
5193     } elsif (0x0041 <= $self->{nc} and
5194     $self->{nc} <= 0x0046) { # A..F
5195    
5196 wakaba 1.12 $self->{kwd} *= 0x10;
5197     $self->{kwd} += $self->{nc} - 0x0040 + 9;
5198 wakaba 1.1 ## Stay in the state.
5199    
5200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5201     $self->{line_prev} = $self->{line};
5202     $self->{column_prev} = $self->{column};
5203     $self->{column}++;
5204     $self->{nc}
5205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5206     } else {
5207     $self->{set_nc}->($self);
5208     }
5209    
5210     redo A;
5211     } elsif ($self->{nc} == 0x003B) { # ;
5212    
5213    
5214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5215     $self->{line_prev} = $self->{line};
5216     $self->{column_prev} = $self->{column};
5217     $self->{column}++;
5218     $self->{nc}
5219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5220     } else {
5221     $self->{set_nc}->($self);
5222     }
5223    
5224     #
5225     } else {
5226    
5227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5228     line => $self->{line},
5229     column => $self->{column});
5230     ## Reconsume.
5231     #
5232     }
5233    
5234 wakaba 1.12 my $code = $self->{kwd};
5235 wakaba 1.1 my $l = $self->{line_prev};
5236     my $c = $self->{column_prev};
5237 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5238     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5239     ($self->{is_xml} and $code == 0x0000)) {
5240 wakaba 1.1
5241     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5242     text => (sprintf 'U+%04X', $code),
5243     line => $l, column => $c);
5244     $code = $charref_map->{$code};
5245     } elsif ($code > 0x10FFFF) {
5246    
5247     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5248     text => (sprintf 'U-%08X', $code),
5249     line => $l, column => $c);
5250     $code = 0xFFFD;
5251     }
5252    
5253     if ($self->{prev_state} == DATA_STATE) {
5254    
5255     $self->{state} = $self->{prev_state};
5256 wakaba 1.5 $self->{s_kwd} = '';
5257 wakaba 1.1 ## Reconsume.
5258     return ({type => CHARACTER_TOKEN, data => chr $code,
5259 wakaba 1.7 has_reference => 1,
5260 wakaba 1.1 line => $l, column => $c,
5261     });
5262     redo A;
5263     } else {
5264    
5265     $self->{ca}->{value} .= chr $code;
5266     $self->{ca}->{has_reference} = 1;
5267     $self->{state} = $self->{prev_state};
5268 wakaba 1.5 $self->{s_kwd} = '';
5269 wakaba 1.1 ## Reconsume.
5270     redo A;
5271     }
5272     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5273 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5274     $self->{nc} <= 0x005A) or # x
5275     (0x0061 <= $self->{nc} and # a
5276     $self->{nc} <= 0x007A) or # z
5277     (0x0030 <= $self->{nc} and # 0
5278     $self->{nc} <= 0x0039) or # 9
5279 wakaba 1.22 $self->{nc} == 0x003B or # ;
5280     ($self->{is_xml} and
5281     not ($is_space->{$self->{nc}} or
5282     {
5283     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5284     $self->{entity_add} => 1,
5285     }->{$self->{nc}}))) {
5286 wakaba 1.1 our $EntityChar;
5287 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5288 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5289     $self->{ge}->{$self->{kwd}}) {
5290 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5291 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5292     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5293    
5294     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5295     } else {
5296     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5297    
5298     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5299     value => $self->{kwd});
5300     } else {
5301    
5302     }
5303     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5304     }
5305     } else {
5306     if ($self->{is_xml}) {
5307    
5308     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5309     value => $self->{kwd},
5310     level => {
5311     'amp;' => $self->{level}->{warn},
5312     'quot;' => $self->{level}->{warn},
5313     'lt;' => $self->{level}->{warn},
5314     'gt;' => $self->{level}->{warn},
5315     'apos;' => $self->{level}->{warn},
5316     }->{$self->{kwd}} ||
5317     $self->{level}->{must});
5318     } else {
5319    
5320     }
5321     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5322     }
5323 wakaba 1.1 $self->{entity__match} = 1;
5324    
5325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5326     $self->{line_prev} = $self->{line};
5327     $self->{column_prev} = $self->{column};
5328     $self->{column}++;
5329     $self->{nc}
5330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5331     } else {
5332     $self->{set_nc}->($self);
5333     }
5334    
5335     #
5336     } else {
5337    
5338 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5339 wakaba 1.1 $self->{entity__match} = -1;
5340     ## Stay in the state.
5341    
5342     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5343     $self->{line_prev} = $self->{line};
5344     $self->{column_prev} = $self->{column};
5345     $self->{column}++;
5346     $self->{nc}
5347     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5348     } else {
5349     $self->{set_nc}->($self);
5350     }
5351    
5352     redo A;
5353     }
5354     } else {
5355    
5356     $self->{entity__value} .= chr $self->{nc};
5357     $self->{entity__match} *= 2;
5358     ## Stay in the state.
5359    
5360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5361     $self->{line_prev} = $self->{line};
5362     $self->{column_prev} = $self->{column};
5363     $self->{column}++;
5364     $self->{nc}
5365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5366     } else {
5367     $self->{set_nc}->($self);
5368     }
5369    
5370     redo A;
5371     }
5372     }
5373    
5374     my $data;
5375     my $has_ref;
5376     if ($self->{entity__match} > 0) {
5377    
5378     $data = $self->{entity__value};
5379     $has_ref = 1;
5380     #
5381     } elsif ($self->{entity__match} < 0) {
5382     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5383     if ($self->{prev_state} != DATA_STATE and # in attribute
5384     $self->{entity__match} < -1) {
5385    
5386 wakaba 1.12 $data = '&' . $self->{kwd};
5387 wakaba 1.1 #
5388     } else {
5389    
5390     $data = $self->{entity__value};
5391     $has_ref = 1;
5392     #
5393     }
5394     } else {
5395    
5396     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5397     line => $self->{line_prev},
5398 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5399     $data = '&' . $self->{kwd};
5400 wakaba 1.1 #
5401     }
5402    
5403     ## NOTE: In these cases, when a character reference is found,
5404     ## it is consumed and a character token is returned, or, otherwise,
5405     ## nothing is consumed and returned, according to the spec algorithm.
5406     ## In this implementation, anything that has been examined by the
5407     ## tokenizer is appended to the parent element or the attribute value
5408     ## as string, either literal string when no character reference or
5409     ## entity-replaced string otherwise, in this stage, since any characters
5410     ## that would not be consumed are appended in the data state or in an
5411     ## appropriate attribute value state anyway.
5412    
5413     if ($self->{prev_state} == DATA_STATE) {
5414    
5415     $self->{state} = $self->{prev_state};
5416 wakaba 1.5 $self->{s_kwd} = '';
5417 wakaba 1.1 ## Reconsume.
5418     return ({type => CHARACTER_TOKEN,
5419     data => $data,
5420 wakaba 1.7 has_reference => $has_ref,
5421 wakaba 1.1 line => $self->{line_prev},
5422 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5423 wakaba 1.1 });
5424     redo A;
5425     } else {
5426    
5427     $self->{ca}->{value} .= $data;
5428     $self->{ca}->{has_reference} = 1 if $has_ref;
5429     $self->{state} = $self->{prev_state};
5430 wakaba 1.5 $self->{s_kwd} = '';
5431 wakaba 1.1 ## Reconsume.
5432     redo A;
5433     }
5434 wakaba 1.8
5435     ## XML-only states
5436    
5437     } elsif ($self->{state} == PI_STATE) {
5438 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5439    
5440 wakaba 1.8 if ($is_space->{$self->{nc}} or
5441 wakaba 1.14 $self->{nc} == 0x003F or # ?
5442 wakaba 1.8 $self->{nc} == -1) {
5443 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5444     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5445     ## "DOCTYPE pi state": Parse error, switch to the "data
5446     ## state".
5447 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5448     line => $self->{line_prev},
5449     column => $self->{column_prev}
5450     - 1 * ($self->{nc} != -1));
5451     $self->{state} = BOGUS_COMMENT_STATE;
5452     ## Reconsume.
5453     $self->{ct} = {type => COMMENT_TOKEN,
5454     data => '?',
5455     line => $self->{line_prev},
5456     column => $self->{column_prev}
5457     - 1 * ($self->{nc} != -1),
5458     };
5459     redo A;
5460     } else {
5461 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5462 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5463     target => chr $self->{nc},
5464     data => '',
5465     line => $self->{line_prev},
5466     column => $self->{column_prev} - 1,
5467     };
5468     $self->{state} = PI_TARGET_STATE;
5469    
5470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5471     $self->{line_prev} = $self->{line};
5472     $self->{column_prev} = $self->{column};
5473     $self->{column}++;
5474     $self->{nc}
5475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5476     } else {
5477     $self->{set_nc}->($self);
5478     }
5479    
5480     redo A;
5481     }
5482     } elsif ($self->{state} == PI_TARGET_STATE) {
5483     if ($is_space->{$self->{nc}}) {
5484     $self->{state} = PI_TARGET_AFTER_STATE;
5485    
5486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5487     $self->{line_prev} = $self->{line};
5488     $self->{column_prev} = $self->{column};
5489     $self->{column}++;
5490     $self->{nc}
5491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5492     } else {
5493     $self->{set_nc}->($self);
5494     }
5495    
5496     redo A;
5497     } elsif ($self->{nc} == -1) {
5498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5499 wakaba 1.13 if ($self->{in_subset}) {
5500     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5501     } else {
5502     $self->{state} = DATA_STATE;
5503     $self->{s_kwd} = '';
5504     }
5505 wakaba 1.8 ## Reconsume.
5506     return ($self->{ct}); # pi
5507     redo A;
5508     } elsif ($self->{nc} == 0x003F) { # ?
5509     $self->{state} = PI_AFTER_STATE;
5510    
5511     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5512     $self->{line_prev} = $self->{line};
5513     $self->{column_prev} = $self->{column};
5514     $self->{column}++;
5515     $self->{nc}
5516     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5517     } else {
5518     $self->{set_nc}->($self);
5519     }
5520    
5521     redo A;
5522     } else {
5523     ## XML5: typo ("tag name" -> "target")
5524     $self->{ct}->{target} .= chr $self->{nc}; # pi
5525    
5526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5527     $self->{line_prev} = $self->{line};
5528     $self->{column_prev} = $self->{column};
5529     $self->{column}++;
5530     $self->{nc}
5531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5532     } else {
5533     $self->{set_nc}->($self);
5534     }
5535    
5536     redo A;
5537     }
5538     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5539     if ($is_space->{$self->{nc}}) {
5540     ## Stay in the state.
5541    
5542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5543     $self->{line_prev} = $self->{line};
5544     $self->{column_prev} = $self->{column};
5545     $self->{column}++;
5546     $self->{nc}
5547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5548     } else {
5549     $self->{set_nc}->($self);
5550     }
5551    
5552     redo A;
5553     } else {
5554     $self->{state} = PI_DATA_STATE;
5555     ## Reprocess.
5556     redo A;
5557     }
5558     } elsif ($self->{state} == PI_DATA_STATE) {
5559     if ($self->{nc} == 0x003F) { # ?
5560     $self->{state} = PI_DATA_AFTER_STATE;
5561    
5562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5563     $self->{line_prev} = $self->{line};
5564     $self->{column_prev} = $self->{column};
5565     $self->{column}++;
5566     $self->{nc}
5567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5568     } else {
5569     $self->{set_nc}->($self);
5570     }
5571    
5572     redo A;
5573     } elsif ($self->{nc} == -1) {
5574     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5575 wakaba 1.13 if ($self->{in_subset}) {
5576 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5577 wakaba 1.13 } else {
5578     $self->{state} = DATA_STATE;
5579     $self->{s_kwd} = '';
5580     }
5581 wakaba 1.8 ## Reprocess.
5582     return ($self->{ct}); # pi
5583     redo A;
5584     } else {
5585     $self->{ct}->{data} .= chr $self->{nc}; # pi
5586     $self->{read_until}->($self->{ct}->{data}, q[?],
5587     length $self->{ct}->{data});
5588     ## Stay in the state.
5589    
5590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5591     $self->{line_prev} = $self->{line};
5592     $self->{column_prev} = $self->{column};
5593     $self->{column}++;
5594     $self->{nc}
5595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5596     } else {
5597     $self->{set_nc}->($self);
5598     }
5599    
5600     ## Reprocess.
5601     redo A;
5602     }
5603     } elsif ($self->{state} == PI_AFTER_STATE) {
5604 wakaba 1.14 ## XML5: Part of "Pi after state".
5605    
5606 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5607 wakaba 1.13 if ($self->{in_subset}) {
5608     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5609     } else {
5610     $self->{state} = DATA_STATE;
5611     $self->{s_kwd} = '';
5612     }
5613 wakaba 1.8
5614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5615     $self->{line_prev} = $self->{line};
5616     $self->{column_prev} = $self->{column};
5617     $self->{column}++;
5618     $self->{nc}
5619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5620     } else {
5621     $self->{set_nc}->($self);
5622     }
5623    
5624     return ($self->{ct}); # pi
5625     redo A;
5626     } elsif ($self->{nc} == 0x003F) { # ?
5627     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5628     line => $self->{line_prev},
5629     column => $self->{column_prev}); ## XML5: no error
5630     $self->{ct}->{data} .= '?';
5631     $self->{state} = PI_DATA_AFTER_STATE;
5632    
5633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5634     $self->{line_prev} = $self->{line};
5635     $self->{column_prev} = $self->{column};
5636     $self->{column}++;
5637     $self->{nc}
5638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5639     } else {
5640     $self->{set_nc}->($self);
5641     }
5642    
5643     redo A;
5644     } else {
5645     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5646     line => $self->{line_prev},
5647     column => $self->{column_prev}
5648     + 1 * ($self->{nc} == -1)); ## XML5: no error
5649     $self->{ct}->{data} .= '?'; ## XML5: not appended
5650     $self->{state} = PI_DATA_STATE;
5651     ## Reprocess.
5652     redo A;
5653     }
5654     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5655 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5656    
5657 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5658 wakaba 1.13 if ($self->{in_subset}) {
5659     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5660     } else {
5661     $self->{state} = DATA_STATE;
5662     $self->{s_kwd} = '';
5663     }
5664 wakaba 1.8
5665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5666     $self->{line_prev} = $self->{line};
5667     $self->{column_prev} = $self->{column};
5668     $self->{column}++;
5669     $self->{nc}
5670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5671     } else {
5672     $self->{set_nc}->($self);
5673     }
5674    
5675     return ($self->{ct}); # pi
5676     redo A;
5677     } elsif ($self->{nc} == 0x003F) { # ?
5678     $self->{ct}->{data} .= '?';
5679     ## Stay in the state.
5680    
5681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5682     $self->{line_prev} = $self->{line};
5683     $self->{column_prev} = $self->{column};
5684     $self->{column}++;
5685     $self->{nc}
5686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5687     } else {
5688     $self->{set_nc}->($self);
5689     }
5690    
5691     redo A;
5692     } else {
5693     $self->{ct}->{data} .= '?'; ## XML5: not appended
5694     $self->{state} = PI_DATA_STATE;
5695     ## Reprocess.
5696     redo A;
5697     }
5698 wakaba 1.12
5699     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5700     if ($self->{nc} == 0x003C) { # <
5701 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5702 wakaba 1.12
5703     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5704     $self->{line_prev} = $self->{line};
5705     $self->{column_prev} = $self->{column};
5706     $self->{column}++;
5707     $self->{nc}
5708     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5709     } else {
5710     $self->{set_nc}->($self);
5711     }
5712    
5713     redo A;
5714     } elsif ($self->{nc} == 0x0025) { # %
5715     ## XML5: Not defined yet.
5716    
5717     ## TODO:
5718 wakaba 1.24
5719     if (not $self->{stop_processing} and
5720     not $self->{document}->xml_standalone) {
5721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5722     level => $self->{level}->{info});
5723     $self->{stop_processing} = 1;
5724     }
5725    
5726 wakaba 1.12
5727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5728     $self->{line_prev} = $self->{line};
5729     $self->{column_prev} = $self->{column};
5730     $self->{column}++;
5731     $self->{nc}
5732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5733     } else {
5734     $self->{set_nc}->($self);
5735     }
5736    
5737     redo A;
5738     } elsif ($self->{nc} == 0x005D) { # ]
5739 wakaba 1.13 delete $self->{in_subset};
5740 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5741    
5742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5743     $self->{line_prev} = $self->{line};
5744     $self->{column_prev} = $self->{column};
5745     $self->{column}++;
5746     $self->{nc}
5747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5748     } else {
5749     $self->{set_nc}->($self);
5750     }
5751    
5752     redo A;
5753     } elsif ($is_space->{$self->{nc}}) {
5754     ## Stay in the state.
5755    
5756     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5757     $self->{line_prev} = $self->{line};
5758     $self->{column_prev} = $self->{column};
5759     $self->{column}++;
5760     $self->{nc}
5761     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5762     } else {
5763     $self->{set_nc}->($self);
5764     }
5765    
5766     redo A;
5767     } elsif ($self->{nc} == -1) {
5768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5769 wakaba 1.13 delete $self->{in_subset};
5770 wakaba 1.12 $self->{state} = DATA_STATE;
5771     $self->{s_kwd} = '';
5772     ## Reconsume.
5773 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5774 wakaba 1.12 redo A;
5775     } else {
5776     unless ($self->{internal_subset_tainted}) {
5777     ## XML5: No parse error.
5778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5779     $self->{internal_subset_tainted} = 1;
5780     }
5781     ## Stay in the state.
5782    
5783     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5784     $self->{line_prev} = $self->{line};
5785     $self->{column_prev} = $self->{column};
5786     $self->{column}++;
5787     $self->{nc}
5788     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5789     } else {
5790     $self->{set_nc}->($self);
5791     }
5792    
5793     redo A;
5794     }
5795     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5796     if ($self->{nc} == 0x003E) { # >
5797     $self->{state} = DATA_STATE;
5798     $self->{s_kwd} = '';
5799    
5800     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5801     $self->{line_prev} = $self->{line};
5802     $self->{column_prev} = $self->{column};
5803     $self->{column}++;
5804     $self->{nc}
5805     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5806     } else {
5807     $self->{set_nc}->($self);
5808     }
5809    
5810 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5811 wakaba 1.12 redo A;
5812     } elsif ($self->{nc} == -1) {
5813     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5814     $self->{state} = DATA_STATE;
5815     $self->{s_kwd} = '';
5816     ## Reconsume.
5817 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5818 wakaba 1.12 redo A;
5819     } else {
5820     ## XML5: No parse error and stay in the state.
5821     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5822    
5823 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5824    
5825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5826     $self->{line_prev} = $self->{line};
5827     $self->{column_prev} = $self->{column};
5828     $self->{column}++;
5829     $self->{nc}
5830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5831     } else {
5832     $self->{set_nc}->($self);
5833     }
5834    
5835     redo A;
5836     }
5837     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5838     if ($self->{nc} == 0x003E) { # >
5839     $self->{state} = DATA_STATE;
5840     $self->{s_kwd} = '';
5841    
5842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5843     $self->{line_prev} = $self->{line};
5844     $self->{column_prev} = $self->{column};
5845     $self->{column}++;
5846     $self->{nc}
5847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5848     } else {
5849     $self->{set_nc}->($self);
5850     }
5851    
5852     return ({type => END_OF_DOCTYPE_TOKEN});
5853     redo A;
5854     } elsif ($self->{nc} == -1) {
5855     $self->{state} = DATA_STATE;
5856     $self->{s_kwd} = '';
5857     ## Reconsume.
5858     return ({type => END_OF_DOCTYPE_TOKEN});
5859     redo A;
5860     } else {
5861     ## Stay in the state.
5862    
5863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5864     $self->{line_prev} = $self->{line};
5865     $self->{column_prev} = $self->{column};
5866     $self->{column}++;
5867     $self->{nc}
5868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5869     } else {
5870     $self->{set_nc}->($self);
5871     }
5872    
5873     redo A;
5874     }
5875     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5876     if ($self->{nc} == 0x0021) { # !
5877 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5878 wakaba 1.13
5879     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5880     $self->{line_prev} = $self->{line};
5881     $self->{column_prev} = $self->{column};
5882     $self->{column}++;
5883     $self->{nc}
5884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5885     } else {
5886     $self->{set_nc}->($self);
5887     }
5888    
5889     redo A;
5890     } elsif ($self->{nc} == 0x003F) { # ?
5891     $self->{state} = PI_STATE;
5892    
5893     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5894     $self->{line_prev} = $self->{line};
5895     $self->{column_prev} = $self->{column};
5896     $self->{column}++;
5897     $self->{nc}
5898     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5899     } else {
5900     $self->{set_nc}->($self);
5901     }
5902    
5903     redo A;
5904     } elsif ($self->{nc} == -1) {
5905     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5906     $self->{state} = DATA_STATE;
5907     $self->{s_kwd} = '';
5908     ## Reconsume.
5909     redo A;
5910     } else {
5911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5912     line => $self->{line_prev},
5913     column => $self->{column_prev});
5914     $self->{state} = BOGUS_COMMENT_STATE;
5915     $self->{ct} = {type => COMMENT_TOKEN,
5916     data => '',
5917     }; ## NOTE: Will be discarded.
5918 wakaba 1.12
5919     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5920     $self->{line_prev} = $self->{line};
5921     $self->{column_prev} = $self->{column};
5922     $self->{column}++;
5923     $self->{nc}
5924     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5925     } else {
5926     $self->{set_nc}->($self);
5927     }
5928    
5929     redo A;
5930     }
5931 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5932     ## XML5: "DOCTYPE markup declaration state".
5933    
5934     if ($self->{nc} == 0x002D) { # -
5935     $self->{state} = MD_HYPHEN_STATE;
5936    
5937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5938     $self->{line_prev} = $self->{line};
5939     $self->{column_prev} = $self->{column};
5940     $self->{column}++;
5941     $self->{nc}
5942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5943     } else {
5944     $self->{set_nc}->($self);
5945     }
5946    
5947     redo A;
5948 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5949     $self->{nc} == 0x0065) { # e
5950 wakaba 1.14 $self->{state} = MD_E_STATE;
5951     $self->{kwd} = chr $self->{nc};
5952    
5953     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5954     $self->{line_prev} = $self->{line};
5955     $self->{column_prev} = $self->{column};
5956     $self->{column}++;
5957     $self->{nc}
5958     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5959     } else {
5960     $self->{set_nc}->($self);
5961     }
5962    
5963     redo A;
5964 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5965     $self->{nc} == 0x0061) { # a
5966 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5967     $self->{kwd} = chr $self->{nc};
5968    
5969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5970     $self->{line_prev} = $self->{line};
5971     $self->{column_prev} = $self->{column};
5972     $self->{column}++;
5973     $self->{nc}
5974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5975     } else {
5976     $self->{set_nc}->($self);
5977     }
5978    
5979     redo A;
5980 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5981     $self->{nc} == 0x006E) { # n
5982 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5983     $self->{kwd} = chr $self->{nc};
5984    
5985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5986     $self->{line_prev} = $self->{line};
5987     $self->{column_prev} = $self->{column};
5988     $self->{column}++;
5989     $self->{nc}
5990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5991     } else {
5992     $self->{set_nc}->($self);
5993     }
5994    
5995     redo A;
5996     } else {
5997     #
5998     }
5999    
6000     ## XML5: No parse error.
6001     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6002     line => $self->{line_prev},
6003     column => $self->{column_prev} - 1);
6004     ## Reconsume.
6005     $self->{state} = BOGUS_COMMENT_STATE;
6006     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
6007     redo A;
6008     } elsif ($self->{state} == MD_E_STATE) {
6009 wakaba 1.17 if ($self->{nc} == 0x004E or # N
6010     $self->{nc} == 0x006E) { # n
6011 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
6012     $self->{kwd} .= chr $self->{nc};
6013    
6014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6015     $self->{line_prev} = $self->{line};
6016     $self->{column_prev} = $self->{column};
6017     $self->{column}++;
6018     $self->{nc}
6019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6020     } else {
6021     $self->{set_nc}->($self);
6022     }
6023    
6024     redo A;
6025 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
6026     $self->{nc} == 0x006C) { # l
6027 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
6028     $self->{state} = MD_ELEMENT_STATE;
6029     $self->{kwd} .= chr $self->{nc};
6030    
6031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032     $self->{line_prev} = $self->{line};
6033     $self->{column_prev} = $self->{column};
6034     $self->{column}++;
6035     $self->{nc}
6036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037     } else {
6038     $self->{set_nc}->($self);
6039     }
6040    
6041     redo A;
6042     } else {
6043     ## XML5: No parse error.
6044     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6045     line => $self->{line_prev},
6046     column => $self->{column_prev} - 2
6047     + 1 * ($self->{nc} == -1));
6048     ## Reconsume.
6049     $self->{state} = BOGUS_COMMENT_STATE;
6050     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6051     redo A;
6052     }
6053     } elsif ($self->{state} == MD_ENTITY_STATE) {
6054 wakaba 1.17 if ($self->{nc} == [
6055     undef,
6056     undef,
6057     0x0054, # T
6058     0x0049, # I
6059     0x0054, # T
6060     ]->[length $self->{kwd}] or
6061     $self->{nc} == [
6062     undef,
6063     undef,
6064     0x0074, # t
6065     0x0069, # i
6066     0x0074, # t
6067     ]->[length $self->{kwd}]) {
6068 wakaba 1.14 ## Stay in the state.
6069     $self->{kwd} .= chr $self->{nc};
6070    
6071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6072     $self->{line_prev} = $self->{line};
6073     $self->{column_prev} = $self->{column};
6074     $self->{column}++;
6075     $self->{nc}
6076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6077     } else {
6078     $self->{set_nc}->($self);
6079     }
6080    
6081     redo A;
6082 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
6083     ($self->{nc} == 0x0059 or # Y
6084     $self->{nc} == 0x0079)) { # y
6085     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6086     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6087     text => 'ENTITY',
6088     line => $self->{line_prev},
6089     column => $self->{column_prev} - 4);
6090     }
6091     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6092 wakaba 1.14 line => $self->{line_prev},
6093     column => $self->{column_prev} - 6};
6094     $self->{state} = DOCTYPE_MD_STATE;
6095    
6096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6097     $self->{line_prev} = $self->{line};
6098     $self->{column_prev} = $self->{column};
6099     $self->{column}++;
6100     $self->{nc}
6101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6102     } else {
6103     $self->{set_nc}->($self);
6104     }
6105    
6106     redo A;
6107     } else {
6108     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6109     line => $self->{line_prev},
6110     column => $self->{column_prev} - 1
6111     - (length $self->{kwd})
6112     + 1 * ($self->{nc} == -1));
6113     $self->{state} = BOGUS_COMMENT_STATE;
6114     ## Reconsume.
6115     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6116     redo A;
6117     }
6118     } elsif ($self->{state} == MD_ELEMENT_STATE) {
6119 wakaba 1.17 if ($self->{nc} == [
6120     undef,
6121     undef,
6122     0x0045, # E
6123     0x004D, # M
6124     0x0045, # E
6125     0x004E, # N
6126     ]->[length $self->{kwd}] or
6127     $self->{nc} == [
6128     undef,
6129     undef,
6130     0x0065, # e
6131     0x006D, # m
6132     0x0065, # e
6133     0x006E, # n
6134     ]->[length $self->{kwd}]) {
6135 wakaba 1.14 ## Stay in the state.
6136     $self->{kwd} .= chr $self->{nc};
6137    
6138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6139     $self->{line_prev} = $self->{line};
6140     $self->{column_prev} = $self->{column};
6141     $self->{column}++;
6142     $self->{nc}
6143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6144     } else {
6145     $self->{set_nc}->($self);
6146     }
6147    
6148     redo A;
6149 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6150     ($self->{nc} == 0x0054 or # T
6151     $self->{nc} == 0x0074)) { # t
6152     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6153     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6154     text => 'ELEMENT',
6155     line => $self->{line_prev},
6156     column => $self->{column_prev} - 5);
6157     }
6158 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6159     line => $self->{line_prev},
6160 wakaba 1.23 column => $self->{column_prev} - 7};
6161 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6162    
6163     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6164     $self->{line_prev} = $self->{line};
6165     $self->{column_prev} = $self->{column};
6166     $self->{column}++;
6167     $self->{nc}
6168     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6169     } else {
6170     $self->{set_nc}->($self);
6171     }
6172    
6173     redo A;
6174     } else {
6175     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6176     line => $self->{line_prev},
6177     column => $self->{column_prev} - 1
6178     - (length $self->{kwd})
6179     + 1 * ($self->{nc} == -1));
6180     $self->{state} = BOGUS_COMMENT_STATE;
6181     ## Reconsume.
6182     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6183     redo A;
6184     }
6185     } elsif ($self->{state} == MD_ATTLIST_STATE) {
6186 wakaba 1.17 if ($self->{nc} == [
6187     undef,
6188     0x0054, # T
6189     0x0054, # T
6190     0x004C, # L
6191     0x0049, # I
6192     0x0053, # S
6193     ]->[length $self->{kwd}] or
6194     $self->{nc} == [
6195     undef,
6196     0x0074, # t
6197     0x0074, # t
6198     0x006C, # l
6199     0x0069, # i
6200     0x0073, # s
6201     ]->[length $self->{kwd}]) {
6202 wakaba 1.14 ## Stay in the state.
6203     $self->{kwd} .= chr $self->{nc};
6204    
6205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6206     $self->{line_prev} = $self->{line};
6207     $self->{column_prev} = $self->{column};
6208     $self->{column}++;
6209     $self->{nc}
6210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6211     } else {
6212     $self->{set_nc}->($self);
6213     }
6214    
6215     redo A;
6216 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6217     ($self->{nc} == 0x0054 or # T
6218     $self->{nc} == 0x0074)) { # t
6219     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6220     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6221     text => 'ATTLIST',
6222     line => $self->{line_prev},
6223     column => $self->{column_prev} - 5);
6224     }
6225 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6226 wakaba 1.15 attrdefs => [],
6227 wakaba 1.14 line => $self->{line_prev},
6228 wakaba 1.23 column => $self->{column_prev} - 7};
6229 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6230    
6231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6232     $self->{line_prev} = $self->{line};
6233     $self->{column_prev} = $self->{column};
6234     $self->{column}++;
6235     $self->{nc}
6236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6237     } else {
6238     $self->{set_nc}->($self);
6239     }
6240    
6241     redo A;
6242     } else {
6243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6244     line => $self->{line_prev},
6245     column => $self->{column_prev} - 1
6246     - (length $self->{kwd})
6247     + 1 * ($self->{nc} == -1));
6248     $self->{state} = BOGUS_COMMENT_STATE;
6249     ## Reconsume.
6250     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6251     redo A;
6252     }
6253     } elsif ($self->{state} == MD_NOTATION_STATE) {
6254 wakaba 1.17 if ($self->{nc} == [
6255     undef,
6256     0x004F, # O
6257     0x0054, # T
6258     0x0041, # A
6259     0x0054, # T
6260     0x0049, # I
6261     0x004F, # O
6262     ]->[length $self->{kwd}] or
6263     $self->{nc} == [
6264     undef,
6265     0x006F, # o
6266     0x0074, # t
6267     0x0061, # a
6268     0x0074, # t
6269     0x0069, # i
6270     0x006F, # o
6271     ]->[length $self->{kwd}]) {
6272 wakaba 1.14 ## Stay in the state.
6273     $self->{kwd} .= chr $self->{nc};
6274    
6275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6276     $self->{line_prev} = $self->{line};
6277     $self->{column_prev} = $self->{column};
6278     $self->{column}++;
6279     $self->{nc}
6280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6281     } else {
6282     $self->{set_nc}->($self);
6283     }
6284    
6285     redo A;
6286 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6287     ($self->{nc} == 0x004E or # N
6288     $self->{nc} == 0x006E)) { # n
6289     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6291     text => 'NOTATION',
6292     line => $self->{line_prev},
6293     column => $self->{column_prev} - 6);
6294     }
6295 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6296     line => $self->{line_prev},
6297 wakaba 1.23 column => $self->{column_prev} - 8};
6298 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6299    
6300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6301     $self->{line_prev} = $self->{line};
6302     $self->{column_prev} = $self->{column};
6303     $self->{column}++;
6304     $self->{nc}
6305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6306     } else {
6307     $self->{set_nc}->($self);
6308     }
6309    
6310     redo A;
6311     } else {
6312     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6313     line => $self->{line_prev},
6314     column => $self->{column_prev} - 1
6315     - (length $self->{kwd})
6316     + 1 * ($self->{nc} == -1));
6317     $self->{state} = BOGUS_COMMENT_STATE;
6318     ## Reconsume.
6319     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6320     redo A;
6321     }
6322     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6323     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6324     ## "DOCTYPE NOTATION state".
6325    
6326     if ($is_space->{$self->{nc}}) {
6327     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6328     $self->{state} = BEFORE_MD_NAME_STATE;
6329    
6330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6331     $self->{line_prev} = $self->{line};
6332     $self->{column_prev} = $self->{column};
6333     $self->{column}++;
6334     $self->{nc}
6335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6336     } else {
6337     $self->{set_nc}->($self);
6338     }
6339    
6340     redo A;
6341     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6342     $self->{nc} == 0x0025) { # %
6343     ## XML5: Switch to the "DOCTYPE bogus comment state".
6344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6345     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6346    
6347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6348     $self->{line_prev} = $self->{line};
6349     $self->{column_prev} = $self->{column};
6350     $self->{column}++;
6351     $self->{nc}
6352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6353     } else {
6354     $self->{set_nc}->($self);
6355     }
6356    
6357     redo A;
6358     } elsif ($self->{nc} == -1) {
6359     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6360     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6361     ## Reconsume.
6362     redo A;
6363     } elsif ($self->{nc} == 0x003E) { # >
6364     ## XML5: Switch to the "DOCTYPE bogus comment state".
6365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6366     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6367    
6368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6369     $self->{line_prev} = $self->{line};
6370     $self->{column_prev} = $self->{column};
6371     $self->{column}++;
6372     $self->{nc}
6373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6374     } else {
6375     $self->{set_nc}->($self);
6376     }
6377    
6378     redo A;
6379     } else {
6380     ## XML5: Switch to the "DOCTYPE bogus comment state".
6381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6382     $self->{state} = BEFORE_MD_NAME_STATE;
6383     redo A;
6384     }
6385     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6386     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6387     ## before state", "DOCTYPE ATTLIST name before state".
6388    
6389     if ($is_space->{$self->{nc}}) {
6390     ## Stay in the state.
6391    
6392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6393     $self->{line_prev} = $self->{line};
6394     $self->{column_prev} = $self->{column};
6395     $self->{column}++;
6396     $self->{nc}
6397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6398     } else {
6399     $self->{set_nc}->($self);
6400     }
6401    
6402     redo A;
6403     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6404     $self->{nc} == 0x0025) { # %
6405     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6406    
6407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6408     $self->{line_prev} = $self->{line};
6409     $self->{column_prev} = $self->{column};
6410     $self->{column}++;
6411     $self->{nc}
6412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6413     } else {
6414     $self->{set_nc}->($self);
6415     }
6416    
6417     redo A;
6418     } elsif ($self->{nc} == 0x003E) { # >
6419     ## XML5: Same as "Anything else".
6420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6421     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6422    
6423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6424     $self->{line_prev} = $self->{line};
6425     $self->{column_prev} = $self->{column};
6426     $self->{column}++;
6427     $self->{nc}
6428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6429     } else {
6430     $self->{set_nc}->($self);
6431     }
6432    
6433     redo A;
6434     } elsif ($self->{nc} == -1) {
6435     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6436     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6437     ## Reconsume.
6438     redo A;
6439     } else {
6440     ## XML5: [ATTLIST] Not defined yet.
6441     $self->{ct}->{name} .= chr $self->{nc};
6442     $self->{state} = MD_NAME_STATE;
6443    
6444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6445     $self->{line_prev} = $self->{line};
6446     $self->{column_prev} = $self->{column};
6447     $self->{column}++;
6448     $self->{nc}
6449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6450     } else {
6451     $self->{set_nc}->($self);
6452     }
6453    
6454     redo A;
6455     }
6456     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6457     if ($is_space->{$self->{nc}}) {
6458     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6459     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6460     $self->{state} = BEFORE_MD_NAME_STATE;
6461 wakaba 1.8
6462 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6463     $self->{line_prev} = $self->{line};
6464     $self->{column_prev} = $self->{column};
6465     $self->{column}++;
6466     $self->{nc}
6467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6468     } else {
6469     $self->{set_nc}->($self);
6470     }
6471    
6472     redo A;
6473     } elsif ($self->{nc} == 0x003E) { # >
6474     ## XML5: Same as "Anything else".
6475     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6476     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6477    
6478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6479     $self->{line_prev} = $self->{line};
6480     $self->{column_prev} = $self->{column};
6481     $self->{column}++;
6482     $self->{nc}
6483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6484     } else {
6485     $self->{set_nc}->($self);
6486     }
6487    
6488     redo A;
6489     } elsif ($self->{nc} == -1) {
6490     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6491     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6492     ## Reconsume.
6493     redo A;
6494     } else {
6495     ## XML5: No parse error.
6496     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6497     $self->{state} = BOGUS_COMMENT_STATE;
6498     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6499     ## Reconsume.
6500     redo A;
6501     }
6502     } elsif ($self->{state} == MD_NAME_STATE) {
6503     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6504    
6505     if ($is_space->{$self->{nc}}) {
6506 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6507     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6508     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6509 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6510 wakaba 1.16 } else { # ENTITY/NOTATION
6511     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6512     }
6513 wakaba 1.14
6514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515     $self->{line_prev} = $self->{line};
6516     $self->{column_prev} = $self->{column};
6517     $self->{column}++;
6518     $self->{nc}
6519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520     } else {
6521     $self->{set_nc}->($self);
6522     }
6523    
6524     redo A;
6525     } elsif ($self->{nc} == 0x003E) { # >
6526     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6527     #
6528     } else {
6529 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6530 wakaba 1.14 }
6531     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6532    
6533     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6534     $self->{line_prev} = $self->{line};
6535     $self->{column_prev} = $self->{column};
6536     $self->{column}++;
6537     $self->{nc}
6538     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6539     } else {
6540     $self->{set_nc}->($self);
6541     }
6542    
6543     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6544     redo A;
6545     } elsif ($self->{nc} == -1) {
6546     ## XML5: [ATTLIST] No parse error.
6547     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6548     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6549     ## Reconsume.
6550     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6551     redo A;
6552     } else {
6553     ## XML5: [ATTLIST] Not defined yet.
6554     $self->{ct}->{name} .= chr $self->{nc};
6555     ## Stay in the state.
6556    
6557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6558     $self->{line_prev} = $self->{line};
6559     $self->{column_prev} = $self->{column};
6560     $self->{column}++;
6561     $self->{nc}
6562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6563     } else {
6564     $self->{set_nc}->($self);
6565     }
6566    
6567     redo A;
6568     }
6569     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6570     if ($is_space->{$self->{nc}}) {
6571     ## Stay in the state.
6572    
6573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6574     $self->{line_prev} = $self->{line};
6575     $self->{column_prev} = $self->{column};
6576     $self->{column}++;
6577     $self->{nc}
6578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6579     } else {
6580     $self->{set_nc}->($self);
6581     }
6582    
6583     redo A;
6584     } elsif ($self->{nc} == 0x003E) { # >
6585     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6586    
6587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6588     $self->{line_prev} = $self->{line};
6589     $self->{column_prev} = $self->{column};
6590     $self->{column}++;
6591     $self->{nc}
6592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6593     } else {
6594     $self->{set_nc}->($self);
6595     }
6596    
6597     return ($self->{ct}); # ATTLIST
6598     redo A;
6599     } elsif ($self->{nc} == -1) {
6600     ## XML5: No parse error.
6601     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6602     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6603 wakaba 1.15 return ($self->{ct});
6604 wakaba 1.14 redo A;
6605     } else {
6606     ## XML5: Not defined yet.
6607 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6608     tokens => [],
6609     line => $self->{line}, column => $self->{column}};
6610     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6611    
6612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6613     $self->{line_prev} = $self->{line};
6614     $self->{column_prev} = $self->{column};
6615     $self->{column}++;
6616     $self->{nc}
6617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6618     } else {
6619     $self->{set_nc}->($self);
6620     }
6621    
6622     redo A;
6623     }
6624     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6625     if ($is_space->{$self->{nc}}) {
6626     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6627    
6628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6629     $self->{line_prev} = $self->{line};
6630     $self->{column_prev} = $self->{column};
6631     $self->{column}++;
6632     $self->{nc}
6633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6634     } else {
6635     $self->{set_nc}->($self);
6636     }
6637    
6638     redo A;
6639     } elsif ($self->{nc} == 0x003E) { # >
6640     ## XML5: Same as "anything else".
6641     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6642     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6643    
6644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6645     $self->{line_prev} = $self->{line};
6646     $self->{column_prev} = $self->{column};
6647     $self->{column}++;
6648     $self->{nc}
6649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6650     } else {
6651     $self->{set_nc}->($self);
6652     }
6653    
6654     return ($self->{ct}); # ATTLIST
6655     redo A;
6656     } elsif ($self->{nc} == 0x0028) { # (
6657     ## XML5: Same as "anything else".
6658     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6659     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6660    
6661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6662     $self->{line_prev} = $self->{line};
6663     $self->{column_prev} = $self->{column};
6664     $self->{column}++;
6665     $self->{nc}
6666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6667     } else {
6668     $self->{set_nc}->($self);
6669     }
6670    
6671     redo A;
6672     } elsif ($self->{nc} == -1) {
6673     ## XML5: No parse error.
6674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6675     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6676    
6677     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6678     $self->{line_prev} = $self->{line};
6679     $self->{column_prev} = $self->{column};
6680     $self->{column}++;
6681     $self->{nc}
6682     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6683     } else {
6684     $self->{set_nc}->($self);
6685     }
6686    
6687     return ($self->{ct}); # ATTLIST
6688     redo A;
6689     } else {
6690     ## XML5: Not defined yet.
6691     $self->{ca}->{name} .= chr $self->{nc};
6692     ## Stay in the state.
6693    
6694     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6695     $self->{line_prev} = $self->{line};
6696     $self->{column_prev} = $self->{column};
6697     $self->{column}++;
6698     $self->{nc}
6699     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6700     } else {
6701     $self->{set_nc}->($self);
6702     }
6703    
6704 wakaba 1.14 redo A;
6705     }
6706 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6707     if ($is_space->{$self->{nc}}) {
6708     ## Stay in the state.
6709    
6710     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6711     $self->{line_prev} = $self->{line};
6712     $self->{column_prev} = $self->{column};
6713     $self->{column}++;
6714     $self->{nc}
6715     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6716     } else {
6717     $self->{set_nc}->($self);
6718     }
6719    
6720     redo A;
6721     } elsif ($self->{nc} == 0x003E) { # >
6722     ## XML5: Same as "anything else".
6723     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6724     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6725    
6726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6727     $self->{line_prev} = $self->{line};
6728     $self->{column_prev} = $self->{column};
6729     $self->{column}++;
6730     $self->{nc}
6731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6732     } else {
6733     $self->{set_nc}->($self);
6734     }
6735    
6736     return ($self->{ct}); # ATTLIST
6737     redo A;
6738     } elsif ($self->{nc} == 0x0028) { # (
6739     ## XML5: Same as "anything else".
6740     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6741    
6742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6743     $self->{line_prev} = $self->{line};
6744     $self->{column_prev} = $self->{column};
6745     $self->{column}++;
6746     $self->{nc}
6747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6748     } else {
6749     $self->{set_nc}->($self);
6750     }
6751    
6752     redo A;
6753     } elsif ($self->{nc} == -1) {
6754     ## XML5: No parse error.
6755     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6756     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6757    
6758     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6759     $self->{line_prev} = $self->{line};
6760     $self->{column_prev} = $self->{column};
6761     $self->{column}++;
6762     $self->{nc}
6763     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6764     } else {
6765     $self->{set_nc}->($self);
6766     }
6767    
6768     return ($self->{ct});
6769     redo A;
6770     } else {
6771     ## XML5: Not defined yet.
6772     $self->{ca}->{type} = chr $self->{nc};
6773     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6774    
6775     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6776     $self->{line_prev} = $self->{line};
6777     $self->{column_prev} = $self->{column};
6778     $self->{column}++;
6779     $self->{nc}
6780     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6781     } else {
6782     $self->{set_nc}->($self);
6783     }
6784    
6785     redo A;
6786     }
6787     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6788     if ($is_space->{$self->{nc}}) {
6789     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6790    
6791     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6792     $self->{line_prev} = $self->{line};
6793     $self->{column_prev} = $self->{column};
6794     $self->{column}++;
6795     $self->{nc}
6796     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6797     } else {
6798     $self->{set_nc}->($self);
6799     }
6800    
6801     redo A;
6802     } elsif ($self->{nc} == 0x0023) { # #
6803     ## XML5: Same as "anything else".
6804     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6805     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6806    
6807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6808     $self->{line_prev} = $self->{line};
6809     $self->{column_prev} = $self->{column};
6810     $self->{column}++;
6811     $self->{nc}
6812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6813     } else {
6814     $self->{set_nc}->($self);
6815     }
6816    
6817     redo A;
6818     } elsif ($self->{nc} == 0x0022) { # "
6819     ## XML5: Same as "anything else".
6820     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6821     $self->{ca}->{value} = '';
6822     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6823    
6824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6825     $self->{line_prev} = $self->{line};
6826     $self->{column_prev} = $self->{column};
6827     $self->{column}++;
6828     $self->{nc}
6829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6830     } else {
6831     $self->{set_nc}->($self);
6832     }
6833    
6834     redo A;
6835     } elsif ($self->{nc} == 0x0027) { # '
6836     ## XML5: Same as "anything else".
6837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6838     $self->{ca}->{value} = '';
6839     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6840    
6841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842     $self->{line_prev} = $self->{line};
6843     $self->{column_prev} = $self->{column};
6844     $self->{column}++;
6845     $self->{nc}
6846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847     } else {
6848     $self->{set_nc}->($self);
6849     }
6850    
6851     redo A;
6852     } elsif ($self->{nc} == 0x003E) { # >
6853     ## XML5: Same as "anything else".
6854     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6855     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6856    
6857     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6858     $self->{line_prev} = $self->{line};
6859     $self->{column_prev} = $self->{column};
6860     $self->{column}++;
6861     $self->{nc}
6862     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6863     } else {
6864     $self->{set_nc}->($self);
6865     }
6866    
6867     return ($self->{ct}); # ATTLIST
6868     redo A;
6869     } elsif ($self->{nc} == 0x0028) { # (
6870     ## XML5: Same as "anything else".
6871     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6872     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6873    
6874     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6875     $self->{line_prev} = $self->{line};
6876     $self->{column_prev} = $self->{column};
6877     $self->{column}++;
6878     $self->{nc}
6879     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6880     } else {
6881     $self->{set_nc}->($self);
6882     }
6883    
6884     redo A;
6885     } elsif ($self->{nc} == -1) {
6886     ## XML5: No parse error.
6887     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6888     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6889    
6890     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6891     $self->{line_prev} = $self->{line};
6892     $self->{column_prev} = $self->{column};
6893     $self->{column}++;
6894     $self->{nc}
6895     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6896     } else {
6897     $self->{set_nc}->($self);
6898     }
6899    
6900     return ($self->{ct});
6901     redo A;
6902     } else {
6903     ## XML5: Not defined yet.
6904     $self->{ca}->{type} .= chr $self->{nc};
6905     ## Stay in the state.
6906    
6907     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6908     $self->{line_prev} = $self->{line};
6909     $self->{column_prev} = $self->{column};
6910     $self->{column}++;
6911     $self->{nc}
6912     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6913     } else {
6914     $self->{set_nc}->($self);
6915     }
6916    
6917     redo A;
6918     }
6919     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6920     if ($is_space->{$self->{nc}}) {
6921     ## Stay in the state.
6922    
6923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6924     $self->{line_prev} = $self->{line};
6925     $self->{column_prev} = $self->{column};
6926     $self->{column}++;
6927     $self->{nc}
6928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6929     } else {
6930     $self->{set_nc}->($self);
6931     }
6932    
6933     redo A;
6934     } elsif ($self->{nc} == 0x0028) { # (
6935     ## XML5: Same as "anything else".
6936     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6937    
6938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6939     $self->{line_prev} = $self->{line};
6940     $self->{column_prev} = $self->{column};
6941     $self->{column}++;
6942     $self->{nc}
6943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6944     } else {
6945     $self->{set_nc}->($self);
6946     }
6947    
6948     redo A;
6949     } elsif ($self->{nc} == 0x0023) { # #
6950     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6951    
6952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6953     $self->{line_prev} = $self->{line};
6954     $self->{column_prev} = $self->{column};
6955     $self->{column}++;
6956     $self->{nc}
6957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6958     } else {
6959     $self->{set_nc}->($self);
6960     }
6961    
6962     redo A;
6963     } elsif ($self->{nc} == 0x0022) { # "
6964     ## XML5: Same as "anything else".
6965     $self->{ca}->{value} = '';
6966     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6967    
6968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6969     $self->{line_prev} = $self->{line};
6970     $self->{column_prev} = $self->{column};
6971     $self->{column}++;
6972     $self->{nc}
6973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6974     } else {
6975     $self->{set_nc}->($self);
6976     }
6977    
6978     redo A;
6979     } elsif ($self->{nc} == 0x0027) { # '
6980     ## XML5: Same as "anything else".
6981     $self->{ca}->{value} = '';
6982     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6983    
6984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6985     $self->{line_prev} = $self->{line};
6986     $self->{column_prev} = $self->{column};
6987     $self->{column}++;
6988     $self->{nc}
6989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6990     } else {
6991     $self->{set_nc}->($self);
6992     }
6993    
6994     redo A;
6995     } elsif ($self->{nc} == 0x003E) { # >
6996     ## XML5: Same as "anything else".
6997     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6998     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6999    
7000     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7001     $self->{line_prev} = $self->{line};
7002     $self->{column_prev} = $self->{column};
7003     $self->{column}++;
7004     $self->{nc}
7005     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7006     } else {
7007     $self->{set_nc}->($self);
7008     }
7009    
7010     return ($self->{ct}); # ATTLIST
7011     redo A;
7012     } elsif ($self->{nc} == -1) {
7013     ## XML5: No parse error.
7014     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7015     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7016    
7017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7018     $self->{line_prev} = $self->{line};
7019     $self->{column_prev} = $self->{column};
7020     $self->{column}++;
7021     $self->{nc}
7022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7023     } else {
7024     $self->{set_nc}->($self);
7025     }
7026    
7027     return ($self->{ct});
7028     redo A;
7029     } else {
7030     ## XML5: Switch to the "DOCTYPE bogus comment state".
7031     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7032     $self->{ca}->{value} = '';
7033     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7034     ## Reconsume.
7035     redo A;
7036     }
7037     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7038     if ($is_space->{$self->{nc}}) {
7039     ## Stay in the state.
7040    
7041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7042     $self->{line_prev} = $self->{line};
7043     $self->{column_prev} = $self->{column};
7044     $self->{column}++;
7045     $self->{nc}
7046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7047     } else {
7048     $self->{set_nc}->($self);
7049     }
7050    
7051     redo A;
7052     } elsif ($self->{nc} == 0x007C) { # |
7053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7054     ## Stay in the state.
7055    
7056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7057     $self->{line_prev} = $self->{line};
7058     $self->{column_prev} = $self->{column};
7059     $self->{column}++;
7060     $self->{nc}
7061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7062     } else {
7063     $self->{set_nc}->($self);
7064     }
7065    
7066     redo A;
7067     } elsif ($self->{nc} == 0x0029) { # )
7068     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7069     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7070    
7071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7072     $self->{line_prev} = $self->{line};
7073     $self->{column_prev} = $self->{column};
7074     $self->{column}++;
7075     $self->{nc}
7076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7077     } else {
7078     $self->{set_nc}->($self);
7079     }
7080    
7081     redo A;
7082     } elsif ($self->{nc} == 0x003E) { # >
7083     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7084     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7085    
7086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7087     $self->{line_prev} = $self->{line};
7088     $self->{column_prev} = $self->{column};
7089     $self->{column}++;
7090     $self->{nc}
7091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7092     } else {
7093     $self->{set_nc}->($self);
7094     }
7095    
7096     return ($self->{ct}); # ATTLIST
7097     redo A;
7098     } elsif ($self->{nc} == -1) {
7099     ## XML5: No parse error.
7100     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7101     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7102    
7103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7104     $self->{line_prev} = $self->{line};
7105     $self->{column_prev} = $self->{column};
7106     $self->{column}++;
7107     $self->{nc}
7108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7109     } else {
7110     $self->{set_nc}->($self);
7111     }
7112    
7113     return ($self->{ct});
7114     redo A;
7115     } else {
7116     push @{$self->{ca}->{tokens}}, chr $self->{nc};
7117     $self->{state} = ALLOWED_TOKEN_STATE;
7118    
7119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7120     $self->{line_prev} = $self->{line};
7121     $self->{column_prev} = $self->{column};
7122     $self->{column}++;
7123     $self->{nc}
7124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7125     } else {
7126     $self->{set_nc}->($self);
7127     }
7128    
7129     redo A;
7130     }
7131     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7132     if ($is_space->{$self->{nc}}) {
7133     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7134    
7135     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7136     $self->{line_prev} = $self->{line};
7137     $self->{column_prev} = $self->{column};
7138     $self->{column}++;
7139     $self->{nc}
7140     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7141     } else {
7142     $self->{set_nc}->($self);
7143     }
7144    
7145     redo A;
7146     } elsif ($self->{nc} == 0x007C) { # |
7147     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7148    
7149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7150     $self->{line_prev} = $self->{line};
7151     $self->{column_prev} = $self->{column};
7152     $self->{column}++;
7153     $self->{nc}
7154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7155     } else {
7156     $self->{set_nc}->($self);
7157     }
7158    
7159     redo A;
7160     } elsif ($self->{nc} == 0x0029) { # )
7161     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7162    
7163     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164     $self->{line_prev} = $self->{line};
7165     $self->{column_prev} = $self->{column};
7166     $self->{column}++;
7167     $self->{nc}
7168     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169     } else {
7170     $self->{set_nc}->($self);
7171     }
7172    
7173     redo A;
7174     } elsif ($self->{nc} == 0x003E) { # >
7175     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7176     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7177    
7178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7179     $self->{line_prev} = $self->{line};
7180     $self->{column_prev} = $self->{column};
7181     $self->{column}++;
7182     $self->{nc}
7183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7184     } else {
7185     $self->{set_nc}->($self);
7186     }
7187    
7188     return ($self->{ct}); # ATTLIST
7189     redo A;
7190     } elsif ($self->{nc} == -1) {
7191     ## XML5: No parse error.
7192     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7193     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7194    
7195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7196     $self->{line_prev} = $self->{line};
7197     $self->{column_prev} = $self->{column};
7198     $self->{column}++;
7199     $self->{nc}
7200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7201     } else {
7202     $self->{set_nc}->($self);
7203     }
7204    
7205     return ($self->{ct});
7206     redo A;
7207     } else {
7208     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7209     ## Stay in the state.
7210    
7211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7212     $self->{line_prev} = $self->{line};
7213     $self->{column_prev} = $self->{column};
7214     $self->{column}++;
7215     $self->{nc}
7216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7217     } else {
7218     $self->{set_nc}->($self);
7219     }
7220    
7221     redo A;
7222     }
7223     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7224     if ($is_space->{$self->{nc}}) {
7225     ## Stay in the state.
7226    
7227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7228     $self->{line_prev} = $self->{line};
7229     $self->{column_prev} = $self->{column};
7230     $self->{column}++;
7231     $self->{nc}
7232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7233     } else {
7234     $self->{set_nc}->($self);
7235     }
7236    
7237     redo A;
7238     } elsif ($self->{nc} == 0x007C) { # |
7239     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7240    
7241     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7242     $self->{line_prev} = $self->{line};
7243     $self->{column_prev} = $self->{column};
7244     $self->{column}++;
7245     $self->{nc}
7246     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7247     } else {
7248     $self->{set_nc}->($self);
7249     }
7250    
7251     redo A;
7252     } elsif ($self->{nc} == 0x0029) { # )
7253     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7254    
7255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7256     $self->{line_prev} = $self->{line};
7257     $self->{column_prev} = $self->{column};
7258     $self->{column}++;
7259     $self->{nc}
7260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7261     } else {
7262     $self->{set_nc}->($self);
7263     }
7264    
7265     redo A;
7266     } elsif ($self->{nc} == 0x003E) { # >
7267     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7268     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7269    
7270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271     $self->{line_prev} = $self->{line};
7272     $self->{column_prev} = $self->{column};
7273     $self->{column}++;
7274     $self->{nc}
7275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276     } else {
7277     $self->{set_nc}->($self);
7278     }
7279    
7280     return ($self->{ct}); # ATTLIST
7281     redo A;
7282     } elsif ($self->{nc} == -1) {
7283     ## XML5: No parse error.
7284     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7285     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7286    
7287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7288     $self->{line_prev} = $self->{line};
7289     $self->{column_prev} = $self->{column};
7290     $self->{column}++;
7291     $self->{nc}
7292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7293     } else {
7294     $self->{set_nc}->($self);
7295     }
7296    
7297     return ($self->{ct});
7298     redo A;
7299     } else {
7300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7301     line => $self->{line_prev},
7302     column => $self->{column_prev});
7303     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7304     $self->{state} = ALLOWED_TOKEN_STATE;
7305    
7306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7307     $self->{line_prev} = $self->{line};
7308     $self->{column_prev} = $self->{column};
7309     $self->{column}++;
7310     $self->{nc}
7311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7312     } else {
7313     $self->{set_nc}->($self);
7314     }
7315    
7316     redo A;
7317     }
7318     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7319     if ($is_space->{$self->{nc}}) {
7320     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7321    
7322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323     $self->{line_prev} = $self->{line};
7324     $self->{column_prev} = $self->{column};
7325     $self->{column}++;
7326     $self->{nc}
7327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328     } else {
7329     $self->{set_nc}->($self);
7330     }
7331    
7332     redo A;
7333     } elsif ($self->{nc} == 0x0023) { # #
7334     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7335     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7336    
7337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7338     $self->{line_prev} = $self->{line};
7339     $self->{column_prev} = $self->{column};
7340     $self->{column}++;
7341     $self->{nc}
7342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7343     } else {
7344     $self->{set_nc}->($self);
7345     }
7346    
7347     redo A;
7348     } elsif ($self->{nc} == 0x0022) { # "
7349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7350     $self->{ca}->{value} = '';
7351     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7352    
7353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7354     $self->{line_prev} = $self->{line};
7355     $self->{column_prev} = $self->{column};
7356     $self->{column}++;
7357     $self->{nc}
7358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7359     } else {
7360     $self->{set_nc}->($self);
7361     }
7362    
7363     redo A;
7364     } elsif ($self->{nc} == 0x0027) { # '
7365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7366     $self->{ca}->{value} = '';
7367     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7368    
7369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7370     $self->{line_prev} = $self->{line};
7371     $self->{column_prev} = $self->{column};
7372     $self->{column}++;
7373     $self->{nc}
7374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7375     } else {
7376     $self->{set_nc}->($self);
7377     }
7378    
7379     redo A;
7380     } elsif ($self->{nc} == 0x003E) { # >
7381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7382     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7383    
7384     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385     $self->{line_prev} = $self->{line};
7386     $self->{column_prev} = $self->{column};
7387     $self->{column}++;
7388     $self->{nc}
7389     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390     } else {
7391     $self->{set_nc}->($self);
7392     }
7393    
7394     return ($self->{ct}); # ATTLIST
7395     redo A;
7396     } elsif ($self->{nc} == -1) {
7397     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7398     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7399    
7400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401     $self->{line_prev} = $self->{line};
7402     $self->{column_prev} = $self->{column};
7403     $self->{column}++;
7404     $self->{nc}
7405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406     } else {
7407     $self->{set_nc}->($self);
7408     }
7409    
7410     return ($self->{ct});
7411     redo A;
7412     } else {
7413     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7414     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7415     ## Reconsume.
7416     redo A;
7417     }
7418     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7419     if ($is_space->{$self->{nc}}) {
7420     ## Stay in the state.
7421    
7422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7423     $self->{line_prev} = $self->{line};
7424     $self->{column_prev} = $self->{column};
7425     $self->{column}++;
7426     $self->{nc}
7427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7428     } else {
7429     $self->{set_nc}->($self);
7430     }
7431    
7432     redo A;
7433     } elsif ($self->{nc} == 0x0023) { # #
7434     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7435    
7436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7437     $self->{line_prev} = $self->{line};
7438     $self->{column_prev} = $self->{column};
7439     $self->{column}++;
7440     $self->{nc}
7441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7442     } else {
7443     $self->{set_nc}->($self);
7444     }
7445    
7446     redo A;
7447     } elsif ($self->{nc} == 0x0022) { # "
7448     $self->{ca}->{value} = '';
7449     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7450    
7451     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7452     $self->{line_prev} = $self->{line};
7453     $self->{column_prev} = $self->{column};
7454     $self->{column}++;
7455     $self->{nc}
7456     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7457     } else {
7458     $self->{set_nc}->($self);
7459     }
7460    
7461     redo A;
7462     } elsif ($self->{nc} == 0x0027) { # '
7463     $self->{ca}->{value} = '';
7464     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7465    
7466     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7467     $self->{line_prev} = $self->{line};
7468     $self->{column_prev} = $self->{column};
7469     $self->{column}++;
7470     $self->{nc}
7471     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7472     } else {
7473     $self->{set_nc}->($self);
7474     }
7475    
7476     redo A;
7477     } elsif ($self->{nc} == 0x003E) { # >
7478     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7479     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7480    
7481     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7482     $self->{line_prev} = $self->{line};
7483     $self->{column_prev} = $self->{column};
7484     $self->{column}++;
7485     $self->{nc}
7486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7487     } else {
7488     $self->{set_nc}->($self);
7489     }
7490    
7491     return ($self->{ct}); # ATTLIST
7492     redo A;
7493     } elsif ($self->{nc} == -1) {
7494     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7495     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7496    
7497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7498     $self->{line_prev} = $self->{line};
7499     $self->{column_prev} = $self->{column};
7500     $self->{column}++;
7501     $self->{nc}
7502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7503     } else {
7504     $self->{set_nc}->($self);
7505     }
7506    
7507     return ($self->{ct});
7508     redo A;
7509     } else {
7510     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7511     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7512     ## Reconsume.
7513     redo A;
7514     }
7515     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7516     if ($is_space->{$self->{nc}}) {
7517     ## XML5: No parse error.
7518     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7519 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7520 wakaba 1.15 ## Reconsume.
7521     redo A;
7522     } elsif ($self->{nc} == 0x0022) { # "
7523     ## XML5: Same as "anything else".
7524     $self->{ca}->{value} = '';
7525     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7526    
7527     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7528     $self->{line_prev} = $self->{line};
7529     $self->{column_prev} = $self->{column};
7530     $self->{column}++;
7531     $self->{nc}
7532     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7533     } else {
7534     $self->{set_nc}->($self);
7535     }
7536    
7537     redo A;
7538     } elsif ($self->{nc} == 0x0027) { # '
7539     ## XML5: Same as "anything else".
7540     $self->{ca}->{value} = '';
7541     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7542    
7543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7544     $self->{line_prev} = $self->{line};
7545     $self->{column_prev} = $self->{column};
7546     $self->{column}++;
7547     $self->{nc}
7548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7549     } else {
7550     $self->{set_nc}->($self);
7551     }
7552    
7553     redo A;
7554     } elsif ($self->{nc} == 0x003E) { # >
7555     ## XML5: Same as "anything else".
7556     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7557     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7558    
7559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7560     $self->{line_prev} = $self->{line};
7561     $self->{column_prev} = $self->{column};
7562     $self->{column}++;
7563     $self->{nc}
7564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7565     } else {
7566     $self->{set_nc}->($self);
7567     }
7568    
7569     return ($self->{ct}); # ATTLIST
7570     redo A;
7571     } elsif ($self->{nc} == -1) {
7572     ## XML5: No parse error.
7573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7574     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7575    
7576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7577     $self->{line_prev} = $self->{line};
7578     $self->{column_prev} = $self->{column};
7579     $self->{column}++;
7580     $self->{nc}
7581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7582     } else {
7583     $self->{set_nc}->($self);
7584     }
7585    
7586     return ($self->{ct});
7587     redo A;
7588     } else {
7589     $self->{ca}->{default} = chr $self->{nc};
7590     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7591    
7592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7593     $self->{line_prev} = $self->{line};
7594     $self->{column_prev} = $self->{column};
7595     $self->{column}++;
7596     $self->{nc}
7597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7598     } else {
7599     $self->{set_nc}->($self);
7600     }
7601    
7602     redo A;
7603     }
7604     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7605     if ($is_space->{$self->{nc}}) {
7606     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7607    
7608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7609     $self->{line_prev} = $self->{line};
7610     $self->{column_prev} = $self->{column};
7611     $self->{column}++;
7612     $self->{nc}
7613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7614     } else {
7615     $self->{set_nc}->($self);
7616     }
7617    
7618     redo A;
7619     } elsif ($self->{nc} == 0x0022) { # "
7620     ## XML5: Same as "anything else".
7621     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7622     $self->{ca}->{value} = '';
7623     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7624    
7625     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7626     $self->{line_prev} = $self->{line};
7627     $self->{column_prev} = $self->{column};
7628     $self->{column}++;
7629     $self->{nc}
7630     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7631     } else {
7632     $self->{set_nc}->($self);
7633     }
7634    
7635     redo A;
7636     } elsif ($self->{nc} == 0x0027) { # '
7637     ## XML5: Same as "anything else".
7638     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7639     $self->{ca}->{value} = '';
7640     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7641    
7642     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643     $self->{line_prev} = $self->{line};
7644     $self->{column_prev} = $self->{column};
7645     $self->{column}++;
7646     $self->{nc}
7647     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648     } else {
7649     $self->{set_nc}->($self);
7650     }
7651    
7652     redo A;
7653     } elsif ($self->{nc} == 0x003E) { # >
7654     ## XML5: Same as "anything else".
7655     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7656     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7657    
7658     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7659     $self->{line_prev} = $self->{line};
7660     $self->{column_prev} = $self->{column};
7661     $self->{column}++;
7662     $self->{nc}
7663     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7664     } else {
7665     $self->{set_nc}->($self);
7666     }
7667    
7668     return ($self->{ct}); # ATTLIST
7669     redo A;
7670     } elsif ($self->{nc} == -1) {
7671     ## XML5: No parse error.
7672     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7673     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7674     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7675    
7676     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7677     $self->{line_prev} = $self->{line};
7678     $self->{column_prev} = $self->{column};
7679     $self->{column}++;
7680     $self->{nc}
7681     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7682     } else {
7683     $self->{set_nc}->($self);
7684     }
7685    
7686     return ($self->{ct});
7687     redo A;
7688     } else {
7689     $self->{ca}->{default} .= chr $self->{nc};
7690     ## Stay in the state.
7691    
7692     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7693     $self->{line_prev} = $self->{line};
7694     $self->{column_prev} = $self->{column};
7695     $self->{column}++;
7696     $self->{nc}
7697     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7698     } else {
7699     $self->{set_nc}->($self);
7700     }
7701    
7702     redo A;
7703     }
7704     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7705     if ($is_space->{$self->{nc}}) {
7706     ## Stay in the state.
7707    
7708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7709     $self->{line_prev} = $self->{line};
7710     $self->{column_prev} = $self->{column};
7711     $self->{column}++;
7712     $self->{nc}
7713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7714     } else {
7715     $self->{set_nc}->($self);
7716     }
7717    
7718     redo A;
7719     } elsif ($self->{nc} == 0x0022) { # "
7720     $self->{ca}->{value} = '';
7721     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7722    
7723     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7724     $self->{line_prev} = $self->{line};
7725     $self->{column_prev} = $self->{column};
7726     $self->{column}++;
7727     $self->{nc}
7728     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7729     } else {
7730     $self->{set_nc}->($self);
7731     }
7732    
7733     redo A;
7734     } elsif ($self->{nc} == 0x0027) { # '
7735     $self->{ca}->{value} = '';
7736     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7737    
7738     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7739     $self->{line_prev} = $self->{line};
7740     $self->{column_prev} = $self->{column};
7741     $self->{column}++;
7742     $self->{nc}
7743     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7744     } else {
7745     $self->{set_nc}->($self);
7746     }
7747    
7748     redo A;
7749     } elsif ($self->{nc} == 0x003E) { # >
7750     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7751     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7752    
7753     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7754     $self->{line_prev} = $self->{line};
7755     $self->{column_prev} = $self->{column};
7756     $self->{column}++;
7757     $self->{nc}
7758     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7759     } else {
7760     $self->{set_nc}->($self);
7761     }
7762    
7763     return ($self->{ct}); # ATTLIST
7764     redo A;
7765     } elsif ($self->{nc} == -1) {
7766     ## XML5: No parse error.
7767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7768     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7769     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7770    
7771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772     $self->{line_prev} = $self->{line};
7773     $self->{column_prev} = $self->{column};
7774     $self->{column}++;
7775     $self->{nc}
7776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777     } else {
7778     $self->{set_nc}->($self);
7779     }
7780    
7781     return ($self->{ct});
7782     redo A;
7783     } else {
7784     ## XML5: Not defined yet.
7785     if ($self->{ca}->{default} eq 'FIXED') {
7786     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7787     } else {
7788     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7789     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7790     }
7791     ## Reconsume.
7792     redo A;
7793     }
7794     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7795     if ($is_space->{$self->{nc}} or
7796     $self->{nc} == -1 or
7797     $self->{nc} == 0x003E) { # >
7798     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7799     ## Reconsume.
7800     redo A;
7801     } else {
7802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7803     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7804     ## Reconsume.
7805     redo A;
7806 wakaba 1.16 }
7807 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7808     ## ASCII case-insensitive
7809     if ($self->{nc} == [
7810     undef,
7811     0x0044, # D
7812     0x0041, # A
7813     0x0054, # T
7814     ]->[length $self->{kwd}] or
7815     $self->{nc} == [
7816     undef,
7817     0x0064, # d
7818     0x0061, # a
7819     0x0074, # t
7820     ]->[length $self->{kwd}]) {
7821    
7822     ## Stay in the state.
7823     $self->{kwd} .= chr $self->{nc};
7824    
7825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7826     $self->{line_prev} = $self->{line};
7827     $self->{column_prev} = $self->{column};
7828     $self->{column}++;
7829     $self->{nc}
7830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7831     } else {
7832     $self->{set_nc}->($self);
7833     }
7834    
7835     redo A;
7836     } elsif ((length $self->{kwd}) == 4 and
7837     ($self->{nc} == 0x0041 or # A
7838     $self->{nc} == 0x0061)) { # a
7839     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7840    
7841     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7842     text => 'NDATA',
7843     line => $self->{line_prev},
7844     column => $self->{column_prev} - 4);
7845     } else {
7846    
7847     }
7848     $self->{state} = AFTER_NDATA_STATE;
7849    
7850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7851     $self->{line_prev} = $self->{line};
7852     $self->{column_prev} = $self->{column};
7853     $self->{column}++;
7854     $self->{nc}
7855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7856     } else {
7857     $self->{set_nc}->($self);
7858     }
7859    
7860     redo A;
7861     } else {
7862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7863     line => $self->{line_prev},
7864     column => $self->{column_prev} + 1
7865     - length $self->{kwd});
7866    
7867     $self->{state} = BOGUS_MD_STATE;
7868     ## Reconsume.
7869     redo A;
7870     }
7871     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7872     if ($is_space->{$self->{nc}}) {
7873     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7874    
7875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7876     $self->{line_prev} = $self->{line};
7877     $self->{column_prev} = $self->{column};
7878     $self->{column}++;
7879     $self->{nc}
7880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7881     } else {
7882     $self->{set_nc}->($self);
7883     }
7884    
7885     redo A;
7886     } elsif ($self->{nc} == 0x003E) { # >
7887     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7888     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7889    
7890     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7891     $self->{line_prev} = $self->{line};
7892     $self->{column_prev} = $self->{column};
7893     $self->{column}++;
7894     $self->{nc}
7895     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7896     } else {
7897     $self->{set_nc}->($self);
7898     }
7899    
7900     return ($self->{ct}); # ENTITY
7901     redo A;
7902     } elsif ($self->{nc} == -1) {
7903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7904     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7905    
7906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7907     $self->{line_prev} = $self->{line};
7908     $self->{column_prev} = $self->{column};
7909     $self->{column}++;
7910     $self->{nc}
7911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7912     } else {
7913     $self->{set_nc}->($self);
7914     }
7915    
7916     return ($self->{ct}); # ENTITY
7917     redo A;
7918     } else {
7919     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7920     line => $self->{line_prev},
7921     column => $self->{column_prev} + 1
7922     - length $self->{kwd});
7923     $self->{state} = BOGUS_MD_STATE;
7924     ## Reconsume.
7925     redo A;
7926     }
7927     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7928     if ($is_space->{$self->{nc}}) {
7929     ## Stay in the state.
7930    
7931     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7932     $self->{line_prev} = $self->{line};
7933     $self->{column_prev} = $self->{column};
7934     $self->{column}++;
7935     $self->{nc}
7936     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7937     } else {
7938     $self->{set_nc}->($self);
7939     }
7940    
7941     redo A;
7942     } elsif ($self->{nc} == 0x003E) { # >
7943     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7944     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7945    
7946     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7947     $self->{line_prev} = $self->{line};
7948     $self->{column_prev} = $self->{column};
7949     $self->{column}++;
7950     $self->{nc}
7951     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7952     } else {
7953     $self->{set_nc}->($self);
7954     }
7955    
7956     return ($self->{ct}); # ENTITY
7957     redo A;
7958     } elsif ($self->{nc} == -1) {
7959     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7960     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7961    
7962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7963     $self->{line_prev} = $self->{line};
7964     $self->{column_prev} = $self->{column};
7965     $self->{column}++;
7966     $self->{nc}
7967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7968     } else {
7969     $self->{set_nc}->($self);
7970     }
7971    
7972     return ($self->{ct}); # ENTITY
7973     redo A;
7974     } else {
7975     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7976     $self->{state} = NOTATION_NAME_STATE;
7977    
7978     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7979     $self->{line_prev} = $self->{line};
7980     $self->{column_prev} = $self->{column};
7981     $self->{column}++;
7982     $self->{nc}
7983     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7984     } else {
7985     $self->{set_nc}->($self);
7986     }
7987    
7988     redo A;
7989     }
7990     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7991     if ($is_space->{$self->{nc}}) {
7992 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7993 wakaba 1.18
7994     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7995     $self->{line_prev} = $self->{line};
7996     $self->{column_prev} = $self->{column};
7997     $self->{column}++;
7998     $self->{nc}
7999     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8000     } else {
8001     $self->{set_nc}->($self);
8002     }
8003    
8004     redo A;
8005     } elsif ($self->{nc} == 0x003E) { # >
8006     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8007    
8008     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8009     $self->{line_prev} = $self->{line};
8010     $self->{column_prev} = $self->{column};
8011     $self->{column}++;
8012     $self->{nc}
8013     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8014     } else {
8015     $self->{set_nc}->($self);
8016     }
8017    
8018     return ($self->{ct}); # ENTITY
8019     redo A;
8020     } elsif ($self->{nc} == -1) {
8021     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8022     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8023    
8024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8025     $self->{line_prev} = $self->{line};
8026     $self->{column_prev} = $self->{column};
8027     $self->{column}++;
8028     $self->{nc}
8029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8030     } else {
8031     $self->{set_nc}->($self);
8032     }
8033    
8034     return ($self->{ct}); # ENTITY
8035     redo A;
8036     } else {
8037     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8038     ## Stay in the state.
8039    
8040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8041     $self->{line_prev} = $self->{line};
8042     $self->{column_prev} = $self->{column};
8043     $self->{column}++;
8044     $self->{nc}
8045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8046     } else {
8047     $self->{set_nc}->($self);
8048     }
8049    
8050     redo A;
8051     }
8052 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8053     if ($self->{nc} == 0x0022) { # "
8054 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
8055 wakaba 1.19
8056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8057     $self->{line_prev} = $self->{line};
8058     $self->{column_prev} = $self->{column};
8059     $self->{column}++;
8060     $self->{nc}
8061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8062     } else {
8063     $self->{set_nc}->($self);
8064     }
8065    
8066     redo A;
8067     } elsif ($self->{nc} == 0x0026) { # &
8068     $self->{prev_state} = $self->{state};
8069     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8070     $self->{entity_add} = 0x0022; # "
8071    
8072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8073     $self->{line_prev} = $self->{line};
8074     $self->{column_prev} = $self->{column};
8075     $self->{column}++;
8076     $self->{nc}
8077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8078     } else {
8079     $self->{set_nc}->($self);
8080     }
8081    
8082     redo A;
8083     ## TODO: %
8084     } elsif ($self->{nc} == -1) {
8085     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8086     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8087     ## Reconsume.
8088     return ($self->{ct}); # ENTITY
8089     redo A;
8090     } else {
8091     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8092    
8093     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8094     $self->{line_prev} = $self->{line};
8095     $self->{column_prev} = $self->{column};
8096     $self->{column}++;
8097     $self->{nc}
8098     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8099     } else {
8100     $self->{set_nc}->($self);
8101     }
8102    
8103     redo A;
8104     }
8105     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8106     if ($self->{nc} == 0x0027) { # '
8107 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
8108 wakaba 1.19
8109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8110     $self->{line_prev} = $self->{line};
8111     $self->{column_prev} = $self->{column};
8112     $self->{column}++;
8113     $self->{nc}
8114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8115     } else {
8116     $self->{set_nc}->($self);
8117     }
8118    
8119     redo A;
8120     } elsif ($self->{nc} == 0x0026) { # &
8121     $self->{prev_state} = $self->{state};
8122     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8123     $self->{entity_add} = 0x0027; # '
8124    
8125     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8126     $self->{line_prev} = $self->{line};
8127     $self->{column_prev} = $self->{column};
8128     $self->{column}++;
8129     $self->{nc}
8130     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8131     } else {
8132     $self->{set_nc}->($self);
8133     }
8134    
8135     redo A;
8136     ## TODO: %
8137     } elsif ($self->{nc} == -1) {
8138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8139     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8140     ## Reconsume.
8141     return ($self->{ct}); # ENTITY
8142     redo A;
8143     } else {
8144     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8145    
8146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8147     $self->{line_prev} = $self->{line};
8148     $self->{column_prev} = $self->{column};
8149     $self->{column}++;
8150     $self->{nc}
8151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8152     } else {
8153     $self->{set_nc}->($self);
8154     }
8155    
8156     redo A;
8157     }
8158     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8159     if ($is_space->{$self->{nc}} or
8160     {
8161     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8162     $self->{entity_add} => 1,
8163     }->{$self->{nc}}) {
8164 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8165     line => $self->{line_prev},
8166     column => $self->{column_prev}
8167     + ($self->{nc} == -1 ? 1 : 0));
8168 wakaba 1.19 ## Don't consume
8169     ## Return nothing.
8170     #
8171     } elsif ($self->{nc} == 0x0023) { # #
8172     $self->{ca} = $self->{ct};
8173     $self->{state} = ENTITY_HASH_STATE;
8174     $self->{kwd} = '#';
8175    
8176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8177     $self->{line_prev} = $self->{line};
8178     $self->{column_prev} = $self->{column};
8179     $self->{column}++;
8180     $self->{nc}
8181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8182     } else {
8183     $self->{set_nc}->($self);
8184     }
8185    
8186     redo A;
8187     } else {
8188     #
8189     }
8190    
8191     $self->{ct}->{value} .= '&';
8192     $self->{state} = $self->{prev_state};
8193     ## Reconsume.
8194     redo A;
8195 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8196     if ($is_space->{$self->{nc}}) {
8197     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8198    
8199     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8200     $self->{line_prev} = $self->{line};
8201     $self->{column_prev} = $self->{column};
8202     $self->{column}++;
8203     $self->{nc}
8204     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8205     } else {
8206     $self->{set_nc}->($self);
8207     }
8208    
8209     redo A;
8210     } elsif ($self->{nc} == 0x0028) { # (
8211     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8212     $self->{ct}->{content} = ['('];
8213     $self->{group_depth} = 1;
8214    
8215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8216     $self->{line_prev} = $self->{line};
8217     $self->{column_prev} = $self->{column};
8218     $self->{column}++;
8219     $self->{nc}
8220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8221     } else {
8222     $self->{set_nc}->($self);
8223     }
8224    
8225     redo A;
8226     } elsif ($self->{nc} == 0x003E) { # >
8227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8228     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8229    
8230     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231     $self->{line_prev} = $self->{line};
8232     $self->{column_prev} = $self->{column};
8233     $self->{column}++;
8234     $self->{nc}
8235     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236     } else {
8237     $self->{set_nc}->($self);
8238     }
8239    
8240     return ($self->{ct}); # ELEMENT
8241     redo A;
8242     } elsif ($self->{nc} == -1) {
8243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8244     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8245    
8246     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247     $self->{line_prev} = $self->{line};
8248     $self->{column_prev} = $self->{column};
8249     $self->{column}++;
8250     $self->{nc}
8251     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252     } else {
8253     $self->{set_nc}->($self);
8254     }
8255    
8256     return ($self->{ct}); # ELEMENT
8257     redo A;
8258     } else {
8259     $self->{ct}->{content} = [chr $self->{nc}];
8260     $self->{state} = CONTENT_KEYWORD_STATE;
8261    
8262     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8263     $self->{line_prev} = $self->{line};
8264     $self->{column_prev} = $self->{column};
8265     $self->{column}++;
8266     $self->{nc}
8267     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8268     } else {
8269     $self->{set_nc}->($self);
8270     }
8271    
8272     redo A;
8273     }
8274     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8275     if ($is_space->{$self->{nc}}) {
8276     $self->{state} = AFTER_MD_DEF_STATE;
8277    
8278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8279     $self->{line_prev} = $self->{line};
8280     $self->{column_prev} = $self->{column};
8281     $self->{column}++;
8282     $self->{nc}
8283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8284     } else {
8285     $self->{set_nc}->($self);
8286     }
8287    
8288     redo A;
8289     } elsif ($self->{nc} == 0x003E) { # >
8290     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8291    
8292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8293     $self->{line_prev} = $self->{line};
8294     $self->{column_prev} = $self->{column};
8295     $self->{column}++;
8296     $self->{nc}
8297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8298     } else {
8299     $self->{set_nc}->($self);
8300     }
8301    
8302     return ($self->{ct}); # ELEMENT
8303     redo A;
8304     } elsif ($self->{nc} == -1) {
8305     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8306     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8307    
8308     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8309     $self->{line_prev} = $self->{line};
8310     $self->{column_prev} = $self->{column};
8311     $self->{column}++;
8312     $self->{nc}
8313     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8314     } else {
8315     $self->{set_nc}->($self);
8316     }
8317    
8318     return ($self->{ct}); # ELEMENT
8319     redo A;
8320     } else {
8321     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8322     ## Stay in the state.
8323    
8324     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8325     $self->{line_prev} = $self->{line};
8326     $self->{column_prev} = $self->{column};
8327     $self->{column}++;
8328     $self->{nc}
8329     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8330     } else {
8331     $self->{set_nc}->($self);
8332     }
8333    
8334     redo A;
8335     }
8336     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8337     if ($is_space->{$self->{nc}}) {
8338     ## Stay in the state.
8339    
8340     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8341     $self->{line_prev} = $self->{line};
8342     $self->{column_prev} = $self->{column};
8343     $self->{column}++;
8344     $self->{nc}
8345     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8346     } else {
8347     $self->{set_nc}->($self);
8348     }
8349    
8350     redo A;
8351     } elsif ($self->{nc} == 0x0028) { # (
8352     $self->{group_depth}++;
8353     push @{$self->{ct}->{content}}, chr $self->{nc};
8354     ## Stay in the state.
8355    
8356     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8357     $self->{line_prev} = $self->{line};
8358     $self->{column_prev} = $self->{column};
8359     $self->{column}++;
8360     $self->{nc}
8361     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8362     } else {
8363     $self->{set_nc}->($self);
8364     }
8365    
8366     redo A;
8367     } elsif ($self->{nc} == 0x007C or # |
8368     $self->{nc} == 0x002C) { # ,
8369     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8370     ## Stay in the state.
8371    
8372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8373     $self->{line_prev} = $self->{line};
8374     $self->{column_prev} = $self->{column};
8375     $self->{column}++;
8376     $self->{nc}
8377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8378     } else {
8379     $self->{set_nc}->($self);
8380     }
8381    
8382     redo A;
8383     } elsif ($self->{nc} == 0x0029) { # )
8384     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8385     push @{$self->{ct}->{content}}, chr $self->{nc};
8386     $self->{group_depth}--;
8387     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8388    
8389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8390     $self->{line_prev} = $self->{line};
8391     $self->{column_prev} = $self->{column};
8392     $self->{column}++;
8393     $self->{nc}
8394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8395     } else {
8396     $self->{set_nc}->($self);
8397     }
8398    
8399     redo A;
8400     } elsif ($self->{nc} == 0x003E) { # >
8401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8402     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8403     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8404    
8405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8406     $self->{line_prev} = $self->{line};
8407     $self->{column_prev} = $self->{column};
8408     $self->{column}++;
8409     $self->{nc}
8410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8411     } else {
8412     $self->{set_nc}->($self);
8413     }
8414    
8415     return ($self->{ct}); # ELEMENT
8416     redo A;
8417     } elsif ($self->{nc} == -1) {
8418     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8419     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8420     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8421    
8422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423     $self->{line_prev} = $self->{line};
8424     $self->{column_prev} = $self->{column};
8425     $self->{column}++;
8426     $self->{nc}
8427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428     } else {
8429     $self->{set_nc}->($self);
8430     }
8431    
8432     return ($self->{ct}); # ELEMENT
8433     redo A;
8434     } else {
8435     push @{$self->{ct}->{content}}, chr $self->{nc};
8436     $self->{state} = CM_ELEMENT_NAME_STATE;
8437    
8438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8439     $self->{line_prev} = $self->{line};
8440     $self->{column_prev} = $self->{column};
8441     $self->{column}++;
8442     $self->{nc}
8443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8444     } else {
8445     $self->{set_nc}->($self);
8446     }
8447    
8448     redo A;
8449     }
8450     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8451     if ($is_space->{$self->{nc}}) {
8452     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8453    
8454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8455     $self->{line_prev} = $self->{line};
8456     $self->{column_prev} = $self->{column};
8457     $self->{column}++;
8458     $self->{nc}
8459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8460     } else {
8461     $self->{set_nc}->($self);
8462     }
8463    
8464     redo A;
8465     } elsif ($self->{nc} == 0x002A or # *
8466     $self->{nc} == 0x002B or # +
8467     $self->{nc} == 0x003F) { # ?
8468     push @{$self->{ct}->{content}}, chr $self->{nc};
8469     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8470    
8471     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8472     $self->{line_prev} = $self->{line};
8473     $self->{column_prev} = $self->{column};
8474     $self->{column}++;
8475     $self->{nc}
8476     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8477     } else {
8478     $self->{set_nc}->($self);
8479     }
8480    
8481     redo A;
8482     } elsif ($self->{nc} == 0x007C or # |
8483     $self->{nc} == 0x002C) { # ,
8484     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8485     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8486    
8487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8488     $self->{line_prev} = $self->{line};
8489     $self->{column_prev} = $self->{column};
8490     $self->{column}++;
8491     $self->{nc}
8492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8493     } else {
8494     $self->{set_nc}->($self);
8495     }
8496    
8497     redo A;
8498     } elsif ($self->{nc} == 0x0029) { # )
8499     $self->{group_depth}--;
8500     push @{$self->{ct}->{content}}, chr $self->{nc};
8501     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8502    
8503     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8504     $self->{line_prev} = $self->{line};
8505     $self->{column_prev} = $self->{column};
8506     $self->{column}++;
8507     $self->{nc}
8508     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8509     } else {
8510     $self->{set_nc}->($self);
8511     }
8512    
8513     redo A;
8514     } elsif ($self->{nc} == 0x003E) { # >
8515     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8516     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8517     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8518    
8519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8520     $self->{line_prev} = $self->{line};
8521     $self->{column_prev} = $self->{column};
8522     $self->{column}++;
8523     $self->{nc}
8524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8525     } else {
8526     $self->{set_nc}->($self);
8527     }
8528    
8529     return ($self->{ct}); # ELEMENT
8530     redo A;
8531     } elsif ($self->{nc} == -1) {
8532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8533     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8534     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8535    
8536     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8537     $self->{line_prev} = $self->{line};
8538     $self->{column_prev} = $self->{column};
8539     $self->{column}++;
8540     $self->{nc}
8541     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8542     } else {
8543     $self->{set_nc}->($self);
8544     }
8545    
8546     return ($self->{ct}); # ELEMENT
8547     redo A;
8548     } else {
8549     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8550     ## Stay in the state.
8551    
8552     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8553     $self->{line_prev} = $self->{line};
8554     $self->{column_prev} = $self->{column};
8555     $self->{column}++;
8556     $self->{nc}
8557     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8558     } else {
8559     $self->{set_nc}->($self);
8560     }
8561    
8562     redo A;
8563     }
8564     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8565     if ($is_space->{$self->{nc}}) {
8566     ## Stay in the state.
8567    
8568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8569     $self->{line_prev} = $self->{line};
8570     $self->{column_prev} = $self->{column};
8571     $self->{column}++;
8572     $self->{nc}
8573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8574     } else {
8575     $self->{set_nc}->($self);
8576     }
8577    
8578     redo A;
8579     } elsif ($self->{nc} == 0x007C or # |
8580     $self->{nc} == 0x002C) { # ,
8581     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8582     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8583    
8584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8585     $self->{line_prev} = $self->{line};
8586     $self->{column_prev} = $self->{column};
8587     $self->{column}++;
8588     $self->{nc}
8589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8590     } else {
8591     $self->{set_nc}->($self);
8592     }
8593    
8594     redo A;
8595     } elsif ($self->{nc} == 0x0029) { # )
8596     $self->{group_depth}--;
8597     push @{$self->{ct}->{content}}, chr $self->{nc};
8598     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8599    
8600     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8601     $self->{line_prev} = $self->{line};
8602     $self->{column_prev} = $self->{column};
8603     $self->{column}++;
8604     $self->{nc}
8605     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8606     } else {
8607     $self->{set_nc}->($self);
8608     }
8609    
8610     redo A;
8611     } elsif ($self->{nc} == 0x003E) { # >
8612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8613     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8614     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8615    
8616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8617     $self->{line_prev} = $self->{line};
8618     $self->{column_prev} = $self->{column};
8619     $self->{column}++;
8620     $self->{nc}
8621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8622     } else {
8623     $self->{set_nc}->($self);
8624     }
8625    
8626     return ($self->{ct}); # ELEMENT
8627     redo A;
8628     } elsif ($self->{nc} == -1) {
8629     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8630     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8631     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8632    
8633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8634     $self->{line_prev} = $self->{line};
8635     $self->{column_prev} = $self->{column};
8636     $self->{column}++;
8637     $self->{nc}
8638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8639     } else {
8640     $self->{set_nc}->($self);
8641     }
8642    
8643     return ($self->{ct}); # ELEMENT
8644     redo A;
8645     } else {
8646     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8647     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8648     $self->{state} = BOGUS_MD_STATE;
8649    
8650     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8651     $self->{line_prev} = $self->{line};
8652     $self->{column_prev} = $self->{column};
8653     $self->{column}++;
8654     $self->{nc}
8655     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8656     } else {
8657     $self->{set_nc}->($self);
8658     }
8659    
8660     redo A;
8661     }
8662     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8663     if ($is_space->{$self->{nc}}) {
8664     if ($self->{group_depth}) {
8665     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8666     } else {
8667     $self->{state} = AFTER_MD_DEF_STATE;
8668     }
8669    
8670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8671     $self->{line_prev} = $self->{line};
8672     $self->{column_prev} = $self->{column};
8673     $self->{column}++;
8674     $self->{nc}
8675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8676     } else {
8677     $self->{set_nc}->($self);
8678     }
8679    
8680     redo A;
8681     } elsif ($self->{nc} == 0x002A or # *
8682     $self->{nc} == 0x002B or # +
8683     $self->{nc} == 0x003F) { # ?
8684     push @{$self->{ct}->{content}}, chr $self->{nc};
8685     if ($self->{group_depth}) {
8686     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8687     } else {
8688     $self->{state} = AFTER_MD_DEF_STATE;
8689     }
8690    
8691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8692     $self->{line_prev} = $self->{line};
8693     $self->{column_prev} = $self->{column};
8694     $self->{column}++;
8695     $self->{nc}
8696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8697     } else {
8698     $self->{set_nc}->($self);
8699     }
8700    
8701     redo A;
8702     } elsif ($self->{nc} == 0x0029) { # )
8703     if ($self->{group_depth}) {
8704     $self->{group_depth}--;
8705     push @{$self->{ct}->{content}}, chr $self->{nc};
8706     ## Stay in the state.
8707    
8708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8709     $self->{line_prev} = $self->{line};
8710     $self->{column_prev} = $self->{column};
8711     $self->{column}++;
8712     $self->{nc}
8713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8714     } else {
8715     $self->{set_nc}->($self);
8716     }
8717    
8718     redo A;
8719     } else {
8720     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8721     $self->{state} = BOGUS_MD_STATE;
8722     ## Reconsume.
8723     redo A;
8724     }
8725     } elsif ($self->{nc} == 0x003E) { # >
8726     if ($self->{group_depth}) {
8727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8728     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8729     }
8730     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8731    
8732     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8733     $self->{line_prev} = $self->{line};
8734     $self->{column_prev} = $self->{column};
8735     $self->{column}++;
8736     $self->{nc}
8737     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8738     } else {
8739     $self->{set_nc}->($self);
8740     }
8741    
8742     return ($self->{ct}); # ELEMENT
8743     redo A;
8744     } elsif ($self->{nc} == -1) {
8745     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8746     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8747     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8748    
8749     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8750     $self->{line_prev} = $self->{line};
8751     $self->{column_prev} = $self->{column};
8752     $self->{column}++;
8753     $self->{nc}
8754     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8755     } else {
8756     $self->{set_nc}->($self);
8757     }
8758    
8759     return ($self->{ct}); # ELEMENT
8760     redo A;
8761     } else {
8762     if ($self->{group_depth}) {
8763     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8764     } else {
8765     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8766     $self->{state} = BOGUS_MD_STATE;
8767     }
8768     ## Reconsume.
8769     redo A;
8770     }
8771     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8772 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8773     ## Stay in the state.
8774    
8775     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8776     $self->{line_prev} = $self->{line};
8777     $self->{column_prev} = $self->{column};
8778     $self->{column}++;
8779     $self->{nc}
8780     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8781     } else {
8782     $self->{set_nc}->($self);
8783     }
8784    
8785     redo A;
8786     } elsif ($self->{nc} == 0x003E) { # >
8787     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8788    
8789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8790     $self->{line_prev} = $self->{line};
8791     $self->{column_prev} = $self->{column};
8792     $self->{column}++;
8793     $self->{nc}
8794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8795     } else {
8796     $self->{set_nc}->($self);
8797     }
8798    
8799 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8800 wakaba 1.18 redo A;
8801     } elsif ($self->{nc} == -1) {
8802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8803     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8804    
8805     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8806     $self->{line_prev} = $self->{line};
8807     $self->{column_prev} = $self->{column};
8808     $self->{column}++;
8809     $self->{nc}
8810     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8811     } else {
8812     $self->{set_nc}->($self);
8813     }
8814    
8815 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8816 wakaba 1.18 redo A;
8817     } else {
8818 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8819 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8820     ## Reconsume.
8821     redo A;
8822     }
8823 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8824     if ($self->{nc} == 0x003E) { # >
8825     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8826    
8827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8828     $self->{line_prev} = $self->{line};
8829     $self->{column_prev} = $self->{column};
8830     $self->{column}++;
8831     $self->{nc}
8832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8833     } else {
8834     $self->{set_nc}->($self);
8835     }
8836    
8837     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8838     redo A;
8839     } elsif ($self->{nc} == -1) {
8840     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8841     ## Reconsume.
8842     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8843     redo A;
8844     } else {
8845     ## Stay in the state.
8846    
8847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8848     $self->{line_prev} = $self->{line};
8849     $self->{column_prev} = $self->{column};
8850     $self->{column}++;
8851     $self->{nc}
8852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8853     } else {
8854     $self->{set_nc}->($self);
8855     }
8856    
8857     redo A;
8858     }
8859 wakaba 1.1 } else {
8860     die "$0: $self->{state}: Unknown state";
8861     }
8862     } # A
8863    
8864     die "$0: _get_next_token: unexpected case";
8865     } # _get_next_token
8866    
8867     1;
8868 wakaba 1.33 ## $Date: 2009/09/05 09:57:55 $
8869 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24