/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.31 - (hide annotations) (download)
Sat Sep 5 09:26:55 2009 UTC (15 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.30: +40 -11 lines
++ whatpm/t/ChangeLog	5 Sep 2009 09:26:39 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Added test cases for "comment end bang
	state" (HTML5 revision 3191).

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:26:12 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end
	bang state" (HTML5 revision 3191).

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.31 our $VERSION=do{my @r=(q$Revision: 1.30 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108 wakaba 1.31 sub COMMENT_END_BANG_STATE () { 102 } ## LAST
109 wakaba 1.1 sub COMMENT_END_DASH_STATE () { 18 }
110     sub BOGUS_COMMENT_STATE () { 19 }
111     sub DOCTYPE_STATE () { 20 }
112     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
113     sub DOCTYPE_NAME_STATE () { 22 }
114     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
115     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
117     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
118     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
119     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
121     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
122     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
123     sub BOGUS_DOCTYPE_STATE () { 32 }
124     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
125     sub SELF_CLOSING_START_TAG_STATE () { 34 }
126     sub CDATA_SECTION_STATE () { 35 }
127     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
128     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
129     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
130     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
131     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
132     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
133     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
134     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
135     ## NOTE: "Entity data state", "entity in attribute value state", and
136     ## "consume a character reference" algorithm are jointly implemented
137     ## using the following six states:
138     sub ENTITY_STATE () { 44 }
139     sub ENTITY_HASH_STATE () { 45 }
140     sub NCR_NUM_STATE () { 46 }
141     sub HEXREF_X_STATE () { 47 }
142     sub HEXREF_HEX_STATE () { 48 }
143     sub ENTITY_NAME_STATE () { 49 }
144     sub PCDATA_STATE () { 50 } # "data state" in the spec
145    
146 wakaba 1.12 ## XML-only states
147 wakaba 1.8 sub PI_STATE () { 51 }
148     sub PI_TARGET_STATE () { 52 }
149     sub PI_TARGET_AFTER_STATE () { 53 }
150     sub PI_DATA_STATE () { 54 }
151     sub PI_AFTER_STATE () { 55 }
152     sub PI_DATA_AFTER_STATE () { 56 }
153 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
154     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
155 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
156     sub DOCTYPE_TAG_STATE () { 60 }
157     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
158     sub MD_ATTLIST_STATE () { 62 }
159     sub MD_E_STATE () { 63 }
160     sub MD_ELEMENT_STATE () { 64 }
161     sub MD_ENTITY_STATE () { 65 }
162     sub MD_NOTATION_STATE () { 66 }
163     sub DOCTYPE_MD_STATE () { 67 }
164     sub BEFORE_MD_NAME_STATE () { 68 }
165     sub MD_NAME_STATE () { 69 }
166     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
167     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
168 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
171     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
172     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
173     sub ALLOWED_TOKEN_STATE () { 77 }
174     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
175     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
176     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
179     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
180     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
181 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
182     sub NDATA_STATE () { 86 }
183     sub AFTER_NDATA_STATE () { 87 }
184     sub BEFORE_NOTATION_NAME_STATE () { 88 }
185     sub NOTATION_NAME_STATE () { 89 }
186 wakaba 1.20 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
187     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
188     sub ENTITY_VALUE_ENTITY_STATE () { 92 }
189     sub AFTER_ELEMENT_NAME_STATE () { 93 }
190     sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
191     sub CONTENT_KEYWORD_STATE () { 95 }
192     sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
193     sub CM_ELEMENT_NAME_STATE () { 97 }
194     sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
195     sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
196     sub AFTER_MD_DEF_STATE () { 100 }
197     sub BOGUS_MD_STATE () { 101 }
198 wakaba 1.8
199 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
200     ## list and descriptions)
201    
202     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
203     sub FOREIGN_EL () { 0b1_00000000000 }
204    
205     ## Character reference mappings
206    
207     my $charref_map = {
208     0x0D => 0x000A,
209     0x80 => 0x20AC,
210     0x81 => 0xFFFD,
211     0x82 => 0x201A,
212     0x83 => 0x0192,
213     0x84 => 0x201E,
214     0x85 => 0x2026,
215     0x86 => 0x2020,
216     0x87 => 0x2021,
217     0x88 => 0x02C6,
218     0x89 => 0x2030,
219     0x8A => 0x0160,
220     0x8B => 0x2039,
221     0x8C => 0x0152,
222     0x8D => 0xFFFD,
223     0x8E => 0x017D,
224     0x8F => 0xFFFD,
225     0x90 => 0xFFFD,
226     0x91 => 0x2018,
227     0x92 => 0x2019,
228     0x93 => 0x201C,
229     0x94 => 0x201D,
230     0x95 => 0x2022,
231     0x96 => 0x2013,
232     0x97 => 0x2014,
233     0x98 => 0x02DC,
234     0x99 => 0x2122,
235     0x9A => 0x0161,
236     0x9B => 0x203A,
237     0x9C => 0x0153,
238     0x9D => 0xFFFD,
239     0x9E => 0x017E,
240     0x9F => 0x0178,
241     }; # $charref_map
242     $charref_map->{$_} = 0xFFFD
243     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
244     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
245     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
246     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
247     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
248     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
249     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
250    
251     ## Implementations MUST act as if state machine in the spec
252    
253     sub _initialize_tokenizer ($) {
254     my $self = shift;
255    
256     ## NOTE: Fields set by |new| constructor:
257     #$self->{level}
258     #$self->{set_nc}
259     #$self->{parse_error}
260 wakaba 1.3 #$self->{is_xml} (if XML)
261 wakaba 1.1
262     $self->{state} = DATA_STATE; # MUST
263 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
264     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
265 wakaba 1.1 #$self->{entity__value}; # initialized when used
266     #$self->{entity__match}; # initialized when used
267     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
268     undef $self->{ct}; # current token
269     undef $self->{ca}; # current attribute
270     undef $self->{last_stag_name}; # last emitted start tag name
271     #$self->{prev_state}; # initialized when used
272     delete $self->{self_closing};
273     $self->{char_buffer} = '';
274     $self->{char_buffer_pos} = 0;
275     $self->{nc} = -1; # next input character
276     #$self->{next_nc}
277    
278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
279     $self->{line_prev} = $self->{line};
280     $self->{column_prev} = $self->{column};
281     $self->{column}++;
282     $self->{nc}
283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
284     } else {
285     $self->{set_nc}->($self);
286     }
287    
288     $self->{token} = [];
289     # $self->{escape}
290     } # _initialize_tokenizer
291    
292     ## A token has:
293     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
294 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
295 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
296     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
297 wakaba 1.11 ## ->{target} (PI_TOKEN)
298 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
299     ## ->{sysid} (DOCTYPE_TOKEN)
300     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
301     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
302     ## ->{name}
303     ## ->{value}
304     ## ->{has_reference} == 1 or 0
305 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
306     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
307 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
308 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
309 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
310    
311 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
312     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
313     ## while the token is pushed back to the stack.
314    
315     ## Emitted token MUST immediately be handled by the tree construction state.
316    
317     ## Before each step, UA MAY check to see if either one of the scripts in
318     ## "list of scripts that will execute as soon as possible" or the first
319     ## script in the "list of scripts that will execute asynchronously",
320     ## has completed loading. If one has, then it MUST be executed
321     ## and removed from the list.
322    
323     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
324     ## (This requirement was dropped from HTML5 spec, unfortunately.)
325    
326     my $is_space = {
327     0x0009 => 1, # CHARACTER TABULATION (HT)
328     0x000A => 1, # LINE FEED (LF)
329     #0x000B => 0, # LINE TABULATION (VT)
330 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
331 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
332     0x0020 => 1, # SPACE (SP)
333     };
334    
335     sub _get_next_token ($) {
336     my $self = shift;
337    
338     if ($self->{self_closing}) {
339     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
340     ## NOTE: The |self_closing| flag is only set by start tag token.
341     ## In addition, when a start tag token is emitted, it is always set to
342     ## |ct|.
343     delete $self->{self_closing};
344     }
345    
346     if (@{$self->{token}}) {
347     $self->{self_closing} = $self->{token}->[0]->{self_closing};
348     return shift @{$self->{token}};
349     }
350    
351     A: {
352     if ($self->{state} == PCDATA_STATE) {
353     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
354    
355     if ($self->{nc} == 0x0026) { # &
356    
357     ## NOTE: In the spec, the tokenizer is switched to the
358     ## "entity data state". In this implementation, the tokenizer
359     ## is switched to the |ENTITY_STATE|, which is an implementation
360     ## of the "consume a character reference" algorithm.
361     $self->{entity_add} = -1;
362     $self->{prev_state} = DATA_STATE;
363     $self->{state} = ENTITY_STATE;
364    
365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
366     $self->{line_prev} = $self->{line};
367     $self->{column_prev} = $self->{column};
368     $self->{column}++;
369     $self->{nc}
370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
371     } else {
372     $self->{set_nc}->($self);
373     }
374    
375     redo A;
376     } elsif ($self->{nc} == 0x003C) { # <
377    
378     $self->{state} = TAG_OPEN_STATE;
379    
380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
381     $self->{line_prev} = $self->{line};
382     $self->{column_prev} = $self->{column};
383     $self->{column}++;
384     $self->{nc}
385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
386     } else {
387     $self->{set_nc}->($self);
388     }
389    
390     redo A;
391     } elsif ($self->{nc} == -1) {
392    
393     return ({type => END_OF_FILE_TOKEN,
394     line => $self->{line}, column => $self->{column}});
395     last A; ## TODO: ok?
396     } else {
397    
398     #
399     }
400    
401     # Anything else
402     my $token = {type => CHARACTER_TOKEN,
403     data => chr $self->{nc},
404     line => $self->{line}, column => $self->{column},
405     };
406     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
407    
408     ## Stay in the state.
409    
410     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
411     $self->{line_prev} = $self->{line};
412     $self->{column_prev} = $self->{column};
413     $self->{column}++;
414     $self->{nc}
415     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
416     } else {
417     $self->{set_nc}->($self);
418     }
419    
420     return ($token);
421     redo A;
422     } elsif ($self->{state} == DATA_STATE) {
423     $self->{s_kwd} = '' unless defined $self->{s_kwd};
424     if ($self->{nc} == 0x0026) { # &
425     $self->{s_kwd} = '';
426     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
427     not $self->{escape}) {
428    
429     ## NOTE: In the spec, the tokenizer is switched to the
430     ## "entity data state". In this implementation, the tokenizer
431     ## is switched to the |ENTITY_STATE|, which is an implementation
432     ## of the "consume a character reference" algorithm.
433     $self->{entity_add} = -1;
434     $self->{prev_state} = DATA_STATE;
435     $self->{state} = ENTITY_STATE;
436    
437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
438     $self->{line_prev} = $self->{line};
439     $self->{column_prev} = $self->{column};
440     $self->{column}++;
441     $self->{nc}
442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
443     } else {
444     $self->{set_nc}->($self);
445     }
446    
447     redo A;
448     } else {
449    
450     #
451     }
452     } elsif ($self->{nc} == 0x002D) { # -
453     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
454 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
455 wakaba 1.1
456     $self->{escape} = 1; # unless $self->{escape};
457     $self->{s_kwd} = '--';
458     #
459 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
460 wakaba 1.1
461     $self->{s_kwd} = '--';
462     #
463 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
464    
465     $self->{s_kwd} .= '-';
466     #
467 wakaba 1.1 } else {
468    
469 wakaba 1.5 $self->{s_kwd} = '-';
470 wakaba 1.1 #
471     }
472     }
473    
474     #
475     } elsif ($self->{nc} == 0x0021) { # !
476     if (length $self->{s_kwd}) {
477    
478     $self->{s_kwd} .= '!';
479     #
480     } else {
481    
482     #$self->{s_kwd} = '';
483     #
484     }
485     #
486     } elsif ($self->{nc} == 0x003C) { # <
487     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
488     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
489     not $self->{escape})) {
490    
491     $self->{state} = TAG_OPEN_STATE;
492    
493     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
494     $self->{line_prev} = $self->{line};
495     $self->{column_prev} = $self->{column};
496     $self->{column}++;
497     $self->{nc}
498     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
499     } else {
500     $self->{set_nc}->($self);
501     }
502    
503     redo A;
504     } else {
505    
506     $self->{s_kwd} = '';
507     #
508     }
509     } elsif ($self->{nc} == 0x003E) { # >
510     if ($self->{escape} and
511     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
512     if ($self->{s_kwd} eq '--') {
513    
514     delete $self->{escape};
515 wakaba 1.5 #
516 wakaba 1.1 } else {
517    
518 wakaba 1.5 #
519 wakaba 1.1 }
520 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
521    
522     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
523     line => $self->{line_prev},
524     column => $self->{column_prev} - 1);
525     #
526 wakaba 1.1 } else {
527    
528 wakaba 1.5 #
529 wakaba 1.1 }
530    
531     $self->{s_kwd} = '';
532     #
533 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
534     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
535    
536     $self->{s_kwd} .= ']';
537     } elsif ($self->{s_kwd} eq ']]') {
538    
539     #
540     } else {
541    
542     $self->{s_kwd} = '';
543     }
544     #
545 wakaba 1.1 } elsif ($self->{nc} == -1) {
546    
547     $self->{s_kwd} = '';
548     return ({type => END_OF_FILE_TOKEN,
549     line => $self->{line}, column => $self->{column}});
550     last A; ## TODO: ok?
551     } else {
552    
553     $self->{s_kwd} = '';
554     #
555     }
556    
557     # Anything else
558     my $token = {type => CHARACTER_TOKEN,
559     data => chr $self->{nc},
560     line => $self->{line}, column => $self->{column},
561     };
562 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
563 wakaba 1.1 length $token->{data})) {
564     $self->{s_kwd} = '';
565     }
566    
567     ## Stay in the data state.
568 wakaba 1.5 if (not $self->{is_xml} and
569     $self->{content_model} == PCDATA_CONTENT_MODEL) {
570 wakaba 1.1
571     $self->{state} = PCDATA_STATE;
572     } else {
573    
574     ## Stay in the state.
575     }
576    
577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
578     $self->{line_prev} = $self->{line};
579     $self->{column_prev} = $self->{column};
580     $self->{column}++;
581     $self->{nc}
582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
583     } else {
584     $self->{set_nc}->($self);
585     }
586    
587     return ($token);
588     redo A;
589     } elsif ($self->{state} == TAG_OPEN_STATE) {
590 wakaba 1.10 ## XML5: "tag state".
591    
592 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
593     if ($self->{nc} == 0x002F) { # /
594    
595    
596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
597     $self->{line_prev} = $self->{line};
598     $self->{column_prev} = $self->{column};
599     $self->{column}++;
600     $self->{nc}
601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
602     } else {
603     $self->{set_nc}->($self);
604     }
605    
606     $self->{state} = CLOSE_TAG_OPEN_STATE;
607     redo A;
608     } elsif ($self->{nc} == 0x0021) { # !
609    
610 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
611 wakaba 1.1 #
612     } else {
613    
614 wakaba 1.12 $self->{s_kwd} = '';
615 wakaba 1.1 #
616     }
617    
618     ## reconsume
619     $self->{state} = DATA_STATE;
620     return ({type => CHARACTER_TOKEN, data => '<',
621     line => $self->{line_prev},
622     column => $self->{column_prev},
623     });
624     redo A;
625     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
626     if ($self->{nc} == 0x0021) { # !
627    
628     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
629    
630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
631     $self->{line_prev} = $self->{line};
632     $self->{column_prev} = $self->{column};
633     $self->{column}++;
634     $self->{nc}
635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
636     } else {
637     $self->{set_nc}->($self);
638     }
639    
640     redo A;
641     } elsif ($self->{nc} == 0x002F) { # /
642    
643     $self->{state} = CLOSE_TAG_OPEN_STATE;
644    
645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
646     $self->{line_prev} = $self->{line};
647     $self->{column_prev} = $self->{column};
648     $self->{column}++;
649     $self->{nc}
650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
651     } else {
652     $self->{set_nc}->($self);
653     }
654    
655     redo A;
656     } elsif (0x0041 <= $self->{nc} and
657     $self->{nc} <= 0x005A) { # A..Z
658    
659     $self->{ct}
660     = {type => START_TAG_TOKEN,
661 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
662 wakaba 1.1 line => $self->{line_prev},
663     column => $self->{column_prev}};
664     $self->{state} = TAG_NAME_STATE;
665    
666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
667     $self->{line_prev} = $self->{line};
668     $self->{column_prev} = $self->{column};
669     $self->{column}++;
670     $self->{nc}
671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
672     } else {
673     $self->{set_nc}->($self);
674     }
675    
676     redo A;
677     } elsif (0x0061 <= $self->{nc} and
678     $self->{nc} <= 0x007A) { # a..z
679    
680     $self->{ct} = {type => START_TAG_TOKEN,
681     tag_name => chr ($self->{nc}),
682     line => $self->{line_prev},
683     column => $self->{column_prev}};
684     $self->{state} = TAG_NAME_STATE;
685    
686     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
687     $self->{line_prev} = $self->{line};
688     $self->{column_prev} = $self->{column};
689     $self->{column}++;
690     $self->{nc}
691     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
692     } else {
693     $self->{set_nc}->($self);
694     }
695    
696     redo A;
697     } elsif ($self->{nc} == 0x003E) { # >
698    
699     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
700     line => $self->{line_prev},
701     column => $self->{column_prev});
702     $self->{state} = DATA_STATE;
703 wakaba 1.5 $self->{s_kwd} = '';
704 wakaba 1.1
705     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
706     $self->{line_prev} = $self->{line};
707     $self->{column_prev} = $self->{column};
708     $self->{column}++;
709     $self->{nc}
710     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
711     } else {
712     $self->{set_nc}->($self);
713     }
714    
715    
716     return ({type => CHARACTER_TOKEN, data => '<>',
717     line => $self->{line_prev},
718     column => $self->{column_prev},
719     });
720    
721     redo A;
722     } elsif ($self->{nc} == 0x003F) { # ?
723 wakaba 1.8 if ($self->{is_xml}) {
724    
725     $self->{state} = PI_STATE;
726    
727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
728     $self->{line_prev} = $self->{line};
729     $self->{column_prev} = $self->{column};
730     $self->{column}++;
731     $self->{nc}
732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
733     } else {
734     $self->{set_nc}->($self);
735     }
736    
737     redo A;
738     } else {
739    
740     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
741     line => $self->{line_prev},
742     column => $self->{column_prev});
743     $self->{state} = BOGUS_COMMENT_STATE;
744     $self->{ct} = {type => COMMENT_TOKEN, data => '',
745     line => $self->{line_prev},
746     column => $self->{column_prev},
747     };
748     ## $self->{nc} is intentionally left as is
749     redo A;
750     }
751 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
752 wakaba 1.1
753     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
754     line => $self->{line_prev},
755     column => $self->{column_prev});
756     $self->{state} = DATA_STATE;
757 wakaba 1.5 $self->{s_kwd} = '';
758 wakaba 1.1 ## reconsume
759    
760     return ({type => CHARACTER_TOKEN, data => '<',
761     line => $self->{line_prev},
762     column => $self->{column_prev},
763     });
764    
765     redo A;
766 wakaba 1.9 } else {
767     ## XML5: "<:" is a parse error.
768    
769     $self->{ct} = {type => START_TAG_TOKEN,
770     tag_name => chr ($self->{nc}),
771     line => $self->{line_prev},
772     column => $self->{column_prev}};
773     $self->{state} = TAG_NAME_STATE;
774    
775     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
776     $self->{line_prev} = $self->{line};
777     $self->{column_prev} = $self->{column};
778     $self->{column}++;
779     $self->{nc}
780     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
781     } else {
782     $self->{set_nc}->($self);
783     }
784    
785     redo A;
786 wakaba 1.1 }
787     } else {
788     die "$0: $self->{content_model} in tag open";
789     }
790     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
791     ## NOTE: The "close tag open state" in the spec is implemented as
792     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
793    
794 wakaba 1.10 ## XML5: "end tag state".
795    
796 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
797     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
798     if (defined $self->{last_stag_name}) {
799     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
800 wakaba 1.12 $self->{kwd} = '';
801 wakaba 1.1 ## Reconsume.
802     redo A;
803     } else {
804     ## No start tag token has ever been emitted
805     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
806    
807     $self->{state} = DATA_STATE;
808 wakaba 1.5 $self->{s_kwd} = '';
809 wakaba 1.1 ## Reconsume.
810     return ({type => CHARACTER_TOKEN, data => '</',
811     line => $l, column => $c,
812     });
813     redo A;
814     }
815     }
816    
817     if (0x0041 <= $self->{nc} and
818     $self->{nc} <= 0x005A) { # A..Z
819    
820     $self->{ct}
821     = {type => END_TAG_TOKEN,
822 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
823 wakaba 1.1 line => $l, column => $c};
824     $self->{state} = TAG_NAME_STATE;
825    
826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
827     $self->{line_prev} = $self->{line};
828     $self->{column_prev} = $self->{column};
829     $self->{column}++;
830     $self->{nc}
831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
832     } else {
833     $self->{set_nc}->($self);
834     }
835    
836     redo A;
837     } elsif (0x0061 <= $self->{nc} and
838     $self->{nc} <= 0x007A) { # a..z
839    
840     $self->{ct} = {type => END_TAG_TOKEN,
841     tag_name => chr ($self->{nc}),
842     line => $l, column => $c};
843     $self->{state} = TAG_NAME_STATE;
844    
845     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
846     $self->{line_prev} = $self->{line};
847     $self->{column_prev} = $self->{column};
848     $self->{column}++;
849     $self->{nc}
850     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
851     } else {
852     $self->{set_nc}->($self);
853     }
854    
855     redo A;
856     } elsif ($self->{nc} == 0x003E) { # >
857     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
858     line => $self->{line_prev}, ## "<" in "</>"
859     column => $self->{column_prev} - 1);
860     $self->{state} = DATA_STATE;
861 wakaba 1.5 $self->{s_kwd} = '';
862 wakaba 1.10 if ($self->{is_xml}) {
863    
864     ## XML5: No parse error.
865    
866     ## NOTE: This parser raises a parse error, since it supports
867     ## XML1, not XML5.
868    
869     ## NOTE: A short end tag token.
870     my $ct = {type => END_TAG_TOKEN,
871     tag_name => '',
872     line => $self->{line_prev},
873     column => $self->{column_prev} - 1,
874     };
875    
876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
877     $self->{line_prev} = $self->{line};
878     $self->{column_prev} = $self->{column};
879     $self->{column}++;
880     $self->{nc}
881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
882     } else {
883     $self->{set_nc}->($self);
884     }
885    
886     return ($ct);
887     } else {
888    
889    
890 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
891     $self->{line_prev} = $self->{line};
892     $self->{column_prev} = $self->{column};
893     $self->{column}++;
894     $self->{nc}
895     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
896     } else {
897     $self->{set_nc}->($self);
898     }
899    
900 wakaba 1.10 }
901 wakaba 1.1 redo A;
902     } elsif ($self->{nc} == -1) {
903    
904     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
905 wakaba 1.5 $self->{s_kwd} = '';
906 wakaba 1.1 $self->{state} = DATA_STATE;
907     # reconsume
908    
909     return ({type => CHARACTER_TOKEN, data => '</',
910     line => $l, column => $c,
911     });
912    
913     redo A;
914 wakaba 1.10 } elsif (not $self->{is_xml} or
915     $is_space->{$self->{nc}}) {
916 wakaba 1.1
917 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
918     line => $self->{line_prev}, # "<" of "</"
919     column => $self->{column_prev} - 1);
920 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
921     $self->{ct} = {type => COMMENT_TOKEN, data => '',
922     line => $self->{line_prev}, # "<" of "</"
923     column => $self->{column_prev} - 1,
924     };
925     ## NOTE: $self->{nc} is intentionally left as is.
926     ## Although the "anything else" case of the spec not explicitly
927     ## states that the next input character is to be reconsumed,
928     ## it will be included to the |data| of the comment token
929     ## generated from the bogus end tag, as defined in the
930     ## "bogus comment state" entry.
931     redo A;
932 wakaba 1.10 } else {
933     ## XML5: "</:" is a parse error.
934    
935     $self->{ct} = {type => END_TAG_TOKEN,
936     tag_name => chr ($self->{nc}),
937     line => $l, column => $c};
938     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
939    
940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
941     $self->{line_prev} = $self->{line};
942     $self->{column_prev} = $self->{column};
943     $self->{column}++;
944     $self->{nc}
945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
946     } else {
947     $self->{set_nc}->($self);
948     }
949    
950     redo A;
951 wakaba 1.1 }
952     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
953 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
954 wakaba 1.1 if (length $ch) {
955     my $CH = $ch;
956     $ch =~ tr/a-z/A-Z/;
957     my $nch = chr $self->{nc};
958     if ($nch eq $ch or $nch eq $CH) {
959    
960     ## Stay in the state.
961 wakaba 1.12 $self->{kwd} .= $nch;
962 wakaba 1.1
963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
964     $self->{line_prev} = $self->{line};
965     $self->{column_prev} = $self->{column};
966     $self->{column}++;
967     $self->{nc}
968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
969     } else {
970     $self->{set_nc}->($self);
971     }
972    
973     redo A;
974     } else {
975    
976     $self->{state} = DATA_STATE;
977 wakaba 1.5 $self->{s_kwd} = '';
978 wakaba 1.1 ## Reconsume.
979     return ({type => CHARACTER_TOKEN,
980 wakaba 1.12 data => '</' . $self->{kwd},
981 wakaba 1.1 line => $self->{line_prev},
982 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
983 wakaba 1.1 });
984     redo A;
985     }
986     } else { # after "<{tag-name}"
987     unless ($is_space->{$self->{nc}} or
988     {
989     0x003E => 1, # >
990     0x002F => 1, # /
991     -1 => 1, # EOF
992     }->{$self->{nc}}) {
993    
994     ## Reconsume.
995     $self->{state} = DATA_STATE;
996 wakaba 1.5 $self->{s_kwd} = '';
997 wakaba 1.1 return ({type => CHARACTER_TOKEN,
998 wakaba 1.12 data => '</' . $self->{kwd},
999 wakaba 1.1 line => $self->{line_prev},
1000 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1001 wakaba 1.1 });
1002     redo A;
1003     } else {
1004    
1005     $self->{ct}
1006     = {type => END_TAG_TOKEN,
1007     tag_name => $self->{last_stag_name},
1008     line => $self->{line_prev},
1009 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1010 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1011     ## Reconsume.
1012     redo A;
1013     }
1014     }
1015     } elsif ($self->{state} == TAG_NAME_STATE) {
1016     if ($is_space->{$self->{nc}}) {
1017    
1018     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1019    
1020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1021     $self->{line_prev} = $self->{line};
1022     $self->{column_prev} = $self->{column};
1023     $self->{column}++;
1024     $self->{nc}
1025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1026     } else {
1027     $self->{set_nc}->($self);
1028     }
1029    
1030     redo A;
1031     } elsif ($self->{nc} == 0x003E) { # >
1032     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033    
1034     $self->{last_stag_name} = $self->{ct}->{tag_name};
1035     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1036     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1037     #if ($self->{ct}->{attributes}) {
1038     # ## NOTE: This should never be reached.
1039     # !!! cp (36);
1040     # !!! parse-error (type => 'end tag attribute');
1041     #} else {
1042    
1043     #}
1044     } else {
1045     die "$0: $self->{ct}->{type}: Unknown token type";
1046     }
1047     $self->{state} = DATA_STATE;
1048 wakaba 1.5 $self->{s_kwd} = '';
1049 wakaba 1.1
1050     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1051     $self->{line_prev} = $self->{line};
1052     $self->{column_prev} = $self->{column};
1053     $self->{column}++;
1054     $self->{nc}
1055     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1056     } else {
1057     $self->{set_nc}->($self);
1058     }
1059    
1060    
1061     return ($self->{ct}); # start tag or end tag
1062    
1063     redo A;
1064     } elsif (0x0041 <= $self->{nc} and
1065     $self->{nc} <= 0x005A) { # A..Z
1066    
1067 wakaba 1.4 $self->{ct}->{tag_name}
1068     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1069 wakaba 1.1 # start tag or end tag
1070     ## Stay in this state
1071    
1072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1073     $self->{line_prev} = $self->{line};
1074     $self->{column_prev} = $self->{column};
1075     $self->{column}++;
1076     $self->{nc}
1077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1078     } else {
1079     $self->{set_nc}->($self);
1080     }
1081    
1082     redo A;
1083     } elsif ($self->{nc} == -1) {
1084     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1085     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1086    
1087     $self->{last_stag_name} = $self->{ct}->{tag_name};
1088     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1089     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1090     #if ($self->{ct}->{attributes}) {
1091     # ## NOTE: This state should never be reached.
1092     # !!! cp (40);
1093     # !!! parse-error (type => 'end tag attribute');
1094     #} else {
1095    
1096     #}
1097     } else {
1098     die "$0: $self->{ct}->{type}: Unknown token type";
1099     }
1100     $self->{state} = DATA_STATE;
1101 wakaba 1.5 $self->{s_kwd} = '';
1102 wakaba 1.1 # reconsume
1103    
1104     return ($self->{ct}); # start tag or end tag
1105    
1106     redo A;
1107     } elsif ($self->{nc} == 0x002F) { # /
1108    
1109     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1110    
1111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1112     $self->{line_prev} = $self->{line};
1113     $self->{column_prev} = $self->{column};
1114     $self->{column}++;
1115     $self->{nc}
1116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1117     } else {
1118     $self->{set_nc}->($self);
1119     }
1120    
1121     redo A;
1122     } else {
1123    
1124     $self->{ct}->{tag_name} .= chr $self->{nc};
1125     # start tag or end tag
1126     ## Stay in the state
1127    
1128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1129     $self->{line_prev} = $self->{line};
1130     $self->{column_prev} = $self->{column};
1131     $self->{column}++;
1132     $self->{nc}
1133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1134     } else {
1135     $self->{set_nc}->($self);
1136     }
1137    
1138     redo A;
1139     }
1140     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1141 wakaba 1.11 ## XML5: "Tag attribute name before state".
1142    
1143 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1144    
1145     ## Stay in the state
1146    
1147     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1148     $self->{line_prev} = $self->{line};
1149     $self->{column_prev} = $self->{column};
1150     $self->{column}++;
1151     $self->{nc}
1152     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1153     } else {
1154     $self->{set_nc}->($self);
1155     }
1156    
1157     redo A;
1158     } elsif ($self->{nc} == 0x003E) { # >
1159     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1160    
1161     $self->{last_stag_name} = $self->{ct}->{tag_name};
1162     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1163     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1164     if ($self->{ct}->{attributes}) {
1165    
1166     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1167     } else {
1168    
1169     }
1170     } else {
1171     die "$0: $self->{ct}->{type}: Unknown token type";
1172     }
1173     $self->{state} = DATA_STATE;
1174 wakaba 1.5 $self->{s_kwd} = '';
1175 wakaba 1.1
1176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1177     $self->{line_prev} = $self->{line};
1178     $self->{column_prev} = $self->{column};
1179     $self->{column}++;
1180     $self->{nc}
1181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1182     } else {
1183     $self->{set_nc}->($self);
1184     }
1185    
1186    
1187     return ($self->{ct}); # start tag or end tag
1188    
1189     redo A;
1190     } elsif (0x0041 <= $self->{nc} and
1191     $self->{nc} <= 0x005A) { # A..Z
1192    
1193     $self->{ca}
1194 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1195 wakaba 1.1 value => '',
1196     line => $self->{line}, column => $self->{column}};
1197     $self->{state} = ATTRIBUTE_NAME_STATE;
1198    
1199     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1200     $self->{line_prev} = $self->{line};
1201     $self->{column_prev} = $self->{column};
1202     $self->{column}++;
1203     $self->{nc}
1204     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1205     } else {
1206     $self->{set_nc}->($self);
1207     }
1208    
1209     redo A;
1210     } elsif ($self->{nc} == 0x002F) { # /
1211    
1212     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1213    
1214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1215     $self->{line_prev} = $self->{line};
1216     $self->{column_prev} = $self->{column};
1217     $self->{column}++;
1218     $self->{nc}
1219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1220     } else {
1221     $self->{set_nc}->($self);
1222     }
1223    
1224     redo A;
1225     } elsif ($self->{nc} == -1) {
1226     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1227     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1228    
1229     $self->{last_stag_name} = $self->{ct}->{tag_name};
1230     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1231     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1232     if ($self->{ct}->{attributes}) {
1233    
1234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1235     } else {
1236    
1237     }
1238     } else {
1239     die "$0: $self->{ct}->{type}: Unknown token type";
1240     }
1241     $self->{state} = DATA_STATE;
1242 wakaba 1.5 $self->{s_kwd} = '';
1243 wakaba 1.1 # reconsume
1244    
1245     return ($self->{ct}); # start tag or end tag
1246    
1247     redo A;
1248     } else {
1249     if ({
1250     0x0022 => 1, # "
1251     0x0027 => 1, # '
1252 wakaba 1.30 0x003C => 1, # <
1253 wakaba 1.1 0x003D => 1, # =
1254     }->{$self->{nc}}) {
1255    
1256 wakaba 1.11 ## XML5: Not a parse error.
1257 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1258     } else {
1259    
1260 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1261 wakaba 1.1 }
1262     $self->{ca}
1263     = {name => chr ($self->{nc}),
1264     value => '',
1265     line => $self->{line}, column => $self->{column}};
1266     $self->{state} = ATTRIBUTE_NAME_STATE;
1267    
1268     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1269     $self->{line_prev} = $self->{line};
1270     $self->{column_prev} = $self->{column};
1271     $self->{column}++;
1272     $self->{nc}
1273     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1274     } else {
1275     $self->{set_nc}->($self);
1276     }
1277    
1278     redo A;
1279     }
1280     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1281 wakaba 1.11 ## XML5: "Tag attribute name state".
1282    
1283 wakaba 1.1 my $before_leave = sub {
1284     if (exists $self->{ct}->{attributes} # start tag or end tag
1285     ->{$self->{ca}->{name}}) { # MUST
1286    
1287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1288     ## Discard $self->{ca} # MUST
1289     } else {
1290    
1291     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1292     = $self->{ca};
1293 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1294 wakaba 1.1 }
1295     }; # $before_leave
1296    
1297     if ($is_space->{$self->{nc}}) {
1298    
1299     $before_leave->();
1300     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1301    
1302     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1303     $self->{line_prev} = $self->{line};
1304     $self->{column_prev} = $self->{column};
1305     $self->{column}++;
1306     $self->{nc}
1307     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1308     } else {
1309     $self->{set_nc}->($self);
1310     }
1311    
1312     redo A;
1313     } elsif ($self->{nc} == 0x003D) { # =
1314    
1315     $before_leave->();
1316     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1317    
1318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1319     $self->{line_prev} = $self->{line};
1320     $self->{column_prev} = $self->{column};
1321     $self->{column}++;
1322     $self->{nc}
1323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1324     } else {
1325     $self->{set_nc}->($self);
1326     }
1327    
1328     redo A;
1329     } elsif ($self->{nc} == 0x003E) { # >
1330 wakaba 1.11 if ($self->{is_xml}) {
1331    
1332     ## XML5: Not a parse error.
1333     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1334     } else {
1335    
1336     }
1337    
1338 wakaba 1.1 $before_leave->();
1339     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1340    
1341     $self->{last_stag_name} = $self->{ct}->{tag_name};
1342     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1343    
1344     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1345     if ($self->{ct}->{attributes}) {
1346     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1347     }
1348     } else {
1349     die "$0: $self->{ct}->{type}: Unknown token type";
1350     }
1351     $self->{state} = DATA_STATE;
1352 wakaba 1.5 $self->{s_kwd} = '';
1353 wakaba 1.1
1354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1355     $self->{line_prev} = $self->{line};
1356     $self->{column_prev} = $self->{column};
1357     $self->{column}++;
1358     $self->{nc}
1359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1360     } else {
1361     $self->{set_nc}->($self);
1362     }
1363    
1364    
1365     return ($self->{ct}); # start tag or end tag
1366    
1367     redo A;
1368     } elsif (0x0041 <= $self->{nc} and
1369     $self->{nc} <= 0x005A) { # A..Z
1370    
1371 wakaba 1.4 $self->{ca}->{name}
1372     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1373 wakaba 1.1 ## Stay in the state
1374    
1375     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1376     $self->{line_prev} = $self->{line};
1377     $self->{column_prev} = $self->{column};
1378     $self->{column}++;
1379     $self->{nc}
1380     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1381     } else {
1382     $self->{set_nc}->($self);
1383     }
1384    
1385     redo A;
1386     } elsif ($self->{nc} == 0x002F) { # /
1387 wakaba 1.11 if ($self->{is_xml}) {
1388    
1389     ## XML5: Not a parse error.
1390     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1391     } else {
1392    
1393     }
1394 wakaba 1.1
1395     $before_leave->();
1396     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1397    
1398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1399     $self->{line_prev} = $self->{line};
1400     $self->{column_prev} = $self->{column};
1401     $self->{column}++;
1402     $self->{nc}
1403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1404     } else {
1405     $self->{set_nc}->($self);
1406     }
1407    
1408     redo A;
1409     } elsif ($self->{nc} == -1) {
1410     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1411     $before_leave->();
1412     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1413    
1414     $self->{last_stag_name} = $self->{ct}->{tag_name};
1415     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1416     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1417     if ($self->{ct}->{attributes}) {
1418    
1419     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1420     } else {
1421     ## NOTE: This state should never be reached.
1422    
1423     }
1424     } else {
1425     die "$0: $self->{ct}->{type}: Unknown token type";
1426     }
1427     $self->{state} = DATA_STATE;
1428 wakaba 1.5 $self->{s_kwd} = '';
1429 wakaba 1.1 # reconsume
1430    
1431     return ($self->{ct}); # start tag or end tag
1432    
1433     redo A;
1434     } else {
1435 wakaba 1.30 if ({
1436     0x0022 => 1, # "
1437     0x0027 => 1, # '
1438     0x003C => 1, # <
1439     }->{$self->{nc}}) {
1440 wakaba 1.1
1441 wakaba 1.11 ## XML5: Not a parse error.
1442 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1443     } else {
1444    
1445     }
1446     $self->{ca}->{name} .= chr ($self->{nc});
1447     ## Stay in the state
1448    
1449     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1450     $self->{line_prev} = $self->{line};
1451     $self->{column_prev} = $self->{column};
1452     $self->{column}++;
1453     $self->{nc}
1454     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1455     } else {
1456     $self->{set_nc}->($self);
1457     }
1458    
1459     redo A;
1460     }
1461     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1462 wakaba 1.11 ## XML5: "Tag attribute name after state".
1463    
1464 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1465    
1466     ## Stay in the state
1467    
1468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1469     $self->{line_prev} = $self->{line};
1470     $self->{column_prev} = $self->{column};
1471     $self->{column}++;
1472     $self->{nc}
1473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1474     } else {
1475     $self->{set_nc}->($self);
1476     }
1477    
1478     redo A;
1479     } elsif ($self->{nc} == 0x003D) { # =
1480    
1481     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1482    
1483     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1484     $self->{line_prev} = $self->{line};
1485     $self->{column_prev} = $self->{column};
1486     $self->{column}++;
1487     $self->{nc}
1488     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1489     } else {
1490     $self->{set_nc}->($self);
1491     }
1492    
1493     redo A;
1494     } elsif ($self->{nc} == 0x003E) { # >
1495 wakaba 1.11 if ($self->{is_xml}) {
1496    
1497     ## XML5: Not a parse error.
1498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1499     } else {
1500    
1501     }
1502    
1503 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1504    
1505     $self->{last_stag_name} = $self->{ct}->{tag_name};
1506     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1507     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1508     if ($self->{ct}->{attributes}) {
1509    
1510     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1511     } else {
1512     ## NOTE: This state should never be reached.
1513    
1514     }
1515     } else {
1516     die "$0: $self->{ct}->{type}: Unknown token type";
1517     }
1518     $self->{state} = DATA_STATE;
1519 wakaba 1.5 $self->{s_kwd} = '';
1520 wakaba 1.1
1521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1522     $self->{line_prev} = $self->{line};
1523     $self->{column_prev} = $self->{column};
1524     $self->{column}++;
1525     $self->{nc}
1526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1527     } else {
1528     $self->{set_nc}->($self);
1529     }
1530    
1531    
1532     return ($self->{ct}); # start tag or end tag
1533    
1534     redo A;
1535     } elsif (0x0041 <= $self->{nc} and
1536     $self->{nc} <= 0x005A) { # A..Z
1537    
1538     $self->{ca}
1539 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1540 wakaba 1.1 value => '',
1541     line => $self->{line}, column => $self->{column}};
1542     $self->{state} = ATTRIBUTE_NAME_STATE;
1543    
1544     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1545     $self->{line_prev} = $self->{line};
1546     $self->{column_prev} = $self->{column};
1547     $self->{column}++;
1548     $self->{nc}
1549     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1550     } else {
1551     $self->{set_nc}->($self);
1552     }
1553    
1554     redo A;
1555     } elsif ($self->{nc} == 0x002F) { # /
1556 wakaba 1.11 if ($self->{is_xml}) {
1557    
1558     ## XML5: Not a parse error.
1559     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1560     } else {
1561    
1562     }
1563 wakaba 1.1
1564     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1565    
1566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1567     $self->{line_prev} = $self->{line};
1568     $self->{column_prev} = $self->{column};
1569     $self->{column}++;
1570     $self->{nc}
1571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1572     } else {
1573     $self->{set_nc}->($self);
1574     }
1575    
1576     redo A;
1577     } elsif ($self->{nc} == -1) {
1578     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1579     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1580    
1581     $self->{last_stag_name} = $self->{ct}->{tag_name};
1582     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1583     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1584     if ($self->{ct}->{attributes}) {
1585    
1586     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1587     } else {
1588     ## NOTE: This state should never be reached.
1589    
1590     }
1591     } else {
1592     die "$0: $self->{ct}->{type}: Unknown token type";
1593     }
1594 wakaba 1.5 $self->{s_kwd} = '';
1595 wakaba 1.1 $self->{state} = DATA_STATE;
1596     # reconsume
1597    
1598     return ($self->{ct}); # start tag or end tag
1599    
1600     redo A;
1601     } else {
1602 wakaba 1.11 if ($self->{is_xml}) {
1603    
1604     ## XML5: Not a parse error.
1605     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1606     } else {
1607    
1608     }
1609    
1610 wakaba 1.30 if ({
1611     0x0022 => 1, # "
1612     0x0027 => 1, # '
1613     0x003C => 1, # <
1614     }->{$self->{nc}}) {
1615 wakaba 1.1
1616 wakaba 1.11 ## XML5: Not a parse error.
1617 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1618     } else {
1619    
1620     }
1621     $self->{ca}
1622     = {name => chr ($self->{nc}),
1623     value => '',
1624     line => $self->{line}, column => $self->{column}};
1625     $self->{state} = ATTRIBUTE_NAME_STATE;
1626    
1627     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1628     $self->{line_prev} = $self->{line};
1629     $self->{column_prev} = $self->{column};
1630     $self->{column}++;
1631     $self->{nc}
1632     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1633     } else {
1634     $self->{set_nc}->($self);
1635     }
1636    
1637     redo A;
1638     }
1639     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1640 wakaba 1.11 ## XML5: "Tag attribute value before state".
1641    
1642 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1643    
1644     ## Stay in the state
1645    
1646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1647     $self->{line_prev} = $self->{line};
1648     $self->{column_prev} = $self->{column};
1649     $self->{column}++;
1650     $self->{nc}
1651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1652     } else {
1653     $self->{set_nc}->($self);
1654     }
1655    
1656     redo A;
1657     } elsif ($self->{nc} == 0x0022) { # "
1658    
1659     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1660    
1661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1662     $self->{line_prev} = $self->{line};
1663     $self->{column_prev} = $self->{column};
1664     $self->{column}++;
1665     $self->{nc}
1666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1667     } else {
1668     $self->{set_nc}->($self);
1669     }
1670    
1671     redo A;
1672     } elsif ($self->{nc} == 0x0026) { # &
1673    
1674     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1675     ## reconsume
1676     redo A;
1677     } elsif ($self->{nc} == 0x0027) { # '
1678    
1679     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1680    
1681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1682     $self->{line_prev} = $self->{line};
1683     $self->{column_prev} = $self->{column};
1684     $self->{column}++;
1685     $self->{nc}
1686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1687     } else {
1688     $self->{set_nc}->($self);
1689     }
1690    
1691     redo A;
1692     } elsif ($self->{nc} == 0x003E) { # >
1693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1694     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1695    
1696     $self->{last_stag_name} = $self->{ct}->{tag_name};
1697     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1698     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1699     if ($self->{ct}->{attributes}) {
1700    
1701     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1702     } else {
1703     ## NOTE: This state should never be reached.
1704    
1705     }
1706     } else {
1707     die "$0: $self->{ct}->{type}: Unknown token type";
1708     }
1709     $self->{state} = DATA_STATE;
1710 wakaba 1.5 $self->{s_kwd} = '';
1711 wakaba 1.1
1712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1713     $self->{line_prev} = $self->{line};
1714     $self->{column_prev} = $self->{column};
1715     $self->{column}++;
1716     $self->{nc}
1717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1718     } else {
1719     $self->{set_nc}->($self);
1720     }
1721    
1722    
1723     return ($self->{ct}); # start tag or end tag
1724    
1725     redo A;
1726     } elsif ($self->{nc} == -1) {
1727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1728     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1729    
1730     $self->{last_stag_name} = $self->{ct}->{tag_name};
1731     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1732     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1733     if ($self->{ct}->{attributes}) {
1734    
1735     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1736     } else {
1737     ## NOTE: This state should never be reached.
1738    
1739     }
1740     } else {
1741     die "$0: $self->{ct}->{type}: Unknown token type";
1742     }
1743     $self->{state} = DATA_STATE;
1744 wakaba 1.5 $self->{s_kwd} = '';
1745 wakaba 1.1 ## reconsume
1746    
1747     return ($self->{ct}); # start tag or end tag
1748    
1749     redo A;
1750     } else {
1751 wakaba 1.26 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1752 wakaba 1.1
1753 wakaba 1.11 ## XML5: Not a parse error.
1754 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1755 wakaba 1.11 } elsif ($self->{is_xml}) {
1756    
1757     ## XML5: No parse error.
1758     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1759 wakaba 1.1 } else {
1760    
1761     }
1762     $self->{ca}->{value} .= chr ($self->{nc});
1763     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1764    
1765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1766     $self->{line_prev} = $self->{line};
1767     $self->{column_prev} = $self->{column};
1768     $self->{column}++;
1769     $self->{nc}
1770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1771     } else {
1772     $self->{set_nc}->($self);
1773     }
1774    
1775     redo A;
1776     }
1777     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1778 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1779     ## ATTLIST attribute value double quoted state".
1780 wakaba 1.11
1781 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1782 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1783    
1784     ## XML5: "DOCTYPE ATTLIST name after state".
1785     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1786     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1787     } else {
1788    
1789     ## XML5: "Tag attribute name before state".
1790     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1791     }
1792 wakaba 1.1
1793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1794     $self->{line_prev} = $self->{line};
1795     $self->{column_prev} = $self->{column};
1796     $self->{column}++;
1797     $self->{nc}
1798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1799     } else {
1800     $self->{set_nc}->($self);
1801     }
1802    
1803     redo A;
1804     } elsif ($self->{nc} == 0x0026) { # &
1805    
1806 wakaba 1.11 ## XML5: Not defined yet.
1807    
1808 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1809     ## "entity in attribute value state". In this implementation, the
1810     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1811     ## implementation of the "consume a character reference" algorithm.
1812     $self->{prev_state} = $self->{state};
1813     $self->{entity_add} = 0x0022; # "
1814     $self->{state} = ENTITY_STATE;
1815    
1816     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1817     $self->{line_prev} = $self->{line};
1818     $self->{column_prev} = $self->{column};
1819     $self->{column}++;
1820     $self->{nc}
1821     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1822     } else {
1823     $self->{set_nc}->($self);
1824     }
1825    
1826     redo A;
1827 wakaba 1.25 } elsif ($self->{is_xml} and
1828     $is_space->{$self->{nc}}) {
1829    
1830     $self->{ca}->{value} .= ' ';
1831     ## Stay in the state.
1832    
1833     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1834     $self->{line_prev} = $self->{line};
1835     $self->{column_prev} = $self->{column};
1836     $self->{column}++;
1837     $self->{nc}
1838     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1839     } else {
1840     $self->{set_nc}->($self);
1841     }
1842    
1843     redo A;
1844 wakaba 1.1 } elsif ($self->{nc} == -1) {
1845     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1846     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1847    
1848     $self->{last_stag_name} = $self->{ct}->{tag_name};
1849 wakaba 1.15
1850     $self->{state} = DATA_STATE;
1851     $self->{s_kwd} = '';
1852     ## reconsume
1853     return ($self->{ct}); # start tag
1854     redo A;
1855 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1856     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1857     if ($self->{ct}->{attributes}) {
1858    
1859     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1860     } else {
1861     ## NOTE: This state should never be reached.
1862    
1863     }
1864 wakaba 1.15
1865     $self->{state} = DATA_STATE;
1866     $self->{s_kwd} = '';
1867     ## reconsume
1868     return ($self->{ct}); # end tag
1869     redo A;
1870     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1871     ## XML5: No parse error above; not defined yet.
1872     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1873     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1874     ## Reconsume.
1875     return ($self->{ct}); # ATTLIST
1876     redo A;
1877 wakaba 1.1 } else {
1878     die "$0: $self->{ct}->{type}: Unknown token type";
1879     }
1880     } else {
1881 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1882 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1883    
1884     ## XML5: Not a parse error.
1885     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1886     } else {
1887    
1888     }
1889 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1890     $self->{read_until}->($self->{ca}->{value},
1891 wakaba 1.25 qq["&<\x09\x0C\x20],
1892 wakaba 1.1 length $self->{ca}->{value});
1893    
1894     ## Stay in the state
1895    
1896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1897     $self->{line_prev} = $self->{line};
1898     $self->{column_prev} = $self->{column};
1899     $self->{column}++;
1900     $self->{nc}
1901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1902     } else {
1903     $self->{set_nc}->($self);
1904     }
1905    
1906     redo A;
1907     }
1908     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1909 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1910     ## ATTLIST attribute value single quoted state".
1911 wakaba 1.11
1912 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1913 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1914    
1915     ## XML5: "DOCTYPE ATTLIST name after state".
1916     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1917     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1918     } else {
1919    
1920     ## XML5: "Before attribute name state" (sic).
1921     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1922     }
1923 wakaba 1.1
1924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1925     $self->{line_prev} = $self->{line};
1926     $self->{column_prev} = $self->{column};
1927     $self->{column}++;
1928     $self->{nc}
1929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1930     } else {
1931     $self->{set_nc}->($self);
1932     }
1933    
1934     redo A;
1935     } elsif ($self->{nc} == 0x0026) { # &
1936    
1937 wakaba 1.11 ## XML5: Not defined yet.
1938    
1939 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1940     ## "entity in attribute value state". In this implementation, the
1941     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1942     ## implementation of the "consume a character reference" algorithm.
1943     $self->{entity_add} = 0x0027; # '
1944     $self->{prev_state} = $self->{state};
1945     $self->{state} = ENTITY_STATE;
1946    
1947     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1948     $self->{line_prev} = $self->{line};
1949     $self->{column_prev} = $self->{column};
1950     $self->{column}++;
1951     $self->{nc}
1952     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1953     } else {
1954     $self->{set_nc}->($self);
1955     }
1956    
1957     redo A;
1958 wakaba 1.25 } elsif ($self->{is_xml} and
1959     $is_space->{$self->{nc}}) {
1960    
1961     $self->{ca}->{value} .= ' ';
1962     ## Stay in the state.
1963    
1964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1965     $self->{line_prev} = $self->{line};
1966     $self->{column_prev} = $self->{column};
1967     $self->{column}++;
1968     $self->{nc}
1969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1970     } else {
1971     $self->{set_nc}->($self);
1972     }
1973    
1974     redo A;
1975 wakaba 1.1 } elsif ($self->{nc} == -1) {
1976     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1977     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1978    
1979     $self->{last_stag_name} = $self->{ct}->{tag_name};
1980 wakaba 1.15
1981     $self->{state} = DATA_STATE;
1982     $self->{s_kwd} = '';
1983     ## reconsume
1984     return ($self->{ct}); # start tag
1985     redo A;
1986 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1987     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1988     if ($self->{ct}->{attributes}) {
1989    
1990     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1991     } else {
1992     ## NOTE: This state should never be reached.
1993    
1994     }
1995 wakaba 1.15
1996     $self->{state} = DATA_STATE;
1997     $self->{s_kwd} = '';
1998     ## reconsume
1999     return ($self->{ct}); # end tag
2000     redo A;
2001     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002     ## XML5: No parse error above; not defined yet.
2003     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005     ## Reconsume.
2006     return ($self->{ct}); # ATTLIST
2007     redo A;
2008 wakaba 1.1 } else {
2009     die "$0: $self->{ct}->{type}: Unknown token type";
2010     }
2011     } else {
2012 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
2013 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2014    
2015     ## XML5: Not a parse error.
2016     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2017     } else {
2018    
2019     }
2020 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
2021     $self->{read_until}->($self->{ca}->{value},
2022 wakaba 1.25 qq['&<\x09\x0C\x20],
2023 wakaba 1.1 length $self->{ca}->{value});
2024    
2025     ## Stay in the state
2026    
2027     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2028     $self->{line_prev} = $self->{line};
2029     $self->{column_prev} = $self->{column};
2030     $self->{column}++;
2031     $self->{nc}
2032     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2033     } else {
2034     $self->{set_nc}->($self);
2035     }
2036    
2037     redo A;
2038     }
2039     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2040 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
2041    
2042 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2043 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2044    
2045     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2046     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2047     } else {
2048    
2049     ## XML5: "Tag attribute name before state".
2050     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2051     }
2052 wakaba 1.1
2053     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2054     $self->{line_prev} = $self->{line};
2055     $self->{column_prev} = $self->{column};
2056     $self->{column}++;
2057     $self->{nc}
2058     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2059     } else {
2060     $self->{set_nc}->($self);
2061     }
2062    
2063     redo A;
2064     } elsif ($self->{nc} == 0x0026) { # &
2065    
2066 wakaba 1.11
2067     ## XML5: Not defined yet.
2068    
2069 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2070     ## "entity in attribute value state". In this implementation, the
2071     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2072     ## implementation of the "consume a character reference" algorithm.
2073     $self->{entity_add} = -1;
2074     $self->{prev_state} = $self->{state};
2075     $self->{state} = ENTITY_STATE;
2076    
2077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2078     $self->{line_prev} = $self->{line};
2079     $self->{column_prev} = $self->{column};
2080     $self->{column}++;
2081     $self->{nc}
2082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2083     } else {
2084     $self->{set_nc}->($self);
2085     }
2086    
2087     redo A;
2088     } elsif ($self->{nc} == 0x003E) { # >
2089     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2090    
2091     $self->{last_stag_name} = $self->{ct}->{tag_name};
2092 wakaba 1.15
2093     $self->{state} = DATA_STATE;
2094     $self->{s_kwd} = '';
2095    
2096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2097     $self->{line_prev} = $self->{line};
2098     $self->{column_prev} = $self->{column};
2099     $self->{column}++;
2100     $self->{nc}
2101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2102     } else {
2103     $self->{set_nc}->($self);
2104     }
2105    
2106     return ($self->{ct}); # start tag
2107     redo A;
2108 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2109     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2110     if ($self->{ct}->{attributes}) {
2111    
2112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2113     } else {
2114     ## NOTE: This state should never be reached.
2115    
2116     }
2117 wakaba 1.15
2118     $self->{state} = DATA_STATE;
2119     $self->{s_kwd} = '';
2120    
2121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2122     $self->{line_prev} = $self->{line};
2123     $self->{column_prev} = $self->{column};
2124     $self->{column}++;
2125     $self->{nc}
2126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2127     } else {
2128     $self->{set_nc}->($self);
2129     }
2130    
2131     return ($self->{ct}); # end tag
2132     redo A;
2133     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2134     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2135     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2136    
2137 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2138     $self->{line_prev} = $self->{line};
2139     $self->{column_prev} = $self->{column};
2140     $self->{column}++;
2141     $self->{nc}
2142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2143     } else {
2144     $self->{set_nc}->($self);
2145     }
2146    
2147 wakaba 1.15 return ($self->{ct}); # ATTLIST
2148     redo A;
2149     } else {
2150     die "$0: $self->{ct}->{type}: Unknown token type";
2151     }
2152 wakaba 1.1 } elsif ($self->{nc} == -1) {
2153     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2154    
2155 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2156 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2157 wakaba 1.15
2158     $self->{state} = DATA_STATE;
2159     $self->{s_kwd} = '';
2160     ## reconsume
2161     return ($self->{ct}); # start tag
2162     redo A;
2163 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2164 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2165 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2166     if ($self->{ct}->{attributes}) {
2167    
2168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2169     } else {
2170     ## NOTE: This state should never be reached.
2171    
2172     }
2173 wakaba 1.15
2174     $self->{state} = DATA_STATE;
2175     $self->{s_kwd} = '';
2176     ## reconsume
2177     return ($self->{ct}); # end tag
2178     redo A;
2179     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2180     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2181     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2182     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2183     ## Reconsume.
2184     return ($self->{ct}); # ATTLIST
2185     redo A;
2186 wakaba 1.1 } else {
2187     die "$0: $self->{ct}->{type}: Unknown token type";
2188     }
2189     } else {
2190     if ({
2191     0x0022 => 1, # "
2192     0x0027 => 1, # '
2193     0x003D => 1, # =
2194 wakaba 1.26 0x003C => 1, # <
2195 wakaba 1.1 }->{$self->{nc}}) {
2196    
2197 wakaba 1.11 ## XML5: Not a parse error.
2198 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2199     } else {
2200    
2201     }
2202     $self->{ca}->{value} .= chr ($self->{nc});
2203     $self->{read_until}->($self->{ca}->{value},
2204 wakaba 1.25 qq["'=& \x09\x0C>],
2205 wakaba 1.1 length $self->{ca}->{value});
2206    
2207     ## Stay in the state
2208    
2209     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2210     $self->{line_prev} = $self->{line};
2211     $self->{column_prev} = $self->{column};
2212     $self->{column}++;
2213     $self->{nc}
2214     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2215     } else {
2216     $self->{set_nc}->($self);
2217     }
2218    
2219     redo A;
2220     }
2221     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2222     if ($is_space->{$self->{nc}}) {
2223    
2224     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2225    
2226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2227     $self->{line_prev} = $self->{line};
2228     $self->{column_prev} = $self->{column};
2229     $self->{column}++;
2230     $self->{nc}
2231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2232     } else {
2233     $self->{set_nc}->($self);
2234     }
2235    
2236     redo A;
2237     } elsif ($self->{nc} == 0x003E) { # >
2238     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2239    
2240     $self->{last_stag_name} = $self->{ct}->{tag_name};
2241     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2242     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2243     if ($self->{ct}->{attributes}) {
2244    
2245     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2246     } else {
2247     ## NOTE: This state should never be reached.
2248    
2249     }
2250     } else {
2251     die "$0: $self->{ct}->{type}: Unknown token type";
2252     }
2253     $self->{state} = DATA_STATE;
2254 wakaba 1.5 $self->{s_kwd} = '';
2255 wakaba 1.1
2256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2257     $self->{line_prev} = $self->{line};
2258     $self->{column_prev} = $self->{column};
2259     $self->{column}++;
2260     $self->{nc}
2261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2262     } else {
2263     $self->{set_nc}->($self);
2264     }
2265    
2266    
2267     return ($self->{ct}); # start tag or end tag
2268    
2269     redo A;
2270     } elsif ($self->{nc} == 0x002F) { # /
2271    
2272     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2273    
2274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2275     $self->{line_prev} = $self->{line};
2276     $self->{column_prev} = $self->{column};
2277     $self->{column}++;
2278     $self->{nc}
2279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2280     } else {
2281     $self->{set_nc}->($self);
2282     }
2283    
2284     redo A;
2285     } elsif ($self->{nc} == -1) {
2286     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2287     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2288    
2289     $self->{last_stag_name} = $self->{ct}->{tag_name};
2290     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2291     if ($self->{ct}->{attributes}) {
2292    
2293     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2294     } else {
2295     ## NOTE: This state should never be reached.
2296    
2297     }
2298     } else {
2299     die "$0: $self->{ct}->{type}: Unknown token type";
2300     }
2301     $self->{state} = DATA_STATE;
2302 wakaba 1.5 $self->{s_kwd} = '';
2303 wakaba 1.1 ## Reconsume.
2304     return ($self->{ct}); # start tag or end tag
2305     redo A;
2306     } else {
2307    
2308     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2309     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2310     ## reconsume
2311     redo A;
2312     }
2313     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2314 wakaba 1.11 ## XML5: "Empty tag state".
2315    
2316 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2317     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2318    
2319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2320     ## TODO: Different type than slash in start tag
2321     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2322     if ($self->{ct}->{attributes}) {
2323    
2324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2325     } else {
2326    
2327     }
2328     ## TODO: Test |<title></title/>|
2329     } else {
2330    
2331     $self->{self_closing} = 1;
2332     }
2333    
2334     $self->{state} = DATA_STATE;
2335 wakaba 1.5 $self->{s_kwd} = '';
2336 wakaba 1.1
2337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2338     $self->{line_prev} = $self->{line};
2339     $self->{column_prev} = $self->{column};
2340     $self->{column}++;
2341     $self->{nc}
2342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2343     } else {
2344     $self->{set_nc}->($self);
2345     }
2346    
2347    
2348     return ($self->{ct}); # start tag or end tag
2349    
2350     redo A;
2351     } elsif ($self->{nc} == -1) {
2352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2353     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2354    
2355     $self->{last_stag_name} = $self->{ct}->{tag_name};
2356     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2357     if ($self->{ct}->{attributes}) {
2358    
2359     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2360     } else {
2361     ## NOTE: This state should never be reached.
2362    
2363     }
2364     } else {
2365     die "$0: $self->{ct}->{type}: Unknown token type";
2366     }
2367 wakaba 1.11 ## XML5: "Tag attribute name before state".
2368 wakaba 1.1 $self->{state} = DATA_STATE;
2369 wakaba 1.5 $self->{s_kwd} = '';
2370 wakaba 1.1 ## Reconsume.
2371     return ($self->{ct}); # start tag or end tag
2372     redo A;
2373     } else {
2374    
2375     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2376     ## TODO: This error type is wrong.
2377     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2378     ## Reconsume.
2379     redo A;
2380     }
2381     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2382 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2383    
2384 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2385     ## consumes characters one-by-one basis.
2386    
2387     if ($self->{nc} == 0x003E) { # >
2388 wakaba 1.13 if ($self->{in_subset}) {
2389    
2390     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2391     } else {
2392    
2393     $self->{state} = DATA_STATE;
2394     $self->{s_kwd} = '';
2395     }
2396 wakaba 1.1
2397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2398     $self->{line_prev} = $self->{line};
2399     $self->{column_prev} = $self->{column};
2400     $self->{column}++;
2401     $self->{nc}
2402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2403     } else {
2404     $self->{set_nc}->($self);
2405     }
2406    
2407    
2408     return ($self->{ct}); # comment
2409     redo A;
2410     } elsif ($self->{nc} == -1) {
2411 wakaba 1.13 if ($self->{in_subset}) {
2412    
2413     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2414     } else {
2415    
2416     $self->{state} = DATA_STATE;
2417     $self->{s_kwd} = '';
2418     }
2419 wakaba 1.1 ## reconsume
2420    
2421     return ($self->{ct}); # comment
2422     redo A;
2423     } else {
2424    
2425     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2426     $self->{read_until}->($self->{ct}->{data},
2427     q[>],
2428     length $self->{ct}->{data});
2429    
2430     ## Stay in the state.
2431    
2432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2433     $self->{line_prev} = $self->{line};
2434     $self->{column_prev} = $self->{column};
2435     $self->{column}++;
2436     $self->{nc}
2437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2438     } else {
2439     $self->{set_nc}->($self);
2440     }
2441    
2442     redo A;
2443     }
2444     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2445 wakaba 1.14 ## XML5: "Markup declaration state".
2446 wakaba 1.1
2447     if ($self->{nc} == 0x002D) { # -
2448    
2449     $self->{state} = MD_HYPHEN_STATE;
2450    
2451     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2452     $self->{line_prev} = $self->{line};
2453     $self->{column_prev} = $self->{column};
2454     $self->{column}++;
2455     $self->{nc}
2456     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2457     } else {
2458     $self->{set_nc}->($self);
2459     }
2460    
2461     redo A;
2462     } elsif ($self->{nc} == 0x0044 or # D
2463     $self->{nc} == 0x0064) { # d
2464     ## ASCII case-insensitive.
2465    
2466     $self->{state} = MD_DOCTYPE_STATE;
2467 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2468 wakaba 1.1
2469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470     $self->{line_prev} = $self->{line};
2471     $self->{column_prev} = $self->{column};
2472     $self->{column}++;
2473     $self->{nc}
2474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475     } else {
2476     $self->{set_nc}->($self);
2477     }
2478    
2479     redo A;
2480 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2481     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2482     $self->{is_xml}) and
2483 wakaba 1.1 $self->{nc} == 0x005B) { # [
2484    
2485     $self->{state} = MD_CDATA_STATE;
2486 wakaba 1.12 $self->{kwd} = '[';
2487 wakaba 1.1
2488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2489     $self->{line_prev} = $self->{line};
2490     $self->{column_prev} = $self->{column};
2491     $self->{column}++;
2492     $self->{nc}
2493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2494     } else {
2495     $self->{set_nc}->($self);
2496     }
2497    
2498     redo A;
2499     } else {
2500    
2501     }
2502    
2503     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2504     line => $self->{line_prev},
2505     column => $self->{column_prev} - 1);
2506     ## Reconsume.
2507     $self->{state} = BOGUS_COMMENT_STATE;
2508     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2509     line => $self->{line_prev},
2510     column => $self->{column_prev} - 1,
2511     };
2512     redo A;
2513     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2514     if ($self->{nc} == 0x002D) { # -
2515    
2516     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2517     line => $self->{line_prev},
2518     column => $self->{column_prev} - 2,
2519     };
2520 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2521 wakaba 1.1
2522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2523     $self->{line_prev} = $self->{line};
2524     $self->{column_prev} = $self->{column};
2525     $self->{column}++;
2526     $self->{nc}
2527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2528     } else {
2529     $self->{set_nc}->($self);
2530     }
2531    
2532     redo A;
2533     } else {
2534    
2535     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2536     line => $self->{line_prev},
2537     column => $self->{column_prev} - 2);
2538     $self->{state} = BOGUS_COMMENT_STATE;
2539     ## Reconsume.
2540     $self->{ct} = {type => COMMENT_TOKEN,
2541     data => '-',
2542     line => $self->{line_prev},
2543     column => $self->{column_prev} - 2,
2544     };
2545     redo A;
2546     }
2547     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2548     ## ASCII case-insensitive.
2549     if ($self->{nc} == [
2550     undef,
2551     0x004F, # O
2552     0x0043, # C
2553     0x0054, # T
2554     0x0059, # Y
2555     0x0050, # P
2556 wakaba 1.12 ]->[length $self->{kwd}] or
2557 wakaba 1.1 $self->{nc} == [
2558     undef,
2559     0x006F, # o
2560     0x0063, # c
2561     0x0074, # t
2562     0x0079, # y
2563     0x0070, # p
2564 wakaba 1.12 ]->[length $self->{kwd}]) {
2565 wakaba 1.1
2566     ## Stay in the state.
2567 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2568 wakaba 1.1
2569     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2570     $self->{line_prev} = $self->{line};
2571     $self->{column_prev} = $self->{column};
2572     $self->{column}++;
2573     $self->{nc}
2574     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2575     } else {
2576     $self->{set_nc}->($self);
2577     }
2578    
2579     redo A;
2580 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2581 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2582     $self->{nc} == 0x0065)) { # e
2583 wakaba 1.12 if ($self->{is_xml} and
2584     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2585 wakaba 1.10
2586     ## XML5: case-sensitive.
2587     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2588     text => 'DOCTYPE',
2589     line => $self->{line_prev},
2590     column => $self->{column_prev} - 5);
2591     } else {
2592    
2593     }
2594 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2595     $self->{ct} = {type => DOCTYPE_TOKEN,
2596     quirks => 1,
2597     line => $self->{line_prev},
2598     column => $self->{column_prev} - 7,
2599     };
2600    
2601     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2602     $self->{line_prev} = $self->{line};
2603     $self->{column_prev} = $self->{column};
2604     $self->{column}++;
2605     $self->{nc}
2606     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2607     } else {
2608     $self->{set_nc}->($self);
2609     }
2610    
2611     redo A;
2612     } else {
2613    
2614     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2615     line => $self->{line_prev},
2616 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2617 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2618     ## Reconsume.
2619     $self->{ct} = {type => COMMENT_TOKEN,
2620 wakaba 1.12 data => $self->{kwd},
2621 wakaba 1.1 line => $self->{line_prev},
2622 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2623 wakaba 1.1 };
2624     redo A;
2625     }
2626     } elsif ($self->{state} == MD_CDATA_STATE) {
2627     if ($self->{nc} == {
2628     '[' => 0x0043, # C
2629     '[C' => 0x0044, # D
2630     '[CD' => 0x0041, # A
2631     '[CDA' => 0x0054, # T
2632     '[CDAT' => 0x0041, # A
2633 wakaba 1.12 }->{$self->{kwd}}) {
2634 wakaba 1.1
2635     ## Stay in the state.
2636 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2637 wakaba 1.1
2638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2639     $self->{line_prev} = $self->{line};
2640     $self->{column_prev} = $self->{column};
2641     $self->{column}++;
2642     $self->{nc}
2643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2644     } else {
2645     $self->{set_nc}->($self);
2646     }
2647    
2648     redo A;
2649 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2650 wakaba 1.1 $self->{nc} == 0x005B) { # [
2651 wakaba 1.6 if ($self->{is_xml} and
2652     not $self->{tainted} and
2653     @{$self->{open_elements} or []} == 0) {
2654 wakaba 1.8
2655 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2656     line => $self->{line_prev},
2657     column => $self->{column_prev} - 7);
2658     $self->{tainted} = 1;
2659 wakaba 1.8 } else {
2660    
2661 wakaba 1.6 }
2662    
2663 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2664     data => '',
2665     line => $self->{line_prev},
2666     column => $self->{column_prev} - 7};
2667     $self->{state} = CDATA_SECTION_STATE;
2668    
2669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2670     $self->{line_prev} = $self->{line};
2671     $self->{column_prev} = $self->{column};
2672     $self->{column}++;
2673     $self->{nc}
2674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2675     } else {
2676     $self->{set_nc}->($self);
2677     }
2678    
2679     redo A;
2680     } else {
2681    
2682     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2683     line => $self->{line_prev},
2684 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2685 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2686     ## Reconsume.
2687     $self->{ct} = {type => COMMENT_TOKEN,
2688 wakaba 1.12 data => $self->{kwd},
2689 wakaba 1.1 line => $self->{line_prev},
2690 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2691 wakaba 1.1 };
2692     redo A;
2693     }
2694     } elsif ($self->{state} == COMMENT_START_STATE) {
2695     if ($self->{nc} == 0x002D) { # -
2696    
2697     $self->{state} = COMMENT_START_DASH_STATE;
2698    
2699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2700     $self->{line_prev} = $self->{line};
2701     $self->{column_prev} = $self->{column};
2702     $self->{column}++;
2703     $self->{nc}
2704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2705     } else {
2706     $self->{set_nc}->($self);
2707     }
2708    
2709     redo A;
2710     } elsif ($self->{nc} == 0x003E) { # >
2711     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2712 wakaba 1.13 if ($self->{in_subset}) {
2713    
2714     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2715     } else {
2716    
2717     $self->{state} = DATA_STATE;
2718     $self->{s_kwd} = '';
2719     }
2720 wakaba 1.1
2721     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2722     $self->{line_prev} = $self->{line};
2723     $self->{column_prev} = $self->{column};
2724     $self->{column}++;
2725     $self->{nc}
2726     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2727     } else {
2728     $self->{set_nc}->($self);
2729     }
2730    
2731    
2732     return ($self->{ct}); # comment
2733    
2734     redo A;
2735     } elsif ($self->{nc} == -1) {
2736     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2737 wakaba 1.13 if ($self->{in_subset}) {
2738    
2739     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2740     } else {
2741    
2742     $self->{state} = DATA_STATE;
2743     $self->{s_kwd} = '';
2744     }
2745 wakaba 1.1 ## reconsume
2746    
2747     return ($self->{ct}); # comment
2748    
2749     redo A;
2750     } else {
2751    
2752     $self->{ct}->{data} # comment
2753     .= chr ($self->{nc});
2754     $self->{state} = COMMENT_STATE;
2755    
2756     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2757     $self->{line_prev} = $self->{line};
2758     $self->{column_prev} = $self->{column};
2759     $self->{column}++;
2760     $self->{nc}
2761     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2762     } else {
2763     $self->{set_nc}->($self);
2764     }
2765    
2766     redo A;
2767     }
2768     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2769     if ($self->{nc} == 0x002D) { # -
2770    
2771     $self->{state} = COMMENT_END_STATE;
2772    
2773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2774     $self->{line_prev} = $self->{line};
2775     $self->{column_prev} = $self->{column};
2776     $self->{column}++;
2777     $self->{nc}
2778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2779     } else {
2780     $self->{set_nc}->($self);
2781     }
2782    
2783     redo A;
2784     } elsif ($self->{nc} == 0x003E) { # >
2785     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2786 wakaba 1.13 if ($self->{in_subset}) {
2787    
2788     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2789     } else {
2790    
2791     $self->{state} = DATA_STATE;
2792     $self->{s_kwd} = '';
2793     }
2794 wakaba 1.1
2795     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2796     $self->{line_prev} = $self->{line};
2797     $self->{column_prev} = $self->{column};
2798     $self->{column}++;
2799     $self->{nc}
2800     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2801     } else {
2802     $self->{set_nc}->($self);
2803     }
2804    
2805    
2806     return ($self->{ct}); # comment
2807    
2808     redo A;
2809     } elsif ($self->{nc} == -1) {
2810     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2811 wakaba 1.13 if ($self->{in_subset}) {
2812    
2813     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2814     } else {
2815    
2816     $self->{state} = DATA_STATE;
2817     $self->{s_kwd} = '';
2818     }
2819 wakaba 1.1 ## reconsume
2820    
2821     return ($self->{ct}); # comment
2822    
2823     redo A;
2824     } else {
2825    
2826     $self->{ct}->{data} # comment
2827     .= '-' . chr ($self->{nc});
2828     $self->{state} = COMMENT_STATE;
2829    
2830     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2831     $self->{line_prev} = $self->{line};
2832     $self->{column_prev} = $self->{column};
2833     $self->{column}++;
2834     $self->{nc}
2835     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2836     } else {
2837     $self->{set_nc}->($self);
2838     }
2839    
2840     redo A;
2841     }
2842     } elsif ($self->{state} == COMMENT_STATE) {
2843 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2844    
2845 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2846    
2847     $self->{state} = COMMENT_END_DASH_STATE;
2848    
2849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2850     $self->{line_prev} = $self->{line};
2851     $self->{column_prev} = $self->{column};
2852     $self->{column}++;
2853     $self->{nc}
2854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2855     } else {
2856     $self->{set_nc}->($self);
2857     }
2858    
2859     redo A;
2860     } elsif ($self->{nc} == -1) {
2861     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2862 wakaba 1.13 if ($self->{in_subset}) {
2863    
2864     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2865     } else {
2866    
2867     $self->{state} = DATA_STATE;
2868     $self->{s_kwd} = '';
2869     }
2870 wakaba 1.1 ## reconsume
2871    
2872     return ($self->{ct}); # comment
2873    
2874     redo A;
2875     } else {
2876    
2877     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2878     $self->{read_until}->($self->{ct}->{data},
2879     q[-],
2880     length $self->{ct}->{data});
2881    
2882     ## Stay in the state
2883    
2884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2885     $self->{line_prev} = $self->{line};
2886     $self->{column_prev} = $self->{column};
2887     $self->{column}++;
2888     $self->{nc}
2889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2890     } else {
2891     $self->{set_nc}->($self);
2892     }
2893    
2894     redo A;
2895     }
2896     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2897 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2898 wakaba 1.10
2899 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2900    
2901     $self->{state} = COMMENT_END_STATE;
2902    
2903     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2904     $self->{line_prev} = $self->{line};
2905     $self->{column_prev} = $self->{column};
2906     $self->{column}++;
2907     $self->{nc}
2908     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2909     } else {
2910     $self->{set_nc}->($self);
2911     }
2912    
2913     redo A;
2914     } elsif ($self->{nc} == -1) {
2915     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2916 wakaba 1.13 if ($self->{in_subset}) {
2917    
2918     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2919     } else {
2920    
2921     $self->{state} = DATA_STATE;
2922     $self->{s_kwd} = '';
2923     }
2924 wakaba 1.1 ## reconsume
2925    
2926     return ($self->{ct}); # comment
2927    
2928     redo A;
2929     } else {
2930    
2931     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2932     $self->{state} = COMMENT_STATE;
2933    
2934     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2935     $self->{line_prev} = $self->{line};
2936     $self->{column_prev} = $self->{column};
2937     $self->{column}++;
2938     $self->{nc}
2939     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2940     } else {
2941     $self->{set_nc}->($self);
2942     }
2943    
2944     redo A;
2945     }
2946 wakaba 1.31 } elsif ($self->{state} == COMMENT_END_STATE or
2947     $self->{state} == COMMENT_END_BANG_STATE) {
2948 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2949 wakaba 1.31 ## (No comment end bang state.)
2950 wakaba 1.14
2951 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2952 wakaba 1.13 if ($self->{in_subset}) {
2953    
2954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955     } else {
2956    
2957     $self->{state} = DATA_STATE;
2958     $self->{s_kwd} = '';
2959     }
2960 wakaba 1.1
2961     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2962     $self->{line_prev} = $self->{line};
2963     $self->{column_prev} = $self->{column};
2964     $self->{column}++;
2965     $self->{nc}
2966     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2967     } else {
2968     $self->{set_nc}->($self);
2969     }
2970    
2971    
2972     return ($self->{ct}); # comment
2973    
2974     redo A;
2975     } elsif ($self->{nc} == 0x002D) { # -
2976 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
2977    
2978     $self->{ct}->{data} .= '--!'; # comment
2979     $self->{state} = COMMENT_END_DASH_STATE;
2980     } else {
2981    
2982     ## XML5: Not a parse error.
2983     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2984     line => $self->{line_prev},
2985     column => $self->{column_prev});
2986     $self->{ct}->{data} .= '-'; # comment
2987     ## Stay in the state
2988     }
2989 wakaba 1.1
2990 wakaba 1.31 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2991     $self->{line_prev} = $self->{line};
2992     $self->{column_prev} = $self->{column};
2993     $self->{column}++;
2994     $self->{nc}
2995     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2996     } else {
2997     $self->{set_nc}->($self);
2998     }
2999    
3000     redo A;
3001     } elsif ($self->{nc} == 0x0021 and # !
3002     $self->{state} != COMMENT_END_BANG_STATE) {
3003     $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3004     $self->{state} = COMMENT_END_BANG_STATE;
3005 wakaba 1.1
3006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3007     $self->{line_prev} = $self->{line};
3008     $self->{column_prev} = $self->{column};
3009     $self->{column}++;
3010     $self->{nc}
3011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3012     } else {
3013     $self->{set_nc}->($self);
3014     }
3015    
3016     redo A;
3017     } elsif ($self->{nc} == -1) {
3018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3019 wakaba 1.13 if ($self->{in_subset}) {
3020    
3021     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3022     } else {
3023    
3024     $self->{state} = DATA_STATE;
3025     $self->{s_kwd} = '';
3026     }
3027 wakaba 1.31 ## Reconsume.
3028 wakaba 1.1
3029     return ($self->{ct}); # comment
3030    
3031     redo A;
3032     } else {
3033    
3034 wakaba 1.31 if ($self->{state} == COMMENT_END_BANG_STATE) {
3035     $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3036     } else {
3037     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3038     }
3039 wakaba 1.1 $self->{state} = COMMENT_STATE;
3040    
3041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3042     $self->{line_prev} = $self->{line};
3043     $self->{column_prev} = $self->{column};
3044     $self->{column}++;
3045     $self->{nc}
3046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3047     } else {
3048     $self->{set_nc}->($self);
3049     }
3050    
3051     redo A;
3052     }
3053     } elsif ($self->{state} == DOCTYPE_STATE) {
3054     if ($is_space->{$self->{nc}}) {
3055    
3056     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3057    
3058     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3059     $self->{line_prev} = $self->{line};
3060     $self->{column_prev} = $self->{column};
3061     $self->{column}++;
3062     $self->{nc}
3063     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3064     } else {
3065     $self->{set_nc}->($self);
3066     }
3067    
3068     redo A;
3069 wakaba 1.28 } elsif ($self->{nc} == -1) {
3070    
3071     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3072     $self->{ct}->{quirks} = 1;
3073    
3074     $self->{state} = DATA_STATE;
3075     ## Reconsume.
3076     return ($self->{ct}); # DOCTYPE (quirks)
3077    
3078     redo A;
3079 wakaba 1.1 } else {
3080    
3081 wakaba 1.28 ## XML5: Swith to the bogus comment state.
3082 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3083     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3084     ## reconsume
3085     redo A;
3086     }
3087     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3088 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3089    
3090 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3091    
3092     ## Stay in the state
3093    
3094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3095     $self->{line_prev} = $self->{line};
3096     $self->{column_prev} = $self->{column};
3097     $self->{column}++;
3098     $self->{nc}
3099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3100     } else {
3101     $self->{set_nc}->($self);
3102     }
3103    
3104     redo A;
3105     } elsif ($self->{nc} == 0x003E) { # >
3106    
3107 wakaba 1.12 ## XML5: No parse error.
3108 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3109     $self->{state} = DATA_STATE;
3110 wakaba 1.5 $self->{s_kwd} = '';
3111 wakaba 1.1
3112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3113     $self->{line_prev} = $self->{line};
3114     $self->{column_prev} = $self->{column};
3115     $self->{column}++;
3116     $self->{nc}
3117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3118     } else {
3119     $self->{set_nc}->($self);
3120     }
3121    
3122    
3123     return ($self->{ct}); # DOCTYPE (quirks)
3124    
3125     redo A;
3126 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3127    
3128     $self->{ct}->{name} # DOCTYPE
3129     = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3130     delete $self->{ct}->{quirks};
3131     $self->{state} = DOCTYPE_NAME_STATE;
3132    
3133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3134     $self->{line_prev} = $self->{line};
3135     $self->{column_prev} = $self->{column};
3136     $self->{column}++;
3137     $self->{nc}
3138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3139     } else {
3140     $self->{set_nc}->($self);
3141     }
3142    
3143     redo A;
3144 wakaba 1.1 } elsif ($self->{nc} == -1) {
3145    
3146     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3147     $self->{state} = DATA_STATE;
3148 wakaba 1.5 $self->{s_kwd} = '';
3149 wakaba 1.1 ## reconsume
3150    
3151     return ($self->{ct}); # DOCTYPE (quirks)
3152    
3153     redo A;
3154 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3155    
3156     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3157     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3158 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3159     $self->{in_subset} = 1;
3160 wakaba 1.12
3161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3162     $self->{line_prev} = $self->{line};
3163     $self->{column_prev} = $self->{column};
3164     $self->{column}++;
3165     $self->{nc}
3166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3167     } else {
3168     $self->{set_nc}->($self);
3169     }
3170    
3171 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3172 wakaba 1.12 redo A;
3173 wakaba 1.1 } else {
3174    
3175     $self->{ct}->{name} = chr $self->{nc};
3176     delete $self->{ct}->{quirks};
3177     $self->{state} = DOCTYPE_NAME_STATE;
3178    
3179     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3180     $self->{line_prev} = $self->{line};
3181     $self->{column_prev} = $self->{column};
3182     $self->{column}++;
3183     $self->{nc}
3184     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3185     } else {
3186     $self->{set_nc}->($self);
3187     }
3188    
3189     redo A;
3190     }
3191     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3192 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3193    
3194     ## ISSUE: Redundant "First," in the spec.
3195    
3196 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3197    
3198     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3199    
3200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3201     $self->{line_prev} = $self->{line};
3202     $self->{column_prev} = $self->{column};
3203     $self->{column}++;
3204     $self->{nc}
3205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3206     } else {
3207     $self->{set_nc}->($self);
3208     }
3209    
3210     redo A;
3211     } elsif ($self->{nc} == 0x003E) { # >
3212    
3213     $self->{state} = DATA_STATE;
3214 wakaba 1.5 $self->{s_kwd} = '';
3215 wakaba 1.1
3216     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3217     $self->{line_prev} = $self->{line};
3218     $self->{column_prev} = $self->{column};
3219     $self->{column}++;
3220     $self->{nc}
3221     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3222     } else {
3223     $self->{set_nc}->($self);
3224     }
3225    
3226    
3227     return ($self->{ct}); # DOCTYPE
3228    
3229     redo A;
3230 wakaba 1.29 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3231    
3232     $self->{ct}->{name} # DOCTYPE
3233     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3234     delete $self->{ct}->{quirks};
3235     ## Stay in the state.
3236    
3237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238     $self->{line_prev} = $self->{line};
3239     $self->{column_prev} = $self->{column};
3240     $self->{column}++;
3241     $self->{nc}
3242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243     } else {
3244     $self->{set_nc}->($self);
3245     }
3246    
3247     redo A;
3248 wakaba 1.1 } elsif ($self->{nc} == -1) {
3249    
3250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3251     $self->{state} = DATA_STATE;
3252 wakaba 1.5 $self->{s_kwd} = '';
3253 wakaba 1.1 ## reconsume
3254    
3255     $self->{ct}->{quirks} = 1;
3256     return ($self->{ct}); # DOCTYPE
3257    
3258     redo A;
3259 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3260    
3261     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3262 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3263     $self->{in_subset} = 1;
3264 wakaba 1.12
3265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3266     $self->{line_prev} = $self->{line};
3267     $self->{column_prev} = $self->{column};
3268     $self->{column}++;
3269     $self->{nc}
3270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3271     } else {
3272     $self->{set_nc}->($self);
3273     }
3274    
3275 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3276 wakaba 1.12 redo A;
3277 wakaba 1.1 } else {
3278    
3279 wakaba 1.29 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3280     ## Stay in the state.
3281 wakaba 1.1
3282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283     $self->{line_prev} = $self->{line};
3284     $self->{column_prev} = $self->{column};
3285     $self->{column}++;
3286     $self->{nc}
3287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288     } else {
3289     $self->{set_nc}->($self);
3290     }
3291    
3292     redo A;
3293     }
3294     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3295 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3296     ## state", but implemented differently.
3297    
3298 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3299    
3300     ## Stay in the state
3301    
3302     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3303     $self->{line_prev} = $self->{line};
3304     $self->{column_prev} = $self->{column};
3305     $self->{column}++;
3306     $self->{nc}
3307     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3308     } else {
3309     $self->{set_nc}->($self);
3310     }
3311    
3312     redo A;
3313     } elsif ($self->{nc} == 0x003E) { # >
3314 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3315    
3316     $self->{state} = DATA_STATE;
3317     $self->{s_kwd} = '';
3318     } else {
3319    
3320     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3321     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3322     }
3323 wakaba 1.1
3324    
3325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3326     $self->{line_prev} = $self->{line};
3327     $self->{column_prev} = $self->{column};
3328     $self->{column}++;
3329     $self->{nc}
3330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3331     } else {
3332     $self->{set_nc}->($self);
3333     }
3334    
3335 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3336 wakaba 1.1 redo A;
3337     } elsif ($self->{nc} == -1) {
3338 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3339    
3340     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3341     $self->{state} = DATA_STATE;
3342     $self->{s_kwd} = '';
3343     $self->{ct}->{quirks} = 1;
3344     } else {
3345    
3346     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3347     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3348     }
3349 wakaba 1.1
3350 wakaba 1.16 ## Reconsume.
3351     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3352 wakaba 1.1 redo A;
3353     } elsif ($self->{nc} == 0x0050 or # P
3354     $self->{nc} == 0x0070) { # p
3355 wakaba 1.12
3356 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3357 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3358 wakaba 1.1
3359     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3360     $self->{line_prev} = $self->{line};
3361     $self->{column_prev} = $self->{column};
3362     $self->{column}++;
3363     $self->{nc}
3364     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3365     } else {
3366     $self->{set_nc}->($self);
3367     }
3368    
3369     redo A;
3370     } elsif ($self->{nc} == 0x0053 or # S
3371     $self->{nc} == 0x0073) { # s
3372 wakaba 1.12
3373 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3374 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3375    
3376     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377     $self->{line_prev} = $self->{line};
3378     $self->{column_prev} = $self->{column};
3379     $self->{column}++;
3380     $self->{nc}
3381     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382     } else {
3383     $self->{set_nc}->($self);
3384     }
3385    
3386     redo A;
3387 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3388     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3389     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3390    
3391     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3392     $self->{ct}->{value} = ''; # ENTITY
3393    
3394     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3395     $self->{line_prev} = $self->{line};
3396     $self->{column_prev} = $self->{column};
3397     $self->{column}++;
3398     $self->{nc}
3399     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3400     } else {
3401     $self->{set_nc}->($self);
3402     }
3403    
3404     redo A;
3405     } elsif ($self->{nc} == 0x0027 and # '
3406     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3407     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3408    
3409     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3410     $self->{ct}->{value} = ''; # ENTITY
3411    
3412     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3413     $self->{line_prev} = $self->{line};
3414     $self->{column_prev} = $self->{column};
3415     $self->{column}++;
3416     $self->{nc}
3417     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3418     } else {
3419     $self->{set_nc}->($self);
3420     }
3421    
3422     redo A;
3423 wakaba 1.16 } elsif ($self->{is_xml} and
3424     $self->{ct}->{type} == DOCTYPE_TOKEN and
3425     $self->{nc} == 0x005B) { # [
3426 wakaba 1.12
3427     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3428     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3429 wakaba 1.13 $self->{in_subset} = 1;
3430 wakaba 1.1
3431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3432     $self->{line_prev} = $self->{line};
3433     $self->{column_prev} = $self->{column};
3434     $self->{column}++;
3435     $self->{nc}
3436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3437     } else {
3438     $self->{set_nc}->($self);
3439     }
3440    
3441 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3442 wakaba 1.1 redo A;
3443     } else {
3444 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3445    
3446     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3447    
3448     $self->{ct}->{quirks} = 1;
3449     $self->{state} = BOGUS_DOCTYPE_STATE;
3450     } else {
3451    
3452     $self->{state} = BOGUS_MD_STATE;
3453     }
3454 wakaba 1.1
3455    
3456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3457     $self->{line_prev} = $self->{line};
3458     $self->{column_prev} = $self->{column};
3459     $self->{column}++;
3460     $self->{nc}
3461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3462     } else {
3463     $self->{set_nc}->($self);
3464     }
3465    
3466     redo A;
3467     }
3468     } elsif ($self->{state} == PUBLIC_STATE) {
3469     ## ASCII case-insensitive
3470     if ($self->{nc} == [
3471     undef,
3472     0x0055, # U
3473     0x0042, # B
3474     0x004C, # L
3475     0x0049, # I
3476 wakaba 1.12 ]->[length $self->{kwd}] or
3477 wakaba 1.1 $self->{nc} == [
3478     undef,
3479     0x0075, # u
3480     0x0062, # b
3481     0x006C, # l
3482     0x0069, # i
3483 wakaba 1.12 ]->[length $self->{kwd}]) {
3484 wakaba 1.1
3485     ## Stay in the state.
3486 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3487 wakaba 1.1
3488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3489     $self->{line_prev} = $self->{line};
3490     $self->{column_prev} = $self->{column};
3491     $self->{column}++;
3492     $self->{nc}
3493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3494     } else {
3495     $self->{set_nc}->($self);
3496     }
3497    
3498     redo A;
3499 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3500 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3501     $self->{nc} == 0x0063)) { # c
3502 wakaba 1.12 if ($self->{is_xml} and
3503     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3504    
3505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3506     text => 'PUBLIC',
3507     line => $self->{line_prev},
3508     column => $self->{column_prev} - 4);
3509     } else {
3510    
3511     }
3512 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3513    
3514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3515     $self->{line_prev} = $self->{line};
3516     $self->{column_prev} = $self->{column};
3517     $self->{column}++;
3518     $self->{nc}
3519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3520     } else {
3521     $self->{set_nc}->($self);
3522     }
3523    
3524     redo A;
3525     } else {
3526 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3527 wakaba 1.1 line => $self->{line_prev},
3528 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3529 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3530    
3531     $self->{ct}->{quirks} = 1;
3532     $self->{state} = BOGUS_DOCTYPE_STATE;
3533     } else {
3534    
3535     $self->{state} = BOGUS_MD_STATE;
3536     }
3537 wakaba 1.1 ## Reconsume.
3538     redo A;
3539     }
3540     } elsif ($self->{state} == SYSTEM_STATE) {
3541     ## ASCII case-insensitive
3542     if ($self->{nc} == [
3543     undef,
3544     0x0059, # Y
3545     0x0053, # S
3546     0x0054, # T
3547     0x0045, # E
3548 wakaba 1.12 ]->[length $self->{kwd}] or
3549 wakaba 1.1 $self->{nc} == [
3550     undef,
3551     0x0079, # y
3552     0x0073, # s
3553     0x0074, # t
3554     0x0065, # e
3555 wakaba 1.12 ]->[length $self->{kwd}]) {
3556 wakaba 1.1
3557     ## Stay in the state.
3558 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3559 wakaba 1.1
3560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3561     $self->{line_prev} = $self->{line};
3562     $self->{column_prev} = $self->{column};
3563     $self->{column}++;
3564     $self->{nc}
3565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3566     } else {
3567     $self->{set_nc}->($self);
3568     }
3569    
3570     redo A;
3571 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3572 wakaba 1.1 ($self->{nc} == 0x004D or # M
3573     $self->{nc} == 0x006D)) { # m
3574 wakaba 1.12 if ($self->{is_xml} and
3575     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3576    
3577     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3578     text => 'SYSTEM',
3579     line => $self->{line_prev},
3580     column => $self->{column_prev} - 4);
3581     } else {
3582    
3583     }
3584 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3585    
3586     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3587     $self->{line_prev} = $self->{line};
3588     $self->{column_prev} = $self->{column};
3589     $self->{column}++;
3590     $self->{nc}
3591     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3592     } else {
3593     $self->{set_nc}->($self);
3594     }
3595    
3596     redo A;
3597     } else {
3598 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3599 wakaba 1.1 line => $self->{line_prev},
3600 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3601 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3602    
3603     $self->{ct}->{quirks} = 1;
3604     $self->{state} = BOGUS_DOCTYPE_STATE;
3605     } else {
3606    
3607     $self->{state} = BOGUS_MD_STATE;
3608     }
3609 wakaba 1.1 ## Reconsume.
3610     redo A;
3611     }
3612     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3613     if ($is_space->{$self->{nc}}) {
3614    
3615     ## Stay in the state
3616    
3617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3618     $self->{line_prev} = $self->{line};
3619     $self->{column_prev} = $self->{column};
3620     $self->{column}++;
3621     $self->{nc}
3622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3623     } else {
3624     $self->{set_nc}->($self);
3625     }
3626    
3627     redo A;
3628     } elsif ($self->{nc} eq 0x0022) { # "
3629    
3630     $self->{ct}->{pubid} = ''; # DOCTYPE
3631     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3632    
3633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3634     $self->{line_prev} = $self->{line};
3635     $self->{column_prev} = $self->{column};
3636     $self->{column}++;
3637     $self->{nc}
3638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3639     } else {
3640     $self->{set_nc}->($self);
3641     }
3642    
3643     redo A;
3644     } elsif ($self->{nc} eq 0x0027) { # '
3645    
3646     $self->{ct}->{pubid} = ''; # DOCTYPE
3647     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3648    
3649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3650     $self->{line_prev} = $self->{line};
3651     $self->{column_prev} = $self->{column};
3652     $self->{column}++;
3653     $self->{nc}
3654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3655     } else {
3656     $self->{set_nc}->($self);
3657     }
3658    
3659     redo A;
3660     } elsif ($self->{nc} eq 0x003E) { # >
3661 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3662    
3663     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3664    
3665     $self->{state} = DATA_STATE;
3666     $self->{s_kwd} = '';
3667     $self->{ct}->{quirks} = 1;
3668     } else {
3669    
3670     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3671     }
3672 wakaba 1.1
3673    
3674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3675     $self->{line_prev} = $self->{line};
3676     $self->{column_prev} = $self->{column};
3677     $self->{column}++;
3678     $self->{nc}
3679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3680     } else {
3681     $self->{set_nc}->($self);
3682     }
3683    
3684 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3685 wakaba 1.1 redo A;
3686     } elsif ($self->{nc} == -1) {
3687 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3688    
3689     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3690     $self->{state} = DATA_STATE;
3691     $self->{s_kwd} = '';
3692     $self->{ct}->{quirks} = 1;
3693     } else {
3694    
3695     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3696     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3697     }
3698 wakaba 1.1
3699     ## reconsume
3700     return ($self->{ct}); # DOCTYPE
3701     redo A;
3702 wakaba 1.16 } elsif ($self->{is_xml} and
3703     $self->{ct}->{type} == DOCTYPE_TOKEN and
3704     $self->{nc} == 0x005B) { # [
3705 wakaba 1.12
3706     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3707     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3708     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3709 wakaba 1.13 $self->{in_subset} = 1;
3710 wakaba 1.12
3711     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3712     $self->{line_prev} = $self->{line};
3713     $self->{column_prev} = $self->{column};
3714     $self->{column}++;
3715     $self->{nc}
3716     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3717     } else {
3718     $self->{set_nc}->($self);
3719     }
3720    
3721 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3722 wakaba 1.12 redo A;
3723 wakaba 1.1 } else {
3724     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3725    
3726 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3727    
3728     $self->{ct}->{quirks} = 1;
3729     $self->{state} = BOGUS_DOCTYPE_STATE;
3730     } else {
3731    
3732     $self->{state} = BOGUS_MD_STATE;
3733     }
3734    
3735 wakaba 1.1
3736     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3737     $self->{line_prev} = $self->{line};
3738     $self->{column_prev} = $self->{column};
3739     $self->{column}++;
3740     $self->{nc}
3741     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3742     } else {
3743     $self->{set_nc}->($self);
3744     }
3745    
3746     redo A;
3747     }
3748     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3749     if ($self->{nc} == 0x0022) { # "
3750    
3751     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3752    
3753     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3754     $self->{line_prev} = $self->{line};
3755     $self->{column_prev} = $self->{column};
3756     $self->{column}++;
3757     $self->{nc}
3758     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3759     } else {
3760     $self->{set_nc}->($self);
3761     }
3762    
3763     redo A;
3764     } elsif ($self->{nc} == 0x003E) { # >
3765     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3766    
3767 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3768    
3769     $self->{state} = DATA_STATE;
3770     $self->{s_kwd} = '';
3771     $self->{ct}->{quirks} = 1;
3772     } else {
3773    
3774     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3775     }
3776    
3777 wakaba 1.1
3778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779     $self->{line_prev} = $self->{line};
3780     $self->{column_prev} = $self->{column};
3781     $self->{column}++;
3782     $self->{nc}
3783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784     } else {
3785     $self->{set_nc}->($self);
3786     }
3787    
3788 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3789 wakaba 1.1 redo A;
3790     } elsif ($self->{nc} == -1) {
3791     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3792    
3793 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794    
3795     $self->{state} = DATA_STATE;
3796     $self->{s_kwd} = '';
3797     $self->{ct}->{quirks} = 1;
3798     } else {
3799    
3800     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801     }
3802    
3803     ## Reconsume.
3804 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3805     redo A;
3806     } else {
3807    
3808 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3809 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3810     length $self->{ct}->{pubid});
3811    
3812     ## Stay in the state
3813    
3814     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3815     $self->{line_prev} = $self->{line};
3816     $self->{column_prev} = $self->{column};
3817     $self->{column}++;
3818     $self->{nc}
3819     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3820     } else {
3821     $self->{set_nc}->($self);
3822     }
3823    
3824     redo A;
3825     }
3826     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3827     if ($self->{nc} == 0x0027) { # '
3828    
3829     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3830    
3831     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3832     $self->{line_prev} = $self->{line};
3833     $self->{column_prev} = $self->{column};
3834     $self->{column}++;
3835     $self->{nc}
3836     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3837     } else {
3838     $self->{set_nc}->($self);
3839     }
3840    
3841     redo A;
3842     } elsif ($self->{nc} == 0x003E) { # >
3843     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3844    
3845 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3846    
3847     $self->{state} = DATA_STATE;
3848     $self->{s_kwd} = '';
3849     $self->{ct}->{quirks} = 1;
3850     } else {
3851    
3852     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3853     }
3854    
3855 wakaba 1.1
3856     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3857     $self->{line_prev} = $self->{line};
3858     $self->{column_prev} = $self->{column};
3859     $self->{column}++;
3860     $self->{nc}
3861     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3862     } else {
3863     $self->{set_nc}->($self);
3864     }
3865    
3866 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3867 wakaba 1.1 redo A;
3868     } elsif ($self->{nc} == -1) {
3869     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3870    
3871 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3872    
3873     $self->{state} = DATA_STATE;
3874     $self->{s_kwd} = '';
3875     $self->{ct}->{quirks} = 1;
3876     } else {
3877    
3878     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3879     }
3880    
3881 wakaba 1.1 ## reconsume
3882 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3883 wakaba 1.1 redo A;
3884     } else {
3885    
3886 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3887 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3888     length $self->{ct}->{pubid});
3889    
3890     ## Stay in the state
3891    
3892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3893     $self->{line_prev} = $self->{line};
3894     $self->{column_prev} = $self->{column};
3895     $self->{column}++;
3896     $self->{nc}
3897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3898     } else {
3899     $self->{set_nc}->($self);
3900     }
3901    
3902     redo A;
3903     }
3904     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3905     if ($is_space->{$self->{nc}}) {
3906    
3907     ## Stay in the state
3908    
3909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3910     $self->{line_prev} = $self->{line};
3911     $self->{column_prev} = $self->{column};
3912     $self->{column}++;
3913     $self->{nc}
3914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3915     } else {
3916     $self->{set_nc}->($self);
3917     }
3918    
3919     redo A;
3920     } elsif ($self->{nc} == 0x0022) { # "
3921    
3922 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3923 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3924    
3925     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3926     $self->{line_prev} = $self->{line};
3927     $self->{column_prev} = $self->{column};
3928     $self->{column}++;
3929     $self->{nc}
3930     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3931     } else {
3932     $self->{set_nc}->($self);
3933     }
3934    
3935     redo A;
3936     } elsif ($self->{nc} == 0x0027) { # '
3937    
3938 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3939 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3940    
3941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3942     $self->{line_prev} = $self->{line};
3943     $self->{column_prev} = $self->{column};
3944     $self->{column}++;
3945     $self->{nc}
3946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3947     } else {
3948     $self->{set_nc}->($self);
3949     }
3950    
3951     redo A;
3952     } elsif ($self->{nc} == 0x003E) { # >
3953 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3954     if ($self->{is_xml}) {
3955    
3956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3957     } else {
3958    
3959     }
3960     $self->{state} = DATA_STATE;
3961     $self->{s_kwd} = '';
3962 wakaba 1.12 } else {
3963 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3964    
3965     } else {
3966    
3967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3968     }
3969     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3970 wakaba 1.12 }
3971 wakaba 1.16
3972 wakaba 1.1
3973     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3974     $self->{line_prev} = $self->{line};
3975     $self->{column_prev} = $self->{column};
3976     $self->{column}++;
3977     $self->{nc}
3978     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3979     } else {
3980     $self->{set_nc}->($self);
3981     }
3982    
3983 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3984 wakaba 1.1 redo A;
3985     } elsif ($self->{nc} == -1) {
3986 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3987    
3988     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3989    
3990     $self->{state} = DATA_STATE;
3991     $self->{s_kwd} = '';
3992     $self->{ct}->{quirks} = 1;
3993     } else {
3994     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3996     }
3997 wakaba 1.1
3998     ## reconsume
3999 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4000 wakaba 1.1 redo A;
4001 wakaba 1.16 } elsif ($self->{is_xml} and
4002     $self->{ct}->{type} == DOCTYPE_TOKEN and
4003     $self->{nc} == 0x005B) { # [
4004 wakaba 1.12
4005     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4006     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4007     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4008 wakaba 1.13 $self->{in_subset} = 1;
4009 wakaba 1.12
4010     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4011     $self->{line_prev} = $self->{line};
4012     $self->{column_prev} = $self->{column};
4013     $self->{column}++;
4014     $self->{nc}
4015     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4016     } else {
4017     $self->{set_nc}->($self);
4018     }
4019    
4020 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4021 wakaba 1.12 redo A;
4022 wakaba 1.1 } else {
4023     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4024    
4025 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4026    
4027     $self->{ct}->{quirks} = 1;
4028     $self->{state} = BOGUS_DOCTYPE_STATE;
4029     } else {
4030    
4031     $self->{state} = BOGUS_MD_STATE;
4032     }
4033    
4034 wakaba 1.1
4035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036     $self->{line_prev} = $self->{line};
4037     $self->{column_prev} = $self->{column};
4038     $self->{column}++;
4039     $self->{nc}
4040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041     } else {
4042     $self->{set_nc}->($self);
4043     }
4044    
4045     redo A;
4046     }
4047     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4048     if ($is_space->{$self->{nc}}) {
4049    
4050     ## Stay in the state
4051    
4052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4053     $self->{line_prev} = $self->{line};
4054     $self->{column_prev} = $self->{column};
4055     $self->{column}++;
4056     $self->{nc}
4057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4058     } else {
4059     $self->{set_nc}->($self);
4060     }
4061    
4062     redo A;
4063     } elsif ($self->{nc} == 0x0022) { # "
4064    
4065     $self->{ct}->{sysid} = ''; # DOCTYPE
4066     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4067    
4068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4069     $self->{line_prev} = $self->{line};
4070     $self->{column_prev} = $self->{column};
4071     $self->{column}++;
4072     $self->{nc}
4073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4074     } else {
4075     $self->{set_nc}->($self);
4076     }
4077    
4078     redo A;
4079     } elsif ($self->{nc} == 0x0027) { # '
4080    
4081     $self->{ct}->{sysid} = ''; # DOCTYPE
4082     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4083    
4084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4085     $self->{line_prev} = $self->{line};
4086     $self->{column_prev} = $self->{column};
4087     $self->{column}++;
4088     $self->{nc}
4089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4090     } else {
4091     $self->{set_nc}->($self);
4092     }
4093    
4094     redo A;
4095     } elsif ($self->{nc} == 0x003E) { # >
4096     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4097    
4098     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4099     $self->{line_prev} = $self->{line};
4100     $self->{column_prev} = $self->{column};
4101     $self->{column}++;
4102     $self->{nc}
4103     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4104     } else {
4105     $self->{set_nc}->($self);
4106     }
4107    
4108    
4109 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4110    
4111     $self->{state} = DATA_STATE;
4112     $self->{s_kwd} = '';
4113     $self->{ct}->{quirks} = 1;
4114     } else {
4115    
4116     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4117     }
4118 wakaba 1.1
4119 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4120 wakaba 1.1 redo A;
4121     } elsif ($self->{nc} == -1) {
4122 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4123    
4124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4125     $self->{state} = DATA_STATE;
4126     $self->{s_kwd} = '';
4127     $self->{ct}->{quirks} = 1;
4128     } else {
4129    
4130     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4131     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4132     }
4133 wakaba 1.1
4134     ## reconsume
4135 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4136 wakaba 1.1 redo A;
4137 wakaba 1.16 } elsif ($self->{is_xml} and
4138     $self->{ct}->{type} == DOCTYPE_TOKEN and
4139     $self->{nc} == 0x005B) { # [
4140 wakaba 1.12
4141     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4142    
4143     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4144     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4145 wakaba 1.13 $self->{in_subset} = 1;
4146 wakaba 1.12
4147     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4148     $self->{line_prev} = $self->{line};
4149     $self->{column_prev} = $self->{column};
4150     $self->{column}++;
4151     $self->{nc}
4152     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4153     } else {
4154     $self->{set_nc}->($self);
4155     }
4156    
4157 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4158 wakaba 1.12 redo A;
4159 wakaba 1.1 } else {
4160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4161    
4162 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4163    
4164     $self->{ct}->{quirks} = 1;
4165     $self->{state} = BOGUS_DOCTYPE_STATE;
4166     } else {
4167    
4168     $self->{state} = BOGUS_MD_STATE;
4169     }
4170    
4171 wakaba 1.1
4172     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4173     $self->{line_prev} = $self->{line};
4174     $self->{column_prev} = $self->{column};
4175     $self->{column}++;
4176     $self->{nc}
4177     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4178     } else {
4179     $self->{set_nc}->($self);
4180     }
4181    
4182     redo A;
4183     }
4184     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4185     if ($self->{nc} == 0x0022) { # "
4186    
4187     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4188    
4189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190     $self->{line_prev} = $self->{line};
4191     $self->{column_prev} = $self->{column};
4192     $self->{column}++;
4193     $self->{nc}
4194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195     } else {
4196     $self->{set_nc}->($self);
4197     }
4198    
4199     redo A;
4200 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4201 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4202    
4203 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4204    
4205     $self->{state} = DATA_STATE;
4206     $self->{s_kwd} = '';
4207     $self->{ct}->{quirks} = 1;
4208     } else {
4209    
4210     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4211     }
4212    
4213 wakaba 1.1
4214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4215     $self->{line_prev} = $self->{line};
4216     $self->{column_prev} = $self->{column};
4217     $self->{column}++;
4218     $self->{nc}
4219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4220     } else {
4221     $self->{set_nc}->($self);
4222     }
4223    
4224 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4225 wakaba 1.1 redo A;
4226     } elsif ($self->{nc} == -1) {
4227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4228    
4229 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4230    
4231     $self->{state} = DATA_STATE;
4232     $self->{s_kwd} = '';
4233     $self->{ct}->{quirks} = 1;
4234     } else {
4235    
4236     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4237     }
4238    
4239 wakaba 1.1 ## reconsume
4240 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4241 wakaba 1.1 redo A;
4242     } else {
4243    
4244 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4245 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4246     length $self->{ct}->{sysid});
4247    
4248     ## Stay in the state
4249    
4250     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4251     $self->{line_prev} = $self->{line};
4252     $self->{column_prev} = $self->{column};
4253     $self->{column}++;
4254     $self->{nc}
4255     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4256     } else {
4257     $self->{set_nc}->($self);
4258     }
4259    
4260     redo A;
4261     }
4262     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4263     if ($self->{nc} == 0x0027) { # '
4264    
4265     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4266    
4267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268     $self->{line_prev} = $self->{line};
4269     $self->{column_prev} = $self->{column};
4270     $self->{column}++;
4271     $self->{nc}
4272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4273     } else {
4274     $self->{set_nc}->($self);
4275     }
4276    
4277     redo A;
4278 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4279 wakaba 1.1
4280     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4281    
4282     $self->{state} = DATA_STATE;
4283 wakaba 1.5 $self->{s_kwd} = '';
4284 wakaba 1.1
4285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4286     $self->{line_prev} = $self->{line};
4287     $self->{column_prev} = $self->{column};
4288     $self->{column}++;
4289     $self->{nc}
4290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4291     } else {
4292     $self->{set_nc}->($self);
4293     }
4294    
4295    
4296     $self->{ct}->{quirks} = 1;
4297     return ($self->{ct}); # DOCTYPE
4298    
4299     redo A;
4300     } elsif ($self->{nc} == -1) {
4301     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4302    
4303 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4304    
4305     $self->{state} = DATA_STATE;
4306     $self->{s_kwd} = '';
4307     $self->{ct}->{quirks} = 1;
4308     } else {
4309    
4310     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4311     }
4312    
4313 wakaba 1.1 ## reconsume
4314 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4315 wakaba 1.1 redo A;
4316     } else {
4317    
4318 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4319 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4320     length $self->{ct}->{sysid});
4321    
4322     ## Stay in the state
4323    
4324     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4325     $self->{line_prev} = $self->{line};
4326     $self->{column_prev} = $self->{column};
4327     $self->{column}++;
4328     $self->{nc}
4329     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4330     } else {
4331     $self->{set_nc}->($self);
4332     }
4333    
4334     redo A;
4335     }
4336     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4337     if ($is_space->{$self->{nc}}) {
4338 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4339    
4340     $self->{state} = BEFORE_NDATA_STATE;
4341     } else {
4342    
4343     ## Stay in the state
4344     }
4345 wakaba 1.1
4346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4347     $self->{line_prev} = $self->{line};
4348     $self->{column_prev} = $self->{column};
4349     $self->{column}++;
4350     $self->{nc}
4351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4352     } else {
4353     $self->{set_nc}->($self);
4354     }
4355    
4356     redo A;
4357     } elsif ($self->{nc} == 0x003E) { # >
4358 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4359    
4360     $self->{state} = DATA_STATE;
4361     $self->{s_kwd} = '';
4362     } else {
4363    
4364     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4365     }
4366    
4367 wakaba 1.1
4368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369     $self->{line_prev} = $self->{line};
4370     $self->{column_prev} = $self->{column};
4371     $self->{column}++;
4372     $self->{nc}
4373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374     } else {
4375     $self->{set_nc}->($self);
4376     }
4377    
4378 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4379 wakaba 1.1 redo A;
4380 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4381     ($self->{nc} == 0x004E or # N
4382     $self->{nc} == 0x006E)) { # n
4383    
4384     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4385     $self->{state} = NDATA_STATE;
4386     $self->{kwd} = chr $self->{nc};
4387    
4388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4389     $self->{line_prev} = $self->{line};
4390     $self->{column_prev} = $self->{column};
4391     $self->{column}++;
4392     $self->{nc}
4393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4394     } else {
4395     $self->{set_nc}->($self);
4396     }
4397    
4398     redo A;
4399 wakaba 1.1 } elsif ($self->{nc} == -1) {
4400 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4401    
4402     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4403     $self->{state} = DATA_STATE;
4404     $self->{s_kwd} = '';
4405     $self->{ct}->{quirks} = 1;
4406     } else {
4407    
4408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4409     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4410     }
4411    
4412 wakaba 1.1 ## reconsume
4413 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4414 wakaba 1.1 redo A;
4415 wakaba 1.16 } elsif ($self->{is_xml} and
4416     $self->{ct}->{type} == DOCTYPE_TOKEN and
4417     $self->{nc} == 0x005B) { # [
4418 wakaba 1.12
4419     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4420     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4421 wakaba 1.13 $self->{in_subset} = 1;
4422 wakaba 1.12
4423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4424     $self->{line_prev} = $self->{line};
4425     $self->{column_prev} = $self->{column};
4426     $self->{column}++;
4427     $self->{nc}
4428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4429     } else {
4430     $self->{set_nc}->($self);
4431     }
4432    
4433 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4434 wakaba 1.12 redo A;
4435 wakaba 1.1 } else {
4436     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4437    
4438 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4439    
4440     #$self->{ct}->{quirks} = 1;
4441     $self->{state} = BOGUS_DOCTYPE_STATE;
4442     } else {
4443    
4444     $self->{state} = BOGUS_MD_STATE;
4445     }
4446    
4447 wakaba 1.1
4448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4449     $self->{line_prev} = $self->{line};
4450     $self->{column_prev} = $self->{column};
4451     $self->{column}++;
4452     $self->{nc}
4453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4454     } else {
4455     $self->{set_nc}->($self);
4456     }
4457    
4458     redo A;
4459     }
4460 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4461     if ($is_space->{$self->{nc}}) {
4462    
4463     ## Stay in the state.
4464    
4465     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4466     $self->{line_prev} = $self->{line};
4467     $self->{column_prev} = $self->{column};
4468     $self->{column}++;
4469     $self->{nc}
4470     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4471     } else {
4472     $self->{set_nc}->($self);
4473     }
4474    
4475     redo A;
4476     } elsif ($self->{nc} == 0x003E) { # >
4477    
4478     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4479    
4480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4481     $self->{line_prev} = $self->{line};
4482     $self->{column_prev} = $self->{column};
4483     $self->{column}++;
4484     $self->{nc}
4485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4486     } else {
4487     $self->{set_nc}->($self);
4488     }
4489    
4490     return ($self->{ct}); # ENTITY
4491     redo A;
4492     } elsif ($self->{nc} == 0x004E or # N
4493     $self->{nc} == 0x006E) { # n
4494    
4495     $self->{state} = NDATA_STATE;
4496     $self->{kwd} = chr $self->{nc};
4497    
4498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499     $self->{line_prev} = $self->{line};
4500     $self->{column_prev} = $self->{column};
4501     $self->{column}++;
4502     $self->{nc}
4503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504     } else {
4505     $self->{set_nc}->($self);
4506     }
4507    
4508     redo A;
4509     } elsif ($self->{nc} == -1) {
4510    
4511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4512     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4513     ## reconsume
4514     return ($self->{ct}); # ENTITY
4515     redo A;
4516     } else {
4517    
4518     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4519     $self->{state} = BOGUS_MD_STATE;
4520    
4521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4522     $self->{line_prev} = $self->{line};
4523     $self->{column_prev} = $self->{column};
4524     $self->{column}++;
4525     $self->{nc}
4526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4527     } else {
4528     $self->{set_nc}->($self);
4529     }
4530    
4531     redo A;
4532     }
4533 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4534     if ($self->{nc} == 0x003E) { # >
4535    
4536     $self->{state} = DATA_STATE;
4537 wakaba 1.5 $self->{s_kwd} = '';
4538 wakaba 1.1
4539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540     $self->{line_prev} = $self->{line};
4541     $self->{column_prev} = $self->{column};
4542     $self->{column}++;
4543     $self->{nc}
4544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4545     } else {
4546     $self->{set_nc}->($self);
4547     }
4548    
4549    
4550     return ($self->{ct}); # DOCTYPE
4551    
4552     redo A;
4553 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4554 wakaba 1.13
4555     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4556     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4557     $self->{in_subset} = 1;
4558    
4559 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4560     $self->{line_prev} = $self->{line};
4561     $self->{column_prev} = $self->{column};
4562     $self->{column}++;
4563     $self->{nc}
4564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4565     } else {
4566     $self->{set_nc}->($self);
4567     }
4568    
4569 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4570     redo A;
4571 wakaba 1.1 } elsif ($self->{nc} == -1) {
4572    
4573     $self->{state} = DATA_STATE;
4574 wakaba 1.5 $self->{s_kwd} = '';
4575 wakaba 1.1 ## reconsume
4576    
4577     return ($self->{ct}); # DOCTYPE
4578    
4579     redo A;
4580     } else {
4581    
4582     my $s = '';
4583 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4584 wakaba 1.1
4585     ## Stay in the state
4586    
4587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4588     $self->{line_prev} = $self->{line};
4589     $self->{column_prev} = $self->{column};
4590     $self->{column}++;
4591     $self->{nc}
4592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4593     } else {
4594     $self->{set_nc}->($self);
4595     }
4596    
4597     redo A;
4598     }
4599     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4600     ## NOTE: "CDATA section state" in the state is jointly implemented
4601     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4602     ## and |CDATA_SECTION_MSE2_STATE|.
4603 wakaba 1.10
4604     ## XML5: "CDATA state".
4605 wakaba 1.1
4606     if ($self->{nc} == 0x005D) { # ]
4607    
4608     $self->{state} = CDATA_SECTION_MSE1_STATE;
4609    
4610     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4611     $self->{line_prev} = $self->{line};
4612     $self->{column_prev} = $self->{column};
4613     $self->{column}++;
4614     $self->{nc}
4615     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4616     } else {
4617     $self->{set_nc}->($self);
4618     }
4619    
4620     redo A;
4621     } elsif ($self->{nc} == -1) {
4622 wakaba 1.6 if ($self->{is_xml}) {
4623 wakaba 1.8
4624 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4625 wakaba 1.8 } else {
4626    
4627 wakaba 1.6 }
4628    
4629 wakaba 1.1 $self->{state} = DATA_STATE;
4630 wakaba 1.5 $self->{s_kwd} = '';
4631 wakaba 1.10 ## Reconsume.
4632 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4633    
4634     return ($self->{ct}); # character
4635     } else {
4636    
4637     ## No token to emit. $self->{ct} is discarded.
4638     }
4639     redo A;
4640     } else {
4641    
4642     $self->{ct}->{data} .= chr $self->{nc};
4643     $self->{read_until}->($self->{ct}->{data},
4644     q<]>,
4645     length $self->{ct}->{data});
4646    
4647     ## Stay in the state.
4648    
4649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650     $self->{line_prev} = $self->{line};
4651     $self->{column_prev} = $self->{column};
4652     $self->{column}++;
4653     $self->{nc}
4654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655     } else {
4656     $self->{set_nc}->($self);
4657     }
4658    
4659     redo A;
4660     }
4661    
4662     ## ISSUE: "text tokens" in spec.
4663     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4664 wakaba 1.10 ## XML5: "CDATA bracket state".
4665    
4666 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4667    
4668     $self->{state} = CDATA_SECTION_MSE2_STATE;
4669    
4670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4671     $self->{line_prev} = $self->{line};
4672     $self->{column_prev} = $self->{column};
4673     $self->{column}++;
4674     $self->{nc}
4675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4676     } else {
4677     $self->{set_nc}->($self);
4678     }
4679    
4680     redo A;
4681     } else {
4682    
4683 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4684 wakaba 1.1 $self->{ct}->{data} .= ']';
4685 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4686 wakaba 1.1 ## Reconsume.
4687     redo A;
4688     }
4689     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4690 wakaba 1.10 ## XML5: "CDATA end state".
4691    
4692 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4693     $self->{state} = DATA_STATE;
4694 wakaba 1.5 $self->{s_kwd} = '';
4695 wakaba 1.1
4696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4697     $self->{line_prev} = $self->{line};
4698     $self->{column_prev} = $self->{column};
4699     $self->{column}++;
4700     $self->{nc}
4701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4702     } else {
4703     $self->{set_nc}->($self);
4704     }
4705    
4706     if (length $self->{ct}->{data}) { # character
4707    
4708     return ($self->{ct}); # character
4709     } else {
4710    
4711     ## No token to emit. $self->{ct} is discarded.
4712     }
4713     redo A;
4714     } elsif ($self->{nc} == 0x005D) { # ]
4715     # character
4716     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4717     ## Stay in the state.
4718    
4719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4720     $self->{line_prev} = $self->{line};
4721     $self->{column_prev} = $self->{column};
4722     $self->{column}++;
4723     $self->{nc}
4724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4725     } else {
4726     $self->{set_nc}->($self);
4727     }
4728    
4729     redo A;
4730     } else {
4731    
4732     $self->{ct}->{data} .= ']]'; # character
4733     $self->{state} = CDATA_SECTION_STATE;
4734 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4735 wakaba 1.1 redo A;
4736     }
4737     } elsif ($self->{state} == ENTITY_STATE) {
4738     if ($is_space->{$self->{nc}} or
4739     {
4740     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4741     $self->{entity_add} => 1,
4742     }->{$self->{nc}}) {
4743 wakaba 1.22 if ($self->{is_xml}) {
4744    
4745     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4746     line => $self->{line_prev},
4747     column => $self->{column_prev}
4748     + ($self->{nc} == -1 ? 1 : 0));
4749     } else {
4750    
4751     ## No error
4752     }
4753 wakaba 1.1 ## Don't consume
4754     ## Return nothing.
4755     #
4756     } elsif ($self->{nc} == 0x0023) { # #
4757    
4758     $self->{state} = ENTITY_HASH_STATE;
4759 wakaba 1.12 $self->{kwd} = '#';
4760 wakaba 1.1
4761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4762     $self->{line_prev} = $self->{line};
4763     $self->{column_prev} = $self->{column};
4764     $self->{column}++;
4765     $self->{nc}
4766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4767     } else {
4768     $self->{set_nc}->($self);
4769     }
4770    
4771     redo A;
4772 wakaba 1.22 } elsif ($self->{is_xml} or
4773     (0x0041 <= $self->{nc} and
4774 wakaba 1.1 $self->{nc} <= 0x005A) or # A..Z
4775     (0x0061 <= $self->{nc} and
4776     $self->{nc} <= 0x007A)) { # a..z
4777    
4778     require Whatpm::_NamedEntityList;
4779     $self->{state} = ENTITY_NAME_STATE;
4780 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4781     $self->{entity__value} = $self->{kwd};
4782 wakaba 1.1 $self->{entity__match} = 0;
4783    
4784     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4785     $self->{line_prev} = $self->{line};
4786     $self->{column_prev} = $self->{column};
4787     $self->{column}++;
4788     $self->{nc}
4789     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4790     } else {
4791     $self->{set_nc}->($self);
4792     }
4793    
4794     redo A;
4795     } else {
4796    
4797     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4798     ## Return nothing.
4799     #
4800     }
4801    
4802     ## NOTE: No character is consumed by the "consume a character
4803     ## reference" algorithm. In other word, there is an "&" character
4804     ## that does not introduce a character reference, which would be
4805     ## appended to the parent element or the attribute value in later
4806     ## process of the tokenizer.
4807    
4808     if ($self->{prev_state} == DATA_STATE) {
4809    
4810     $self->{state} = $self->{prev_state};
4811 wakaba 1.5 $self->{s_kwd} = '';
4812 wakaba 1.1 ## Reconsume.
4813     return ({type => CHARACTER_TOKEN, data => '&',
4814     line => $self->{line_prev},
4815     column => $self->{column_prev},
4816     });
4817     redo A;
4818     } else {
4819    
4820     $self->{ca}->{value} .= '&';
4821     $self->{state} = $self->{prev_state};
4822 wakaba 1.5 $self->{s_kwd} = '';
4823 wakaba 1.1 ## Reconsume.
4824     redo A;
4825     }
4826     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4827 wakaba 1.21 if ($self->{nc} == 0x0078) { # x
4828 wakaba 1.1
4829     $self->{state} = HEXREF_X_STATE;
4830 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4831 wakaba 1.1
4832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4833     $self->{line_prev} = $self->{line};
4834     $self->{column_prev} = $self->{column};
4835     $self->{column}++;
4836     $self->{nc}
4837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4838     } else {
4839     $self->{set_nc}->($self);
4840     }
4841    
4842     redo A;
4843 wakaba 1.21 } elsif ($self->{nc} == 0x0058) { # X
4844    
4845     if ($self->{is_xml}) {
4846     $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4847     }
4848     $self->{state} = HEXREF_X_STATE;
4849     $self->{kwd} .= chr $self->{nc};
4850    
4851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4852     $self->{line_prev} = $self->{line};
4853     $self->{column_prev} = $self->{column};
4854     $self->{column}++;
4855     $self->{nc}
4856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4857     } else {
4858     $self->{set_nc}->($self);
4859     }
4860    
4861     redo A;
4862 wakaba 1.1 } elsif (0x0030 <= $self->{nc} and
4863     $self->{nc} <= 0x0039) { # 0..9
4864    
4865     $self->{state} = NCR_NUM_STATE;
4866 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4867 wakaba 1.1
4868     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4869     $self->{line_prev} = $self->{line};
4870     $self->{column_prev} = $self->{column};
4871     $self->{column}++;
4872     $self->{nc}
4873     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4874     } else {
4875     $self->{set_nc}->($self);
4876     }
4877    
4878     redo A;
4879     } else {
4880     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4881     line => $self->{line_prev},
4882     column => $self->{column_prev} - 1);
4883    
4884     ## NOTE: According to the spec algorithm, nothing is returned,
4885     ## and then "&#" is appended to the parent element or the attribute
4886     ## value in the later processing.
4887    
4888     if ($self->{prev_state} == DATA_STATE) {
4889    
4890     $self->{state} = $self->{prev_state};
4891 wakaba 1.5 $self->{s_kwd} = '';
4892 wakaba 1.1 ## Reconsume.
4893     return ({type => CHARACTER_TOKEN,
4894     data => '&#',
4895     line => $self->{line_prev},
4896     column => $self->{column_prev} - 1,
4897     });
4898     redo A;
4899     } else {
4900    
4901     $self->{ca}->{value} .= '&#';
4902     $self->{state} = $self->{prev_state};
4903 wakaba 1.5 $self->{s_kwd} = '';
4904 wakaba 1.1 ## Reconsume.
4905     redo A;
4906     }
4907     }
4908     } elsif ($self->{state} == NCR_NUM_STATE) {
4909     if (0x0030 <= $self->{nc} and
4910     $self->{nc} <= 0x0039) { # 0..9
4911    
4912 wakaba 1.12 $self->{kwd} *= 10;
4913     $self->{kwd} += $self->{nc} - 0x0030;
4914 wakaba 1.1
4915     ## Stay in the state.
4916    
4917     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4918     $self->{line_prev} = $self->{line};
4919     $self->{column_prev} = $self->{column};
4920     $self->{column}++;
4921     $self->{nc}
4922     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4923     } else {
4924     $self->{set_nc}->($self);
4925     }
4926    
4927     redo A;
4928     } elsif ($self->{nc} == 0x003B) { # ;
4929    
4930    
4931     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4932     $self->{line_prev} = $self->{line};
4933     $self->{column_prev} = $self->{column};
4934     $self->{column}++;
4935     $self->{nc}
4936     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4937     } else {
4938     $self->{set_nc}->($self);
4939     }
4940    
4941     #
4942     } else {
4943    
4944     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4945     ## Reconsume.
4946     #
4947     }
4948    
4949 wakaba 1.12 my $code = $self->{kwd};
4950 wakaba 1.1 my $l = $self->{line_prev};
4951     my $c = $self->{column_prev};
4952 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
4953     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4954     ($self->{is_xml} and $code == 0x0000)) {
4955 wakaba 1.1
4956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4957     text => (sprintf 'U+%04X', $code),
4958     line => $l, column => $c);
4959     $code = $charref_map->{$code};
4960     } elsif ($code > 0x10FFFF) {
4961    
4962     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4963     text => (sprintf 'U-%08X', $code),
4964     line => $l, column => $c);
4965     $code = 0xFFFD;
4966     }
4967    
4968     if ($self->{prev_state} == DATA_STATE) {
4969    
4970     $self->{state} = $self->{prev_state};
4971 wakaba 1.5 $self->{s_kwd} = '';
4972 wakaba 1.1 ## Reconsume.
4973     return ({type => CHARACTER_TOKEN, data => chr $code,
4974 wakaba 1.7 has_reference => 1,
4975 wakaba 1.1 line => $l, column => $c,
4976     });
4977     redo A;
4978     } else {
4979    
4980     $self->{ca}->{value} .= chr $code;
4981     $self->{ca}->{has_reference} = 1;
4982     $self->{state} = $self->{prev_state};
4983 wakaba 1.5 $self->{s_kwd} = '';
4984 wakaba 1.1 ## Reconsume.
4985     redo A;
4986     }
4987     } elsif ($self->{state} == HEXREF_X_STATE) {
4988     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4989     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4990     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4991     # 0..9, A..F, a..f
4992    
4993     $self->{state} = HEXREF_HEX_STATE;
4994 wakaba 1.12 $self->{kwd} = 0;
4995 wakaba 1.1 ## Reconsume.
4996     redo A;
4997     } else {
4998     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4999     line => $self->{line_prev},
5000     column => $self->{column_prev} - 2);
5001    
5002     ## NOTE: According to the spec algorithm, nothing is returned,
5003     ## and then "&#" followed by "X" or "x" is appended to the parent
5004     ## element or the attribute value in the later processing.
5005    
5006     if ($self->{prev_state} == DATA_STATE) {
5007    
5008     $self->{state} = $self->{prev_state};
5009 wakaba 1.5 $self->{s_kwd} = '';
5010 wakaba 1.1 ## Reconsume.
5011     return ({type => CHARACTER_TOKEN,
5012 wakaba 1.12 data => '&' . $self->{kwd},
5013 wakaba 1.1 line => $self->{line_prev},
5014 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
5015 wakaba 1.1 });
5016     redo A;
5017     } else {
5018    
5019 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
5020 wakaba 1.1 $self->{state} = $self->{prev_state};
5021 wakaba 1.5 $self->{s_kwd} = '';
5022 wakaba 1.1 ## Reconsume.
5023     redo A;
5024     }
5025     }
5026     } elsif ($self->{state} == HEXREF_HEX_STATE) {
5027     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5028     # 0..9
5029    
5030 wakaba 1.12 $self->{kwd} *= 0x10;
5031     $self->{kwd} += $self->{nc} - 0x0030;
5032 wakaba 1.1 ## Stay in the state.
5033    
5034     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5035     $self->{line_prev} = $self->{line};
5036     $self->{column_prev} = $self->{column};
5037     $self->{column}++;
5038     $self->{nc}
5039     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5040     } else {
5041     $self->{set_nc}->($self);
5042     }
5043    
5044     redo A;
5045     } elsif (0x0061 <= $self->{nc} and
5046     $self->{nc} <= 0x0066) { # a..f
5047    
5048 wakaba 1.12 $self->{kwd} *= 0x10;
5049     $self->{kwd} += $self->{nc} - 0x0060 + 9;
5050 wakaba 1.1 ## Stay in the state.
5051    
5052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5053     $self->{line_prev} = $self->{line};
5054     $self->{column_prev} = $self->{column};
5055     $self->{column}++;
5056     $self->{nc}
5057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5058     } else {
5059     $self->{set_nc}->($self);
5060     }
5061    
5062     redo A;
5063     } elsif (0x0041 <= $self->{nc} and
5064     $self->{nc} <= 0x0046) { # A..F
5065    
5066 wakaba 1.12 $self->{kwd} *= 0x10;
5067     $self->{kwd} += $self->{nc} - 0x0040 + 9;
5068 wakaba 1.1 ## Stay in the state.
5069    
5070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5071     $self->{line_prev} = $self->{line};
5072     $self->{column_prev} = $self->{column};
5073     $self->{column}++;
5074     $self->{nc}
5075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5076     } else {
5077     $self->{set_nc}->($self);
5078     }
5079    
5080     redo A;
5081     } elsif ($self->{nc} == 0x003B) { # ;
5082    
5083    
5084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5085     $self->{line_prev} = $self->{line};
5086     $self->{column_prev} = $self->{column};
5087     $self->{column}++;
5088     $self->{nc}
5089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5090     } else {
5091     $self->{set_nc}->($self);
5092     }
5093    
5094     #
5095     } else {
5096    
5097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5098     line => $self->{line},
5099     column => $self->{column});
5100     ## Reconsume.
5101     #
5102     }
5103    
5104 wakaba 1.12 my $code = $self->{kwd};
5105 wakaba 1.1 my $l = $self->{line_prev};
5106     my $c = $self->{column_prev};
5107 wakaba 1.25 if ((not $self->{is_xml} and $charref_map->{$code}) or
5108     ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5109     ($self->{is_xml} and $code == 0x0000)) {
5110 wakaba 1.1
5111     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5112     text => (sprintf 'U+%04X', $code),
5113     line => $l, column => $c);
5114     $code = $charref_map->{$code};
5115     } elsif ($code > 0x10FFFF) {
5116    
5117     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5118     text => (sprintf 'U-%08X', $code),
5119     line => $l, column => $c);
5120     $code = 0xFFFD;
5121     }
5122    
5123     if ($self->{prev_state} == DATA_STATE) {
5124    
5125     $self->{state} = $self->{prev_state};
5126 wakaba 1.5 $self->{s_kwd} = '';
5127 wakaba 1.1 ## Reconsume.
5128     return ({type => CHARACTER_TOKEN, data => chr $code,
5129 wakaba 1.7 has_reference => 1,
5130 wakaba 1.1 line => $l, column => $c,
5131     });
5132     redo A;
5133     } else {
5134    
5135     $self->{ca}->{value} .= chr $code;
5136     $self->{ca}->{has_reference} = 1;
5137     $self->{state} = $self->{prev_state};
5138 wakaba 1.5 $self->{s_kwd} = '';
5139 wakaba 1.1 ## Reconsume.
5140     redo A;
5141     }
5142     } elsif ($self->{state} == ENTITY_NAME_STATE) {
5143 wakaba 1.21 if ((0x0041 <= $self->{nc} and # a
5144     $self->{nc} <= 0x005A) or # x
5145     (0x0061 <= $self->{nc} and # a
5146     $self->{nc} <= 0x007A) or # z
5147     (0x0030 <= $self->{nc} and # 0
5148     $self->{nc} <= 0x0039) or # 9
5149 wakaba 1.22 $self->{nc} == 0x003B or # ;
5150     ($self->{is_xml} and
5151     not ($is_space->{$self->{nc}} or
5152     {
5153     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5154     $self->{entity_add} => 1,
5155     }->{$self->{nc}}))) {
5156 wakaba 1.1 our $EntityChar;
5157 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5158 wakaba 1.21 if (defined $EntityChar->{$self->{kwd}} or
5159     $self->{ge}->{$self->{kwd}}) {
5160 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5161 wakaba 1.21 if (defined $self->{ge}->{$self->{kwd}}) {
5162     if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5163    
5164     $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5165     } else {
5166     if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5167    
5168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5169     value => $self->{kwd});
5170     } else {
5171    
5172     }
5173     $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5174     }
5175     } else {
5176     if ($self->{is_xml}) {
5177    
5178     $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5179     value => $self->{kwd},
5180     level => {
5181     'amp;' => $self->{level}->{warn},
5182     'quot;' => $self->{level}->{warn},
5183     'lt;' => $self->{level}->{warn},
5184     'gt;' => $self->{level}->{warn},
5185     'apos;' => $self->{level}->{warn},
5186     }->{$self->{kwd}} ||
5187     $self->{level}->{must});
5188     } else {
5189    
5190     }
5191     $self->{entity__value} = $EntityChar->{$self->{kwd}};
5192     }
5193 wakaba 1.1 $self->{entity__match} = 1;
5194    
5195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5196     $self->{line_prev} = $self->{line};
5197     $self->{column_prev} = $self->{column};
5198     $self->{column}++;
5199     $self->{nc}
5200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5201     } else {
5202     $self->{set_nc}->($self);
5203     }
5204    
5205     #
5206     } else {
5207    
5208 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5209 wakaba 1.1 $self->{entity__match} = -1;
5210     ## Stay in the state.
5211    
5212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5213     $self->{line_prev} = $self->{line};
5214     $self->{column_prev} = $self->{column};
5215     $self->{column}++;
5216     $self->{nc}
5217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5218     } else {
5219     $self->{set_nc}->($self);
5220     }
5221    
5222     redo A;
5223     }
5224     } else {
5225    
5226     $self->{entity__value} .= chr $self->{nc};
5227     $self->{entity__match} *= 2;
5228     ## Stay in the state.
5229    
5230     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5231     $self->{line_prev} = $self->{line};
5232     $self->{column_prev} = $self->{column};
5233     $self->{column}++;
5234     $self->{nc}
5235     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5236     } else {
5237     $self->{set_nc}->($self);
5238     }
5239    
5240     redo A;
5241     }
5242     }
5243    
5244     my $data;
5245     my $has_ref;
5246     if ($self->{entity__match} > 0) {
5247    
5248     $data = $self->{entity__value};
5249     $has_ref = 1;
5250     #
5251     } elsif ($self->{entity__match} < 0) {
5252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5253     if ($self->{prev_state} != DATA_STATE and # in attribute
5254     $self->{entity__match} < -1) {
5255    
5256 wakaba 1.12 $data = '&' . $self->{kwd};
5257 wakaba 1.1 #
5258     } else {
5259    
5260     $data = $self->{entity__value};
5261     $has_ref = 1;
5262     #
5263     }
5264     } else {
5265    
5266     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5267     line => $self->{line_prev},
5268 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5269     $data = '&' . $self->{kwd};
5270 wakaba 1.1 #
5271     }
5272    
5273     ## NOTE: In these cases, when a character reference is found,
5274     ## it is consumed and a character token is returned, or, otherwise,
5275     ## nothing is consumed and returned, according to the spec algorithm.
5276     ## In this implementation, anything that has been examined by the
5277     ## tokenizer is appended to the parent element or the attribute value
5278     ## as string, either literal string when no character reference or
5279     ## entity-replaced string otherwise, in this stage, since any characters
5280     ## that would not be consumed are appended in the data state or in an
5281     ## appropriate attribute value state anyway.
5282    
5283     if ($self->{prev_state} == DATA_STATE) {
5284    
5285     $self->{state} = $self->{prev_state};
5286 wakaba 1.5 $self->{s_kwd} = '';
5287 wakaba 1.1 ## Reconsume.
5288     return ({type => CHARACTER_TOKEN,
5289     data => $data,
5290 wakaba 1.7 has_reference => $has_ref,
5291 wakaba 1.1 line => $self->{line_prev},
5292 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5293 wakaba 1.1 });
5294     redo A;
5295     } else {
5296    
5297     $self->{ca}->{value} .= $data;
5298     $self->{ca}->{has_reference} = 1 if $has_ref;
5299     $self->{state} = $self->{prev_state};
5300 wakaba 1.5 $self->{s_kwd} = '';
5301 wakaba 1.1 ## Reconsume.
5302     redo A;
5303     }
5304 wakaba 1.8
5305     ## XML-only states
5306    
5307     } elsif ($self->{state} == PI_STATE) {
5308 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5309    
5310 wakaba 1.8 if ($is_space->{$self->{nc}} or
5311 wakaba 1.14 $self->{nc} == 0x003F or # ?
5312 wakaba 1.8 $self->{nc} == -1) {
5313 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5314     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5315     ## "DOCTYPE pi state": Parse error, switch to the "data
5316     ## state".
5317 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5318     line => $self->{line_prev},
5319     column => $self->{column_prev}
5320     - 1 * ($self->{nc} != -1));
5321     $self->{state} = BOGUS_COMMENT_STATE;
5322     ## Reconsume.
5323     $self->{ct} = {type => COMMENT_TOKEN,
5324     data => '?',
5325     line => $self->{line_prev},
5326     column => $self->{column_prev}
5327     - 1 * ($self->{nc} != -1),
5328     };
5329     redo A;
5330     } else {
5331 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5332 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5333     target => chr $self->{nc},
5334     data => '',
5335     line => $self->{line_prev},
5336     column => $self->{column_prev} - 1,
5337     };
5338     $self->{state} = PI_TARGET_STATE;
5339    
5340     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5341     $self->{line_prev} = $self->{line};
5342     $self->{column_prev} = $self->{column};
5343     $self->{column}++;
5344     $self->{nc}
5345     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5346     } else {
5347     $self->{set_nc}->($self);
5348     }
5349    
5350     redo A;
5351     }
5352     } elsif ($self->{state} == PI_TARGET_STATE) {
5353     if ($is_space->{$self->{nc}}) {
5354     $self->{state} = PI_TARGET_AFTER_STATE;
5355    
5356     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5357     $self->{line_prev} = $self->{line};
5358     $self->{column_prev} = $self->{column};
5359     $self->{column}++;
5360     $self->{nc}
5361     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5362     } else {
5363     $self->{set_nc}->($self);
5364     }
5365    
5366     redo A;
5367     } elsif ($self->{nc} == -1) {
5368     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5369 wakaba 1.13 if ($self->{in_subset}) {
5370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5371     } else {
5372     $self->{state} = DATA_STATE;
5373     $self->{s_kwd} = '';
5374     }
5375 wakaba 1.8 ## Reconsume.
5376     return ($self->{ct}); # pi
5377     redo A;
5378     } elsif ($self->{nc} == 0x003F) { # ?
5379     $self->{state} = PI_AFTER_STATE;
5380    
5381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5382     $self->{line_prev} = $self->{line};
5383     $self->{column_prev} = $self->{column};
5384     $self->{column}++;
5385     $self->{nc}
5386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5387     } else {
5388     $self->{set_nc}->($self);
5389     }
5390    
5391     redo A;
5392     } else {
5393     ## XML5: typo ("tag name" -> "target")
5394     $self->{ct}->{target} .= chr $self->{nc}; # pi
5395    
5396     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5397     $self->{line_prev} = $self->{line};
5398     $self->{column_prev} = $self->{column};
5399     $self->{column}++;
5400     $self->{nc}
5401     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5402     } else {
5403     $self->{set_nc}->($self);
5404     }
5405    
5406     redo A;
5407     }
5408     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5409     if ($is_space->{$self->{nc}}) {
5410     ## Stay in the state.
5411    
5412     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5413     $self->{line_prev} = $self->{line};
5414     $self->{column_prev} = $self->{column};
5415     $self->{column}++;
5416     $self->{nc}
5417     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5418     } else {
5419     $self->{set_nc}->($self);
5420     }
5421    
5422     redo A;
5423     } else {
5424     $self->{state} = PI_DATA_STATE;
5425     ## Reprocess.
5426     redo A;
5427     }
5428     } elsif ($self->{state} == PI_DATA_STATE) {
5429     if ($self->{nc} == 0x003F) { # ?
5430     $self->{state} = PI_DATA_AFTER_STATE;
5431    
5432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5433     $self->{line_prev} = $self->{line};
5434     $self->{column_prev} = $self->{column};
5435     $self->{column}++;
5436     $self->{nc}
5437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5438     } else {
5439     $self->{set_nc}->($self);
5440     }
5441    
5442     redo A;
5443     } elsif ($self->{nc} == -1) {
5444     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5445 wakaba 1.13 if ($self->{in_subset}) {
5446 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5447 wakaba 1.13 } else {
5448     $self->{state} = DATA_STATE;
5449     $self->{s_kwd} = '';
5450     }
5451 wakaba 1.8 ## Reprocess.
5452     return ($self->{ct}); # pi
5453     redo A;
5454     } else {
5455     $self->{ct}->{data} .= chr $self->{nc}; # pi
5456     $self->{read_until}->($self->{ct}->{data}, q[?],
5457     length $self->{ct}->{data});
5458     ## Stay in the state.
5459    
5460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5461     $self->{line_prev} = $self->{line};
5462     $self->{column_prev} = $self->{column};
5463     $self->{column}++;
5464     $self->{nc}
5465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5466     } else {
5467     $self->{set_nc}->($self);
5468     }
5469    
5470     ## Reprocess.
5471     redo A;
5472     }
5473     } elsif ($self->{state} == PI_AFTER_STATE) {
5474 wakaba 1.14 ## XML5: Part of "Pi after state".
5475    
5476 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5477 wakaba 1.13 if ($self->{in_subset}) {
5478     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5479     } else {
5480     $self->{state} = DATA_STATE;
5481     $self->{s_kwd} = '';
5482     }
5483 wakaba 1.8
5484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5485     $self->{line_prev} = $self->{line};
5486     $self->{column_prev} = $self->{column};
5487     $self->{column}++;
5488     $self->{nc}
5489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5490     } else {
5491     $self->{set_nc}->($self);
5492     }
5493    
5494     return ($self->{ct}); # pi
5495     redo A;
5496     } elsif ($self->{nc} == 0x003F) { # ?
5497     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5498     line => $self->{line_prev},
5499     column => $self->{column_prev}); ## XML5: no error
5500     $self->{ct}->{data} .= '?';
5501     $self->{state} = PI_DATA_AFTER_STATE;
5502    
5503     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5504     $self->{line_prev} = $self->{line};
5505     $self->{column_prev} = $self->{column};
5506     $self->{column}++;
5507     $self->{nc}
5508     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5509     } else {
5510     $self->{set_nc}->($self);
5511     }
5512    
5513     redo A;
5514     } else {
5515     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5516     line => $self->{line_prev},
5517     column => $self->{column_prev}
5518     + 1 * ($self->{nc} == -1)); ## XML5: no error
5519     $self->{ct}->{data} .= '?'; ## XML5: not appended
5520     $self->{state} = PI_DATA_STATE;
5521     ## Reprocess.
5522     redo A;
5523     }
5524     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5525 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5526    
5527 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5528 wakaba 1.13 if ($self->{in_subset}) {
5529     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5530     } else {
5531     $self->{state} = DATA_STATE;
5532     $self->{s_kwd} = '';
5533     }
5534 wakaba 1.8
5535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5536     $self->{line_prev} = $self->{line};
5537     $self->{column_prev} = $self->{column};
5538     $self->{column}++;
5539     $self->{nc}
5540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5541     } else {
5542     $self->{set_nc}->($self);
5543     }
5544    
5545     return ($self->{ct}); # pi
5546     redo A;
5547     } elsif ($self->{nc} == 0x003F) { # ?
5548     $self->{ct}->{data} .= '?';
5549     ## Stay in the state.
5550    
5551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5552     $self->{line_prev} = $self->{line};
5553     $self->{column_prev} = $self->{column};
5554     $self->{column}++;
5555     $self->{nc}
5556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5557     } else {
5558     $self->{set_nc}->($self);
5559     }
5560    
5561     redo A;
5562     } else {
5563     $self->{ct}->{data} .= '?'; ## XML5: not appended
5564     $self->{state} = PI_DATA_STATE;
5565     ## Reprocess.
5566     redo A;
5567     }
5568 wakaba 1.12
5569     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5570     if ($self->{nc} == 0x003C) { # <
5571 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5572 wakaba 1.12
5573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5574     $self->{line_prev} = $self->{line};
5575     $self->{column_prev} = $self->{column};
5576     $self->{column}++;
5577     $self->{nc}
5578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5579     } else {
5580     $self->{set_nc}->($self);
5581     }
5582    
5583     redo A;
5584     } elsif ($self->{nc} == 0x0025) { # %
5585     ## XML5: Not defined yet.
5586    
5587     ## TODO:
5588 wakaba 1.24
5589     if (not $self->{stop_processing} and
5590     not $self->{document}->xml_standalone) {
5591     $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5592     level => $self->{level}->{info});
5593     $self->{stop_processing} = 1;
5594     }
5595    
5596 wakaba 1.12
5597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5598     $self->{line_prev} = $self->{line};
5599     $self->{column_prev} = $self->{column};
5600     $self->{column}++;
5601     $self->{nc}
5602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5603     } else {
5604     $self->{set_nc}->($self);
5605     }
5606    
5607     redo A;
5608     } elsif ($self->{nc} == 0x005D) { # ]
5609 wakaba 1.13 delete $self->{in_subset};
5610 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5611    
5612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5613     $self->{line_prev} = $self->{line};
5614     $self->{column_prev} = $self->{column};
5615     $self->{column}++;
5616     $self->{nc}
5617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5618     } else {
5619     $self->{set_nc}->($self);
5620     }
5621    
5622     redo A;
5623     } elsif ($is_space->{$self->{nc}}) {
5624     ## Stay in the state.
5625    
5626     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5627     $self->{line_prev} = $self->{line};
5628     $self->{column_prev} = $self->{column};
5629     $self->{column}++;
5630     $self->{nc}
5631     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5632     } else {
5633     $self->{set_nc}->($self);
5634     }
5635    
5636     redo A;
5637     } elsif ($self->{nc} == -1) {
5638     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5639 wakaba 1.13 delete $self->{in_subset};
5640 wakaba 1.12 $self->{state} = DATA_STATE;
5641     $self->{s_kwd} = '';
5642     ## Reconsume.
5643 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5644 wakaba 1.12 redo A;
5645     } else {
5646     unless ($self->{internal_subset_tainted}) {
5647     ## XML5: No parse error.
5648     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5649     $self->{internal_subset_tainted} = 1;
5650     }
5651     ## Stay in the state.
5652    
5653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5654     $self->{line_prev} = $self->{line};
5655     $self->{column_prev} = $self->{column};
5656     $self->{column}++;
5657     $self->{nc}
5658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5659     } else {
5660     $self->{set_nc}->($self);
5661     }
5662    
5663     redo A;
5664     }
5665     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5666     if ($self->{nc} == 0x003E) { # >
5667     $self->{state} = DATA_STATE;
5668     $self->{s_kwd} = '';
5669    
5670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5671     $self->{line_prev} = $self->{line};
5672     $self->{column_prev} = $self->{column};
5673     $self->{column}++;
5674     $self->{nc}
5675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5676     } else {
5677     $self->{set_nc}->($self);
5678     }
5679    
5680 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5681 wakaba 1.12 redo A;
5682     } elsif ($self->{nc} == -1) {
5683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5684     $self->{state} = DATA_STATE;
5685     $self->{s_kwd} = '';
5686     ## Reconsume.
5687 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5688 wakaba 1.12 redo A;
5689     } else {
5690     ## XML5: No parse error and stay in the state.
5691     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5692    
5693 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5694    
5695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5696     $self->{line_prev} = $self->{line};
5697     $self->{column_prev} = $self->{column};
5698     $self->{column}++;
5699     $self->{nc}
5700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5701     } else {
5702     $self->{set_nc}->($self);
5703     }
5704    
5705     redo A;
5706     }
5707     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5708     if ($self->{nc} == 0x003E) { # >
5709     $self->{state} = DATA_STATE;
5710     $self->{s_kwd} = '';
5711    
5712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5713     $self->{line_prev} = $self->{line};
5714     $self->{column_prev} = $self->{column};
5715     $self->{column}++;
5716     $self->{nc}
5717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5718     } else {
5719     $self->{set_nc}->($self);
5720     }
5721    
5722     return ({type => END_OF_DOCTYPE_TOKEN});
5723     redo A;
5724     } elsif ($self->{nc} == -1) {
5725     $self->{state} = DATA_STATE;
5726     $self->{s_kwd} = '';
5727     ## Reconsume.
5728     return ({type => END_OF_DOCTYPE_TOKEN});
5729     redo A;
5730     } else {
5731     ## Stay in the state.
5732    
5733     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5734     $self->{line_prev} = $self->{line};
5735     $self->{column_prev} = $self->{column};
5736     $self->{column}++;
5737     $self->{nc}
5738     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5739     } else {
5740     $self->{set_nc}->($self);
5741     }
5742    
5743     redo A;
5744     }
5745     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5746     if ($self->{nc} == 0x0021) { # !
5747 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5748 wakaba 1.13
5749     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5750     $self->{line_prev} = $self->{line};
5751     $self->{column_prev} = $self->{column};
5752     $self->{column}++;
5753     $self->{nc}
5754     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5755     } else {
5756     $self->{set_nc}->($self);
5757     }
5758    
5759     redo A;
5760     } elsif ($self->{nc} == 0x003F) { # ?
5761     $self->{state} = PI_STATE;
5762    
5763     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5764     $self->{line_prev} = $self->{line};
5765     $self->{column_prev} = $self->{column};
5766     $self->{column}++;
5767     $self->{nc}
5768     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5769     } else {
5770     $self->{set_nc}->($self);
5771     }
5772    
5773     redo A;
5774     } elsif ($self->{nc} == -1) {
5775     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5776     $self->{state} = DATA_STATE;
5777     $self->{s_kwd} = '';
5778     ## Reconsume.
5779     redo A;
5780     } else {
5781     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5782     line => $self->{line_prev},
5783     column => $self->{column_prev});
5784     $self->{state} = BOGUS_COMMENT_STATE;
5785     $self->{ct} = {type => COMMENT_TOKEN,
5786     data => '',
5787     }; ## NOTE: Will be discarded.
5788 wakaba 1.12
5789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5790     $self->{line_prev} = $self->{line};
5791     $self->{column_prev} = $self->{column};
5792     $self->{column}++;
5793     $self->{nc}
5794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5795     } else {
5796     $self->{set_nc}->($self);
5797     }
5798    
5799     redo A;
5800     }
5801 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5802     ## XML5: "DOCTYPE markup declaration state".
5803    
5804     if ($self->{nc} == 0x002D) { # -
5805     $self->{state} = MD_HYPHEN_STATE;
5806    
5807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5808     $self->{line_prev} = $self->{line};
5809     $self->{column_prev} = $self->{column};
5810     $self->{column}++;
5811     $self->{nc}
5812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5813     } else {
5814     $self->{set_nc}->($self);
5815     }
5816    
5817     redo A;
5818 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5819     $self->{nc} == 0x0065) { # e
5820 wakaba 1.14 $self->{state} = MD_E_STATE;
5821     $self->{kwd} = chr $self->{nc};
5822    
5823     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5824     $self->{line_prev} = $self->{line};
5825     $self->{column_prev} = $self->{column};
5826     $self->{column}++;
5827     $self->{nc}
5828     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5829     } else {
5830     $self->{set_nc}->($self);
5831     }
5832    
5833     redo A;
5834 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5835     $self->{nc} == 0x0061) { # a
5836 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5837     $self->{kwd} = chr $self->{nc};
5838    
5839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5840     $self->{line_prev} = $self->{line};
5841     $self->{column_prev} = $self->{column};
5842     $self->{column}++;
5843     $self->{nc}
5844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5845     } else {
5846     $self->{set_nc}->($self);
5847     }
5848    
5849     redo A;
5850 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5851     $self->{nc} == 0x006E) { # n
5852 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5853     $self->{kwd} = chr $self->{nc};
5854    
5855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5856     $self->{line_prev} = $self->{line};
5857     $self->{column_prev} = $self->{column};
5858     $self->{column}++;
5859     $self->{nc}
5860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5861     } else {
5862     $self->{set_nc}->($self);
5863     }
5864    
5865     redo A;
5866     } else {
5867     #
5868     }
5869    
5870     ## XML5: No parse error.
5871     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5872     line => $self->{line_prev},
5873     column => $self->{column_prev} - 1);
5874     ## Reconsume.
5875     $self->{state} = BOGUS_COMMENT_STATE;
5876     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5877     redo A;
5878     } elsif ($self->{state} == MD_E_STATE) {
5879 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5880     $self->{nc} == 0x006E) { # n
5881 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5882     $self->{kwd} .= chr $self->{nc};
5883    
5884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885     $self->{line_prev} = $self->{line};
5886     $self->{column_prev} = $self->{column};
5887     $self->{column}++;
5888     $self->{nc}
5889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890     } else {
5891     $self->{set_nc}->($self);
5892     }
5893    
5894     redo A;
5895 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5896     $self->{nc} == 0x006C) { # l
5897 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5898     $self->{state} = MD_ELEMENT_STATE;
5899     $self->{kwd} .= chr $self->{nc};
5900    
5901     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5902     $self->{line_prev} = $self->{line};
5903     $self->{column_prev} = $self->{column};
5904     $self->{column}++;
5905     $self->{nc}
5906     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5907     } else {
5908     $self->{set_nc}->($self);
5909     }
5910    
5911     redo A;
5912     } else {
5913     ## XML5: No parse error.
5914     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5915     line => $self->{line_prev},
5916     column => $self->{column_prev} - 2
5917     + 1 * ($self->{nc} == -1));
5918     ## Reconsume.
5919     $self->{state} = BOGUS_COMMENT_STATE;
5920     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5921     redo A;
5922     }
5923     } elsif ($self->{state} == MD_ENTITY_STATE) {
5924 wakaba 1.17 if ($self->{nc} == [
5925     undef,
5926     undef,
5927     0x0054, # T
5928     0x0049, # I
5929     0x0054, # T
5930     ]->[length $self->{kwd}] or
5931     $self->{nc} == [
5932     undef,
5933     undef,
5934     0x0074, # t
5935     0x0069, # i
5936     0x0074, # t
5937     ]->[length $self->{kwd}]) {
5938 wakaba 1.14 ## Stay in the state.
5939     $self->{kwd} .= chr $self->{nc};
5940    
5941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5942     $self->{line_prev} = $self->{line};
5943     $self->{column_prev} = $self->{column};
5944     $self->{column}++;
5945     $self->{nc}
5946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5947     } else {
5948     $self->{set_nc}->($self);
5949     }
5950    
5951     redo A;
5952 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5953     ($self->{nc} == 0x0059 or # Y
5954     $self->{nc} == 0x0079)) { # y
5955     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5957     text => 'ENTITY',
5958     line => $self->{line_prev},
5959     column => $self->{column_prev} - 4);
5960     }
5961     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5962 wakaba 1.14 line => $self->{line_prev},
5963     column => $self->{column_prev} - 6};
5964     $self->{state} = DOCTYPE_MD_STATE;
5965    
5966     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5967     $self->{line_prev} = $self->{line};
5968     $self->{column_prev} = $self->{column};
5969     $self->{column}++;
5970     $self->{nc}
5971     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5972     } else {
5973     $self->{set_nc}->($self);
5974     }
5975    
5976     redo A;
5977     } else {
5978     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5979     line => $self->{line_prev},
5980     column => $self->{column_prev} - 1
5981     - (length $self->{kwd})
5982     + 1 * ($self->{nc} == -1));
5983     $self->{state} = BOGUS_COMMENT_STATE;
5984     ## Reconsume.
5985     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5986     redo A;
5987     }
5988     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5989 wakaba 1.17 if ($self->{nc} == [
5990     undef,
5991     undef,
5992     0x0045, # E
5993     0x004D, # M
5994     0x0045, # E
5995     0x004E, # N
5996     ]->[length $self->{kwd}] or
5997     $self->{nc} == [
5998     undef,
5999     undef,
6000     0x0065, # e
6001     0x006D, # m
6002     0x0065, # e
6003     0x006E, # n
6004     ]->[length $self->{kwd}]) {
6005 wakaba 1.14 ## Stay in the state.
6006     $self->{kwd} .= chr $self->{nc};
6007    
6008     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6009     $self->{line_prev} = $self->{line};
6010     $self->{column_prev} = $self->{column};
6011     $self->{column}++;
6012     $self->{nc}
6013     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6014     } else {
6015     $self->{set_nc}->($self);
6016     }
6017    
6018     redo A;
6019 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6020     ($self->{nc} == 0x0054 or # T
6021     $self->{nc} == 0x0074)) { # t
6022     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6023     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6024     text => 'ELEMENT',
6025     line => $self->{line_prev},
6026     column => $self->{column_prev} - 5);
6027     }
6028 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6029     line => $self->{line_prev},
6030 wakaba 1.23 column => $self->{column_prev} - 7};
6031 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6032    
6033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6034     $self->{line_prev} = $self->{line};
6035     $self->{column_prev} = $self->{column};
6036     $self->{column}++;
6037     $self->{nc}
6038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6039     } else {
6040     $self->{set_nc}->($self);
6041     }
6042    
6043     redo A;
6044     } else {
6045     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6046     line => $self->{line_prev},
6047     column => $self->{column_prev} - 1
6048     - (length $self->{kwd})
6049     + 1 * ($self->{nc} == -1));
6050     $self->{state} = BOGUS_COMMENT_STATE;
6051     ## Reconsume.
6052     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6053     redo A;
6054     }
6055     } elsif ($self->{state} == MD_ATTLIST_STATE) {
6056 wakaba 1.17 if ($self->{nc} == [
6057     undef,
6058     0x0054, # T
6059     0x0054, # T
6060     0x004C, # L
6061     0x0049, # I
6062     0x0053, # S
6063     ]->[length $self->{kwd}] or
6064     $self->{nc} == [
6065     undef,
6066     0x0074, # t
6067     0x0074, # t
6068     0x006C, # l
6069     0x0069, # i
6070     0x0073, # s
6071     ]->[length $self->{kwd}]) {
6072 wakaba 1.14 ## Stay in the state.
6073     $self->{kwd} .= chr $self->{nc};
6074    
6075     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6076     $self->{line_prev} = $self->{line};
6077     $self->{column_prev} = $self->{column};
6078     $self->{column}++;
6079     $self->{nc}
6080     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6081     } else {
6082     $self->{set_nc}->($self);
6083     }
6084    
6085     redo A;
6086 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
6087     ($self->{nc} == 0x0054 or # T
6088     $self->{nc} == 0x0074)) { # t
6089     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6090     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6091     text => 'ATTLIST',
6092     line => $self->{line_prev},
6093     column => $self->{column_prev} - 5);
6094     }
6095 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6096 wakaba 1.15 attrdefs => [],
6097 wakaba 1.14 line => $self->{line_prev},
6098 wakaba 1.23 column => $self->{column_prev} - 7};
6099 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6100    
6101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6102     $self->{line_prev} = $self->{line};
6103     $self->{column_prev} = $self->{column};
6104     $self->{column}++;
6105     $self->{nc}
6106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6107     } else {
6108     $self->{set_nc}->($self);
6109     }
6110    
6111     redo A;
6112     } else {
6113     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6114     line => $self->{line_prev},
6115     column => $self->{column_prev} - 1
6116     - (length $self->{kwd})
6117     + 1 * ($self->{nc} == -1));
6118     $self->{state} = BOGUS_COMMENT_STATE;
6119     ## Reconsume.
6120     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6121     redo A;
6122     }
6123     } elsif ($self->{state} == MD_NOTATION_STATE) {
6124 wakaba 1.17 if ($self->{nc} == [
6125     undef,
6126     0x004F, # O
6127     0x0054, # T
6128     0x0041, # A
6129     0x0054, # T
6130     0x0049, # I
6131     0x004F, # O
6132     ]->[length $self->{kwd}] or
6133     $self->{nc} == [
6134     undef,
6135     0x006F, # o
6136     0x0074, # t
6137     0x0061, # a
6138     0x0074, # t
6139     0x0069, # i
6140     0x006F, # o
6141     ]->[length $self->{kwd}]) {
6142 wakaba 1.14 ## Stay in the state.
6143     $self->{kwd} .= chr $self->{nc};
6144    
6145     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6146     $self->{line_prev} = $self->{line};
6147     $self->{column_prev} = $self->{column};
6148     $self->{column}++;
6149     $self->{nc}
6150     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6151     } else {
6152     $self->{set_nc}->($self);
6153     }
6154    
6155     redo A;
6156 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
6157     ($self->{nc} == 0x004E or # N
6158     $self->{nc} == 0x006E)) { # n
6159     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6161     text => 'NOTATION',
6162     line => $self->{line_prev},
6163     column => $self->{column_prev} - 6);
6164     }
6165 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6166     line => $self->{line_prev},
6167 wakaba 1.23 column => $self->{column_prev} - 8};
6168 wakaba 1.14 $self->{state} = DOCTYPE_MD_STATE;
6169    
6170     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6171     $self->{line_prev} = $self->{line};
6172     $self->{column_prev} = $self->{column};
6173     $self->{column}++;
6174     $self->{nc}
6175     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6176     } else {
6177     $self->{set_nc}->($self);
6178     }
6179    
6180     redo A;
6181     } else {
6182     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6183     line => $self->{line_prev},
6184     column => $self->{column_prev} - 1
6185     - (length $self->{kwd})
6186     + 1 * ($self->{nc} == -1));
6187     $self->{state} = BOGUS_COMMENT_STATE;
6188     ## Reconsume.
6189     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6190     redo A;
6191     }
6192     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6193     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6194     ## "DOCTYPE NOTATION state".
6195    
6196     if ($is_space->{$self->{nc}}) {
6197     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6198     $self->{state} = BEFORE_MD_NAME_STATE;
6199    
6200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6201     $self->{line_prev} = $self->{line};
6202     $self->{column_prev} = $self->{column};
6203     $self->{column}++;
6204     $self->{nc}
6205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6206     } else {
6207     $self->{set_nc}->($self);
6208     }
6209    
6210     redo A;
6211     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6212     $self->{nc} == 0x0025) { # %
6213     ## XML5: Switch to the "DOCTYPE bogus comment state".
6214     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6215     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6216    
6217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6218     $self->{line_prev} = $self->{line};
6219     $self->{column_prev} = $self->{column};
6220     $self->{column}++;
6221     $self->{nc}
6222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6223     } else {
6224     $self->{set_nc}->($self);
6225     }
6226    
6227     redo A;
6228     } elsif ($self->{nc} == -1) {
6229     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6230     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6231     ## Reconsume.
6232     redo A;
6233     } elsif ($self->{nc} == 0x003E) { # >
6234     ## XML5: Switch to the "DOCTYPE bogus comment state".
6235     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6236     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6237    
6238     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6239     $self->{line_prev} = $self->{line};
6240     $self->{column_prev} = $self->{column};
6241     $self->{column}++;
6242     $self->{nc}
6243     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6244     } else {
6245     $self->{set_nc}->($self);
6246     }
6247    
6248     redo A;
6249     } else {
6250     ## XML5: Switch to the "DOCTYPE bogus comment state".
6251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6252     $self->{state} = BEFORE_MD_NAME_STATE;
6253     redo A;
6254     }
6255     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6256     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6257     ## before state", "DOCTYPE ATTLIST name before state".
6258    
6259     if ($is_space->{$self->{nc}}) {
6260     ## Stay in the state.
6261    
6262     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6263     $self->{line_prev} = $self->{line};
6264     $self->{column_prev} = $self->{column};
6265     $self->{column}++;
6266     $self->{nc}
6267     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6268     } else {
6269     $self->{set_nc}->($self);
6270     }
6271    
6272     redo A;
6273     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6274     $self->{nc} == 0x0025) { # %
6275     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6276    
6277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6278     $self->{line_prev} = $self->{line};
6279     $self->{column_prev} = $self->{column};
6280     $self->{column}++;
6281     $self->{nc}
6282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6283     } else {
6284     $self->{set_nc}->($self);
6285     }
6286    
6287     redo A;
6288     } elsif ($self->{nc} == 0x003E) { # >
6289     ## XML5: Same as "Anything else".
6290     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6291     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6292    
6293     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6294     $self->{line_prev} = $self->{line};
6295     $self->{column_prev} = $self->{column};
6296     $self->{column}++;
6297     $self->{nc}
6298     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6299     } else {
6300     $self->{set_nc}->($self);
6301     }
6302    
6303     redo A;
6304     } elsif ($self->{nc} == -1) {
6305     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6306     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6307     ## Reconsume.
6308     redo A;
6309     } else {
6310     ## XML5: [ATTLIST] Not defined yet.
6311     $self->{ct}->{name} .= chr $self->{nc};
6312     $self->{state} = MD_NAME_STATE;
6313    
6314     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6315     $self->{line_prev} = $self->{line};
6316     $self->{column_prev} = $self->{column};
6317     $self->{column}++;
6318     $self->{nc}
6319     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6320     } else {
6321     $self->{set_nc}->($self);
6322     }
6323    
6324     redo A;
6325     }
6326     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6327     if ($is_space->{$self->{nc}}) {
6328     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6329     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6330     $self->{state} = BEFORE_MD_NAME_STATE;
6331 wakaba 1.8
6332 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6333     $self->{line_prev} = $self->{line};
6334     $self->{column_prev} = $self->{column};
6335     $self->{column}++;
6336     $self->{nc}
6337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6338     } else {
6339     $self->{set_nc}->($self);
6340     }
6341    
6342     redo A;
6343     } elsif ($self->{nc} == 0x003E) { # >
6344     ## XML5: Same as "Anything else".
6345     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6346     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6347    
6348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6349     $self->{line_prev} = $self->{line};
6350     $self->{column_prev} = $self->{column};
6351     $self->{column}++;
6352     $self->{nc}
6353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6354     } else {
6355     $self->{set_nc}->($self);
6356     }
6357    
6358     redo A;
6359     } elsif ($self->{nc} == -1) {
6360     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6361     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6362     ## Reconsume.
6363     redo A;
6364     } else {
6365     ## XML5: No parse error.
6366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6367     $self->{state} = BOGUS_COMMENT_STATE;
6368     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6369     ## Reconsume.
6370     redo A;
6371     }
6372     } elsif ($self->{state} == MD_NAME_STATE) {
6373     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6374    
6375     if ($is_space->{$self->{nc}}) {
6376 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6377     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6378     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6379 wakaba 1.20 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6380 wakaba 1.16 } else { # ENTITY/NOTATION
6381     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6382     }
6383 wakaba 1.14
6384     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6385     $self->{line_prev} = $self->{line};
6386     $self->{column_prev} = $self->{column};
6387     $self->{column}++;
6388     $self->{nc}
6389     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6390     } else {
6391     $self->{set_nc}->($self);
6392     }
6393    
6394     redo A;
6395     } elsif ($self->{nc} == 0x003E) { # >
6396     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6397     #
6398     } else {
6399 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6400 wakaba 1.14 }
6401     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6402    
6403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6404     $self->{line_prev} = $self->{line};
6405     $self->{column_prev} = $self->{column};
6406     $self->{column}++;
6407     $self->{nc}
6408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6409     } else {
6410     $self->{set_nc}->($self);
6411     }
6412    
6413     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6414     redo A;
6415     } elsif ($self->{nc} == -1) {
6416     ## XML5: [ATTLIST] No parse error.
6417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6418     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6419     ## Reconsume.
6420     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6421     redo A;
6422     } else {
6423     ## XML5: [ATTLIST] Not defined yet.
6424     $self->{ct}->{name} .= chr $self->{nc};
6425     ## Stay in the state.
6426    
6427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6428     $self->{line_prev} = $self->{line};
6429     $self->{column_prev} = $self->{column};
6430     $self->{column}++;
6431     $self->{nc}
6432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6433     } else {
6434     $self->{set_nc}->($self);
6435     }
6436    
6437     redo A;
6438     }
6439     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6440     if ($is_space->{$self->{nc}}) {
6441     ## Stay in the state.
6442    
6443     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6444     $self->{line_prev} = $self->{line};
6445     $self->{column_prev} = $self->{column};
6446     $self->{column}++;
6447     $self->{nc}
6448     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6449     } else {
6450     $self->{set_nc}->($self);
6451     }
6452    
6453     redo A;
6454     } elsif ($self->{nc} == 0x003E) { # >
6455     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6456    
6457     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6458     $self->{line_prev} = $self->{line};
6459     $self->{column_prev} = $self->{column};
6460     $self->{column}++;
6461     $self->{nc}
6462     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6463     } else {
6464     $self->{set_nc}->($self);
6465     }
6466    
6467     return ($self->{ct}); # ATTLIST
6468     redo A;
6469     } elsif ($self->{nc} == -1) {
6470     ## XML5: No parse error.
6471     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6472     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6473 wakaba 1.15 return ($self->{ct});
6474 wakaba 1.14 redo A;
6475     } else {
6476     ## XML5: Not defined yet.
6477 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6478     tokens => [],
6479     line => $self->{line}, column => $self->{column}};
6480     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6481    
6482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6483     $self->{line_prev} = $self->{line};
6484     $self->{column_prev} = $self->{column};
6485     $self->{column}++;
6486     $self->{nc}
6487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6488     } else {
6489     $self->{set_nc}->($self);
6490     }
6491    
6492     redo A;
6493     }
6494     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6495     if ($is_space->{$self->{nc}}) {
6496     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6497    
6498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6499     $self->{line_prev} = $self->{line};
6500     $self->{column_prev} = $self->{column};
6501     $self->{column}++;
6502     $self->{nc}
6503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6504     } else {
6505     $self->{set_nc}->($self);
6506     }
6507    
6508     redo A;
6509     } elsif ($self->{nc} == 0x003E) { # >
6510     ## XML5: Same as "anything else".
6511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6512     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6513    
6514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515     $self->{line_prev} = $self->{line};
6516     $self->{column_prev} = $self->{column};
6517     $self->{column}++;
6518     $self->{nc}
6519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520     } else {
6521     $self->{set_nc}->($self);
6522     }
6523    
6524     return ($self->{ct}); # ATTLIST
6525     redo A;
6526     } elsif ($self->{nc} == 0x0028) { # (
6527     ## XML5: Same as "anything else".
6528     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6529     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6530    
6531     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6532     $self->{line_prev} = $self->{line};
6533     $self->{column_prev} = $self->{column};
6534     $self->{column}++;
6535     $self->{nc}
6536     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6537     } else {
6538     $self->{set_nc}->($self);
6539     }
6540    
6541     redo A;
6542     } elsif ($self->{nc} == -1) {
6543     ## XML5: No parse error.
6544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6545     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6546    
6547     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6548     $self->{line_prev} = $self->{line};
6549     $self->{column_prev} = $self->{column};
6550     $self->{column}++;
6551     $self->{nc}
6552     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6553     } else {
6554     $self->{set_nc}->($self);
6555     }
6556    
6557     return ($self->{ct}); # ATTLIST
6558     redo A;
6559     } else {
6560     ## XML5: Not defined yet.
6561     $self->{ca}->{name} .= chr $self->{nc};
6562     ## Stay in the state.
6563    
6564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6565     $self->{line_prev} = $self->{line};
6566     $self->{column_prev} = $self->{column};
6567     $self->{column}++;
6568     $self->{nc}
6569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6570     } else {
6571     $self->{set_nc}->($self);
6572     }
6573    
6574 wakaba 1.14 redo A;
6575     }
6576 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6577     if ($is_space->{$self->{nc}}) {
6578     ## Stay in the state.
6579    
6580     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6581     $self->{line_prev} = $self->{line};
6582     $self->{column_prev} = $self->{column};
6583     $self->{column}++;
6584     $self->{nc}
6585     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6586     } else {
6587     $self->{set_nc}->($self);
6588     }
6589    
6590     redo A;
6591     } elsif ($self->{nc} == 0x003E) { # >
6592     ## XML5: Same as "anything else".
6593     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6594     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6595    
6596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6597     $self->{line_prev} = $self->{line};
6598     $self->{column_prev} = $self->{column};
6599     $self->{column}++;
6600     $self->{nc}
6601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6602     } else {
6603     $self->{set_nc}->($self);
6604     }
6605    
6606     return ($self->{ct}); # ATTLIST
6607     redo A;
6608     } elsif ($self->{nc} == 0x0028) { # (
6609     ## XML5: Same as "anything else".
6610     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6611    
6612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6613     $self->{line_prev} = $self->{line};
6614     $self->{column_prev} = $self->{column};
6615     $self->{column}++;
6616     $self->{nc}
6617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6618     } else {
6619     $self->{set_nc}->($self);
6620     }
6621    
6622     redo A;
6623     } elsif ($self->{nc} == -1) {
6624     ## XML5: No parse error.
6625     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6626     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6627    
6628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6629     $self->{line_prev} = $self->{line};
6630     $self->{column_prev} = $self->{column};
6631     $self->{column}++;
6632     $self->{nc}
6633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6634     } else {
6635     $self->{set_nc}->($self);
6636     }
6637    
6638     return ($self->{ct});
6639     redo A;
6640     } else {
6641     ## XML5: Not defined yet.
6642     $self->{ca}->{type} = chr $self->{nc};
6643     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6644    
6645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6646     $self->{line_prev} = $self->{line};
6647     $self->{column_prev} = $self->{column};
6648     $self->{column}++;
6649     $self->{nc}
6650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6651     } else {
6652     $self->{set_nc}->($self);
6653     }
6654    
6655     redo A;
6656     }
6657     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6658     if ($is_space->{$self->{nc}}) {
6659     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6660    
6661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6662     $self->{line_prev} = $self->{line};
6663     $self->{column_prev} = $self->{column};
6664     $self->{column}++;
6665     $self->{nc}
6666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6667     } else {
6668     $self->{set_nc}->($self);
6669     }
6670    
6671     redo A;
6672     } elsif ($self->{nc} == 0x0023) { # #
6673     ## XML5: Same as "anything else".
6674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6675     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6676    
6677     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6678     $self->{line_prev} = $self->{line};
6679     $self->{column_prev} = $self->{column};
6680     $self->{column}++;
6681     $self->{nc}
6682     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6683     } else {
6684     $self->{set_nc}->($self);
6685     }
6686    
6687     redo A;
6688     } elsif ($self->{nc} == 0x0022) { # "
6689     ## XML5: Same as "anything else".
6690     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6691     $self->{ca}->{value} = '';
6692     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6693    
6694     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6695     $self->{line_prev} = $self->{line};
6696     $self->{column_prev} = $self->{column};
6697     $self->{column}++;
6698     $self->{nc}
6699     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6700     } else {
6701     $self->{set_nc}->($self);
6702     }
6703    
6704     redo A;
6705     } elsif ($self->{nc} == 0x0027) { # '
6706     ## XML5: Same as "anything else".
6707     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6708     $self->{ca}->{value} = '';
6709     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6710    
6711     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712     $self->{line_prev} = $self->{line};
6713     $self->{column_prev} = $self->{column};
6714     $self->{column}++;
6715     $self->{nc}
6716     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717     } else {
6718     $self->{set_nc}->($self);
6719     }
6720    
6721     redo A;
6722     } elsif ($self->{nc} == 0x003E) { # >
6723     ## XML5: Same as "anything else".
6724     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6725     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6726    
6727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6728     $self->{line_prev} = $self->{line};
6729     $self->{column_prev} = $self->{column};
6730     $self->{column}++;
6731     $self->{nc}
6732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6733     } else {
6734     $self->{set_nc}->($self);
6735     }
6736    
6737     return ($self->{ct}); # ATTLIST
6738     redo A;
6739     } elsif ($self->{nc} == 0x0028) { # (
6740     ## XML5: Same as "anything else".
6741     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6742     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6743    
6744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6745     $self->{line_prev} = $self->{line};
6746     $self->{column_prev} = $self->{column};
6747     $self->{column}++;
6748     $self->{nc}
6749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6750     } else {
6751     $self->{set_nc}->($self);
6752     }
6753    
6754     redo A;
6755     } elsif ($self->{nc} == -1) {
6756     ## XML5: No parse error.
6757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6758     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6759    
6760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6761     $self->{line_prev} = $self->{line};
6762     $self->{column_prev} = $self->{column};
6763     $self->{column}++;
6764     $self->{nc}
6765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6766     } else {
6767     $self->{set_nc}->($self);
6768     }
6769    
6770     return ($self->{ct});
6771     redo A;
6772     } else {
6773     ## XML5: Not defined yet.
6774     $self->{ca}->{type} .= chr $self->{nc};
6775     ## Stay in the state.
6776    
6777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6778     $self->{line_prev} = $self->{line};
6779     $self->{column_prev} = $self->{column};
6780     $self->{column}++;
6781     $self->{nc}
6782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6783     } else {
6784     $self->{set_nc}->($self);
6785     }
6786    
6787     redo A;
6788     }
6789     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6790     if ($is_space->{$self->{nc}}) {
6791     ## Stay in the state.
6792    
6793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794     $self->{line_prev} = $self->{line};
6795     $self->{column_prev} = $self->{column};
6796     $self->{column}++;
6797     $self->{nc}
6798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799     } else {
6800     $self->{set_nc}->($self);
6801     }
6802    
6803     redo A;
6804     } elsif ($self->{nc} == 0x0028) { # (
6805     ## XML5: Same as "anything else".
6806     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6807    
6808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6809     $self->{line_prev} = $self->{line};
6810     $self->{column_prev} = $self->{column};
6811     $self->{column}++;
6812     $self->{nc}
6813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6814     } else {
6815     $self->{set_nc}->($self);
6816     }
6817    
6818     redo A;
6819     } elsif ($self->{nc} == 0x0023) { # #
6820     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6821    
6822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6823     $self->{line_prev} = $self->{line};
6824     $self->{column_prev} = $self->{column};
6825     $self->{column}++;
6826     $self->{nc}
6827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6828     } else {
6829     $self->{set_nc}->($self);
6830     }
6831    
6832     redo A;
6833     } elsif ($self->{nc} == 0x0022) { # "
6834     ## XML5: Same as "anything else".
6835     $self->{ca}->{value} = '';
6836     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6837    
6838     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6839     $self->{line_prev} = $self->{line};
6840     $self->{column_prev} = $self->{column};
6841     $self->{column}++;
6842     $self->{nc}
6843     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6844     } else {
6845     $self->{set_nc}->($self);
6846     }
6847    
6848     redo A;
6849     } elsif ($self->{nc} == 0x0027) { # '
6850     ## XML5: Same as "anything else".
6851     $self->{ca}->{value} = '';
6852     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6853    
6854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6855     $self->{line_prev} = $self->{line};
6856     $self->{column_prev} = $self->{column};
6857     $self->{column}++;
6858     $self->{nc}
6859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6860     } else {
6861     $self->{set_nc}->($self);
6862     }
6863    
6864     redo A;
6865     } elsif ($self->{nc} == 0x003E) { # >
6866     ## XML5: Same as "anything else".
6867     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6868     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6869    
6870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6871     $self->{line_prev} = $self->{line};
6872     $self->{column_prev} = $self->{column};
6873     $self->{column}++;
6874     $self->{nc}
6875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6876     } else {
6877     $self->{set_nc}->($self);
6878     }
6879    
6880     return ($self->{ct}); # ATTLIST
6881     redo A;
6882     } elsif ($self->{nc} == -1) {
6883     ## XML5: No parse error.
6884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6885     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6886    
6887     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6888     $self->{line_prev} = $self->{line};
6889     $self->{column_prev} = $self->{column};
6890     $self->{column}++;
6891     $self->{nc}
6892     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6893     } else {
6894     $self->{set_nc}->($self);
6895     }
6896    
6897     return ($self->{ct});
6898     redo A;
6899     } else {
6900     ## XML5: Switch to the "DOCTYPE bogus comment state".
6901     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6902     $self->{ca}->{value} = '';
6903     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6904     ## Reconsume.
6905     redo A;
6906     }
6907     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6908     if ($is_space->{$self->{nc}}) {
6909     ## Stay in the state.
6910    
6911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6912     $self->{line_prev} = $self->{line};
6913     $self->{column_prev} = $self->{column};
6914     $self->{column}++;
6915     $self->{nc}
6916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6917     } else {
6918     $self->{set_nc}->($self);
6919     }
6920    
6921     redo A;
6922     } elsif ($self->{nc} == 0x007C) { # |
6923     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6924     ## Stay in the state.
6925    
6926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6927     $self->{line_prev} = $self->{line};
6928     $self->{column_prev} = $self->{column};
6929     $self->{column}++;
6930     $self->{nc}
6931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6932     } else {
6933     $self->{set_nc}->($self);
6934     }
6935    
6936     redo A;
6937     } elsif ($self->{nc} == 0x0029) { # )
6938     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6939     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6940    
6941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6942     $self->{line_prev} = $self->{line};
6943     $self->{column_prev} = $self->{column};
6944     $self->{column}++;
6945     $self->{nc}
6946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6947     } else {
6948     $self->{set_nc}->($self);
6949     }
6950    
6951     redo A;
6952     } elsif ($self->{nc} == 0x003E) { # >
6953     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6954     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6955    
6956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6957     $self->{line_prev} = $self->{line};
6958     $self->{column_prev} = $self->{column};
6959     $self->{column}++;
6960     $self->{nc}
6961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6962     } else {
6963     $self->{set_nc}->($self);
6964     }
6965    
6966     return ($self->{ct}); # ATTLIST
6967     redo A;
6968     } elsif ($self->{nc} == -1) {
6969     ## XML5: No parse error.
6970     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6971     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6972    
6973     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6974     $self->{line_prev} = $self->{line};
6975     $self->{column_prev} = $self->{column};
6976     $self->{column}++;
6977     $self->{nc}
6978     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6979     } else {
6980     $self->{set_nc}->($self);
6981     }
6982    
6983     return ($self->{ct});
6984     redo A;
6985     } else {
6986     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6987     $self->{state} = ALLOWED_TOKEN_STATE;
6988    
6989     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6990     $self->{line_prev} = $self->{line};
6991     $self->{column_prev} = $self->{column};
6992     $self->{column}++;
6993     $self->{nc}
6994     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6995     } else {
6996     $self->{set_nc}->($self);
6997     }
6998    
6999     redo A;
7000     }
7001     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7002     if ($is_space->{$self->{nc}}) {
7003     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7004    
7005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7006     $self->{line_prev} = $self->{line};
7007     $self->{column_prev} = $self->{column};
7008     $self->{column}++;
7009     $self->{nc}
7010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7011     } else {
7012     $self->{set_nc}->($self);
7013     }
7014    
7015     redo A;
7016     } elsif ($self->{nc} == 0x007C) { # |
7017     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7018    
7019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020     $self->{line_prev} = $self->{line};
7021     $self->{column_prev} = $self->{column};
7022     $self->{column}++;
7023     $self->{nc}
7024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025     } else {
7026     $self->{set_nc}->($self);
7027     }
7028    
7029     redo A;
7030     } elsif ($self->{nc} == 0x0029) { # )
7031     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7032    
7033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7034     $self->{line_prev} = $self->{line};
7035     $self->{column_prev} = $self->{column};
7036     $self->{column}++;
7037     $self->{nc}
7038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7039     } else {
7040     $self->{set_nc}->($self);
7041     }
7042    
7043     redo A;
7044     } elsif ($self->{nc} == 0x003E) { # >
7045     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7046     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7047    
7048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7049     $self->{line_prev} = $self->{line};
7050     $self->{column_prev} = $self->{column};
7051     $self->{column}++;
7052     $self->{nc}
7053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7054     } else {
7055     $self->{set_nc}->($self);
7056     }
7057    
7058     return ($self->{ct}); # ATTLIST
7059     redo A;
7060     } elsif ($self->{nc} == -1) {
7061     ## XML5: No parse error.
7062     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7063     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7064    
7065     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7066     $self->{line_prev} = $self->{line};
7067     $self->{column_prev} = $self->{column};
7068     $self->{column}++;
7069     $self->{nc}
7070     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7071     } else {
7072     $self->{set_nc}->($self);
7073     }
7074    
7075     return ($self->{ct});
7076     redo A;
7077     } else {
7078     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7079     ## Stay in the state.
7080    
7081     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7082     $self->{line_prev} = $self->{line};
7083     $self->{column_prev} = $self->{column};
7084     $self->{column}++;
7085     $self->{nc}
7086     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7087     } else {
7088     $self->{set_nc}->($self);
7089     }
7090    
7091     redo A;
7092     }
7093     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7094     if ($is_space->{$self->{nc}}) {
7095     ## Stay in the state.
7096    
7097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7098     $self->{line_prev} = $self->{line};
7099     $self->{column_prev} = $self->{column};
7100     $self->{column}++;
7101     $self->{nc}
7102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7103     } else {
7104     $self->{set_nc}->($self);
7105     }
7106    
7107     redo A;
7108     } elsif ($self->{nc} == 0x007C) { # |
7109     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7110    
7111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7112     $self->{line_prev} = $self->{line};
7113     $self->{column_prev} = $self->{column};
7114     $self->{column}++;
7115     $self->{nc}
7116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7117     } else {
7118     $self->{set_nc}->($self);
7119     }
7120    
7121     redo A;
7122     } elsif ($self->{nc} == 0x0029) { # )
7123     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7124    
7125     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7126     $self->{line_prev} = $self->{line};
7127     $self->{column_prev} = $self->{column};
7128     $self->{column}++;
7129     $self->{nc}
7130     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7131     } else {
7132     $self->{set_nc}->($self);
7133     }
7134    
7135     redo A;
7136     } elsif ($self->{nc} == 0x003E) { # >
7137     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7138     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7139    
7140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7141     $self->{line_prev} = $self->{line};
7142     $self->{column_prev} = $self->{column};
7143     $self->{column}++;
7144     $self->{nc}
7145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7146     } else {
7147     $self->{set_nc}->($self);
7148     }
7149    
7150     return ($self->{ct}); # ATTLIST
7151     redo A;
7152     } elsif ($self->{nc} == -1) {
7153     ## XML5: No parse error.
7154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7155     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7156    
7157     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7158     $self->{line_prev} = $self->{line};
7159     $self->{column_prev} = $self->{column};
7160     $self->{column}++;
7161     $self->{nc}
7162     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7163     } else {
7164     $self->{set_nc}->($self);
7165     }
7166    
7167     return ($self->{ct});
7168     redo A;
7169     } else {
7170     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7171     line => $self->{line_prev},
7172     column => $self->{column_prev});
7173     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7174     $self->{state} = ALLOWED_TOKEN_STATE;
7175    
7176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177     $self->{line_prev} = $self->{line};
7178     $self->{column_prev} = $self->{column};
7179     $self->{column}++;
7180     $self->{nc}
7181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182     } else {
7183     $self->{set_nc}->($self);
7184     }
7185    
7186     redo A;
7187     }
7188     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7189     if ($is_space->{$self->{nc}}) {
7190     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7191    
7192     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193     $self->{line_prev} = $self->{line};
7194     $self->{column_prev} = $self->{column};
7195     $self->{column}++;
7196     $self->{nc}
7197     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198     } else {
7199     $self->{set_nc}->($self);
7200     }
7201    
7202     redo A;
7203     } elsif ($self->{nc} == 0x0023) { # #
7204     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7205     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7206    
7207     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7208     $self->{line_prev} = $self->{line};
7209     $self->{column_prev} = $self->{column};
7210     $self->{column}++;
7211     $self->{nc}
7212     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7213     } else {
7214     $self->{set_nc}->($self);
7215     }
7216    
7217     redo A;
7218     } elsif ($self->{nc} == 0x0022) { # "
7219     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7220     $self->{ca}->{value} = '';
7221     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7222    
7223     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7224     $self->{line_prev} = $self->{line};
7225     $self->{column_prev} = $self->{column};
7226     $self->{column}++;
7227     $self->{nc}
7228     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7229     } else {
7230     $self->{set_nc}->($self);
7231     }
7232    
7233     redo A;
7234     } elsif ($self->{nc} == 0x0027) { # '
7235     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7236     $self->{ca}->{value} = '';
7237     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7238    
7239     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7240     $self->{line_prev} = $self->{line};
7241     $self->{column_prev} = $self->{column};
7242     $self->{column}++;
7243     $self->{nc}
7244     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7245     } else {
7246     $self->{set_nc}->($self);
7247     }
7248    
7249     redo A;
7250     } elsif ($self->{nc} == 0x003E) { # >
7251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7253    
7254     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7255     $self->{line_prev} = $self->{line};
7256     $self->{column_prev} = $self->{column};
7257     $self->{column}++;
7258     $self->{nc}
7259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7260     } else {
7261     $self->{set_nc}->($self);
7262     }
7263    
7264     return ($self->{ct}); # ATTLIST
7265     redo A;
7266     } elsif ($self->{nc} == -1) {
7267     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7268     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7269    
7270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271     $self->{line_prev} = $self->{line};
7272     $self->{column_prev} = $self->{column};
7273     $self->{column}++;
7274     $self->{nc}
7275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276     } else {
7277     $self->{set_nc}->($self);
7278     }
7279    
7280     return ($self->{ct});
7281     redo A;
7282     } else {
7283     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7284     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7285     ## Reconsume.
7286     redo A;
7287     }
7288     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7289     if ($is_space->{$self->{nc}}) {
7290     ## Stay in the state.
7291    
7292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7293     $self->{line_prev} = $self->{line};
7294     $self->{column_prev} = $self->{column};
7295     $self->{column}++;
7296     $self->{nc}
7297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7298     } else {
7299     $self->{set_nc}->($self);
7300     }
7301    
7302     redo A;
7303     } elsif ($self->{nc} == 0x0023) { # #
7304     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7305    
7306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7307     $self->{line_prev} = $self->{line};
7308     $self->{column_prev} = $self->{column};
7309     $self->{column}++;
7310     $self->{nc}
7311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7312     } else {
7313     $self->{set_nc}->($self);
7314     }
7315    
7316     redo A;
7317     } elsif ($self->{nc} == 0x0022) { # "
7318     $self->{ca}->{value} = '';
7319     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7320    
7321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7322     $self->{line_prev} = $self->{line};
7323     $self->{column_prev} = $self->{column};
7324     $self->{column}++;
7325     $self->{nc}
7326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7327     } else {
7328     $self->{set_nc}->($self);
7329     }
7330    
7331     redo A;
7332     } elsif ($self->{nc} == 0x0027) { # '
7333     $self->{ca}->{value} = '';
7334     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7335    
7336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7337     $self->{line_prev} = $self->{line};
7338     $self->{column_prev} = $self->{column};
7339     $self->{column}++;
7340     $self->{nc}
7341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7342     } else {
7343     $self->{set_nc}->($self);
7344     }
7345    
7346     redo A;
7347     } elsif ($self->{nc} == 0x003E) { # >
7348     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7349     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7350    
7351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7352     $self->{line_prev} = $self->{line};
7353     $self->{column_prev} = $self->{column};
7354     $self->{column}++;
7355     $self->{nc}
7356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7357     } else {
7358     $self->{set_nc}->($self);
7359     }
7360    
7361     return ($self->{ct}); # ATTLIST
7362     redo A;
7363     } elsif ($self->{nc} == -1) {
7364     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7365     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7366    
7367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7368     $self->{line_prev} = $self->{line};
7369     $self->{column_prev} = $self->{column};
7370     $self->{column}++;
7371     $self->{nc}
7372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7373     } else {
7374     $self->{set_nc}->($self);
7375     }
7376    
7377     return ($self->{ct});
7378     redo A;
7379     } else {
7380     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7381     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7382     ## Reconsume.
7383     redo A;
7384     }
7385     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7386     if ($is_space->{$self->{nc}}) {
7387     ## XML5: No parse error.
7388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7389 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7390 wakaba 1.15 ## Reconsume.
7391     redo A;
7392     } elsif ($self->{nc} == 0x0022) { # "
7393     ## XML5: Same as "anything else".
7394     $self->{ca}->{value} = '';
7395     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7396    
7397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7398     $self->{line_prev} = $self->{line};
7399     $self->{column_prev} = $self->{column};
7400     $self->{column}++;
7401     $self->{nc}
7402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7403     } else {
7404     $self->{set_nc}->($self);
7405     }
7406    
7407     redo A;
7408     } elsif ($self->{nc} == 0x0027) { # '
7409     ## XML5: Same as "anything else".
7410     $self->{ca}->{value} = '';
7411     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7412    
7413     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7414     $self->{line_prev} = $self->{line};
7415     $self->{column_prev} = $self->{column};
7416     $self->{column}++;
7417     $self->{nc}
7418     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7419     } else {
7420     $self->{set_nc}->($self);
7421     }
7422    
7423     redo A;
7424     } elsif ($self->{nc} == 0x003E) { # >
7425     ## XML5: Same as "anything else".
7426     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7427     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7428    
7429     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7430     $self->{line_prev} = $self->{line};
7431     $self->{column_prev} = $self->{column};
7432     $self->{column}++;
7433     $self->{nc}
7434     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7435     } else {
7436     $self->{set_nc}->($self);
7437     }
7438    
7439     return ($self->{ct}); # ATTLIST
7440     redo A;
7441     } elsif ($self->{nc} == -1) {
7442     ## XML5: No parse error.
7443     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7444     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7445    
7446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7447     $self->{line_prev} = $self->{line};
7448     $self->{column_prev} = $self->{column};
7449     $self->{column}++;
7450     $self->{nc}
7451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7452     } else {
7453     $self->{set_nc}->($self);
7454     }
7455    
7456     return ($self->{ct});
7457     redo A;
7458     } else {
7459     $self->{ca}->{default} = chr $self->{nc};
7460     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7461    
7462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7463     $self->{line_prev} = $self->{line};
7464     $self->{column_prev} = $self->{column};
7465     $self->{column}++;
7466     $self->{nc}
7467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7468     } else {
7469     $self->{set_nc}->($self);
7470     }
7471    
7472     redo A;
7473     }
7474     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7475     if ($is_space->{$self->{nc}}) {
7476     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7477    
7478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7479     $self->{line_prev} = $self->{line};
7480     $self->{column_prev} = $self->{column};
7481     $self->{column}++;
7482     $self->{nc}
7483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7484     } else {
7485     $self->{set_nc}->($self);
7486     }
7487    
7488     redo A;
7489     } elsif ($self->{nc} == 0x0022) { # "
7490     ## XML5: Same as "anything else".
7491     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7492     $self->{ca}->{value} = '';
7493     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7494    
7495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7496     $self->{line_prev} = $self->{line};
7497     $self->{column_prev} = $self->{column};
7498     $self->{column}++;
7499     $self->{nc}
7500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7501     } else {
7502     $self->{set_nc}->($self);
7503     }
7504    
7505     redo A;
7506     } elsif ($self->{nc} == 0x0027) { # '
7507     ## XML5: Same as "anything else".
7508     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7509     $self->{ca}->{value} = '';
7510     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7511    
7512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7513     $self->{line_prev} = $self->{line};
7514     $self->{column_prev} = $self->{column};
7515     $self->{column}++;
7516     $self->{nc}
7517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7518     } else {
7519     $self->{set_nc}->($self);
7520     }
7521    
7522     redo A;
7523     } elsif ($self->{nc} == 0x003E) { # >
7524     ## XML5: Same as "anything else".
7525     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7526     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7527    
7528     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7529     $self->{line_prev} = $self->{line};
7530     $self->{column_prev} = $self->{column};
7531     $self->{column}++;
7532     $self->{nc}
7533     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7534     } else {
7535     $self->{set_nc}->($self);
7536     }
7537    
7538     return ($self->{ct}); # ATTLIST
7539     redo A;
7540     } elsif ($self->{nc} == -1) {
7541     ## XML5: No parse error.
7542     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7543     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7544     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7545    
7546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547     $self->{line_prev} = $self->{line};
7548     $self->{column_prev} = $self->{column};
7549     $self->{column}++;
7550     $self->{nc}
7551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552     } else {
7553     $self->{set_nc}->($self);
7554     }
7555    
7556     return ($self->{ct});
7557     redo A;
7558     } else {
7559     $self->{ca}->{default} .= chr $self->{nc};
7560     ## Stay in the state.
7561    
7562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7563     $self->{line_prev} = $self->{line};
7564     $self->{column_prev} = $self->{column};
7565     $self->{column}++;
7566     $self->{nc}
7567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7568     } else {
7569     $self->{set_nc}->($self);
7570     }
7571    
7572     redo A;
7573     }
7574     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7575     if ($is_space->{$self->{nc}}) {
7576     ## Stay in the state.
7577    
7578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7579     $self->{line_prev} = $self->{line};
7580     $self->{column_prev} = $self->{column};
7581     $self->{column}++;
7582     $self->{nc}
7583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7584     } else {
7585     $self->{set_nc}->($self);
7586     }
7587    
7588     redo A;
7589     } elsif ($self->{nc} == 0x0022) { # "
7590     $self->{ca}->{value} = '';
7591     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7592    
7593     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7594     $self->{line_prev} = $self->{line};
7595     $self->{column_prev} = $self->{column};
7596     $self->{column}++;
7597     $self->{nc}
7598     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7599     } else {
7600     $self->{set_nc}->($self);
7601     }
7602    
7603     redo A;
7604     } elsif ($self->{nc} == 0x0027) { # '
7605     $self->{ca}->{value} = '';
7606     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7607    
7608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7609     $self->{line_prev} = $self->{line};
7610     $self->{column_prev} = $self->{column};
7611     $self->{column}++;
7612     $self->{nc}
7613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7614     } else {
7615     $self->{set_nc}->($self);
7616     }
7617    
7618     redo A;
7619     } elsif ($self->{nc} == 0x003E) { # >
7620     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7621     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7622    
7623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7624     $self->{line_prev} = $self->{line};
7625     $self->{column_prev} = $self->{column};
7626     $self->{column}++;
7627     $self->{nc}
7628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7629     } else {
7630     $self->{set_nc}->($self);
7631     }
7632    
7633     return ($self->{ct}); # ATTLIST
7634     redo A;
7635     } elsif ($self->{nc} == -1) {
7636     ## XML5: No parse error.
7637     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7638     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7639     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7640    
7641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7642     $self->{line_prev} = $self->{line};
7643     $self->{column_prev} = $self->{column};
7644     $self->{column}++;
7645     $self->{nc}
7646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7647     } else {
7648     $self->{set_nc}->($self);
7649     }
7650    
7651     return ($self->{ct});
7652     redo A;
7653     } else {
7654     ## XML5: Not defined yet.
7655     if ($self->{ca}->{default} eq 'FIXED') {
7656     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7657     } else {
7658     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7659     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7660     }
7661     ## Reconsume.
7662     redo A;
7663     }
7664     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7665     if ($is_space->{$self->{nc}} or
7666     $self->{nc} == -1 or
7667     $self->{nc} == 0x003E) { # >
7668     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7669     ## Reconsume.
7670     redo A;
7671     } else {
7672     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7673     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7674     ## Reconsume.
7675     redo A;
7676 wakaba 1.16 }
7677 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7678     ## ASCII case-insensitive
7679     if ($self->{nc} == [
7680     undef,
7681     0x0044, # D
7682     0x0041, # A
7683     0x0054, # T
7684     ]->[length $self->{kwd}] or
7685     $self->{nc} == [
7686     undef,
7687     0x0064, # d
7688     0x0061, # a
7689     0x0074, # t
7690     ]->[length $self->{kwd}]) {
7691    
7692     ## Stay in the state.
7693     $self->{kwd} .= chr $self->{nc};
7694    
7695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7696     $self->{line_prev} = $self->{line};
7697     $self->{column_prev} = $self->{column};
7698     $self->{column}++;
7699     $self->{nc}
7700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7701     } else {
7702     $self->{set_nc}->($self);
7703     }
7704    
7705     redo A;
7706     } elsif ((length $self->{kwd}) == 4 and
7707     ($self->{nc} == 0x0041 or # A
7708     $self->{nc} == 0x0061)) { # a
7709     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7710    
7711     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7712     text => 'NDATA',
7713     line => $self->{line_prev},
7714     column => $self->{column_prev} - 4);
7715     } else {
7716    
7717     }
7718     $self->{state} = AFTER_NDATA_STATE;
7719    
7720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7721     $self->{line_prev} = $self->{line};
7722     $self->{column_prev} = $self->{column};
7723     $self->{column}++;
7724     $self->{nc}
7725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7726     } else {
7727     $self->{set_nc}->($self);
7728     }
7729    
7730     redo A;
7731     } else {
7732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7733     line => $self->{line_prev},
7734     column => $self->{column_prev} + 1
7735     - length $self->{kwd});
7736    
7737     $self->{state} = BOGUS_MD_STATE;
7738     ## Reconsume.
7739     redo A;
7740     }
7741     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7742     if ($is_space->{$self->{nc}}) {
7743     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7744    
7745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7746     $self->{line_prev} = $self->{line};
7747     $self->{column_prev} = $self->{column};
7748     $self->{column}++;
7749     $self->{nc}
7750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7751     } else {
7752     $self->{set_nc}->($self);
7753     }
7754    
7755     redo A;
7756     } elsif ($self->{nc} == 0x003E) { # >
7757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7758     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7759    
7760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7761     $self->{line_prev} = $self->{line};
7762     $self->{column_prev} = $self->{column};
7763     $self->{column}++;
7764     $self->{nc}
7765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7766     } else {
7767     $self->{set_nc}->($self);
7768     }
7769    
7770     return ($self->{ct}); # ENTITY
7771     redo A;
7772     } elsif ($self->{nc} == -1) {
7773     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7774     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7775    
7776     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7777     $self->{line_prev} = $self->{line};
7778     $self->{column_prev} = $self->{column};
7779     $self->{column}++;
7780     $self->{nc}
7781     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7782     } else {
7783     $self->{set_nc}->($self);
7784     }
7785    
7786     return ($self->{ct}); # ENTITY
7787     redo A;
7788     } else {
7789     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7790     line => $self->{line_prev},
7791     column => $self->{column_prev} + 1
7792     - length $self->{kwd});
7793     $self->{state} = BOGUS_MD_STATE;
7794     ## Reconsume.
7795     redo A;
7796     }
7797     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7798     if ($is_space->{$self->{nc}}) {
7799     ## Stay in the state.
7800    
7801     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7802     $self->{line_prev} = $self->{line};
7803     $self->{column_prev} = $self->{column};
7804     $self->{column}++;
7805     $self->{nc}
7806     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7807     } else {
7808     $self->{set_nc}->($self);
7809     }
7810    
7811     redo A;
7812     } elsif ($self->{nc} == 0x003E) { # >
7813     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7814     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7815    
7816     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7817     $self->{line_prev} = $self->{line};
7818     $self->{column_prev} = $self->{column};
7819     $self->{column}++;
7820     $self->{nc}
7821     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7822     } else {
7823     $self->{set_nc}->($self);
7824     }
7825    
7826     return ($self->{ct}); # ENTITY
7827     redo A;
7828     } elsif ($self->{nc} == -1) {
7829     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7830     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7831    
7832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7833     $self->{line_prev} = $self->{line};
7834     $self->{column_prev} = $self->{column};
7835     $self->{column}++;
7836     $self->{nc}
7837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7838     } else {
7839     $self->{set_nc}->($self);
7840     }
7841    
7842     return ($self->{ct}); # ENTITY
7843     redo A;
7844     } else {
7845     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7846     $self->{state} = NOTATION_NAME_STATE;
7847    
7848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7849     $self->{line_prev} = $self->{line};
7850     $self->{column_prev} = $self->{column};
7851     $self->{column}++;
7852     $self->{nc}
7853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7854     } else {
7855     $self->{set_nc}->($self);
7856     }
7857    
7858     redo A;
7859     }
7860     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7861     if ($is_space->{$self->{nc}}) {
7862 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7863 wakaba 1.18
7864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7865     $self->{line_prev} = $self->{line};
7866     $self->{column_prev} = $self->{column};
7867     $self->{column}++;
7868     $self->{nc}
7869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7870     } else {
7871     $self->{set_nc}->($self);
7872     }
7873    
7874     redo A;
7875     } elsif ($self->{nc} == 0x003E) { # >
7876     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7877    
7878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7879     $self->{line_prev} = $self->{line};
7880     $self->{column_prev} = $self->{column};
7881     $self->{column}++;
7882     $self->{nc}
7883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7884     } else {
7885     $self->{set_nc}->($self);
7886     }
7887    
7888     return ($self->{ct}); # ENTITY
7889     redo A;
7890     } elsif ($self->{nc} == -1) {
7891     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7892     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7893    
7894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7895     $self->{line_prev} = $self->{line};
7896     $self->{column_prev} = $self->{column};
7897     $self->{column}++;
7898     $self->{nc}
7899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7900     } else {
7901     $self->{set_nc}->($self);
7902     }
7903    
7904     return ($self->{ct}); # ENTITY
7905     redo A;
7906     } else {
7907     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7908     ## Stay in the state.
7909    
7910     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7911     $self->{line_prev} = $self->{line};
7912     $self->{column_prev} = $self->{column};
7913     $self->{column}++;
7914     $self->{nc}
7915     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7916     } else {
7917     $self->{set_nc}->($self);
7918     }
7919    
7920     redo A;
7921     }
7922 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7923     if ($self->{nc} == 0x0022) { # "
7924 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7925 wakaba 1.19
7926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7927     $self->{line_prev} = $self->{line};
7928     $self->{column_prev} = $self->{column};
7929     $self->{column}++;
7930     $self->{nc}
7931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7932     } else {
7933     $self->{set_nc}->($self);
7934     }
7935    
7936     redo A;
7937     } elsif ($self->{nc} == 0x0026) { # &
7938     $self->{prev_state} = $self->{state};
7939     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7940     $self->{entity_add} = 0x0022; # "
7941    
7942     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7943     $self->{line_prev} = $self->{line};
7944     $self->{column_prev} = $self->{column};
7945     $self->{column}++;
7946     $self->{nc}
7947     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7948     } else {
7949     $self->{set_nc}->($self);
7950     }
7951    
7952     redo A;
7953     ## TODO: %
7954     } elsif ($self->{nc} == -1) {
7955     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7956     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7957     ## Reconsume.
7958     return ($self->{ct}); # ENTITY
7959     redo A;
7960     } else {
7961     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7962    
7963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7964     $self->{line_prev} = $self->{line};
7965     $self->{column_prev} = $self->{column};
7966     $self->{column}++;
7967     $self->{nc}
7968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7969     } else {
7970     $self->{set_nc}->($self);
7971     }
7972    
7973     redo A;
7974     }
7975     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7976     if ($self->{nc} == 0x0027) { # '
7977 wakaba 1.20 $self->{state} = AFTER_MD_DEF_STATE;
7978 wakaba 1.19
7979     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7980     $self->{line_prev} = $self->{line};
7981     $self->{column_prev} = $self->{column};
7982     $self->{column}++;
7983     $self->{nc}
7984     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7985     } else {
7986     $self->{set_nc}->($self);
7987     }
7988    
7989     redo A;
7990     } elsif ($self->{nc} == 0x0026) { # &
7991     $self->{prev_state} = $self->{state};
7992     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7993     $self->{entity_add} = 0x0027; # '
7994    
7995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7996     $self->{line_prev} = $self->{line};
7997     $self->{column_prev} = $self->{column};
7998     $self->{column}++;
7999     $self->{nc}
8000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8001     } else {
8002     $self->{set_nc}->($self);
8003     }
8004    
8005     redo A;
8006     ## TODO: %
8007     } elsif ($self->{nc} == -1) {
8008     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8009     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8010     ## Reconsume.
8011     return ($self->{ct}); # ENTITY
8012     redo A;
8013     } else {
8014     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8015    
8016     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8017     $self->{line_prev} = $self->{line};
8018     $self->{column_prev} = $self->{column};
8019     $self->{column}++;
8020     $self->{nc}
8021     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8022     } else {
8023     $self->{set_nc}->($self);
8024     }
8025    
8026     redo A;
8027     }
8028     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8029     if ($is_space->{$self->{nc}} or
8030     {
8031     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8032     $self->{entity_add} => 1,
8033     }->{$self->{nc}}) {
8034 wakaba 1.22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8035     line => $self->{line_prev},
8036     column => $self->{column_prev}
8037     + ($self->{nc} == -1 ? 1 : 0));
8038 wakaba 1.19 ## Don't consume
8039     ## Return nothing.
8040     #
8041     } elsif ($self->{nc} == 0x0023) { # #
8042     $self->{ca} = $self->{ct};
8043     $self->{state} = ENTITY_HASH_STATE;
8044     $self->{kwd} = '#';
8045    
8046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8047     $self->{line_prev} = $self->{line};
8048     $self->{column_prev} = $self->{column};
8049     $self->{column}++;
8050     $self->{nc}
8051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8052     } else {
8053     $self->{set_nc}->($self);
8054     }
8055    
8056     redo A;
8057     } else {
8058     #
8059     }
8060    
8061     $self->{ct}->{value} .= '&';
8062     $self->{state} = $self->{prev_state};
8063     ## Reconsume.
8064     redo A;
8065 wakaba 1.20 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8066     if ($is_space->{$self->{nc}}) {
8067     $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8068    
8069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8070     $self->{line_prev} = $self->{line};
8071     $self->{column_prev} = $self->{column};
8072     $self->{column}++;
8073     $self->{nc}
8074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8075     } else {
8076     $self->{set_nc}->($self);
8077     }
8078    
8079     redo A;
8080     } elsif ($self->{nc} == 0x0028) { # (
8081     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8082     $self->{ct}->{content} = ['('];
8083     $self->{group_depth} = 1;
8084    
8085     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8086     $self->{line_prev} = $self->{line};
8087     $self->{column_prev} = $self->{column};
8088     $self->{column}++;
8089     $self->{nc}
8090     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8091     } else {
8092     $self->{set_nc}->($self);
8093     }
8094    
8095     redo A;
8096     } elsif ($self->{nc} == 0x003E) { # >
8097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8099    
8100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8101     $self->{line_prev} = $self->{line};
8102     $self->{column_prev} = $self->{column};
8103     $self->{column}++;
8104     $self->{nc}
8105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8106     } else {
8107     $self->{set_nc}->($self);
8108     }
8109    
8110     return ($self->{ct}); # ELEMENT
8111     redo A;
8112     } elsif ($self->{nc} == -1) {
8113     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8115    
8116     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8117     $self->{line_prev} = $self->{line};
8118     $self->{column_prev} = $self->{column};
8119     $self->{column}++;
8120     $self->{nc}
8121     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8122     } else {
8123     $self->{set_nc}->($self);
8124     }
8125    
8126     return ($self->{ct}); # ELEMENT
8127     redo A;
8128     } else {
8129     $self->{ct}->{content} = [chr $self->{nc}];
8130     $self->{state} = CONTENT_KEYWORD_STATE;
8131    
8132     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8133     $self->{line_prev} = $self->{line};
8134     $self->{column_prev} = $self->{column};
8135     $self->{column}++;
8136     $self->{nc}
8137     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8138     } else {
8139     $self->{set_nc}->($self);
8140     }
8141    
8142     redo A;
8143     }
8144     } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8145     if ($is_space->{$self->{nc}}) {
8146     $self->{state} = AFTER_MD_DEF_STATE;
8147    
8148     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8149     $self->{line_prev} = $self->{line};
8150     $self->{column_prev} = $self->{column};
8151     $self->{column}++;
8152     $self->{nc}
8153     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8154     } else {
8155     $self->{set_nc}->($self);
8156     }
8157    
8158     redo A;
8159     } elsif ($self->{nc} == 0x003E) { # >
8160     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8161    
8162     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8163     $self->{line_prev} = $self->{line};
8164     $self->{column_prev} = $self->{column};
8165     $self->{column}++;
8166     $self->{nc}
8167     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8168     } else {
8169     $self->{set_nc}->($self);
8170     }
8171    
8172     return ($self->{ct}); # ELEMENT
8173     redo A;
8174     } elsif ($self->{nc} == -1) {
8175     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8176     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8177    
8178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8179     $self->{line_prev} = $self->{line};
8180     $self->{column_prev} = $self->{column};
8181     $self->{column}++;
8182     $self->{nc}
8183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8184     } else {
8185     $self->{set_nc}->($self);
8186     }
8187    
8188     return ($self->{ct}); # ELEMENT
8189     redo A;
8190     } else {
8191     $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8192     ## Stay in the state.
8193    
8194     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8195     $self->{line_prev} = $self->{line};
8196     $self->{column_prev} = $self->{column};
8197     $self->{column}++;
8198     $self->{nc}
8199     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8200     } else {
8201     $self->{set_nc}->($self);
8202     }
8203    
8204     redo A;
8205     }
8206     } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8207     if ($is_space->{$self->{nc}}) {
8208     ## Stay in the state.
8209    
8210     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8211     $self->{line_prev} = $self->{line};
8212     $self->{column_prev} = $self->{column};
8213     $self->{column}++;
8214     $self->{nc}
8215     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8216     } else {
8217     $self->{set_nc}->($self);
8218     }
8219    
8220     redo A;
8221     } elsif ($self->{nc} == 0x0028) { # (
8222     $self->{group_depth}++;
8223     push @{$self->{ct}->{content}}, chr $self->{nc};
8224     ## Stay in the state.
8225    
8226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8227     $self->{line_prev} = $self->{line};
8228     $self->{column_prev} = $self->{column};
8229     $self->{column}++;
8230     $self->{nc}
8231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8232     } else {
8233     $self->{set_nc}->($self);
8234     }
8235    
8236     redo A;
8237     } elsif ($self->{nc} == 0x007C or # |
8238     $self->{nc} == 0x002C) { # ,
8239     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8240     ## Stay in the state.
8241    
8242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8243     $self->{line_prev} = $self->{line};
8244     $self->{column_prev} = $self->{column};
8245     $self->{column}++;
8246     $self->{nc}
8247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8248     } else {
8249     $self->{set_nc}->($self);
8250     }
8251    
8252     redo A;
8253     } elsif ($self->{nc} == 0x0029) { # )
8254     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8255     push @{$self->{ct}->{content}}, chr $self->{nc};
8256     $self->{group_depth}--;
8257     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8258    
8259     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8260     $self->{line_prev} = $self->{line};
8261     $self->{column_prev} = $self->{column};
8262     $self->{column}++;
8263     $self->{nc}
8264     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8265     } else {
8266     $self->{set_nc}->($self);
8267     }
8268    
8269     redo A;
8270     } elsif ($self->{nc} == 0x003E) { # >
8271     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8272     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8273     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8274    
8275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8276     $self->{line_prev} = $self->{line};
8277     $self->{column_prev} = $self->{column};
8278     $self->{column}++;
8279     $self->{nc}
8280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8281     } else {
8282     $self->{set_nc}->($self);
8283     }
8284    
8285     return ($self->{ct}); # ELEMENT
8286     redo A;
8287     } elsif ($self->{nc} == -1) {
8288     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8289     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8290     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8291    
8292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8293     $self->{line_prev} = $self->{line};
8294     $self->{column_prev} = $self->{column};
8295     $self->{column}++;
8296     $self->{nc}
8297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8298     } else {
8299     $self->{set_nc}->($self);
8300     }
8301    
8302     return ($self->{ct}); # ELEMENT
8303     redo A;
8304     } else {
8305     push @{$self->{ct}->{content}}, chr $self->{nc};
8306     $self->{state} = CM_ELEMENT_NAME_STATE;
8307    
8308     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8309     $self->{line_prev} = $self->{line};
8310     $self->{column_prev} = $self->{column};
8311     $self->{column}++;
8312     $self->{nc}
8313     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8314     } else {
8315     $self->{set_nc}->($self);
8316     }
8317    
8318     redo A;
8319     }
8320     } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8321     if ($is_space->{$self->{nc}}) {
8322     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8323    
8324     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8325     $self->{line_prev} = $self->{line};
8326     $self->{column_prev} = $self->{column};
8327     $self->{column}++;
8328     $self->{nc}
8329     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8330     } else {
8331     $self->{set_nc}->($self);
8332     }
8333    
8334     redo A;
8335     } elsif ($self->{nc} == 0x002A or # *
8336     $self->{nc} == 0x002B or # +
8337     $self->{nc} == 0x003F) { # ?
8338     push @{$self->{ct}->{content}}, chr $self->{nc};
8339     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8340    
8341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8342     $self->{line_prev} = $self->{line};
8343     $self->{column_prev} = $self->{column};
8344     $self->{column}++;
8345     $self->{nc}
8346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8347     } else {
8348     $self->{set_nc}->($self);
8349     }
8350    
8351     redo A;
8352     } elsif ($self->{nc} == 0x007C or # |
8353     $self->{nc} == 0x002C) { # ,
8354     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8355     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8356    
8357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8358     $self->{line_prev} = $self->{line};
8359     $self->{column_prev} = $self->{column};
8360     $self->{column}++;
8361     $self->{nc}
8362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8363     } else {
8364     $self->{set_nc}->($self);
8365     }
8366    
8367     redo A;
8368     } elsif ($self->{nc} == 0x0029) { # )
8369     $self->{group_depth}--;
8370     push @{$self->{ct}->{content}}, chr $self->{nc};
8371     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8372    
8373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8374     $self->{line_prev} = $self->{line};
8375     $self->{column_prev} = $self->{column};
8376     $self->{column}++;
8377     $self->{nc}
8378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8379     } else {
8380     $self->{set_nc}->($self);
8381     }
8382    
8383     redo A;
8384     } elsif ($self->{nc} == 0x003E) { # >
8385     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8386     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8387     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8388    
8389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8390     $self->{line_prev} = $self->{line};
8391     $self->{column_prev} = $self->{column};
8392     $self->{column}++;
8393     $self->{nc}
8394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8395     } else {
8396     $self->{set_nc}->($self);
8397     }
8398    
8399     return ($self->{ct}); # ELEMENT
8400     redo A;
8401     } elsif ($self->{nc} == -1) {
8402     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8403     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8404     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8405    
8406     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8407     $self->{line_prev} = $self->{line};
8408     $self->{column_prev} = $self->{column};
8409     $self->{column}++;
8410     $self->{nc}
8411     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8412     } else {
8413     $self->{set_nc}->($self);
8414     }
8415    
8416     return ($self->{ct}); # ELEMENT
8417     redo A;
8418     } else {
8419     $self->{ct}->{content}->[-1] .= chr $self->{nc};
8420     ## Stay in the state.
8421    
8422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423     $self->{line_prev} = $self->{line};
8424     $self->{column_prev} = $self->{column};
8425     $self->{column}++;
8426     $self->{nc}
8427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428     } else {
8429     $self->{set_nc}->($self);
8430     }
8431    
8432     redo A;
8433     }
8434     } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8435     if ($is_space->{$self->{nc}}) {
8436     ## Stay in the state.
8437    
8438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8439     $self->{line_prev} = $self->{line};
8440     $self->{column_prev} = $self->{column};
8441     $self->{column}++;
8442     $self->{nc}
8443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8444     } else {
8445     $self->{set_nc}->($self);
8446     }
8447    
8448     redo A;
8449     } elsif ($self->{nc} == 0x007C or # |
8450     $self->{nc} == 0x002C) { # ,
8451     push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8452     $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8453    
8454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8455     $self->{line_prev} = $self->{line};
8456     $self->{column_prev} = $self->{column};
8457     $self->{column}++;
8458     $self->{nc}
8459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8460     } else {
8461     $self->{set_nc}->($self);
8462     }
8463    
8464     redo A;
8465     } elsif ($self->{nc} == 0x0029) { # )
8466     $self->{group_depth}--;
8467     push @{$self->{ct}->{content}}, chr $self->{nc};
8468     $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8469    
8470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8471     $self->{line_prev} = $self->{line};
8472     $self->{column_prev} = $self->{column};
8473     $self->{column}++;
8474     $self->{nc}
8475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8476     } else {
8477     $self->{set_nc}->($self);
8478     }
8479    
8480     redo A;
8481     } elsif ($self->{nc} == 0x003E) { # >
8482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8483     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8484     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8485    
8486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8487     $self->{line_prev} = $self->{line};
8488     $self->{column_prev} = $self->{column};
8489     $self->{column}++;
8490     $self->{nc}
8491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8492     } else {
8493     $self->{set_nc}->($self);
8494     }
8495    
8496     return ($self->{ct}); # ELEMENT
8497     redo A;
8498     } elsif ($self->{nc} == -1) {
8499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8500     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8501     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8502    
8503     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8504     $self->{line_prev} = $self->{line};
8505     $self->{column_prev} = $self->{column};
8506     $self->{column}++;
8507     $self->{nc}
8508     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8509     } else {
8510     $self->{set_nc}->($self);
8511     }
8512    
8513     return ($self->{ct}); # ELEMENT
8514     redo A;
8515     } else {
8516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8517     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8518     $self->{state} = BOGUS_MD_STATE;
8519    
8520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8521     $self->{line_prev} = $self->{line};
8522     $self->{column_prev} = $self->{column};
8523     $self->{column}++;
8524     $self->{nc}
8525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8526     } else {
8527     $self->{set_nc}->($self);
8528     }
8529    
8530     redo A;
8531     }
8532     } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8533     if ($is_space->{$self->{nc}}) {
8534     if ($self->{group_depth}) {
8535     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8536     } else {
8537     $self->{state} = AFTER_MD_DEF_STATE;
8538     }
8539    
8540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8541     $self->{line_prev} = $self->{line};
8542     $self->{column_prev} = $self->{column};
8543     $self->{column}++;
8544     $self->{nc}
8545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8546     } else {
8547     $self->{set_nc}->($self);
8548     }
8549    
8550     redo A;
8551     } elsif ($self->{nc} == 0x002A or # *
8552     $self->{nc} == 0x002B or # +
8553     $self->{nc} == 0x003F) { # ?
8554     push @{$self->{ct}->{content}}, chr $self->{nc};
8555     if ($self->{group_depth}) {
8556     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8557     } else {
8558     $self->{state} = AFTER_MD_DEF_STATE;
8559     }
8560    
8561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8562     $self->{line_prev} = $self->{line};
8563     $self->{column_prev} = $self->{column};
8564     $self->{column}++;
8565     $self->{nc}
8566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8567     } else {
8568     $self->{set_nc}->($self);
8569     }
8570    
8571     redo A;
8572     } elsif ($self->{nc} == 0x0029) { # )
8573     if ($self->{group_depth}) {
8574     $self->{group_depth}--;
8575     push @{$self->{ct}->{content}}, chr $self->{nc};
8576     ## Stay in the state.
8577    
8578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8579     $self->{line_prev} = $self->{line};
8580     $self->{column_prev} = $self->{column};
8581     $self->{column}++;
8582     $self->{nc}
8583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8584     } else {
8585     $self->{set_nc}->($self);
8586     }
8587    
8588     redo A;
8589     } else {
8590     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8591     $self->{state} = BOGUS_MD_STATE;
8592     ## Reconsume.
8593     redo A;
8594     }
8595     } elsif ($self->{nc} == 0x003E) { # >
8596     if ($self->{group_depth}) {
8597     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8598     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8599     }
8600     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8601    
8602     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8603     $self->{line_prev} = $self->{line};
8604     $self->{column_prev} = $self->{column};
8605     $self->{column}++;
8606     $self->{nc}
8607     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8608     } else {
8609     $self->{set_nc}->($self);
8610     }
8611    
8612     return ($self->{ct}); # ELEMENT
8613     redo A;
8614     } elsif ($self->{nc} == -1) {
8615     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8616     push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8617     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8618    
8619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8620     $self->{line_prev} = $self->{line};
8621     $self->{column_prev} = $self->{column};
8622     $self->{column}++;
8623     $self->{nc}
8624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8625     } else {
8626     $self->{set_nc}->($self);
8627     }
8628    
8629     return ($self->{ct}); # ELEMENT
8630     redo A;
8631     } else {
8632     if ($self->{group_depth}) {
8633     $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8634     } else {
8635     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8636     $self->{state} = BOGUS_MD_STATE;
8637     }
8638     ## Reconsume.
8639     redo A;
8640     }
8641     } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8642 wakaba 1.18 if ($is_space->{$self->{nc}}) {
8643     ## Stay in the state.
8644    
8645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8646     $self->{line_prev} = $self->{line};
8647     $self->{column_prev} = $self->{column};
8648     $self->{column}++;
8649     $self->{nc}
8650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8651     } else {
8652     $self->{set_nc}->($self);
8653     }
8654    
8655     redo A;
8656     } elsif ($self->{nc} == 0x003E) { # >
8657     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8658    
8659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8660     $self->{line_prev} = $self->{line};
8661     $self->{column_prev} = $self->{column};
8662     $self->{column}++;
8663     $self->{nc}
8664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8665     } else {
8666     $self->{set_nc}->($self);
8667     }
8668    
8669 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8670 wakaba 1.18 redo A;
8671     } elsif ($self->{nc} == -1) {
8672     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8673     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8674    
8675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8676     $self->{line_prev} = $self->{line};
8677     $self->{column_prev} = $self->{column};
8678     $self->{column}++;
8679     $self->{nc}
8680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8681     } else {
8682     $self->{set_nc}->($self);
8683     }
8684    
8685 wakaba 1.20 return ($self->{ct}); # ENTITY/ELEMENT
8686 wakaba 1.18 redo A;
8687     } else {
8688 wakaba 1.20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8689 wakaba 1.18 $self->{state} = BOGUS_MD_STATE;
8690     ## Reconsume.
8691     redo A;
8692     }
8693 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
8694     if ($self->{nc} == 0x003E) { # >
8695     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8696    
8697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8698     $self->{line_prev} = $self->{line};
8699     $self->{column_prev} = $self->{column};
8700     $self->{column}++;
8701     $self->{nc}
8702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8703     } else {
8704     $self->{set_nc}->($self);
8705     }
8706    
8707     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8708     redo A;
8709     } elsif ($self->{nc} == -1) {
8710     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8711     ## Reconsume.
8712     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8713     redo A;
8714     } else {
8715     ## Stay in the state.
8716    
8717     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8718     $self->{line_prev} = $self->{line};
8719     $self->{column_prev} = $self->{column};
8720     $self->{column}++;
8721     $self->{nc}
8722     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8723     } else {
8724     $self->{set_nc}->($self);
8725     }
8726    
8727     redo A;
8728     }
8729 wakaba 1.1 } else {
8730     die "$0: $self->{state}: Unknown state";
8731     }
8732     } # A
8733    
8734     die "$0: _get_next_token: unexpected case";
8735     } # _get_next_token
8736    
8737     1;
8738 wakaba 1.31 ## $Date: 2009/08/16 05:24:47 $
8739 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24