/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (hide annotations) (download)
Sat Oct 18 11:34:49 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.15: +336 -170 lines
++ whatpm/t/ChangeLog	18 Oct 2008 11:34:40 -0000
2008-10-18  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/notations-1.dat" added.

++ whatpm/t/xml/ChangeLog	18 Oct 2008 11:25:41 -0000
	* attlists-1.dat: A test result updated.

	* notations-1.dat: New test result file.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	18 Oct 2008 11:31:41 -0000
	* NanoDOM.pm (public_id, system_id): New attributes.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	18 Oct 2008 11:34:04 -0000
	* Tokenizer.pm.src: Modifies PUBLIC/SYSTEM identifier tokenizer
	states such that <!ENTITY> and <!NOTATION> can be tokenized by
	those states as well.
	(BOGUS_MD_STATE): A new state; used for bogus markup declarations,
	in favor of BOGUS_COMMENT_STATE.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	18 Oct 2008 11:34:26 -0000
	* Parser.pm.src: Set public_id and system_id attributes of Entity
	and Notation nodes.

2008-10-18  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.16 our $VERSION=do{my @r=(q$Revision: 1.15 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.16 sub BOGUS_MD_STATE () { 85 }
181 wakaba 1.8
182 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
183     ## list and descriptions)
184    
185     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
186     sub FOREIGN_EL () { 0b1_00000000000 }
187    
188     ## Character reference mappings
189    
190     my $charref_map = {
191     0x0D => 0x000A,
192     0x80 => 0x20AC,
193     0x81 => 0xFFFD,
194     0x82 => 0x201A,
195     0x83 => 0x0192,
196     0x84 => 0x201E,
197     0x85 => 0x2026,
198     0x86 => 0x2020,
199     0x87 => 0x2021,
200     0x88 => 0x02C6,
201     0x89 => 0x2030,
202     0x8A => 0x0160,
203     0x8B => 0x2039,
204     0x8C => 0x0152,
205     0x8D => 0xFFFD,
206     0x8E => 0x017D,
207     0x8F => 0xFFFD,
208     0x90 => 0xFFFD,
209     0x91 => 0x2018,
210     0x92 => 0x2019,
211     0x93 => 0x201C,
212     0x94 => 0x201D,
213     0x95 => 0x2022,
214     0x96 => 0x2013,
215     0x97 => 0x2014,
216     0x98 => 0x02DC,
217     0x99 => 0x2122,
218     0x9A => 0x0161,
219     0x9B => 0x203A,
220     0x9C => 0x0153,
221     0x9D => 0xFFFD,
222     0x9E => 0x017E,
223     0x9F => 0x0178,
224     }; # $charref_map
225     $charref_map->{$_} = 0xFFFD
226     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
227     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
228     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
229     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
230     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
231     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
232     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
233    
234     ## Implementations MUST act as if state machine in the spec
235    
236     sub _initialize_tokenizer ($) {
237     my $self = shift;
238    
239     ## NOTE: Fields set by |new| constructor:
240     #$self->{level}
241     #$self->{set_nc}
242     #$self->{parse_error}
243 wakaba 1.3 #$self->{is_xml} (if XML)
244 wakaba 1.1
245     $self->{state} = DATA_STATE; # MUST
246 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
247     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
248 wakaba 1.1 #$self->{entity__value}; # initialized when used
249     #$self->{entity__match}; # initialized when used
250     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
251     undef $self->{ct}; # current token
252     undef $self->{ca}; # current attribute
253     undef $self->{last_stag_name}; # last emitted start tag name
254     #$self->{prev_state}; # initialized when used
255     delete $self->{self_closing};
256     $self->{char_buffer} = '';
257     $self->{char_buffer_pos} = 0;
258     $self->{nc} = -1; # next input character
259     #$self->{next_nc}
260    
261     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
262     $self->{line_prev} = $self->{line};
263     $self->{column_prev} = $self->{column};
264     $self->{column}++;
265     $self->{nc}
266     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
267     } else {
268     $self->{set_nc}->($self);
269     }
270    
271     $self->{token} = [];
272     # $self->{escape}
273     } # _initialize_tokenizer
274    
275     ## A token has:
276     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
277 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
278 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
279     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
280 wakaba 1.11 ## ->{target} (PI_TOKEN)
281 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
282     ## ->{sysid} (DOCTYPE_TOKEN)
283     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
284     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
285     ## ->{name}
286     ## ->{value}
287     ## ->{has_reference} == 1 or 0
288 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
289     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
290 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
291 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
292 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
293    
294 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
295     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
296     ## while the token is pushed back to the stack.
297    
298     ## Emitted token MUST immediately be handled by the tree construction state.
299    
300     ## Before each step, UA MAY check to see if either one of the scripts in
301     ## "list of scripts that will execute as soon as possible" or the first
302     ## script in the "list of scripts that will execute asynchronously",
303     ## has completed loading. If one has, then it MUST be executed
304     ## and removed from the list.
305    
306     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
307     ## (This requirement was dropped from HTML5 spec, unfortunately.)
308    
309     my $is_space = {
310     0x0009 => 1, # CHARACTER TABULATION (HT)
311     0x000A => 1, # LINE FEED (LF)
312     #0x000B => 0, # LINE TABULATION (VT)
313 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
314 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
315     0x0020 => 1, # SPACE (SP)
316     };
317    
318     sub _get_next_token ($) {
319     my $self = shift;
320    
321     if ($self->{self_closing}) {
322     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
323     ## NOTE: The |self_closing| flag is only set by start tag token.
324     ## In addition, when a start tag token is emitted, it is always set to
325     ## |ct|.
326     delete $self->{self_closing};
327     }
328    
329     if (@{$self->{token}}) {
330     $self->{self_closing} = $self->{token}->[0]->{self_closing};
331     return shift @{$self->{token}};
332     }
333    
334     A: {
335     if ($self->{state} == PCDATA_STATE) {
336     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
337    
338     if ($self->{nc} == 0x0026) { # &
339    
340     ## NOTE: In the spec, the tokenizer is switched to the
341     ## "entity data state". In this implementation, the tokenizer
342     ## is switched to the |ENTITY_STATE|, which is an implementation
343     ## of the "consume a character reference" algorithm.
344     $self->{entity_add} = -1;
345     $self->{prev_state} = DATA_STATE;
346     $self->{state} = ENTITY_STATE;
347    
348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349     $self->{line_prev} = $self->{line};
350     $self->{column_prev} = $self->{column};
351     $self->{column}++;
352     $self->{nc}
353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354     } else {
355     $self->{set_nc}->($self);
356     }
357    
358     redo A;
359     } elsif ($self->{nc} == 0x003C) { # <
360    
361     $self->{state} = TAG_OPEN_STATE;
362    
363     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
364     $self->{line_prev} = $self->{line};
365     $self->{column_prev} = $self->{column};
366     $self->{column}++;
367     $self->{nc}
368     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
369     } else {
370     $self->{set_nc}->($self);
371     }
372    
373     redo A;
374     } elsif ($self->{nc} == -1) {
375    
376     return ({type => END_OF_FILE_TOKEN,
377     line => $self->{line}, column => $self->{column}});
378     last A; ## TODO: ok?
379     } else {
380    
381     #
382     }
383    
384     # Anything else
385     my $token = {type => CHARACTER_TOKEN,
386     data => chr $self->{nc},
387     line => $self->{line}, column => $self->{column},
388     };
389     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
390    
391     ## Stay in the state.
392    
393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
394     $self->{line_prev} = $self->{line};
395     $self->{column_prev} = $self->{column};
396     $self->{column}++;
397     $self->{nc}
398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
399     } else {
400     $self->{set_nc}->($self);
401     }
402    
403     return ($token);
404     redo A;
405     } elsif ($self->{state} == DATA_STATE) {
406     $self->{s_kwd} = '' unless defined $self->{s_kwd};
407     if ($self->{nc} == 0x0026) { # &
408     $self->{s_kwd} = '';
409     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
410     not $self->{escape}) {
411    
412     ## NOTE: In the spec, the tokenizer is switched to the
413     ## "entity data state". In this implementation, the tokenizer
414     ## is switched to the |ENTITY_STATE|, which is an implementation
415     ## of the "consume a character reference" algorithm.
416     $self->{entity_add} = -1;
417     $self->{prev_state} = DATA_STATE;
418     $self->{state} = ENTITY_STATE;
419    
420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
421     $self->{line_prev} = $self->{line};
422     $self->{column_prev} = $self->{column};
423     $self->{column}++;
424     $self->{nc}
425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
426     } else {
427     $self->{set_nc}->($self);
428     }
429    
430     redo A;
431     } else {
432    
433     #
434     }
435     } elsif ($self->{nc} == 0x002D) { # -
436     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
437 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
438 wakaba 1.1
439     $self->{escape} = 1; # unless $self->{escape};
440     $self->{s_kwd} = '--';
441     #
442 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
443 wakaba 1.1
444     $self->{s_kwd} = '--';
445     #
446 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
447    
448     $self->{s_kwd} .= '-';
449     #
450 wakaba 1.1 } else {
451    
452 wakaba 1.5 $self->{s_kwd} = '-';
453 wakaba 1.1 #
454     }
455     }
456    
457     #
458     } elsif ($self->{nc} == 0x0021) { # !
459     if (length $self->{s_kwd}) {
460    
461     $self->{s_kwd} .= '!';
462     #
463     } else {
464    
465     #$self->{s_kwd} = '';
466     #
467     }
468     #
469     } elsif ($self->{nc} == 0x003C) { # <
470     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
471     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
472     not $self->{escape})) {
473    
474     $self->{state} = TAG_OPEN_STATE;
475    
476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
477     $self->{line_prev} = $self->{line};
478     $self->{column_prev} = $self->{column};
479     $self->{column}++;
480     $self->{nc}
481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
482     } else {
483     $self->{set_nc}->($self);
484     }
485    
486     redo A;
487     } else {
488    
489     $self->{s_kwd} = '';
490     #
491     }
492     } elsif ($self->{nc} == 0x003E) { # >
493     if ($self->{escape} and
494     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
495     if ($self->{s_kwd} eq '--') {
496    
497     delete $self->{escape};
498 wakaba 1.5 #
499 wakaba 1.1 } else {
500    
501 wakaba 1.5 #
502 wakaba 1.1 }
503 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
504    
505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
506     line => $self->{line_prev},
507     column => $self->{column_prev} - 1);
508     #
509 wakaba 1.1 } else {
510    
511 wakaba 1.5 #
512 wakaba 1.1 }
513    
514     $self->{s_kwd} = '';
515     #
516 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
517     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
518    
519     $self->{s_kwd} .= ']';
520     } elsif ($self->{s_kwd} eq ']]') {
521    
522     #
523     } else {
524    
525     $self->{s_kwd} = '';
526     }
527     #
528 wakaba 1.1 } elsif ($self->{nc} == -1) {
529    
530     $self->{s_kwd} = '';
531     return ({type => END_OF_FILE_TOKEN,
532     line => $self->{line}, column => $self->{column}});
533     last A; ## TODO: ok?
534     } else {
535    
536     $self->{s_kwd} = '';
537     #
538     }
539    
540     # Anything else
541     my $token = {type => CHARACTER_TOKEN,
542     data => chr $self->{nc},
543     line => $self->{line}, column => $self->{column},
544     };
545 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
546 wakaba 1.1 length $token->{data})) {
547     $self->{s_kwd} = '';
548     }
549    
550     ## Stay in the data state.
551 wakaba 1.5 if (not $self->{is_xml} and
552     $self->{content_model} == PCDATA_CONTENT_MODEL) {
553 wakaba 1.1
554     $self->{state} = PCDATA_STATE;
555     } else {
556    
557     ## Stay in the state.
558     }
559    
560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
561     $self->{line_prev} = $self->{line};
562     $self->{column_prev} = $self->{column};
563     $self->{column}++;
564     $self->{nc}
565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
566     } else {
567     $self->{set_nc}->($self);
568     }
569    
570     return ($token);
571     redo A;
572     } elsif ($self->{state} == TAG_OPEN_STATE) {
573 wakaba 1.10 ## XML5: "tag state".
574    
575 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
576     if ($self->{nc} == 0x002F) { # /
577    
578    
579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
580     $self->{line_prev} = $self->{line};
581     $self->{column_prev} = $self->{column};
582     $self->{column}++;
583     $self->{nc}
584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
585     } else {
586     $self->{set_nc}->($self);
587     }
588    
589     $self->{state} = CLOSE_TAG_OPEN_STATE;
590     redo A;
591     } elsif ($self->{nc} == 0x0021) { # !
592    
593 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
594 wakaba 1.1 #
595     } else {
596    
597 wakaba 1.12 $self->{s_kwd} = '';
598 wakaba 1.1 #
599     }
600    
601     ## reconsume
602     $self->{state} = DATA_STATE;
603     return ({type => CHARACTER_TOKEN, data => '<',
604     line => $self->{line_prev},
605     column => $self->{column_prev},
606     });
607     redo A;
608     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
609     if ($self->{nc} == 0x0021) { # !
610    
611     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
612    
613     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
614     $self->{line_prev} = $self->{line};
615     $self->{column_prev} = $self->{column};
616     $self->{column}++;
617     $self->{nc}
618     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
619     } else {
620     $self->{set_nc}->($self);
621     }
622    
623     redo A;
624     } elsif ($self->{nc} == 0x002F) { # /
625    
626     $self->{state} = CLOSE_TAG_OPEN_STATE;
627    
628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
629     $self->{line_prev} = $self->{line};
630     $self->{column_prev} = $self->{column};
631     $self->{column}++;
632     $self->{nc}
633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
634     } else {
635     $self->{set_nc}->($self);
636     }
637    
638     redo A;
639     } elsif (0x0041 <= $self->{nc} and
640     $self->{nc} <= 0x005A) { # A..Z
641    
642     $self->{ct}
643     = {type => START_TAG_TOKEN,
644 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
645 wakaba 1.1 line => $self->{line_prev},
646     column => $self->{column_prev}};
647     $self->{state} = TAG_NAME_STATE;
648    
649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
650     $self->{line_prev} = $self->{line};
651     $self->{column_prev} = $self->{column};
652     $self->{column}++;
653     $self->{nc}
654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
655     } else {
656     $self->{set_nc}->($self);
657     }
658    
659     redo A;
660     } elsif (0x0061 <= $self->{nc} and
661     $self->{nc} <= 0x007A) { # a..z
662    
663     $self->{ct} = {type => START_TAG_TOKEN,
664     tag_name => chr ($self->{nc}),
665     line => $self->{line_prev},
666     column => $self->{column_prev}};
667     $self->{state} = TAG_NAME_STATE;
668    
669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
670     $self->{line_prev} = $self->{line};
671     $self->{column_prev} = $self->{column};
672     $self->{column}++;
673     $self->{nc}
674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
675     } else {
676     $self->{set_nc}->($self);
677     }
678    
679     redo A;
680     } elsif ($self->{nc} == 0x003E) { # >
681    
682     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
683     line => $self->{line_prev},
684     column => $self->{column_prev});
685     $self->{state} = DATA_STATE;
686 wakaba 1.5 $self->{s_kwd} = '';
687 wakaba 1.1
688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
689     $self->{line_prev} = $self->{line};
690     $self->{column_prev} = $self->{column};
691     $self->{column}++;
692     $self->{nc}
693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
694     } else {
695     $self->{set_nc}->($self);
696     }
697    
698    
699     return ({type => CHARACTER_TOKEN, data => '<>',
700     line => $self->{line_prev},
701     column => $self->{column_prev},
702     });
703    
704     redo A;
705     } elsif ($self->{nc} == 0x003F) { # ?
706 wakaba 1.8 if ($self->{is_xml}) {
707    
708     $self->{state} = PI_STATE;
709    
710     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
711     $self->{line_prev} = $self->{line};
712     $self->{column_prev} = $self->{column};
713     $self->{column}++;
714     $self->{nc}
715     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
716     } else {
717     $self->{set_nc}->($self);
718     }
719    
720     redo A;
721     } else {
722    
723     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
724     line => $self->{line_prev},
725     column => $self->{column_prev});
726     $self->{state} = BOGUS_COMMENT_STATE;
727     $self->{ct} = {type => COMMENT_TOKEN, data => '',
728     line => $self->{line_prev},
729     column => $self->{column_prev},
730     };
731     ## $self->{nc} is intentionally left as is
732     redo A;
733     }
734 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
735 wakaba 1.1
736     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
737     line => $self->{line_prev},
738     column => $self->{column_prev});
739     $self->{state} = DATA_STATE;
740 wakaba 1.5 $self->{s_kwd} = '';
741 wakaba 1.1 ## reconsume
742    
743     return ({type => CHARACTER_TOKEN, data => '<',
744     line => $self->{line_prev},
745     column => $self->{column_prev},
746     });
747    
748     redo A;
749 wakaba 1.9 } else {
750     ## XML5: "<:" is a parse error.
751    
752     $self->{ct} = {type => START_TAG_TOKEN,
753     tag_name => chr ($self->{nc}),
754     line => $self->{line_prev},
755     column => $self->{column_prev}};
756     $self->{state} = TAG_NAME_STATE;
757    
758     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
759     $self->{line_prev} = $self->{line};
760     $self->{column_prev} = $self->{column};
761     $self->{column}++;
762     $self->{nc}
763     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
764     } else {
765     $self->{set_nc}->($self);
766     }
767    
768     redo A;
769 wakaba 1.1 }
770     } else {
771     die "$0: $self->{content_model} in tag open";
772     }
773     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
774     ## NOTE: The "close tag open state" in the spec is implemented as
775     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
776    
777 wakaba 1.10 ## XML5: "end tag state".
778    
779 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
780     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
781     if (defined $self->{last_stag_name}) {
782     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
783 wakaba 1.12 $self->{kwd} = '';
784 wakaba 1.1 ## Reconsume.
785     redo A;
786     } else {
787     ## No start tag token has ever been emitted
788     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
789    
790     $self->{state} = DATA_STATE;
791 wakaba 1.5 $self->{s_kwd} = '';
792 wakaba 1.1 ## Reconsume.
793     return ({type => CHARACTER_TOKEN, data => '</',
794     line => $l, column => $c,
795     });
796     redo A;
797     }
798     }
799    
800     if (0x0041 <= $self->{nc} and
801     $self->{nc} <= 0x005A) { # A..Z
802    
803     $self->{ct}
804     = {type => END_TAG_TOKEN,
805 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
806 wakaba 1.1 line => $l, column => $c};
807     $self->{state} = TAG_NAME_STATE;
808    
809     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
810     $self->{line_prev} = $self->{line};
811     $self->{column_prev} = $self->{column};
812     $self->{column}++;
813     $self->{nc}
814     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
815     } else {
816     $self->{set_nc}->($self);
817     }
818    
819     redo A;
820     } elsif (0x0061 <= $self->{nc} and
821     $self->{nc} <= 0x007A) { # a..z
822    
823     $self->{ct} = {type => END_TAG_TOKEN,
824     tag_name => chr ($self->{nc}),
825     line => $l, column => $c};
826     $self->{state} = TAG_NAME_STATE;
827    
828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
829     $self->{line_prev} = $self->{line};
830     $self->{column_prev} = $self->{column};
831     $self->{column}++;
832     $self->{nc}
833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
834     } else {
835     $self->{set_nc}->($self);
836     }
837    
838     redo A;
839     } elsif ($self->{nc} == 0x003E) { # >
840     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
841     line => $self->{line_prev}, ## "<" in "</>"
842     column => $self->{column_prev} - 1);
843     $self->{state} = DATA_STATE;
844 wakaba 1.5 $self->{s_kwd} = '';
845 wakaba 1.10 if ($self->{is_xml}) {
846    
847     ## XML5: No parse error.
848    
849     ## NOTE: This parser raises a parse error, since it supports
850     ## XML1, not XML5.
851    
852     ## NOTE: A short end tag token.
853     my $ct = {type => END_TAG_TOKEN,
854     tag_name => '',
855     line => $self->{line_prev},
856     column => $self->{column_prev} - 1,
857     };
858    
859     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
860     $self->{line_prev} = $self->{line};
861     $self->{column_prev} = $self->{column};
862     $self->{column}++;
863     $self->{nc}
864     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
865     } else {
866     $self->{set_nc}->($self);
867     }
868    
869     return ($ct);
870     } else {
871    
872    
873 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
874     $self->{line_prev} = $self->{line};
875     $self->{column_prev} = $self->{column};
876     $self->{column}++;
877     $self->{nc}
878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
879     } else {
880     $self->{set_nc}->($self);
881     }
882    
883 wakaba 1.10 }
884 wakaba 1.1 redo A;
885     } elsif ($self->{nc} == -1) {
886    
887     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
888 wakaba 1.5 $self->{s_kwd} = '';
889 wakaba 1.1 $self->{state} = DATA_STATE;
890     # reconsume
891    
892     return ({type => CHARACTER_TOKEN, data => '</',
893     line => $l, column => $c,
894     });
895    
896     redo A;
897 wakaba 1.10 } elsif (not $self->{is_xml} or
898     $is_space->{$self->{nc}}) {
899 wakaba 1.1
900 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
901     line => $self->{line_prev}, # "<" of "</"
902     column => $self->{column_prev} - 1);
903 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
904     $self->{ct} = {type => COMMENT_TOKEN, data => '',
905     line => $self->{line_prev}, # "<" of "</"
906     column => $self->{column_prev} - 1,
907     };
908     ## NOTE: $self->{nc} is intentionally left as is.
909     ## Although the "anything else" case of the spec not explicitly
910     ## states that the next input character is to be reconsumed,
911     ## it will be included to the |data| of the comment token
912     ## generated from the bogus end tag, as defined in the
913     ## "bogus comment state" entry.
914     redo A;
915 wakaba 1.10 } else {
916     ## XML5: "</:" is a parse error.
917    
918     $self->{ct} = {type => END_TAG_TOKEN,
919     tag_name => chr ($self->{nc}),
920     line => $l, column => $c};
921     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
922    
923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
924     $self->{line_prev} = $self->{line};
925     $self->{column_prev} = $self->{column};
926     $self->{column}++;
927     $self->{nc}
928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
929     } else {
930     $self->{set_nc}->($self);
931     }
932    
933     redo A;
934 wakaba 1.1 }
935     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
936 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
937 wakaba 1.1 if (length $ch) {
938     my $CH = $ch;
939     $ch =~ tr/a-z/A-Z/;
940     my $nch = chr $self->{nc};
941     if ($nch eq $ch or $nch eq $CH) {
942    
943     ## Stay in the state.
944 wakaba 1.12 $self->{kwd} .= $nch;
945 wakaba 1.1
946     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
947     $self->{line_prev} = $self->{line};
948     $self->{column_prev} = $self->{column};
949     $self->{column}++;
950     $self->{nc}
951     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
952     } else {
953     $self->{set_nc}->($self);
954     }
955    
956     redo A;
957     } else {
958    
959     $self->{state} = DATA_STATE;
960 wakaba 1.5 $self->{s_kwd} = '';
961 wakaba 1.1 ## Reconsume.
962     return ({type => CHARACTER_TOKEN,
963 wakaba 1.12 data => '</' . $self->{kwd},
964 wakaba 1.1 line => $self->{line_prev},
965 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
966 wakaba 1.1 });
967     redo A;
968     }
969     } else { # after "<{tag-name}"
970     unless ($is_space->{$self->{nc}} or
971     {
972     0x003E => 1, # >
973     0x002F => 1, # /
974     -1 => 1, # EOF
975     }->{$self->{nc}}) {
976    
977     ## Reconsume.
978     $self->{state} = DATA_STATE;
979 wakaba 1.5 $self->{s_kwd} = '';
980 wakaba 1.1 return ({type => CHARACTER_TOKEN,
981 wakaba 1.12 data => '</' . $self->{kwd},
982 wakaba 1.1 line => $self->{line_prev},
983 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
984 wakaba 1.1 });
985     redo A;
986     } else {
987    
988     $self->{ct}
989     = {type => END_TAG_TOKEN,
990     tag_name => $self->{last_stag_name},
991     line => $self->{line_prev},
992 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
993 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
994     ## Reconsume.
995     redo A;
996     }
997     }
998     } elsif ($self->{state} == TAG_NAME_STATE) {
999     if ($is_space->{$self->{nc}}) {
1000    
1001     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1002    
1003     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1004     $self->{line_prev} = $self->{line};
1005     $self->{column_prev} = $self->{column};
1006     $self->{column}++;
1007     $self->{nc}
1008     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1009     } else {
1010     $self->{set_nc}->($self);
1011     }
1012    
1013     redo A;
1014     } elsif ($self->{nc} == 0x003E) { # >
1015     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1016    
1017     $self->{last_stag_name} = $self->{ct}->{tag_name};
1018     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1019     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1020     #if ($self->{ct}->{attributes}) {
1021     # ## NOTE: This should never be reached.
1022     # !!! cp (36);
1023     # !!! parse-error (type => 'end tag attribute');
1024     #} else {
1025    
1026     #}
1027     } else {
1028     die "$0: $self->{ct}->{type}: Unknown token type";
1029     }
1030     $self->{state} = DATA_STATE;
1031 wakaba 1.5 $self->{s_kwd} = '';
1032 wakaba 1.1
1033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1034     $self->{line_prev} = $self->{line};
1035     $self->{column_prev} = $self->{column};
1036     $self->{column}++;
1037     $self->{nc}
1038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1039     } else {
1040     $self->{set_nc}->($self);
1041     }
1042    
1043    
1044     return ($self->{ct}); # start tag or end tag
1045    
1046     redo A;
1047     } elsif (0x0041 <= $self->{nc} and
1048     $self->{nc} <= 0x005A) { # A..Z
1049    
1050 wakaba 1.4 $self->{ct}->{tag_name}
1051     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1052 wakaba 1.1 # start tag or end tag
1053     ## Stay in this state
1054    
1055     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1056     $self->{line_prev} = $self->{line};
1057     $self->{column_prev} = $self->{column};
1058     $self->{column}++;
1059     $self->{nc}
1060     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1061     } else {
1062     $self->{set_nc}->($self);
1063     }
1064    
1065     redo A;
1066     } elsif ($self->{nc} == -1) {
1067     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1068     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1069    
1070     $self->{last_stag_name} = $self->{ct}->{tag_name};
1071     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1072     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1073     #if ($self->{ct}->{attributes}) {
1074     # ## NOTE: This state should never be reached.
1075     # !!! cp (40);
1076     # !!! parse-error (type => 'end tag attribute');
1077     #} else {
1078    
1079     #}
1080     } else {
1081     die "$0: $self->{ct}->{type}: Unknown token type";
1082     }
1083     $self->{state} = DATA_STATE;
1084 wakaba 1.5 $self->{s_kwd} = '';
1085 wakaba 1.1 # reconsume
1086    
1087     return ($self->{ct}); # start tag or end tag
1088    
1089     redo A;
1090     } elsif ($self->{nc} == 0x002F) { # /
1091    
1092     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1093    
1094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1095     $self->{line_prev} = $self->{line};
1096     $self->{column_prev} = $self->{column};
1097     $self->{column}++;
1098     $self->{nc}
1099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1100     } else {
1101     $self->{set_nc}->($self);
1102     }
1103    
1104     redo A;
1105     } else {
1106    
1107     $self->{ct}->{tag_name} .= chr $self->{nc};
1108     # start tag or end tag
1109     ## Stay in the state
1110    
1111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1112     $self->{line_prev} = $self->{line};
1113     $self->{column_prev} = $self->{column};
1114     $self->{column}++;
1115     $self->{nc}
1116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1117     } else {
1118     $self->{set_nc}->($self);
1119     }
1120    
1121     redo A;
1122     }
1123     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1124 wakaba 1.11 ## XML5: "Tag attribute name before state".
1125    
1126 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1127    
1128     ## Stay in the state
1129    
1130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1131     $self->{line_prev} = $self->{line};
1132     $self->{column_prev} = $self->{column};
1133     $self->{column}++;
1134     $self->{nc}
1135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1136     } else {
1137     $self->{set_nc}->($self);
1138     }
1139    
1140     redo A;
1141     } elsif ($self->{nc} == 0x003E) { # >
1142     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1143    
1144     $self->{last_stag_name} = $self->{ct}->{tag_name};
1145     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1146     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1147     if ($self->{ct}->{attributes}) {
1148    
1149     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1150     } else {
1151    
1152     }
1153     } else {
1154     die "$0: $self->{ct}->{type}: Unknown token type";
1155     }
1156     $self->{state} = DATA_STATE;
1157 wakaba 1.5 $self->{s_kwd} = '';
1158 wakaba 1.1
1159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1160     $self->{line_prev} = $self->{line};
1161     $self->{column_prev} = $self->{column};
1162     $self->{column}++;
1163     $self->{nc}
1164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1165     } else {
1166     $self->{set_nc}->($self);
1167     }
1168    
1169    
1170     return ($self->{ct}); # start tag or end tag
1171    
1172     redo A;
1173     } elsif (0x0041 <= $self->{nc} and
1174     $self->{nc} <= 0x005A) { # A..Z
1175    
1176     $self->{ca}
1177 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1178 wakaba 1.1 value => '',
1179     line => $self->{line}, column => $self->{column}};
1180     $self->{state} = ATTRIBUTE_NAME_STATE;
1181    
1182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1183     $self->{line_prev} = $self->{line};
1184     $self->{column_prev} = $self->{column};
1185     $self->{column}++;
1186     $self->{nc}
1187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1188     } else {
1189     $self->{set_nc}->($self);
1190     }
1191    
1192     redo A;
1193     } elsif ($self->{nc} == 0x002F) { # /
1194    
1195     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1196    
1197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1198     $self->{line_prev} = $self->{line};
1199     $self->{column_prev} = $self->{column};
1200     $self->{column}++;
1201     $self->{nc}
1202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1203     } else {
1204     $self->{set_nc}->($self);
1205     }
1206    
1207     redo A;
1208     } elsif ($self->{nc} == -1) {
1209     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1210     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211    
1212     $self->{last_stag_name} = $self->{ct}->{tag_name};
1213     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215     if ($self->{ct}->{attributes}) {
1216    
1217     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1218     } else {
1219    
1220     }
1221     } else {
1222     die "$0: $self->{ct}->{type}: Unknown token type";
1223     }
1224     $self->{state} = DATA_STATE;
1225 wakaba 1.5 $self->{s_kwd} = '';
1226 wakaba 1.1 # reconsume
1227    
1228     return ($self->{ct}); # start tag or end tag
1229    
1230     redo A;
1231     } else {
1232     if ({
1233     0x0022 => 1, # "
1234     0x0027 => 1, # '
1235     0x003D => 1, # =
1236     }->{$self->{nc}}) {
1237    
1238 wakaba 1.11 ## XML5: Not a parse error.
1239 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1240     } else {
1241    
1242 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1243 wakaba 1.1 }
1244     $self->{ca}
1245     = {name => chr ($self->{nc}),
1246     value => '',
1247     line => $self->{line}, column => $self->{column}};
1248     $self->{state} = ATTRIBUTE_NAME_STATE;
1249    
1250     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1251     $self->{line_prev} = $self->{line};
1252     $self->{column_prev} = $self->{column};
1253     $self->{column}++;
1254     $self->{nc}
1255     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1256     } else {
1257     $self->{set_nc}->($self);
1258     }
1259    
1260     redo A;
1261     }
1262     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1263 wakaba 1.11 ## XML5: "Tag attribute name state".
1264    
1265 wakaba 1.1 my $before_leave = sub {
1266     if (exists $self->{ct}->{attributes} # start tag or end tag
1267     ->{$self->{ca}->{name}}) { # MUST
1268    
1269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1270     ## Discard $self->{ca} # MUST
1271     } else {
1272    
1273     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1274     = $self->{ca};
1275 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1276 wakaba 1.1 }
1277     }; # $before_leave
1278    
1279     if ($is_space->{$self->{nc}}) {
1280    
1281     $before_leave->();
1282     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1283    
1284     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1285     $self->{line_prev} = $self->{line};
1286     $self->{column_prev} = $self->{column};
1287     $self->{column}++;
1288     $self->{nc}
1289     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1290     } else {
1291     $self->{set_nc}->($self);
1292     }
1293    
1294     redo A;
1295     } elsif ($self->{nc} == 0x003D) { # =
1296    
1297     $before_leave->();
1298     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == 0x003E) { # >
1312 wakaba 1.11 if ($self->{is_xml}) {
1313    
1314     ## XML5: Not a parse error.
1315     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1316     } else {
1317    
1318     }
1319    
1320 wakaba 1.1 $before_leave->();
1321     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1322    
1323     $self->{last_stag_name} = $self->{ct}->{tag_name};
1324     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1325    
1326     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1327     if ($self->{ct}->{attributes}) {
1328     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1329     }
1330     } else {
1331     die "$0: $self->{ct}->{type}: Unknown token type";
1332     }
1333     $self->{state} = DATA_STATE;
1334 wakaba 1.5 $self->{s_kwd} = '';
1335 wakaba 1.1
1336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1337     $self->{line_prev} = $self->{line};
1338     $self->{column_prev} = $self->{column};
1339     $self->{column}++;
1340     $self->{nc}
1341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1342     } else {
1343     $self->{set_nc}->($self);
1344     }
1345    
1346    
1347     return ($self->{ct}); # start tag or end tag
1348    
1349     redo A;
1350     } elsif (0x0041 <= $self->{nc} and
1351     $self->{nc} <= 0x005A) { # A..Z
1352    
1353 wakaba 1.4 $self->{ca}->{name}
1354     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1355 wakaba 1.1 ## Stay in the state
1356    
1357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1358     $self->{line_prev} = $self->{line};
1359     $self->{column_prev} = $self->{column};
1360     $self->{column}++;
1361     $self->{nc}
1362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1363     } else {
1364     $self->{set_nc}->($self);
1365     }
1366    
1367     redo A;
1368     } elsif ($self->{nc} == 0x002F) { # /
1369 wakaba 1.11 if ($self->{is_xml}) {
1370    
1371     ## XML5: Not a parse error.
1372     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1373     } else {
1374    
1375     }
1376 wakaba 1.1
1377     $before_leave->();
1378     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1379    
1380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1381     $self->{line_prev} = $self->{line};
1382     $self->{column_prev} = $self->{column};
1383     $self->{column}++;
1384     $self->{nc}
1385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1386     } else {
1387     $self->{set_nc}->($self);
1388     }
1389    
1390     redo A;
1391     } elsif ($self->{nc} == -1) {
1392     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1393     $before_leave->();
1394     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1395    
1396     $self->{last_stag_name} = $self->{ct}->{tag_name};
1397     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1398     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1399     if ($self->{ct}->{attributes}) {
1400    
1401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1402     } else {
1403     ## NOTE: This state should never be reached.
1404    
1405     }
1406     } else {
1407     die "$0: $self->{ct}->{type}: Unknown token type";
1408     }
1409     $self->{state} = DATA_STATE;
1410 wakaba 1.5 $self->{s_kwd} = '';
1411 wakaba 1.1 # reconsume
1412    
1413     return ($self->{ct}); # start tag or end tag
1414    
1415     redo A;
1416     } else {
1417     if ($self->{nc} == 0x0022 or # "
1418     $self->{nc} == 0x0027) { # '
1419    
1420 wakaba 1.11 ## XML5: Not a parse error.
1421 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1422     } else {
1423    
1424     }
1425     $self->{ca}->{name} .= chr ($self->{nc});
1426     ## Stay in the state
1427    
1428     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1429     $self->{line_prev} = $self->{line};
1430     $self->{column_prev} = $self->{column};
1431     $self->{column}++;
1432     $self->{nc}
1433     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1434     } else {
1435     $self->{set_nc}->($self);
1436     }
1437    
1438     redo A;
1439     }
1440     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1441 wakaba 1.11 ## XML5: "Tag attribute name after state".
1442    
1443 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1444    
1445     ## Stay in the state
1446    
1447     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1448     $self->{line_prev} = $self->{line};
1449     $self->{column_prev} = $self->{column};
1450     $self->{column}++;
1451     $self->{nc}
1452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1453     } else {
1454     $self->{set_nc}->($self);
1455     }
1456    
1457     redo A;
1458     } elsif ($self->{nc} == 0x003D) { # =
1459    
1460     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1461    
1462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1463     $self->{line_prev} = $self->{line};
1464     $self->{column_prev} = $self->{column};
1465     $self->{column}++;
1466     $self->{nc}
1467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1468     } else {
1469     $self->{set_nc}->($self);
1470     }
1471    
1472     redo A;
1473     } elsif ($self->{nc} == 0x003E) { # >
1474 wakaba 1.11 if ($self->{is_xml}) {
1475    
1476     ## XML5: Not a parse error.
1477     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1478     } else {
1479    
1480     }
1481    
1482 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1483    
1484     $self->{last_stag_name} = $self->{ct}->{tag_name};
1485     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1486     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1487     if ($self->{ct}->{attributes}) {
1488    
1489     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1490     } else {
1491     ## NOTE: This state should never be reached.
1492    
1493     }
1494     } else {
1495     die "$0: $self->{ct}->{type}: Unknown token type";
1496     }
1497     $self->{state} = DATA_STATE;
1498 wakaba 1.5 $self->{s_kwd} = '';
1499 wakaba 1.1
1500     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1501     $self->{line_prev} = $self->{line};
1502     $self->{column_prev} = $self->{column};
1503     $self->{column}++;
1504     $self->{nc}
1505     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1506     } else {
1507     $self->{set_nc}->($self);
1508     }
1509    
1510    
1511     return ($self->{ct}); # start tag or end tag
1512    
1513     redo A;
1514     } elsif (0x0041 <= $self->{nc} and
1515     $self->{nc} <= 0x005A) { # A..Z
1516    
1517     $self->{ca}
1518 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1519 wakaba 1.1 value => '',
1520     line => $self->{line}, column => $self->{column}};
1521     $self->{state} = ATTRIBUTE_NAME_STATE;
1522    
1523     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524     $self->{line_prev} = $self->{line};
1525     $self->{column_prev} = $self->{column};
1526     $self->{column}++;
1527     $self->{nc}
1528     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529     } else {
1530     $self->{set_nc}->($self);
1531     }
1532    
1533     redo A;
1534     } elsif ($self->{nc} == 0x002F) { # /
1535 wakaba 1.11 if ($self->{is_xml}) {
1536    
1537     ## XML5: Not a parse error.
1538     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1539     } else {
1540    
1541     }
1542 wakaba 1.1
1543     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1544    
1545     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1546     $self->{line_prev} = $self->{line};
1547     $self->{column_prev} = $self->{column};
1548     $self->{column}++;
1549     $self->{nc}
1550     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1551     } else {
1552     $self->{set_nc}->($self);
1553     }
1554    
1555     redo A;
1556     } elsif ($self->{nc} == -1) {
1557     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1558     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1559    
1560     $self->{last_stag_name} = $self->{ct}->{tag_name};
1561     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1562     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1563     if ($self->{ct}->{attributes}) {
1564    
1565     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1566     } else {
1567     ## NOTE: This state should never be reached.
1568    
1569     }
1570     } else {
1571     die "$0: $self->{ct}->{type}: Unknown token type";
1572     }
1573 wakaba 1.5 $self->{s_kwd} = '';
1574 wakaba 1.1 $self->{state} = DATA_STATE;
1575     # reconsume
1576    
1577     return ($self->{ct}); # start tag or end tag
1578    
1579     redo A;
1580     } else {
1581 wakaba 1.11 if ($self->{is_xml}) {
1582    
1583     ## XML5: Not a parse error.
1584     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1585     } else {
1586    
1587     }
1588    
1589 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1590     $self->{nc} == 0x0027) { # '
1591    
1592 wakaba 1.11 ## XML5: Not a parse error.
1593 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1594     } else {
1595    
1596     }
1597     $self->{ca}
1598     = {name => chr ($self->{nc}),
1599     value => '',
1600     line => $self->{line}, column => $self->{column}};
1601     $self->{state} = ATTRIBUTE_NAME_STATE;
1602    
1603     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1604     $self->{line_prev} = $self->{line};
1605     $self->{column_prev} = $self->{column};
1606     $self->{column}++;
1607     $self->{nc}
1608     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1609     } else {
1610     $self->{set_nc}->($self);
1611     }
1612    
1613     redo A;
1614     }
1615     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1616 wakaba 1.11 ## XML5: "Tag attribute value before state".
1617    
1618 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1619    
1620     ## Stay in the state
1621    
1622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623     $self->{line_prev} = $self->{line};
1624     $self->{column_prev} = $self->{column};
1625     $self->{column}++;
1626     $self->{nc}
1627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628     } else {
1629     $self->{set_nc}->($self);
1630     }
1631    
1632     redo A;
1633     } elsif ($self->{nc} == 0x0022) { # "
1634    
1635     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1636    
1637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1638     $self->{line_prev} = $self->{line};
1639     $self->{column_prev} = $self->{column};
1640     $self->{column}++;
1641     $self->{nc}
1642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1643     } else {
1644     $self->{set_nc}->($self);
1645     }
1646    
1647     redo A;
1648     } elsif ($self->{nc} == 0x0026) { # &
1649    
1650     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1651     ## reconsume
1652     redo A;
1653     } elsif ($self->{nc} == 0x0027) { # '
1654    
1655     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1656    
1657     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1658     $self->{line_prev} = $self->{line};
1659     $self->{column_prev} = $self->{column};
1660     $self->{column}++;
1661     $self->{nc}
1662     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1663     } else {
1664     $self->{set_nc}->($self);
1665     }
1666    
1667     redo A;
1668     } elsif ($self->{nc} == 0x003E) { # >
1669     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1670     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1671    
1672     $self->{last_stag_name} = $self->{ct}->{tag_name};
1673     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1674     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1675     if ($self->{ct}->{attributes}) {
1676    
1677     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1678     } else {
1679     ## NOTE: This state should never be reached.
1680    
1681     }
1682     } else {
1683     die "$0: $self->{ct}->{type}: Unknown token type";
1684     }
1685     $self->{state} = DATA_STATE;
1686 wakaba 1.5 $self->{s_kwd} = '';
1687 wakaba 1.1
1688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1689     $self->{line_prev} = $self->{line};
1690     $self->{column_prev} = $self->{column};
1691     $self->{column}++;
1692     $self->{nc}
1693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1694     } else {
1695     $self->{set_nc}->($self);
1696     }
1697    
1698    
1699     return ($self->{ct}); # start tag or end tag
1700    
1701     redo A;
1702     } elsif ($self->{nc} == -1) {
1703     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1704     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1705    
1706     $self->{last_stag_name} = $self->{ct}->{tag_name};
1707     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1708     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1709     if ($self->{ct}->{attributes}) {
1710    
1711     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1712     } else {
1713     ## NOTE: This state should never be reached.
1714    
1715     }
1716     } else {
1717     die "$0: $self->{ct}->{type}: Unknown token type";
1718     }
1719     $self->{state} = DATA_STATE;
1720 wakaba 1.5 $self->{s_kwd} = '';
1721 wakaba 1.1 ## reconsume
1722    
1723     return ($self->{ct}); # start tag or end tag
1724    
1725     redo A;
1726     } else {
1727     if ($self->{nc} == 0x003D) { # =
1728    
1729 wakaba 1.11 ## XML5: Not a parse error.
1730 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1731 wakaba 1.11 } elsif ($self->{is_xml}) {
1732    
1733     ## XML5: No parse error.
1734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1735 wakaba 1.1 } else {
1736    
1737     }
1738     $self->{ca}->{value} .= chr ($self->{nc});
1739     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1740    
1741     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1742     $self->{line_prev} = $self->{line};
1743     $self->{column_prev} = $self->{column};
1744     $self->{column}++;
1745     $self->{nc}
1746     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1747     } else {
1748     $self->{set_nc}->($self);
1749     }
1750    
1751     redo A;
1752     }
1753     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1754 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1755     ## ATTLIST attribute value double quoted state".
1756 wakaba 1.11
1757 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1758 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1759    
1760     ## XML5: "DOCTYPE ATTLIST name after state".
1761     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1762     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1763     } else {
1764    
1765     ## XML5: "Tag attribute name before state".
1766     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1767     }
1768 wakaba 1.1
1769     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1770     $self->{line_prev} = $self->{line};
1771     $self->{column_prev} = $self->{column};
1772     $self->{column}++;
1773     $self->{nc}
1774     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1775     } else {
1776     $self->{set_nc}->($self);
1777     }
1778    
1779     redo A;
1780     } elsif ($self->{nc} == 0x0026) { # &
1781    
1782 wakaba 1.11 ## XML5: Not defined yet.
1783    
1784 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1785     ## "entity in attribute value state". In this implementation, the
1786     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1787     ## implementation of the "consume a character reference" algorithm.
1788     $self->{prev_state} = $self->{state};
1789     $self->{entity_add} = 0x0022; # "
1790     $self->{state} = ENTITY_STATE;
1791    
1792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1793     $self->{line_prev} = $self->{line};
1794     $self->{column_prev} = $self->{column};
1795     $self->{column}++;
1796     $self->{nc}
1797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1798     } else {
1799     $self->{set_nc}->($self);
1800     }
1801    
1802     redo A;
1803     } elsif ($self->{nc} == -1) {
1804     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1805     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1806    
1807     $self->{last_stag_name} = $self->{ct}->{tag_name};
1808 wakaba 1.15
1809     $self->{state} = DATA_STATE;
1810     $self->{s_kwd} = '';
1811     ## reconsume
1812     return ($self->{ct}); # start tag
1813     redo A;
1814 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1815     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1816     if ($self->{ct}->{attributes}) {
1817    
1818     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1819     } else {
1820     ## NOTE: This state should never be reached.
1821    
1822     }
1823 wakaba 1.15
1824     $self->{state} = DATA_STATE;
1825     $self->{s_kwd} = '';
1826     ## reconsume
1827     return ($self->{ct}); # end tag
1828     redo A;
1829     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1830     ## XML5: No parse error above; not defined yet.
1831     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1833     ## Reconsume.
1834     return ($self->{ct}); # ATTLIST
1835     redo A;
1836 wakaba 1.1 } else {
1837     die "$0: $self->{ct}->{type}: Unknown token type";
1838     }
1839     } else {
1840 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1841 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1842    
1843     ## XML5: Not a parse error.
1844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1845     } else {
1846    
1847     }
1848 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1849     $self->{read_until}->($self->{ca}->{value},
1850 wakaba 1.11 q["&<],
1851 wakaba 1.1 length $self->{ca}->{value});
1852    
1853     ## Stay in the state
1854    
1855     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1856     $self->{line_prev} = $self->{line};
1857     $self->{column_prev} = $self->{column};
1858     $self->{column}++;
1859     $self->{nc}
1860     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1861     } else {
1862     $self->{set_nc}->($self);
1863     }
1864    
1865     redo A;
1866     }
1867     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1868 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1869     ## ATTLIST attribute value single quoted state".
1870 wakaba 1.11
1871 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1872 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1873    
1874     ## XML5: "DOCTYPE ATTLIST name after state".
1875     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1876     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1877     } else {
1878    
1879     ## XML5: "Before attribute name state" (sic).
1880     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1881     }
1882 wakaba 1.1
1883     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1884     $self->{line_prev} = $self->{line};
1885     $self->{column_prev} = $self->{column};
1886     $self->{column}++;
1887     $self->{nc}
1888     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1889     } else {
1890     $self->{set_nc}->($self);
1891     }
1892    
1893     redo A;
1894     } elsif ($self->{nc} == 0x0026) { # &
1895    
1896 wakaba 1.11 ## XML5: Not defined yet.
1897    
1898 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1899     ## "entity in attribute value state". In this implementation, the
1900     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1901     ## implementation of the "consume a character reference" algorithm.
1902     $self->{entity_add} = 0x0027; # '
1903     $self->{prev_state} = $self->{state};
1904     $self->{state} = ENTITY_STATE;
1905    
1906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1907     $self->{line_prev} = $self->{line};
1908     $self->{column_prev} = $self->{column};
1909     $self->{column}++;
1910     $self->{nc}
1911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1912     } else {
1913     $self->{set_nc}->($self);
1914     }
1915    
1916     redo A;
1917     } elsif ($self->{nc} == -1) {
1918     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1919     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1920    
1921     $self->{last_stag_name} = $self->{ct}->{tag_name};
1922 wakaba 1.15
1923     $self->{state} = DATA_STATE;
1924     $self->{s_kwd} = '';
1925     ## reconsume
1926     return ($self->{ct}); # start tag
1927     redo A;
1928 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1929     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1930     if ($self->{ct}->{attributes}) {
1931    
1932     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1933     } else {
1934     ## NOTE: This state should never be reached.
1935    
1936     }
1937 wakaba 1.15
1938     $self->{state} = DATA_STATE;
1939     $self->{s_kwd} = '';
1940     ## reconsume
1941     return ($self->{ct}); # end tag
1942     redo A;
1943     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1944     ## XML5: No parse error above; not defined yet.
1945     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1946     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1947     ## Reconsume.
1948     return ($self->{ct}); # ATTLIST
1949     redo A;
1950 wakaba 1.1 } else {
1951     die "$0: $self->{ct}->{type}: Unknown token type";
1952     }
1953     } else {
1954 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1955 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1956    
1957     ## XML5: Not a parse error.
1958     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1959     } else {
1960    
1961     }
1962 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1963     $self->{read_until}->($self->{ca}->{value},
1964 wakaba 1.11 q['&<],
1965 wakaba 1.1 length $self->{ca}->{value});
1966    
1967     ## Stay in the state
1968    
1969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1970     $self->{line_prev} = $self->{line};
1971     $self->{column_prev} = $self->{column};
1972     $self->{column}++;
1973     $self->{nc}
1974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1975     } else {
1976     $self->{set_nc}->($self);
1977     }
1978    
1979     redo A;
1980     }
1981     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1982 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1983    
1984 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1985 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1986    
1987     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1988     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1989     } else {
1990    
1991     ## XML5: "Tag attribute name before state".
1992     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1993     }
1994 wakaba 1.1
1995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1996     $self->{line_prev} = $self->{line};
1997     $self->{column_prev} = $self->{column};
1998     $self->{column}++;
1999     $self->{nc}
2000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2001     } else {
2002     $self->{set_nc}->($self);
2003     }
2004    
2005     redo A;
2006     } elsif ($self->{nc} == 0x0026) { # &
2007    
2008 wakaba 1.11
2009     ## XML5: Not defined yet.
2010    
2011 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2012     ## "entity in attribute value state". In this implementation, the
2013     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2014     ## implementation of the "consume a character reference" algorithm.
2015     $self->{entity_add} = -1;
2016     $self->{prev_state} = $self->{state};
2017     $self->{state} = ENTITY_STATE;
2018    
2019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020     $self->{line_prev} = $self->{line};
2021     $self->{column_prev} = $self->{column};
2022     $self->{column}++;
2023     $self->{nc}
2024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025     } else {
2026     $self->{set_nc}->($self);
2027     }
2028    
2029     redo A;
2030     } elsif ($self->{nc} == 0x003E) { # >
2031     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2032    
2033     $self->{last_stag_name} = $self->{ct}->{tag_name};
2034 wakaba 1.15
2035     $self->{state} = DATA_STATE;
2036     $self->{s_kwd} = '';
2037    
2038     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2039     $self->{line_prev} = $self->{line};
2040     $self->{column_prev} = $self->{column};
2041     $self->{column}++;
2042     $self->{nc}
2043     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2044     } else {
2045     $self->{set_nc}->($self);
2046     }
2047    
2048     return ($self->{ct}); # start tag
2049     redo A;
2050 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2051     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2052     if ($self->{ct}->{attributes}) {
2053    
2054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2055     } else {
2056     ## NOTE: This state should never be reached.
2057    
2058     }
2059 wakaba 1.15
2060     $self->{state} = DATA_STATE;
2061     $self->{s_kwd} = '';
2062    
2063     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2064     $self->{line_prev} = $self->{line};
2065     $self->{column_prev} = $self->{column};
2066     $self->{column}++;
2067     $self->{nc}
2068     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2069     } else {
2070     $self->{set_nc}->($self);
2071     }
2072    
2073     return ($self->{ct}); # end tag
2074     redo A;
2075     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2076     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2077     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2078    
2079 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080     $self->{line_prev} = $self->{line};
2081     $self->{column_prev} = $self->{column};
2082     $self->{column}++;
2083     $self->{nc}
2084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085     } else {
2086     $self->{set_nc}->($self);
2087     }
2088    
2089 wakaba 1.15 return ($self->{ct}); # ATTLIST
2090     redo A;
2091     } else {
2092     die "$0: $self->{ct}->{type}: Unknown token type";
2093     }
2094 wakaba 1.1 } elsif ($self->{nc} == -1) {
2095     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2096    
2097 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2098 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2099 wakaba 1.15
2100     $self->{state} = DATA_STATE;
2101     $self->{s_kwd} = '';
2102     ## reconsume
2103     return ($self->{ct}); # start tag
2104     redo A;
2105 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2106 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2107 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2108     if ($self->{ct}->{attributes}) {
2109    
2110     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2111     } else {
2112     ## NOTE: This state should never be reached.
2113    
2114     }
2115 wakaba 1.15
2116     $self->{state} = DATA_STATE;
2117     $self->{s_kwd} = '';
2118     ## reconsume
2119     return ($self->{ct}); # end tag
2120     redo A;
2121     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2122     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2123     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2124     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2125     ## Reconsume.
2126     return ($self->{ct}); # ATTLIST
2127     redo A;
2128 wakaba 1.1 } else {
2129     die "$0: $self->{ct}->{type}: Unknown token type";
2130     }
2131     } else {
2132     if ({
2133     0x0022 => 1, # "
2134     0x0027 => 1, # '
2135     0x003D => 1, # =
2136     }->{$self->{nc}}) {
2137    
2138 wakaba 1.11 ## XML5: Not a parse error.
2139 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2140     } else {
2141    
2142     }
2143     $self->{ca}->{value} .= chr ($self->{nc});
2144     $self->{read_until}->($self->{ca}->{value},
2145     q["'=& >],
2146     length $self->{ca}->{value});
2147    
2148     ## Stay in the state
2149    
2150     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2151     $self->{line_prev} = $self->{line};
2152     $self->{column_prev} = $self->{column};
2153     $self->{column}++;
2154     $self->{nc}
2155     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2156     } else {
2157     $self->{set_nc}->($self);
2158     }
2159    
2160     redo A;
2161     }
2162     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2163     if ($is_space->{$self->{nc}}) {
2164    
2165     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2166    
2167     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2168     $self->{line_prev} = $self->{line};
2169     $self->{column_prev} = $self->{column};
2170     $self->{column}++;
2171     $self->{nc}
2172     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2173     } else {
2174     $self->{set_nc}->($self);
2175     }
2176    
2177     redo A;
2178     } elsif ($self->{nc} == 0x003E) { # >
2179     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2180    
2181     $self->{last_stag_name} = $self->{ct}->{tag_name};
2182     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2183     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2184     if ($self->{ct}->{attributes}) {
2185    
2186     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2187     } else {
2188     ## NOTE: This state should never be reached.
2189    
2190     }
2191     } else {
2192     die "$0: $self->{ct}->{type}: Unknown token type";
2193     }
2194     $self->{state} = DATA_STATE;
2195 wakaba 1.5 $self->{s_kwd} = '';
2196 wakaba 1.1
2197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2198     $self->{line_prev} = $self->{line};
2199     $self->{column_prev} = $self->{column};
2200     $self->{column}++;
2201     $self->{nc}
2202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2203     } else {
2204     $self->{set_nc}->($self);
2205     }
2206    
2207    
2208     return ($self->{ct}); # start tag or end tag
2209    
2210     redo A;
2211     } elsif ($self->{nc} == 0x002F) { # /
2212    
2213     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2214    
2215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2216     $self->{line_prev} = $self->{line};
2217     $self->{column_prev} = $self->{column};
2218     $self->{column}++;
2219     $self->{nc}
2220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2221     } else {
2222     $self->{set_nc}->($self);
2223     }
2224    
2225     redo A;
2226     } elsif ($self->{nc} == -1) {
2227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2228     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2229    
2230     $self->{last_stag_name} = $self->{ct}->{tag_name};
2231     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2232     if ($self->{ct}->{attributes}) {
2233    
2234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2235     } else {
2236     ## NOTE: This state should never be reached.
2237    
2238     }
2239     } else {
2240     die "$0: $self->{ct}->{type}: Unknown token type";
2241     }
2242     $self->{state} = DATA_STATE;
2243 wakaba 1.5 $self->{s_kwd} = '';
2244 wakaba 1.1 ## Reconsume.
2245     return ($self->{ct}); # start tag or end tag
2246     redo A;
2247     } else {
2248    
2249     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2250     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2251     ## reconsume
2252     redo A;
2253     }
2254     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2255 wakaba 1.11 ## XML5: "Empty tag state".
2256    
2257 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2258     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2259    
2260     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2261     ## TODO: Different type than slash in start tag
2262     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2263     if ($self->{ct}->{attributes}) {
2264    
2265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2266     } else {
2267    
2268     }
2269     ## TODO: Test |<title></title/>|
2270     } else {
2271    
2272     $self->{self_closing} = 1;
2273     }
2274    
2275     $self->{state} = DATA_STATE;
2276 wakaba 1.5 $self->{s_kwd} = '';
2277 wakaba 1.1
2278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2279     $self->{line_prev} = $self->{line};
2280     $self->{column_prev} = $self->{column};
2281     $self->{column}++;
2282     $self->{nc}
2283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2284     } else {
2285     $self->{set_nc}->($self);
2286     }
2287    
2288    
2289     return ($self->{ct}); # start tag or end tag
2290    
2291     redo A;
2292     } elsif ($self->{nc} == -1) {
2293     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2294     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2295    
2296     $self->{last_stag_name} = $self->{ct}->{tag_name};
2297     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2298     if ($self->{ct}->{attributes}) {
2299    
2300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2301     } else {
2302     ## NOTE: This state should never be reached.
2303    
2304     }
2305     } else {
2306     die "$0: $self->{ct}->{type}: Unknown token type";
2307     }
2308 wakaba 1.11 ## XML5: "Tag attribute name before state".
2309 wakaba 1.1 $self->{state} = DATA_STATE;
2310 wakaba 1.5 $self->{s_kwd} = '';
2311 wakaba 1.1 ## Reconsume.
2312     return ($self->{ct}); # start tag or end tag
2313     redo A;
2314     } else {
2315    
2316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2317     ## TODO: This error type is wrong.
2318     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2319     ## Reconsume.
2320     redo A;
2321     }
2322     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2323 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2324    
2325 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2326     ## consumes characters one-by-one basis.
2327    
2328     if ($self->{nc} == 0x003E) { # >
2329 wakaba 1.13 if ($self->{in_subset}) {
2330    
2331     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2332     } else {
2333    
2334     $self->{state} = DATA_STATE;
2335     $self->{s_kwd} = '';
2336     }
2337 wakaba 1.1
2338     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339     $self->{line_prev} = $self->{line};
2340     $self->{column_prev} = $self->{column};
2341     $self->{column}++;
2342     $self->{nc}
2343     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2344     } else {
2345     $self->{set_nc}->($self);
2346     }
2347    
2348    
2349     return ($self->{ct}); # comment
2350     redo A;
2351     } elsif ($self->{nc} == -1) {
2352 wakaba 1.13 if ($self->{in_subset}) {
2353    
2354     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2355     } else {
2356    
2357     $self->{state} = DATA_STATE;
2358     $self->{s_kwd} = '';
2359     }
2360 wakaba 1.1 ## reconsume
2361    
2362     return ($self->{ct}); # comment
2363     redo A;
2364     } else {
2365    
2366     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2367     $self->{read_until}->($self->{ct}->{data},
2368     q[>],
2369     length $self->{ct}->{data});
2370    
2371     ## Stay in the state.
2372    
2373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2374     $self->{line_prev} = $self->{line};
2375     $self->{column_prev} = $self->{column};
2376     $self->{column}++;
2377     $self->{nc}
2378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2379     } else {
2380     $self->{set_nc}->($self);
2381     }
2382    
2383     redo A;
2384     }
2385     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2386 wakaba 1.14 ## XML5: "Markup declaration state".
2387 wakaba 1.1
2388     if ($self->{nc} == 0x002D) { # -
2389    
2390     $self->{state} = MD_HYPHEN_STATE;
2391    
2392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2393     $self->{line_prev} = $self->{line};
2394     $self->{column_prev} = $self->{column};
2395     $self->{column}++;
2396     $self->{nc}
2397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2398     } else {
2399     $self->{set_nc}->($self);
2400     }
2401    
2402     redo A;
2403     } elsif ($self->{nc} == 0x0044 or # D
2404     $self->{nc} == 0x0064) { # d
2405     ## ASCII case-insensitive.
2406    
2407     $self->{state} = MD_DOCTYPE_STATE;
2408 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2409 wakaba 1.1
2410     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2411     $self->{line_prev} = $self->{line};
2412     $self->{column_prev} = $self->{column};
2413     $self->{column}++;
2414     $self->{nc}
2415     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2416     } else {
2417     $self->{set_nc}->($self);
2418     }
2419    
2420     redo A;
2421 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2422     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2423     $self->{is_xml}) and
2424 wakaba 1.1 $self->{nc} == 0x005B) { # [
2425    
2426     $self->{state} = MD_CDATA_STATE;
2427 wakaba 1.12 $self->{kwd} = '[';
2428 wakaba 1.1
2429     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2430     $self->{line_prev} = $self->{line};
2431     $self->{column_prev} = $self->{column};
2432     $self->{column}++;
2433     $self->{nc}
2434     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2435     } else {
2436     $self->{set_nc}->($self);
2437     }
2438    
2439     redo A;
2440     } else {
2441    
2442     }
2443    
2444     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2445     line => $self->{line_prev},
2446     column => $self->{column_prev} - 1);
2447     ## Reconsume.
2448     $self->{state} = BOGUS_COMMENT_STATE;
2449     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2450     line => $self->{line_prev},
2451     column => $self->{column_prev} - 1,
2452     };
2453     redo A;
2454     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2455     if ($self->{nc} == 0x002D) { # -
2456    
2457     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2458     line => $self->{line_prev},
2459     column => $self->{column_prev} - 2,
2460     };
2461 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2462 wakaba 1.1
2463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2464     $self->{line_prev} = $self->{line};
2465     $self->{column_prev} = $self->{column};
2466     $self->{column}++;
2467     $self->{nc}
2468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2469     } else {
2470     $self->{set_nc}->($self);
2471     }
2472    
2473     redo A;
2474     } else {
2475    
2476     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2477     line => $self->{line_prev},
2478     column => $self->{column_prev} - 2);
2479     $self->{state} = BOGUS_COMMENT_STATE;
2480     ## Reconsume.
2481     $self->{ct} = {type => COMMENT_TOKEN,
2482     data => '-',
2483     line => $self->{line_prev},
2484     column => $self->{column_prev} - 2,
2485     };
2486     redo A;
2487     }
2488     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2489     ## ASCII case-insensitive.
2490     if ($self->{nc} == [
2491     undef,
2492     0x004F, # O
2493     0x0043, # C
2494     0x0054, # T
2495     0x0059, # Y
2496     0x0050, # P
2497 wakaba 1.12 ]->[length $self->{kwd}] or
2498 wakaba 1.1 $self->{nc} == [
2499     undef,
2500     0x006F, # o
2501     0x0063, # c
2502     0x0074, # t
2503     0x0079, # y
2504     0x0070, # p
2505 wakaba 1.12 ]->[length $self->{kwd}]) {
2506 wakaba 1.1
2507     ## Stay in the state.
2508 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2509 wakaba 1.1
2510     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2511     $self->{line_prev} = $self->{line};
2512     $self->{column_prev} = $self->{column};
2513     $self->{column}++;
2514     $self->{nc}
2515     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2516     } else {
2517     $self->{set_nc}->($self);
2518     }
2519    
2520     redo A;
2521 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2522 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2523     $self->{nc} == 0x0065)) { # e
2524 wakaba 1.12 if ($self->{is_xml} and
2525     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2526 wakaba 1.10
2527     ## XML5: case-sensitive.
2528     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2529     text => 'DOCTYPE',
2530     line => $self->{line_prev},
2531     column => $self->{column_prev} - 5);
2532     } else {
2533    
2534     }
2535 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2536     $self->{ct} = {type => DOCTYPE_TOKEN,
2537     quirks => 1,
2538     line => $self->{line_prev},
2539     column => $self->{column_prev} - 7,
2540     };
2541    
2542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2543     $self->{line_prev} = $self->{line};
2544     $self->{column_prev} = $self->{column};
2545     $self->{column}++;
2546     $self->{nc}
2547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2548     } else {
2549     $self->{set_nc}->($self);
2550     }
2551    
2552     redo A;
2553     } else {
2554    
2555     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2556     line => $self->{line_prev},
2557 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2558 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2559     ## Reconsume.
2560     $self->{ct} = {type => COMMENT_TOKEN,
2561 wakaba 1.12 data => $self->{kwd},
2562 wakaba 1.1 line => $self->{line_prev},
2563 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2564 wakaba 1.1 };
2565     redo A;
2566     }
2567     } elsif ($self->{state} == MD_CDATA_STATE) {
2568     if ($self->{nc} == {
2569     '[' => 0x0043, # C
2570     '[C' => 0x0044, # D
2571     '[CD' => 0x0041, # A
2572     '[CDA' => 0x0054, # T
2573     '[CDAT' => 0x0041, # A
2574 wakaba 1.12 }->{$self->{kwd}}) {
2575 wakaba 1.1
2576     ## Stay in the state.
2577 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2578 wakaba 1.1
2579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2580     $self->{line_prev} = $self->{line};
2581     $self->{column_prev} = $self->{column};
2582     $self->{column}++;
2583     $self->{nc}
2584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2585     } else {
2586     $self->{set_nc}->($self);
2587     }
2588    
2589     redo A;
2590 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2591 wakaba 1.1 $self->{nc} == 0x005B) { # [
2592 wakaba 1.6 if ($self->{is_xml} and
2593     not $self->{tainted} and
2594     @{$self->{open_elements} or []} == 0) {
2595 wakaba 1.8
2596 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2597     line => $self->{line_prev},
2598     column => $self->{column_prev} - 7);
2599     $self->{tainted} = 1;
2600 wakaba 1.8 } else {
2601    
2602 wakaba 1.6 }
2603    
2604 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2605     data => '',
2606     line => $self->{line_prev},
2607     column => $self->{column_prev} - 7};
2608     $self->{state} = CDATA_SECTION_STATE;
2609    
2610     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2611     $self->{line_prev} = $self->{line};
2612     $self->{column_prev} = $self->{column};
2613     $self->{column}++;
2614     $self->{nc}
2615     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2616     } else {
2617     $self->{set_nc}->($self);
2618     }
2619    
2620     redo A;
2621     } else {
2622    
2623     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2624     line => $self->{line_prev},
2625 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2626 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2627     ## Reconsume.
2628     $self->{ct} = {type => COMMENT_TOKEN,
2629 wakaba 1.12 data => $self->{kwd},
2630 wakaba 1.1 line => $self->{line_prev},
2631 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2632 wakaba 1.1 };
2633     redo A;
2634     }
2635     } elsif ($self->{state} == COMMENT_START_STATE) {
2636     if ($self->{nc} == 0x002D) { # -
2637    
2638     $self->{state} = COMMENT_START_DASH_STATE;
2639    
2640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2641     $self->{line_prev} = $self->{line};
2642     $self->{column_prev} = $self->{column};
2643     $self->{column}++;
2644     $self->{nc}
2645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2646     } else {
2647     $self->{set_nc}->($self);
2648     }
2649    
2650     redo A;
2651     } elsif ($self->{nc} == 0x003E) { # >
2652     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2653 wakaba 1.13 if ($self->{in_subset}) {
2654    
2655     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2656     } else {
2657    
2658     $self->{state} = DATA_STATE;
2659     $self->{s_kwd} = '';
2660     }
2661 wakaba 1.1
2662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2663     $self->{line_prev} = $self->{line};
2664     $self->{column_prev} = $self->{column};
2665     $self->{column}++;
2666     $self->{nc}
2667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2668     } else {
2669     $self->{set_nc}->($self);
2670     }
2671    
2672    
2673     return ($self->{ct}); # comment
2674    
2675     redo A;
2676     } elsif ($self->{nc} == -1) {
2677     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2678 wakaba 1.13 if ($self->{in_subset}) {
2679    
2680     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2681     } else {
2682    
2683     $self->{state} = DATA_STATE;
2684     $self->{s_kwd} = '';
2685     }
2686 wakaba 1.1 ## reconsume
2687    
2688     return ($self->{ct}); # comment
2689    
2690     redo A;
2691     } else {
2692    
2693     $self->{ct}->{data} # comment
2694     .= chr ($self->{nc});
2695     $self->{state} = COMMENT_STATE;
2696    
2697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2698     $self->{line_prev} = $self->{line};
2699     $self->{column_prev} = $self->{column};
2700     $self->{column}++;
2701     $self->{nc}
2702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2703     } else {
2704     $self->{set_nc}->($self);
2705     }
2706    
2707     redo A;
2708     }
2709     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2710     if ($self->{nc} == 0x002D) { # -
2711    
2712     $self->{state} = COMMENT_END_STATE;
2713    
2714     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2715     $self->{line_prev} = $self->{line};
2716     $self->{column_prev} = $self->{column};
2717     $self->{column}++;
2718     $self->{nc}
2719     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2720     } else {
2721     $self->{set_nc}->($self);
2722     }
2723    
2724     redo A;
2725     } elsif ($self->{nc} == 0x003E) { # >
2726     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2727 wakaba 1.13 if ($self->{in_subset}) {
2728    
2729     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2730     } else {
2731    
2732     $self->{state} = DATA_STATE;
2733     $self->{s_kwd} = '';
2734     }
2735 wakaba 1.1
2736     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2737     $self->{line_prev} = $self->{line};
2738     $self->{column_prev} = $self->{column};
2739     $self->{column}++;
2740     $self->{nc}
2741     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2742     } else {
2743     $self->{set_nc}->($self);
2744     }
2745    
2746    
2747     return ($self->{ct}); # comment
2748    
2749     redo A;
2750     } elsif ($self->{nc} == -1) {
2751     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2752 wakaba 1.13 if ($self->{in_subset}) {
2753    
2754     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2755     } else {
2756    
2757     $self->{state} = DATA_STATE;
2758     $self->{s_kwd} = '';
2759     }
2760 wakaba 1.1 ## reconsume
2761    
2762     return ($self->{ct}); # comment
2763    
2764     redo A;
2765     } else {
2766    
2767     $self->{ct}->{data} # comment
2768     .= '-' . chr ($self->{nc});
2769     $self->{state} = COMMENT_STATE;
2770    
2771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2772     $self->{line_prev} = $self->{line};
2773     $self->{column_prev} = $self->{column};
2774     $self->{column}++;
2775     $self->{nc}
2776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2777     } else {
2778     $self->{set_nc}->($self);
2779     }
2780    
2781     redo A;
2782     }
2783     } elsif ($self->{state} == COMMENT_STATE) {
2784 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2785    
2786 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2787    
2788     $self->{state} = COMMENT_END_DASH_STATE;
2789    
2790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2791     $self->{line_prev} = $self->{line};
2792     $self->{column_prev} = $self->{column};
2793     $self->{column}++;
2794     $self->{nc}
2795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2796     } else {
2797     $self->{set_nc}->($self);
2798     }
2799    
2800     redo A;
2801     } elsif ($self->{nc} == -1) {
2802     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803 wakaba 1.13 if ($self->{in_subset}) {
2804    
2805     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806     } else {
2807    
2808     $self->{state} = DATA_STATE;
2809     $self->{s_kwd} = '';
2810     }
2811 wakaba 1.1 ## reconsume
2812    
2813     return ($self->{ct}); # comment
2814    
2815     redo A;
2816     } else {
2817    
2818     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2819     $self->{read_until}->($self->{ct}->{data},
2820     q[-],
2821     length $self->{ct}->{data});
2822    
2823     ## Stay in the state
2824    
2825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2826     $self->{line_prev} = $self->{line};
2827     $self->{column_prev} = $self->{column};
2828     $self->{column}++;
2829     $self->{nc}
2830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2831     } else {
2832     $self->{set_nc}->($self);
2833     }
2834    
2835     redo A;
2836     }
2837     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2838 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2839 wakaba 1.10
2840 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2841    
2842     $self->{state} = COMMENT_END_STATE;
2843    
2844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2845     $self->{line_prev} = $self->{line};
2846     $self->{column_prev} = $self->{column};
2847     $self->{column}++;
2848     $self->{nc}
2849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2850     } else {
2851     $self->{set_nc}->($self);
2852     }
2853    
2854     redo A;
2855     } elsif ($self->{nc} == -1) {
2856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2857 wakaba 1.13 if ($self->{in_subset}) {
2858    
2859     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2860     } else {
2861    
2862     $self->{state} = DATA_STATE;
2863     $self->{s_kwd} = '';
2864     }
2865 wakaba 1.1 ## reconsume
2866    
2867     return ($self->{ct}); # comment
2868    
2869     redo A;
2870     } else {
2871    
2872     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2873     $self->{state} = COMMENT_STATE;
2874    
2875     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2876     $self->{line_prev} = $self->{line};
2877     $self->{column_prev} = $self->{column};
2878     $self->{column}++;
2879     $self->{nc}
2880     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2881     } else {
2882     $self->{set_nc}->($self);
2883     }
2884    
2885     redo A;
2886     }
2887     } elsif ($self->{state} == COMMENT_END_STATE) {
2888 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2889    
2890 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2891 wakaba 1.13 if ($self->{in_subset}) {
2892    
2893     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2894     } else {
2895    
2896     $self->{state} = DATA_STATE;
2897     $self->{s_kwd} = '';
2898     }
2899 wakaba 1.1
2900     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2901     $self->{line_prev} = $self->{line};
2902     $self->{column_prev} = $self->{column};
2903     $self->{column}++;
2904     $self->{nc}
2905     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2906     } else {
2907     $self->{set_nc}->($self);
2908     }
2909    
2910    
2911     return ($self->{ct}); # comment
2912    
2913     redo A;
2914     } elsif ($self->{nc} == 0x002D) { # -
2915    
2916 wakaba 1.10 ## XML5: Not a parse error.
2917 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2918     line => $self->{line_prev},
2919     column => $self->{column_prev});
2920     $self->{ct}->{data} .= '-'; # comment
2921     ## Stay in the state
2922    
2923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2924     $self->{line_prev} = $self->{line};
2925     $self->{column_prev} = $self->{column};
2926     $self->{column}++;
2927     $self->{nc}
2928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2929     } else {
2930     $self->{set_nc}->($self);
2931     }
2932    
2933     redo A;
2934     } elsif ($self->{nc} == -1) {
2935     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2936 wakaba 1.13 if ($self->{in_subset}) {
2937    
2938     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939     } else {
2940    
2941     $self->{state} = DATA_STATE;
2942     $self->{s_kwd} = '';
2943     }
2944 wakaba 1.1 ## reconsume
2945    
2946     return ($self->{ct}); # comment
2947    
2948     redo A;
2949     } else {
2950    
2951 wakaba 1.10 ## XML5: Not a parse error.
2952 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2953     line => $self->{line_prev},
2954     column => $self->{column_prev});
2955     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2956     $self->{state} = COMMENT_STATE;
2957    
2958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2959     $self->{line_prev} = $self->{line};
2960     $self->{column_prev} = $self->{column};
2961     $self->{column}++;
2962     $self->{nc}
2963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2964     } else {
2965     $self->{set_nc}->($self);
2966     }
2967    
2968     redo A;
2969     }
2970     } elsif ($self->{state} == DOCTYPE_STATE) {
2971     if ($is_space->{$self->{nc}}) {
2972    
2973     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2974    
2975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2976     $self->{line_prev} = $self->{line};
2977     $self->{column_prev} = $self->{column};
2978     $self->{column}++;
2979     $self->{nc}
2980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2981     } else {
2982     $self->{set_nc}->($self);
2983     }
2984    
2985     redo A;
2986     } else {
2987    
2988 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2989 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2990     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2991     ## reconsume
2992     redo A;
2993     }
2994     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2995 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2996    
2997 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2998    
2999     ## Stay in the state
3000    
3001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002     $self->{line_prev} = $self->{line};
3003     $self->{column_prev} = $self->{column};
3004     $self->{column}++;
3005     $self->{nc}
3006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007     } else {
3008     $self->{set_nc}->($self);
3009     }
3010    
3011     redo A;
3012     } elsif ($self->{nc} == 0x003E) { # >
3013    
3014 wakaba 1.12 ## XML5: No parse error.
3015 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3016     $self->{state} = DATA_STATE;
3017 wakaba 1.5 $self->{s_kwd} = '';
3018 wakaba 1.1
3019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3020     $self->{line_prev} = $self->{line};
3021     $self->{column_prev} = $self->{column};
3022     $self->{column}++;
3023     $self->{nc}
3024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3025     } else {
3026     $self->{set_nc}->($self);
3027     }
3028    
3029    
3030     return ($self->{ct}); # DOCTYPE (quirks)
3031    
3032     redo A;
3033     } elsif ($self->{nc} == -1) {
3034    
3035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3036     $self->{state} = DATA_STATE;
3037 wakaba 1.5 $self->{s_kwd} = '';
3038 wakaba 1.1 ## reconsume
3039    
3040     return ($self->{ct}); # DOCTYPE (quirks)
3041    
3042     redo A;
3043 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3044    
3045     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3046     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3047 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3048     $self->{in_subset} = 1;
3049 wakaba 1.12
3050     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3051     $self->{line_prev} = $self->{line};
3052     $self->{column_prev} = $self->{column};
3053     $self->{column}++;
3054     $self->{nc}
3055     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3056     } else {
3057     $self->{set_nc}->($self);
3058     }
3059    
3060 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3061 wakaba 1.12 redo A;
3062 wakaba 1.1 } else {
3063    
3064     $self->{ct}->{name} = chr $self->{nc};
3065     delete $self->{ct}->{quirks};
3066     $self->{state} = DOCTYPE_NAME_STATE;
3067    
3068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3069     $self->{line_prev} = $self->{line};
3070     $self->{column_prev} = $self->{column};
3071     $self->{column}++;
3072     $self->{nc}
3073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3074     } else {
3075     $self->{set_nc}->($self);
3076     }
3077    
3078     redo A;
3079     }
3080     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3081 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3082    
3083     ## ISSUE: Redundant "First," in the spec.
3084    
3085 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3086    
3087     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3088    
3089     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3090     $self->{line_prev} = $self->{line};
3091     $self->{column_prev} = $self->{column};
3092     $self->{column}++;
3093     $self->{nc}
3094     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3095     } else {
3096     $self->{set_nc}->($self);
3097     }
3098    
3099     redo A;
3100     } elsif ($self->{nc} == 0x003E) { # >
3101    
3102     $self->{state} = DATA_STATE;
3103 wakaba 1.5 $self->{s_kwd} = '';
3104 wakaba 1.1
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115    
3116     return ($self->{ct}); # DOCTYPE
3117    
3118     redo A;
3119     } elsif ($self->{nc} == -1) {
3120    
3121     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3122     $self->{state} = DATA_STATE;
3123 wakaba 1.5 $self->{s_kwd} = '';
3124 wakaba 1.1 ## reconsume
3125    
3126     $self->{ct}->{quirks} = 1;
3127     return ($self->{ct}); # DOCTYPE
3128    
3129     redo A;
3130 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3131    
3132     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3133 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3134     $self->{in_subset} = 1;
3135 wakaba 1.12
3136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137     $self->{line_prev} = $self->{line};
3138     $self->{column_prev} = $self->{column};
3139     $self->{column}++;
3140     $self->{nc}
3141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142     } else {
3143     $self->{set_nc}->($self);
3144     }
3145    
3146 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3147 wakaba 1.12 redo A;
3148 wakaba 1.1 } else {
3149    
3150     $self->{ct}->{name}
3151     .= chr ($self->{nc}); # DOCTYPE
3152     ## Stay in the state
3153    
3154     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3155     $self->{line_prev} = $self->{line};
3156     $self->{column_prev} = $self->{column};
3157     $self->{column}++;
3158     $self->{nc}
3159     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3160     } else {
3161     $self->{set_nc}->($self);
3162     }
3163    
3164     redo A;
3165     }
3166     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3167 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3168     ## state", but implemented differently.
3169    
3170 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3171    
3172     ## Stay in the state
3173    
3174     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3175     $self->{line_prev} = $self->{line};
3176     $self->{column_prev} = $self->{column};
3177     $self->{column}++;
3178     $self->{nc}
3179     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3180     } else {
3181     $self->{set_nc}->($self);
3182     }
3183    
3184     redo A;
3185     } elsif ($self->{nc} == 0x003E) { # >
3186 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3187    
3188     $self->{state} = DATA_STATE;
3189     $self->{s_kwd} = '';
3190     } else {
3191    
3192     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3193     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3194     }
3195 wakaba 1.1
3196    
3197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3198     $self->{line_prev} = $self->{line};
3199     $self->{column_prev} = $self->{column};
3200     $self->{column}++;
3201     $self->{nc}
3202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3203     } else {
3204     $self->{set_nc}->($self);
3205     }
3206    
3207 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3208 wakaba 1.1 redo A;
3209     } elsif ($self->{nc} == -1) {
3210 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3211    
3212     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3213     $self->{state} = DATA_STATE;
3214     $self->{s_kwd} = '';
3215     $self->{ct}->{quirks} = 1;
3216     } else {
3217    
3218     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3219     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3220     }
3221 wakaba 1.1
3222 wakaba 1.16 ## Reconsume.
3223     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 wakaba 1.1 redo A;
3225     } elsif ($self->{nc} == 0x0050 or # P
3226     $self->{nc} == 0x0070) { # p
3227 wakaba 1.12
3228 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3229 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3230 wakaba 1.1
3231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3232     $self->{line_prev} = $self->{line};
3233     $self->{column_prev} = $self->{column};
3234     $self->{column}++;
3235     $self->{nc}
3236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3237     } else {
3238     $self->{set_nc}->($self);
3239     }
3240    
3241     redo A;
3242     } elsif ($self->{nc} == 0x0053 or # S
3243     $self->{nc} == 0x0073) { # s
3244 wakaba 1.12
3245 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3246 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3247    
3248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3249     $self->{line_prev} = $self->{line};
3250     $self->{column_prev} = $self->{column};
3251     $self->{column}++;
3252     $self->{nc}
3253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3254     } else {
3255     $self->{set_nc}->($self);
3256     }
3257    
3258     redo A;
3259 wakaba 1.16 ## TODO: " and ' for ENTITY
3260     } elsif ($self->{is_xml} and
3261     $self->{ct}->{type} == DOCTYPE_TOKEN and
3262     $self->{nc} == 0x005B) { # [
3263 wakaba 1.12
3264     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3266 wakaba 1.13 $self->{in_subset} = 1;
3267 wakaba 1.1
3268     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3269     $self->{line_prev} = $self->{line};
3270     $self->{column_prev} = $self->{column};
3271     $self->{column}++;
3272     $self->{nc}
3273     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3274     } else {
3275     $self->{set_nc}->($self);
3276     }
3277    
3278 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3279 wakaba 1.1 redo A;
3280     } else {
3281 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3282    
3283     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3284    
3285     $self->{ct}->{quirks} = 1;
3286     $self->{state} = BOGUS_DOCTYPE_STATE;
3287     } else {
3288    
3289     $self->{state} = BOGUS_MD_STATE;
3290     }
3291 wakaba 1.1
3292    
3293     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3294     $self->{line_prev} = $self->{line};
3295     $self->{column_prev} = $self->{column};
3296     $self->{column}++;
3297     $self->{nc}
3298     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3299     } else {
3300     $self->{set_nc}->($self);
3301     }
3302    
3303     redo A;
3304     }
3305     } elsif ($self->{state} == PUBLIC_STATE) {
3306     ## ASCII case-insensitive
3307     if ($self->{nc} == [
3308     undef,
3309     0x0055, # U
3310     0x0042, # B
3311     0x004C, # L
3312     0x0049, # I
3313 wakaba 1.12 ]->[length $self->{kwd}] or
3314 wakaba 1.1 $self->{nc} == [
3315     undef,
3316     0x0075, # u
3317     0x0062, # b
3318     0x006C, # l
3319     0x0069, # i
3320 wakaba 1.12 ]->[length $self->{kwd}]) {
3321 wakaba 1.1
3322     ## Stay in the state.
3323 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3324 wakaba 1.1
3325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3326     $self->{line_prev} = $self->{line};
3327     $self->{column_prev} = $self->{column};
3328     $self->{column}++;
3329     $self->{nc}
3330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3331     } else {
3332     $self->{set_nc}->($self);
3333     }
3334    
3335     redo A;
3336 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3337 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3338     $self->{nc} == 0x0063)) { # c
3339 wakaba 1.12 if ($self->{is_xml} and
3340     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3341    
3342     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3343     text => 'PUBLIC',
3344     line => $self->{line_prev},
3345     column => $self->{column_prev} - 4);
3346     } else {
3347    
3348     }
3349 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3350    
3351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3352     $self->{line_prev} = $self->{line};
3353     $self->{column_prev} = $self->{column};
3354     $self->{column}++;
3355     $self->{nc}
3356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3357     } else {
3358     $self->{set_nc}->($self);
3359     }
3360    
3361     redo A;
3362     } else {
3363 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3364 wakaba 1.1 line => $self->{line_prev},
3365 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3366 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3367    
3368     $self->{ct}->{quirks} = 1;
3369     $self->{state} = BOGUS_DOCTYPE_STATE;
3370     } else {
3371    
3372     $self->{state} = BOGUS_MD_STATE;
3373     }
3374 wakaba 1.1 ## Reconsume.
3375     redo A;
3376     }
3377     } elsif ($self->{state} == SYSTEM_STATE) {
3378     ## ASCII case-insensitive
3379     if ($self->{nc} == [
3380     undef,
3381     0x0059, # Y
3382     0x0053, # S
3383     0x0054, # T
3384     0x0045, # E
3385 wakaba 1.12 ]->[length $self->{kwd}] or
3386 wakaba 1.1 $self->{nc} == [
3387     undef,
3388     0x0079, # y
3389     0x0073, # s
3390     0x0074, # t
3391     0x0065, # e
3392 wakaba 1.12 ]->[length $self->{kwd}]) {
3393 wakaba 1.1
3394     ## Stay in the state.
3395 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3396 wakaba 1.1
3397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3398     $self->{line_prev} = $self->{line};
3399     $self->{column_prev} = $self->{column};
3400     $self->{column}++;
3401     $self->{nc}
3402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3403     } else {
3404     $self->{set_nc}->($self);
3405     }
3406    
3407     redo A;
3408 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3409 wakaba 1.1 ($self->{nc} == 0x004D or # M
3410     $self->{nc} == 0x006D)) { # m
3411 wakaba 1.12 if ($self->{is_xml} and
3412     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3413    
3414     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3415     text => 'SYSTEM',
3416     line => $self->{line_prev},
3417     column => $self->{column_prev} - 4);
3418     } else {
3419    
3420     }
3421 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3422    
3423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3424     $self->{line_prev} = $self->{line};
3425     $self->{column_prev} = $self->{column};
3426     $self->{column}++;
3427     $self->{nc}
3428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3429     } else {
3430     $self->{set_nc}->($self);
3431     }
3432    
3433     redo A;
3434     } else {
3435 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3436 wakaba 1.1 line => $self->{line_prev},
3437 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3438 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3439    
3440     $self->{ct}->{quirks} = 1;
3441     $self->{state} = BOGUS_DOCTYPE_STATE;
3442     } else {
3443    
3444     $self->{state} = BOGUS_MD_STATE;
3445     }
3446 wakaba 1.1 ## Reconsume.
3447     redo A;
3448     }
3449     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3450     if ($is_space->{$self->{nc}}) {
3451    
3452     ## Stay in the state
3453    
3454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3455     $self->{line_prev} = $self->{line};
3456     $self->{column_prev} = $self->{column};
3457     $self->{column}++;
3458     $self->{nc}
3459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3460     } else {
3461     $self->{set_nc}->($self);
3462     }
3463    
3464     redo A;
3465     } elsif ($self->{nc} eq 0x0022) { # "
3466    
3467     $self->{ct}->{pubid} = ''; # DOCTYPE
3468     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3469    
3470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3471     $self->{line_prev} = $self->{line};
3472     $self->{column_prev} = $self->{column};
3473     $self->{column}++;
3474     $self->{nc}
3475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3476     } else {
3477     $self->{set_nc}->($self);
3478     }
3479    
3480     redo A;
3481     } elsif ($self->{nc} eq 0x0027) { # '
3482    
3483     $self->{ct}->{pubid} = ''; # DOCTYPE
3484     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3485    
3486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3487     $self->{line_prev} = $self->{line};
3488     $self->{column_prev} = $self->{column};
3489     $self->{column}++;
3490     $self->{nc}
3491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3492     } else {
3493     $self->{set_nc}->($self);
3494     }
3495    
3496     redo A;
3497     } elsif ($self->{nc} eq 0x003E) { # >
3498 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3499    
3500     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3501    
3502     $self->{state} = DATA_STATE;
3503     $self->{s_kwd} = '';
3504     $self->{ct}->{quirks} = 1;
3505     } else {
3506    
3507     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3508     }
3509 wakaba 1.1
3510    
3511     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3512     $self->{line_prev} = $self->{line};
3513     $self->{column_prev} = $self->{column};
3514     $self->{column}++;
3515     $self->{nc}
3516     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3517     } else {
3518     $self->{set_nc}->($self);
3519     }
3520    
3521 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3522 wakaba 1.1 redo A;
3523     } elsif ($self->{nc} == -1) {
3524 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3525    
3526     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3527     $self->{state} = DATA_STATE;
3528     $self->{s_kwd} = '';
3529     $self->{ct}->{quirks} = 1;
3530     } else {
3531    
3532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3533     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3534     }
3535 wakaba 1.1
3536     ## reconsume
3537     return ($self->{ct}); # DOCTYPE
3538     redo A;
3539 wakaba 1.16 } elsif ($self->{is_xml} and
3540     $self->{ct}->{type} == DOCTYPE_TOKEN and
3541     $self->{nc} == 0x005B) { # [
3542 wakaba 1.12
3543     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3544     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3545     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3546 wakaba 1.13 $self->{in_subset} = 1;
3547 wakaba 1.12
3548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3549     $self->{line_prev} = $self->{line};
3550     $self->{column_prev} = $self->{column};
3551     $self->{column}++;
3552     $self->{nc}
3553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3554     } else {
3555     $self->{set_nc}->($self);
3556     }
3557    
3558 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3559 wakaba 1.12 redo A;
3560 wakaba 1.1 } else {
3561     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3562    
3563 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3564    
3565     $self->{ct}->{quirks} = 1;
3566     $self->{state} = BOGUS_DOCTYPE_STATE;
3567     } else {
3568    
3569     $self->{state} = BOGUS_MD_STATE;
3570     }
3571    
3572 wakaba 1.1
3573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574     $self->{line_prev} = $self->{line};
3575     $self->{column_prev} = $self->{column};
3576     $self->{column}++;
3577     $self->{nc}
3578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3579     } else {
3580     $self->{set_nc}->($self);
3581     }
3582    
3583     redo A;
3584     }
3585     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3586     if ($self->{nc} == 0x0022) { # "
3587    
3588     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3589    
3590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3591     $self->{line_prev} = $self->{line};
3592     $self->{column_prev} = $self->{column};
3593     $self->{column}++;
3594     $self->{nc}
3595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3596     } else {
3597     $self->{set_nc}->($self);
3598     }
3599    
3600     redo A;
3601     } elsif ($self->{nc} == 0x003E) { # >
3602     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3603    
3604 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3605    
3606     $self->{state} = DATA_STATE;
3607     $self->{s_kwd} = '';
3608     $self->{ct}->{quirks} = 1;
3609     } else {
3610    
3611     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3612     }
3613    
3614 wakaba 1.1
3615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3616     $self->{line_prev} = $self->{line};
3617     $self->{column_prev} = $self->{column};
3618     $self->{column}++;
3619     $self->{nc}
3620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3621     } else {
3622     $self->{set_nc}->($self);
3623     }
3624    
3625 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3626 wakaba 1.1 redo A;
3627     } elsif ($self->{nc} == -1) {
3628     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3629    
3630 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3631    
3632     $self->{state} = DATA_STATE;
3633     $self->{s_kwd} = '';
3634     $self->{ct}->{quirks} = 1;
3635     } else {
3636    
3637     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3638     }
3639    
3640     ## Reconsume.
3641 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3642     redo A;
3643     } else {
3644    
3645 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3646 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3647     length $self->{ct}->{pubid});
3648    
3649     ## Stay in the state
3650    
3651     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3652     $self->{line_prev} = $self->{line};
3653     $self->{column_prev} = $self->{column};
3654     $self->{column}++;
3655     $self->{nc}
3656     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3657     } else {
3658     $self->{set_nc}->($self);
3659     }
3660    
3661     redo A;
3662     }
3663     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3664     if ($self->{nc} == 0x0027) { # '
3665    
3666     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3667    
3668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3669     $self->{line_prev} = $self->{line};
3670     $self->{column_prev} = $self->{column};
3671     $self->{column}++;
3672     $self->{nc}
3673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3674     } else {
3675     $self->{set_nc}->($self);
3676     }
3677    
3678     redo A;
3679     } elsif ($self->{nc} == 0x003E) { # >
3680     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3681    
3682 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3683    
3684     $self->{state} = DATA_STATE;
3685     $self->{s_kwd} = '';
3686     $self->{ct}->{quirks} = 1;
3687     } else {
3688    
3689     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3690     }
3691    
3692 wakaba 1.1
3693     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3694     $self->{line_prev} = $self->{line};
3695     $self->{column_prev} = $self->{column};
3696     $self->{column}++;
3697     $self->{nc}
3698     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3699     } else {
3700     $self->{set_nc}->($self);
3701     }
3702    
3703 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3704 wakaba 1.1 redo A;
3705     } elsif ($self->{nc} == -1) {
3706     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3707    
3708 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3709    
3710     $self->{state} = DATA_STATE;
3711     $self->{s_kwd} = '';
3712     $self->{ct}->{quirks} = 1;
3713     } else {
3714    
3715     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3716     }
3717    
3718 wakaba 1.1 ## reconsume
3719 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3720 wakaba 1.1 redo A;
3721     } else {
3722    
3723 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3724 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3725     length $self->{ct}->{pubid});
3726    
3727     ## Stay in the state
3728    
3729     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3730     $self->{line_prev} = $self->{line};
3731     $self->{column_prev} = $self->{column};
3732     $self->{column}++;
3733     $self->{nc}
3734     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3735     } else {
3736     $self->{set_nc}->($self);
3737     }
3738    
3739     redo A;
3740     }
3741     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3742     if ($is_space->{$self->{nc}}) {
3743    
3744     ## Stay in the state
3745    
3746     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3747     $self->{line_prev} = $self->{line};
3748     $self->{column_prev} = $self->{column};
3749     $self->{column}++;
3750     $self->{nc}
3751     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3752     } else {
3753     $self->{set_nc}->($self);
3754     }
3755    
3756     redo A;
3757     } elsif ($self->{nc} == 0x0022) { # "
3758    
3759 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3760 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3761    
3762     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3763     $self->{line_prev} = $self->{line};
3764     $self->{column_prev} = $self->{column};
3765     $self->{column}++;
3766     $self->{nc}
3767     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3768     } else {
3769     $self->{set_nc}->($self);
3770     }
3771    
3772     redo A;
3773     } elsif ($self->{nc} == 0x0027) { # '
3774    
3775 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3776 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3777    
3778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779     $self->{line_prev} = $self->{line};
3780     $self->{column_prev} = $self->{column};
3781     $self->{column}++;
3782     $self->{nc}
3783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784     } else {
3785     $self->{set_nc}->($self);
3786     }
3787    
3788     redo A;
3789     } elsif ($self->{nc} == 0x003E) { # >
3790 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3791     if ($self->{is_xml}) {
3792    
3793     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3794     } else {
3795    
3796     }
3797     $self->{state} = DATA_STATE;
3798     $self->{s_kwd} = '';
3799 wakaba 1.12 } else {
3800 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3801    
3802     } else {
3803    
3804     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3805     }
3806     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3807 wakaba 1.12 }
3808 wakaba 1.16
3809 wakaba 1.1
3810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3811     $self->{line_prev} = $self->{line};
3812     $self->{column_prev} = $self->{column};
3813     $self->{column}++;
3814     $self->{nc}
3815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3816     } else {
3817     $self->{set_nc}->($self);
3818     }
3819    
3820 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3821 wakaba 1.1 redo A;
3822     } elsif ($self->{nc} == -1) {
3823 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3824    
3825     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3826    
3827     $self->{state} = DATA_STATE;
3828     $self->{s_kwd} = '';
3829     $self->{ct}->{quirks} = 1;
3830     } else {
3831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3833     }
3834 wakaba 1.1
3835     ## reconsume
3836 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3837 wakaba 1.1 redo A;
3838 wakaba 1.16 } elsif ($self->{is_xml} and
3839     $self->{ct}->{type} == DOCTYPE_TOKEN and
3840     $self->{nc} == 0x005B) { # [
3841 wakaba 1.12
3842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3843     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3844     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3845 wakaba 1.13 $self->{in_subset} = 1;
3846 wakaba 1.12
3847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3848     $self->{line_prev} = $self->{line};
3849     $self->{column_prev} = $self->{column};
3850     $self->{column}++;
3851     $self->{nc}
3852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3853     } else {
3854     $self->{set_nc}->($self);
3855     }
3856    
3857 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3858 wakaba 1.12 redo A;
3859 wakaba 1.1 } else {
3860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3861    
3862 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3863    
3864     $self->{ct}->{quirks} = 1;
3865     $self->{state} = BOGUS_DOCTYPE_STATE;
3866     } else {
3867    
3868     $self->{state} = BOGUS_MD_STATE;
3869     }
3870    
3871 wakaba 1.1
3872     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3873     $self->{line_prev} = $self->{line};
3874     $self->{column_prev} = $self->{column};
3875     $self->{column}++;
3876     $self->{nc}
3877     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3878     } else {
3879     $self->{set_nc}->($self);
3880     }
3881    
3882     redo A;
3883     }
3884     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3885     if ($is_space->{$self->{nc}}) {
3886    
3887     ## Stay in the state
3888    
3889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3890     $self->{line_prev} = $self->{line};
3891     $self->{column_prev} = $self->{column};
3892     $self->{column}++;
3893     $self->{nc}
3894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3895     } else {
3896     $self->{set_nc}->($self);
3897     }
3898    
3899     redo A;
3900     } elsif ($self->{nc} == 0x0022) { # "
3901    
3902     $self->{ct}->{sysid} = ''; # DOCTYPE
3903     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3904    
3905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3906     $self->{line_prev} = $self->{line};
3907     $self->{column_prev} = $self->{column};
3908     $self->{column}++;
3909     $self->{nc}
3910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3911     } else {
3912     $self->{set_nc}->($self);
3913     }
3914    
3915     redo A;
3916     } elsif ($self->{nc} == 0x0027) { # '
3917    
3918     $self->{ct}->{sysid} = ''; # DOCTYPE
3919     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3920    
3921     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3922     $self->{line_prev} = $self->{line};
3923     $self->{column_prev} = $self->{column};
3924     $self->{column}++;
3925     $self->{nc}
3926     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3927     } else {
3928     $self->{set_nc}->($self);
3929     }
3930    
3931     redo A;
3932     } elsif ($self->{nc} == 0x003E) { # >
3933     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3934    
3935     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3936     $self->{line_prev} = $self->{line};
3937     $self->{column_prev} = $self->{column};
3938     $self->{column}++;
3939     $self->{nc}
3940     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3941     } else {
3942     $self->{set_nc}->($self);
3943     }
3944    
3945    
3946 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3947    
3948     $self->{state} = DATA_STATE;
3949     $self->{s_kwd} = '';
3950     $self->{ct}->{quirks} = 1;
3951     } else {
3952    
3953     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3954     }
3955 wakaba 1.1
3956 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3957 wakaba 1.1 redo A;
3958     } elsif ($self->{nc} == -1) {
3959 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3960    
3961     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3962     $self->{state} = DATA_STATE;
3963     $self->{s_kwd} = '';
3964     $self->{ct}->{quirks} = 1;
3965     } else {
3966    
3967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3968     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3969     }
3970 wakaba 1.1
3971     ## reconsume
3972 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3973 wakaba 1.1 redo A;
3974 wakaba 1.16 } elsif ($self->{is_xml} and
3975     $self->{ct}->{type} == DOCTYPE_TOKEN and
3976     $self->{nc} == 0x005B) { # [
3977 wakaba 1.12
3978     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3979    
3980     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3981     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3982 wakaba 1.13 $self->{in_subset} = 1;
3983 wakaba 1.12
3984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3985     $self->{line_prev} = $self->{line};
3986     $self->{column_prev} = $self->{column};
3987     $self->{column}++;
3988     $self->{nc}
3989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3990     } else {
3991     $self->{set_nc}->($self);
3992     }
3993    
3994 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3995 wakaba 1.12 redo A;
3996 wakaba 1.1 } else {
3997     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3998    
3999 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4000    
4001     $self->{ct}->{quirks} = 1;
4002     $self->{state} = BOGUS_DOCTYPE_STATE;
4003     } else {
4004    
4005     $self->{state} = BOGUS_MD_STATE;
4006     }
4007    
4008 wakaba 1.1
4009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4010     $self->{line_prev} = $self->{line};
4011     $self->{column_prev} = $self->{column};
4012     $self->{column}++;
4013     $self->{nc}
4014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4015     } else {
4016     $self->{set_nc}->($self);
4017     }
4018    
4019     redo A;
4020     }
4021     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4022     if ($self->{nc} == 0x0022) { # "
4023    
4024     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4025    
4026     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4027     $self->{line_prev} = $self->{line};
4028     $self->{column_prev} = $self->{column};
4029     $self->{column}++;
4030     $self->{nc}
4031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4032     } else {
4033     $self->{set_nc}->($self);
4034     }
4035    
4036     redo A;
4037 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4038 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4039    
4040 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4041    
4042     $self->{state} = DATA_STATE;
4043     $self->{s_kwd} = '';
4044     $self->{ct}->{quirks} = 1;
4045     } else {
4046    
4047     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4048     }
4049    
4050 wakaba 1.1
4051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4052     $self->{line_prev} = $self->{line};
4053     $self->{column_prev} = $self->{column};
4054     $self->{column}++;
4055     $self->{nc}
4056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4057     } else {
4058     $self->{set_nc}->($self);
4059     }
4060    
4061 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4062 wakaba 1.1 redo A;
4063     } elsif ($self->{nc} == -1) {
4064     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4065    
4066 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4067    
4068     $self->{state} = DATA_STATE;
4069     $self->{s_kwd} = '';
4070     $self->{ct}->{quirks} = 1;
4071     } else {
4072    
4073     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4074     }
4075    
4076 wakaba 1.1 ## reconsume
4077 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4078 wakaba 1.1 redo A;
4079     } else {
4080    
4081 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4082 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4083     length $self->{ct}->{sysid});
4084    
4085     ## Stay in the state
4086    
4087     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4088     $self->{line_prev} = $self->{line};
4089     $self->{column_prev} = $self->{column};
4090     $self->{column}++;
4091     $self->{nc}
4092     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4093     } else {
4094     $self->{set_nc}->($self);
4095     }
4096    
4097     redo A;
4098     }
4099     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4100     if ($self->{nc} == 0x0027) { # '
4101    
4102     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4103    
4104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4105     $self->{line_prev} = $self->{line};
4106     $self->{column_prev} = $self->{column};
4107     $self->{column}++;
4108     $self->{nc}
4109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4110     } else {
4111     $self->{set_nc}->($self);
4112     }
4113    
4114     redo A;
4115 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4116 wakaba 1.1
4117     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4118    
4119     $self->{state} = DATA_STATE;
4120 wakaba 1.5 $self->{s_kwd} = '';
4121 wakaba 1.1
4122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4123     $self->{line_prev} = $self->{line};
4124     $self->{column_prev} = $self->{column};
4125     $self->{column}++;
4126     $self->{nc}
4127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4128     } else {
4129     $self->{set_nc}->($self);
4130     }
4131    
4132    
4133     $self->{ct}->{quirks} = 1;
4134     return ($self->{ct}); # DOCTYPE
4135    
4136     redo A;
4137     } elsif ($self->{nc} == -1) {
4138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4139    
4140 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4141    
4142     $self->{state} = DATA_STATE;
4143     $self->{s_kwd} = '';
4144     $self->{ct}->{quirks} = 1;
4145     } else {
4146    
4147     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4148     }
4149    
4150 wakaba 1.1 ## reconsume
4151 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4152 wakaba 1.1 redo A;
4153     } else {
4154    
4155 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4156 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4157     length $self->{ct}->{sysid});
4158    
4159     ## Stay in the state
4160    
4161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4162     $self->{line_prev} = $self->{line};
4163     $self->{column_prev} = $self->{column};
4164     $self->{column}++;
4165     $self->{nc}
4166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4167     } else {
4168     $self->{set_nc}->($self);
4169     }
4170    
4171     redo A;
4172     }
4173     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4174     if ($is_space->{$self->{nc}}) {
4175    
4176     ## Stay in the state
4177    
4178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4179     $self->{line_prev} = $self->{line};
4180     $self->{column_prev} = $self->{column};
4181     $self->{column}++;
4182     $self->{nc}
4183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4184     } else {
4185     $self->{set_nc}->($self);
4186     }
4187    
4188     redo A;
4189     } elsif ($self->{nc} == 0x003E) { # >
4190 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4191    
4192     $self->{state} = DATA_STATE;
4193     $self->{s_kwd} = '';
4194     } else {
4195    
4196     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4197     }
4198    
4199 wakaba 1.1
4200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4201     $self->{line_prev} = $self->{line};
4202     $self->{column_prev} = $self->{column};
4203     $self->{column}++;
4204     $self->{nc}
4205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4206     } else {
4207     $self->{set_nc}->($self);
4208     }
4209    
4210 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4211 wakaba 1.1 redo A;
4212 wakaba 1.16 ## TODO: "NDATA"
4213 wakaba 1.1 } elsif ($self->{nc} == -1) {
4214 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4215    
4216     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4217     $self->{state} = DATA_STATE;
4218     $self->{s_kwd} = '';
4219     $self->{ct}->{quirks} = 1;
4220     } else {
4221    
4222     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4224     }
4225    
4226 wakaba 1.1 ## reconsume
4227 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4228 wakaba 1.1 redo A;
4229 wakaba 1.16 } elsif ($self->{is_xml} and
4230     $self->{ct}->{type} == DOCTYPE_TOKEN and
4231     $self->{nc} == 0x005B) { # [
4232 wakaba 1.12
4233     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4235 wakaba 1.13 $self->{in_subset} = 1;
4236 wakaba 1.12
4237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4238     $self->{line_prev} = $self->{line};
4239     $self->{column_prev} = $self->{column};
4240     $self->{column}++;
4241     $self->{nc}
4242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4243     } else {
4244     $self->{set_nc}->($self);
4245     }
4246    
4247 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4248 wakaba 1.12 redo A;
4249 wakaba 1.1 } else {
4250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4251    
4252 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4253    
4254     #$self->{ct}->{quirks} = 1;
4255     $self->{state} = BOGUS_DOCTYPE_STATE;
4256     } else {
4257    
4258     $self->{state} = BOGUS_MD_STATE;
4259     }
4260    
4261 wakaba 1.1
4262     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4263     $self->{line_prev} = $self->{line};
4264     $self->{column_prev} = $self->{column};
4265     $self->{column}++;
4266     $self->{nc}
4267     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4268     } else {
4269     $self->{set_nc}->($self);
4270     }
4271    
4272     redo A;
4273     }
4274     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4275     if ($self->{nc} == 0x003E) { # >
4276    
4277     $self->{state} = DATA_STATE;
4278 wakaba 1.5 $self->{s_kwd} = '';
4279 wakaba 1.1
4280     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4281     $self->{line_prev} = $self->{line};
4282     $self->{column_prev} = $self->{column};
4283     $self->{column}++;
4284     $self->{nc}
4285     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4286     } else {
4287     $self->{set_nc}->($self);
4288     }
4289    
4290    
4291     return ($self->{ct}); # DOCTYPE
4292    
4293     redo A;
4294 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4295 wakaba 1.13
4296     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4297     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4298     $self->{in_subset} = 1;
4299    
4300 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4301     $self->{line_prev} = $self->{line};
4302     $self->{column_prev} = $self->{column};
4303     $self->{column}++;
4304     $self->{nc}
4305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4306     } else {
4307     $self->{set_nc}->($self);
4308     }
4309    
4310 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4311     redo A;
4312 wakaba 1.1 } elsif ($self->{nc} == -1) {
4313    
4314     $self->{state} = DATA_STATE;
4315 wakaba 1.5 $self->{s_kwd} = '';
4316 wakaba 1.1 ## reconsume
4317    
4318     return ($self->{ct}); # DOCTYPE
4319    
4320     redo A;
4321     } else {
4322    
4323     my $s = '';
4324 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4325 wakaba 1.1
4326     ## Stay in the state
4327    
4328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4329     $self->{line_prev} = $self->{line};
4330     $self->{column_prev} = $self->{column};
4331     $self->{column}++;
4332     $self->{nc}
4333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4334     } else {
4335     $self->{set_nc}->($self);
4336     }
4337    
4338     redo A;
4339     }
4340     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4341     ## NOTE: "CDATA section state" in the state is jointly implemented
4342     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4343     ## and |CDATA_SECTION_MSE2_STATE|.
4344 wakaba 1.10
4345     ## XML5: "CDATA state".
4346 wakaba 1.1
4347     if ($self->{nc} == 0x005D) { # ]
4348    
4349     $self->{state} = CDATA_SECTION_MSE1_STATE;
4350    
4351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4352     $self->{line_prev} = $self->{line};
4353     $self->{column_prev} = $self->{column};
4354     $self->{column}++;
4355     $self->{nc}
4356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4357     } else {
4358     $self->{set_nc}->($self);
4359     }
4360    
4361     redo A;
4362     } elsif ($self->{nc} == -1) {
4363 wakaba 1.6 if ($self->{is_xml}) {
4364 wakaba 1.8
4365 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4366 wakaba 1.8 } else {
4367    
4368 wakaba 1.6 }
4369    
4370 wakaba 1.1 $self->{state} = DATA_STATE;
4371 wakaba 1.5 $self->{s_kwd} = '';
4372 wakaba 1.10 ## Reconsume.
4373 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4374    
4375     return ($self->{ct}); # character
4376     } else {
4377    
4378     ## No token to emit. $self->{ct} is discarded.
4379     }
4380     redo A;
4381     } else {
4382    
4383     $self->{ct}->{data} .= chr $self->{nc};
4384     $self->{read_until}->($self->{ct}->{data},
4385     q<]>,
4386     length $self->{ct}->{data});
4387    
4388     ## Stay in the state.
4389    
4390     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4391     $self->{line_prev} = $self->{line};
4392     $self->{column_prev} = $self->{column};
4393     $self->{column}++;
4394     $self->{nc}
4395     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4396     } else {
4397     $self->{set_nc}->($self);
4398     }
4399    
4400     redo A;
4401     }
4402    
4403     ## ISSUE: "text tokens" in spec.
4404     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4405 wakaba 1.10 ## XML5: "CDATA bracket state".
4406    
4407 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4408    
4409     $self->{state} = CDATA_SECTION_MSE2_STATE;
4410    
4411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4412     $self->{line_prev} = $self->{line};
4413     $self->{column_prev} = $self->{column};
4414     $self->{column}++;
4415     $self->{nc}
4416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4417     } else {
4418     $self->{set_nc}->($self);
4419     }
4420    
4421     redo A;
4422     } else {
4423    
4424 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4425 wakaba 1.1 $self->{ct}->{data} .= ']';
4426 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4427 wakaba 1.1 ## Reconsume.
4428     redo A;
4429     }
4430     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4431 wakaba 1.10 ## XML5: "CDATA end state".
4432    
4433 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4434     $self->{state} = DATA_STATE;
4435 wakaba 1.5 $self->{s_kwd} = '';
4436 wakaba 1.1
4437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4438     $self->{line_prev} = $self->{line};
4439     $self->{column_prev} = $self->{column};
4440     $self->{column}++;
4441     $self->{nc}
4442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4443     } else {
4444     $self->{set_nc}->($self);
4445     }
4446    
4447     if (length $self->{ct}->{data}) { # character
4448    
4449     return ($self->{ct}); # character
4450     } else {
4451    
4452     ## No token to emit. $self->{ct} is discarded.
4453     }
4454     redo A;
4455     } elsif ($self->{nc} == 0x005D) { # ]
4456     # character
4457     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4458     ## Stay in the state.
4459    
4460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4461     $self->{line_prev} = $self->{line};
4462     $self->{column_prev} = $self->{column};
4463     $self->{column}++;
4464     $self->{nc}
4465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4466     } else {
4467     $self->{set_nc}->($self);
4468     }
4469    
4470     redo A;
4471     } else {
4472    
4473     $self->{ct}->{data} .= ']]'; # character
4474     $self->{state} = CDATA_SECTION_STATE;
4475 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4476 wakaba 1.1 redo A;
4477     }
4478     } elsif ($self->{state} == ENTITY_STATE) {
4479     if ($is_space->{$self->{nc}} or
4480     {
4481     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4482     $self->{entity_add} => 1,
4483     }->{$self->{nc}}) {
4484    
4485     ## Don't consume
4486     ## No error
4487     ## Return nothing.
4488     #
4489     } elsif ($self->{nc} == 0x0023) { # #
4490    
4491     $self->{state} = ENTITY_HASH_STATE;
4492 wakaba 1.12 $self->{kwd} = '#';
4493 wakaba 1.1
4494     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4495     $self->{line_prev} = $self->{line};
4496     $self->{column_prev} = $self->{column};
4497     $self->{column}++;
4498     $self->{nc}
4499     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4500     } else {
4501     $self->{set_nc}->($self);
4502     }
4503    
4504     redo A;
4505     } elsif ((0x0041 <= $self->{nc} and
4506     $self->{nc} <= 0x005A) or # A..Z
4507     (0x0061 <= $self->{nc} and
4508     $self->{nc} <= 0x007A)) { # a..z
4509    
4510     require Whatpm::_NamedEntityList;
4511     $self->{state} = ENTITY_NAME_STATE;
4512 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4513     $self->{entity__value} = $self->{kwd};
4514 wakaba 1.1 $self->{entity__match} = 0;
4515    
4516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4517     $self->{line_prev} = $self->{line};
4518     $self->{column_prev} = $self->{column};
4519     $self->{column}++;
4520     $self->{nc}
4521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4522     } else {
4523     $self->{set_nc}->($self);
4524     }
4525    
4526     redo A;
4527     } else {
4528    
4529     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4530     ## Return nothing.
4531     #
4532     }
4533    
4534     ## NOTE: No character is consumed by the "consume a character
4535     ## reference" algorithm. In other word, there is an "&" character
4536     ## that does not introduce a character reference, which would be
4537     ## appended to the parent element or the attribute value in later
4538     ## process of the tokenizer.
4539    
4540     if ($self->{prev_state} == DATA_STATE) {
4541    
4542     $self->{state} = $self->{prev_state};
4543 wakaba 1.5 $self->{s_kwd} = '';
4544 wakaba 1.1 ## Reconsume.
4545     return ({type => CHARACTER_TOKEN, data => '&',
4546     line => $self->{line_prev},
4547     column => $self->{column_prev},
4548     });
4549     redo A;
4550     } else {
4551    
4552     $self->{ca}->{value} .= '&';
4553     $self->{state} = $self->{prev_state};
4554 wakaba 1.5 $self->{s_kwd} = '';
4555 wakaba 1.1 ## Reconsume.
4556     redo A;
4557     }
4558     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4559     if ($self->{nc} == 0x0078 or # x
4560     $self->{nc} == 0x0058) { # X
4561    
4562     $self->{state} = HEXREF_X_STATE;
4563 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4564 wakaba 1.1
4565     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4566     $self->{line_prev} = $self->{line};
4567     $self->{column_prev} = $self->{column};
4568     $self->{column}++;
4569     $self->{nc}
4570     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4571     } else {
4572     $self->{set_nc}->($self);
4573     }
4574    
4575     redo A;
4576     } elsif (0x0030 <= $self->{nc} and
4577     $self->{nc} <= 0x0039) { # 0..9
4578    
4579     $self->{state} = NCR_NUM_STATE;
4580 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4581 wakaba 1.1
4582     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4583     $self->{line_prev} = $self->{line};
4584     $self->{column_prev} = $self->{column};
4585     $self->{column}++;
4586     $self->{nc}
4587     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4588     } else {
4589     $self->{set_nc}->($self);
4590     }
4591    
4592     redo A;
4593     } else {
4594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4595     line => $self->{line_prev},
4596     column => $self->{column_prev} - 1);
4597    
4598     ## NOTE: According to the spec algorithm, nothing is returned,
4599     ## and then "&#" is appended to the parent element or the attribute
4600     ## value in the later processing.
4601    
4602     if ($self->{prev_state} == DATA_STATE) {
4603    
4604     $self->{state} = $self->{prev_state};
4605 wakaba 1.5 $self->{s_kwd} = '';
4606 wakaba 1.1 ## Reconsume.
4607     return ({type => CHARACTER_TOKEN,
4608     data => '&#',
4609     line => $self->{line_prev},
4610     column => $self->{column_prev} - 1,
4611     });
4612     redo A;
4613     } else {
4614    
4615     $self->{ca}->{value} .= '&#';
4616     $self->{state} = $self->{prev_state};
4617 wakaba 1.5 $self->{s_kwd} = '';
4618 wakaba 1.1 ## Reconsume.
4619     redo A;
4620     }
4621     }
4622     } elsif ($self->{state} == NCR_NUM_STATE) {
4623     if (0x0030 <= $self->{nc} and
4624     $self->{nc} <= 0x0039) { # 0..9
4625    
4626 wakaba 1.12 $self->{kwd} *= 10;
4627     $self->{kwd} += $self->{nc} - 0x0030;
4628 wakaba 1.1
4629     ## Stay in the state.
4630    
4631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4632     $self->{line_prev} = $self->{line};
4633     $self->{column_prev} = $self->{column};
4634     $self->{column}++;
4635     $self->{nc}
4636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4637     } else {
4638     $self->{set_nc}->($self);
4639     }
4640    
4641     redo A;
4642     } elsif ($self->{nc} == 0x003B) { # ;
4643    
4644    
4645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4646     $self->{line_prev} = $self->{line};
4647     $self->{column_prev} = $self->{column};
4648     $self->{column}++;
4649     $self->{nc}
4650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4651     } else {
4652     $self->{set_nc}->($self);
4653     }
4654    
4655     #
4656     } else {
4657    
4658     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4659     ## Reconsume.
4660     #
4661     }
4662    
4663 wakaba 1.12 my $code = $self->{kwd};
4664 wakaba 1.1 my $l = $self->{line_prev};
4665     my $c = $self->{column_prev};
4666     if ($charref_map->{$code}) {
4667    
4668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4669     text => (sprintf 'U+%04X', $code),
4670     line => $l, column => $c);
4671     $code = $charref_map->{$code};
4672     } elsif ($code > 0x10FFFF) {
4673    
4674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4675     text => (sprintf 'U-%08X', $code),
4676     line => $l, column => $c);
4677     $code = 0xFFFD;
4678     }
4679    
4680     if ($self->{prev_state} == DATA_STATE) {
4681    
4682     $self->{state} = $self->{prev_state};
4683 wakaba 1.5 $self->{s_kwd} = '';
4684 wakaba 1.1 ## Reconsume.
4685     return ({type => CHARACTER_TOKEN, data => chr $code,
4686 wakaba 1.7 has_reference => 1,
4687 wakaba 1.1 line => $l, column => $c,
4688     });
4689     redo A;
4690     } else {
4691    
4692     $self->{ca}->{value} .= chr $code;
4693     $self->{ca}->{has_reference} = 1;
4694     $self->{state} = $self->{prev_state};
4695 wakaba 1.5 $self->{s_kwd} = '';
4696 wakaba 1.1 ## Reconsume.
4697     redo A;
4698     }
4699     } elsif ($self->{state} == HEXREF_X_STATE) {
4700     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4701     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4702     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4703     # 0..9, A..F, a..f
4704    
4705     $self->{state} = HEXREF_HEX_STATE;
4706 wakaba 1.12 $self->{kwd} = 0;
4707 wakaba 1.1 ## Reconsume.
4708     redo A;
4709     } else {
4710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4711     line => $self->{line_prev},
4712     column => $self->{column_prev} - 2);
4713    
4714     ## NOTE: According to the spec algorithm, nothing is returned,
4715     ## and then "&#" followed by "X" or "x" is appended to the parent
4716     ## element or the attribute value in the later processing.
4717    
4718     if ($self->{prev_state} == DATA_STATE) {
4719    
4720     $self->{state} = $self->{prev_state};
4721 wakaba 1.5 $self->{s_kwd} = '';
4722 wakaba 1.1 ## Reconsume.
4723     return ({type => CHARACTER_TOKEN,
4724 wakaba 1.12 data => '&' . $self->{kwd},
4725 wakaba 1.1 line => $self->{line_prev},
4726 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4727 wakaba 1.1 });
4728     redo A;
4729     } else {
4730    
4731 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4732 wakaba 1.1 $self->{state} = $self->{prev_state};
4733 wakaba 1.5 $self->{s_kwd} = '';
4734 wakaba 1.1 ## Reconsume.
4735     redo A;
4736     }
4737     }
4738     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4739     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4740     # 0..9
4741    
4742 wakaba 1.12 $self->{kwd} *= 0x10;
4743     $self->{kwd} += $self->{nc} - 0x0030;
4744 wakaba 1.1 ## Stay in the state.
4745    
4746     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4747     $self->{line_prev} = $self->{line};
4748     $self->{column_prev} = $self->{column};
4749     $self->{column}++;
4750     $self->{nc}
4751     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4752     } else {
4753     $self->{set_nc}->($self);
4754     }
4755    
4756     redo A;
4757     } elsif (0x0061 <= $self->{nc} and
4758     $self->{nc} <= 0x0066) { # a..f
4759    
4760 wakaba 1.12 $self->{kwd} *= 0x10;
4761     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4762 wakaba 1.1 ## Stay in the state.
4763    
4764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4765     $self->{line_prev} = $self->{line};
4766     $self->{column_prev} = $self->{column};
4767     $self->{column}++;
4768     $self->{nc}
4769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4770     } else {
4771     $self->{set_nc}->($self);
4772     }
4773    
4774     redo A;
4775     } elsif (0x0041 <= $self->{nc} and
4776     $self->{nc} <= 0x0046) { # A..F
4777    
4778 wakaba 1.12 $self->{kwd} *= 0x10;
4779     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4780 wakaba 1.1 ## Stay in the state.
4781    
4782     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4783     $self->{line_prev} = $self->{line};
4784     $self->{column_prev} = $self->{column};
4785     $self->{column}++;
4786     $self->{nc}
4787     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4788     } else {
4789     $self->{set_nc}->($self);
4790     }
4791    
4792     redo A;
4793     } elsif ($self->{nc} == 0x003B) { # ;
4794    
4795    
4796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4797     $self->{line_prev} = $self->{line};
4798     $self->{column_prev} = $self->{column};
4799     $self->{column}++;
4800     $self->{nc}
4801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4802     } else {
4803     $self->{set_nc}->($self);
4804     }
4805    
4806     #
4807     } else {
4808    
4809     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4810     line => $self->{line},
4811     column => $self->{column});
4812     ## Reconsume.
4813     #
4814     }
4815    
4816 wakaba 1.12 my $code = $self->{kwd};
4817 wakaba 1.1 my $l = $self->{line_prev};
4818     my $c = $self->{column_prev};
4819     if ($charref_map->{$code}) {
4820    
4821     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4822     text => (sprintf 'U+%04X', $code),
4823     line => $l, column => $c);
4824     $code = $charref_map->{$code};
4825     } elsif ($code > 0x10FFFF) {
4826    
4827     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4828     text => (sprintf 'U-%08X', $code),
4829     line => $l, column => $c);
4830     $code = 0xFFFD;
4831     }
4832    
4833     if ($self->{prev_state} == DATA_STATE) {
4834    
4835     $self->{state} = $self->{prev_state};
4836 wakaba 1.5 $self->{s_kwd} = '';
4837 wakaba 1.1 ## Reconsume.
4838     return ({type => CHARACTER_TOKEN, data => chr $code,
4839 wakaba 1.7 has_reference => 1,
4840 wakaba 1.1 line => $l, column => $c,
4841     });
4842     redo A;
4843     } else {
4844    
4845     $self->{ca}->{value} .= chr $code;
4846     $self->{ca}->{has_reference} = 1;
4847     $self->{state} = $self->{prev_state};
4848 wakaba 1.5 $self->{s_kwd} = '';
4849 wakaba 1.1 ## Reconsume.
4850     redo A;
4851     }
4852     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4853 wakaba 1.12 if (length $self->{kwd} < 30 and
4854 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
4855     ((0x0041 <= $self->{nc} and # a
4856     $self->{nc} <= 0x005A) or # x
4857     (0x0061 <= $self->{nc} and # a
4858     $self->{nc} <= 0x007A) or # z
4859     (0x0030 <= $self->{nc} and # 0
4860     $self->{nc} <= 0x0039) or # 9
4861     $self->{nc} == 0x003B)) { # ;
4862     our $EntityChar;
4863 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4864     if (defined $EntityChar->{$self->{kwd}}) {
4865 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
4866    
4867 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4868 wakaba 1.1 $self->{entity__match} = 1;
4869    
4870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4871     $self->{line_prev} = $self->{line};
4872     $self->{column_prev} = $self->{column};
4873     $self->{column}++;
4874     $self->{nc}
4875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4876     } else {
4877     $self->{set_nc}->($self);
4878     }
4879    
4880     #
4881     } else {
4882    
4883 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4884 wakaba 1.1 $self->{entity__match} = -1;
4885     ## Stay in the state.
4886    
4887     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4888     $self->{line_prev} = $self->{line};
4889     $self->{column_prev} = $self->{column};
4890     $self->{column}++;
4891     $self->{nc}
4892     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4893     } else {
4894     $self->{set_nc}->($self);
4895     }
4896    
4897     redo A;
4898     }
4899     } else {
4900    
4901     $self->{entity__value} .= chr $self->{nc};
4902     $self->{entity__match} *= 2;
4903     ## Stay in the state.
4904    
4905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4906     $self->{line_prev} = $self->{line};
4907     $self->{column_prev} = $self->{column};
4908     $self->{column}++;
4909     $self->{nc}
4910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4911     } else {
4912     $self->{set_nc}->($self);
4913     }
4914    
4915     redo A;
4916     }
4917     }
4918    
4919     my $data;
4920     my $has_ref;
4921     if ($self->{entity__match} > 0) {
4922    
4923     $data = $self->{entity__value};
4924     $has_ref = 1;
4925     #
4926     } elsif ($self->{entity__match} < 0) {
4927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4928     if ($self->{prev_state} != DATA_STATE and # in attribute
4929     $self->{entity__match} < -1) {
4930    
4931 wakaba 1.12 $data = '&' . $self->{kwd};
4932 wakaba 1.1 #
4933     } else {
4934    
4935     $data = $self->{entity__value};
4936     $has_ref = 1;
4937     #
4938     }
4939     } else {
4940    
4941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4942     line => $self->{line_prev},
4943 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
4944     $data = '&' . $self->{kwd};
4945 wakaba 1.1 #
4946     }
4947    
4948     ## NOTE: In these cases, when a character reference is found,
4949     ## it is consumed and a character token is returned, or, otherwise,
4950     ## nothing is consumed and returned, according to the spec algorithm.
4951     ## In this implementation, anything that has been examined by the
4952     ## tokenizer is appended to the parent element or the attribute value
4953     ## as string, either literal string when no character reference or
4954     ## entity-replaced string otherwise, in this stage, since any characters
4955     ## that would not be consumed are appended in the data state or in an
4956     ## appropriate attribute value state anyway.
4957    
4958     if ($self->{prev_state} == DATA_STATE) {
4959    
4960     $self->{state} = $self->{prev_state};
4961 wakaba 1.5 $self->{s_kwd} = '';
4962 wakaba 1.1 ## Reconsume.
4963     return ({type => CHARACTER_TOKEN,
4964     data => $data,
4965 wakaba 1.7 has_reference => $has_ref,
4966 wakaba 1.1 line => $self->{line_prev},
4967 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
4968 wakaba 1.1 });
4969     redo A;
4970     } else {
4971    
4972     $self->{ca}->{value} .= $data;
4973     $self->{ca}->{has_reference} = 1 if $has_ref;
4974     $self->{state} = $self->{prev_state};
4975 wakaba 1.5 $self->{s_kwd} = '';
4976 wakaba 1.1 ## Reconsume.
4977     redo A;
4978     }
4979 wakaba 1.8
4980     ## XML-only states
4981    
4982     } elsif ($self->{state} == PI_STATE) {
4983 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
4984    
4985 wakaba 1.8 if ($is_space->{$self->{nc}} or
4986 wakaba 1.14 $self->{nc} == 0x003F or # ?
4987 wakaba 1.8 $self->{nc} == -1) {
4988 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
4989     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
4990     ## "DOCTYPE pi state": Parse error, switch to the "data
4991     ## state".
4992 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4993     line => $self->{line_prev},
4994     column => $self->{column_prev}
4995     - 1 * ($self->{nc} != -1));
4996     $self->{state} = BOGUS_COMMENT_STATE;
4997     ## Reconsume.
4998     $self->{ct} = {type => COMMENT_TOKEN,
4999     data => '?',
5000     line => $self->{line_prev},
5001     column => $self->{column_prev}
5002     - 1 * ($self->{nc} != -1),
5003     };
5004     redo A;
5005     } else {
5006 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5007 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5008     target => chr $self->{nc},
5009     data => '',
5010     line => $self->{line_prev},
5011     column => $self->{column_prev} - 1,
5012     };
5013     $self->{state} = PI_TARGET_STATE;
5014    
5015     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5016     $self->{line_prev} = $self->{line};
5017     $self->{column_prev} = $self->{column};
5018     $self->{column}++;
5019     $self->{nc}
5020     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5021     } else {
5022     $self->{set_nc}->($self);
5023     }
5024    
5025     redo A;
5026     }
5027     } elsif ($self->{state} == PI_TARGET_STATE) {
5028     if ($is_space->{$self->{nc}}) {
5029     $self->{state} = PI_TARGET_AFTER_STATE;
5030    
5031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5032     $self->{line_prev} = $self->{line};
5033     $self->{column_prev} = $self->{column};
5034     $self->{column}++;
5035     $self->{nc}
5036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5037     } else {
5038     $self->{set_nc}->($self);
5039     }
5040    
5041     redo A;
5042     } elsif ($self->{nc} == -1) {
5043     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5044 wakaba 1.13 if ($self->{in_subset}) {
5045     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5046     } else {
5047     $self->{state} = DATA_STATE;
5048     $self->{s_kwd} = '';
5049     }
5050 wakaba 1.8 ## Reconsume.
5051     return ($self->{ct}); # pi
5052     redo A;
5053     } elsif ($self->{nc} == 0x003F) { # ?
5054     $self->{state} = PI_AFTER_STATE;
5055    
5056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5057     $self->{line_prev} = $self->{line};
5058     $self->{column_prev} = $self->{column};
5059     $self->{column}++;
5060     $self->{nc}
5061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5062     } else {
5063     $self->{set_nc}->($self);
5064     }
5065    
5066     redo A;
5067     } else {
5068     ## XML5: typo ("tag name" -> "target")
5069     $self->{ct}->{target} .= chr $self->{nc}; # pi
5070    
5071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5072     $self->{line_prev} = $self->{line};
5073     $self->{column_prev} = $self->{column};
5074     $self->{column}++;
5075     $self->{nc}
5076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5077     } else {
5078     $self->{set_nc}->($self);
5079     }
5080    
5081     redo A;
5082     }
5083     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5084     if ($is_space->{$self->{nc}}) {
5085     ## Stay in the state.
5086    
5087     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5088     $self->{line_prev} = $self->{line};
5089     $self->{column_prev} = $self->{column};
5090     $self->{column}++;
5091     $self->{nc}
5092     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5093     } else {
5094     $self->{set_nc}->($self);
5095     }
5096    
5097     redo A;
5098     } else {
5099     $self->{state} = PI_DATA_STATE;
5100     ## Reprocess.
5101     redo A;
5102     }
5103     } elsif ($self->{state} == PI_DATA_STATE) {
5104     if ($self->{nc} == 0x003F) { # ?
5105     $self->{state} = PI_DATA_AFTER_STATE;
5106    
5107     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5108     $self->{line_prev} = $self->{line};
5109     $self->{column_prev} = $self->{column};
5110     $self->{column}++;
5111     $self->{nc}
5112     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5113     } else {
5114     $self->{set_nc}->($self);
5115     }
5116    
5117     redo A;
5118     } elsif ($self->{nc} == -1) {
5119     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5120 wakaba 1.13 if ($self->{in_subset}) {
5121 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5122 wakaba 1.13 } else {
5123     $self->{state} = DATA_STATE;
5124     $self->{s_kwd} = '';
5125     }
5126 wakaba 1.8 ## Reprocess.
5127     return ($self->{ct}); # pi
5128     redo A;
5129     } else {
5130     $self->{ct}->{data} .= chr $self->{nc}; # pi
5131     $self->{read_until}->($self->{ct}->{data}, q[?],
5132     length $self->{ct}->{data});
5133     ## Stay in the state.
5134    
5135     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5136     $self->{line_prev} = $self->{line};
5137     $self->{column_prev} = $self->{column};
5138     $self->{column}++;
5139     $self->{nc}
5140     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5141     } else {
5142     $self->{set_nc}->($self);
5143     }
5144    
5145     ## Reprocess.
5146     redo A;
5147     }
5148     } elsif ($self->{state} == PI_AFTER_STATE) {
5149 wakaba 1.14 ## XML5: Part of "Pi after state".
5150    
5151 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5152 wakaba 1.13 if ($self->{in_subset}) {
5153     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5154     } else {
5155     $self->{state} = DATA_STATE;
5156     $self->{s_kwd} = '';
5157     }
5158 wakaba 1.8
5159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5160     $self->{line_prev} = $self->{line};
5161     $self->{column_prev} = $self->{column};
5162     $self->{column}++;
5163     $self->{nc}
5164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5165     } else {
5166     $self->{set_nc}->($self);
5167     }
5168    
5169     return ($self->{ct}); # pi
5170     redo A;
5171     } elsif ($self->{nc} == 0x003F) { # ?
5172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5173     line => $self->{line_prev},
5174     column => $self->{column_prev}); ## XML5: no error
5175     $self->{ct}->{data} .= '?';
5176     $self->{state} = PI_DATA_AFTER_STATE;
5177    
5178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5179     $self->{line_prev} = $self->{line};
5180     $self->{column_prev} = $self->{column};
5181     $self->{column}++;
5182     $self->{nc}
5183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5184     } else {
5185     $self->{set_nc}->($self);
5186     }
5187    
5188     redo A;
5189     } else {
5190     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5191     line => $self->{line_prev},
5192     column => $self->{column_prev}
5193     + 1 * ($self->{nc} == -1)); ## XML5: no error
5194     $self->{ct}->{data} .= '?'; ## XML5: not appended
5195     $self->{state} = PI_DATA_STATE;
5196     ## Reprocess.
5197     redo A;
5198     }
5199     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5200 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5201    
5202 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5203 wakaba 1.13 if ($self->{in_subset}) {
5204     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5205     } else {
5206     $self->{state} = DATA_STATE;
5207     $self->{s_kwd} = '';
5208     }
5209 wakaba 1.8
5210     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5211     $self->{line_prev} = $self->{line};
5212     $self->{column_prev} = $self->{column};
5213     $self->{column}++;
5214     $self->{nc}
5215     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5216     } else {
5217     $self->{set_nc}->($self);
5218     }
5219    
5220     return ($self->{ct}); # pi
5221     redo A;
5222     } elsif ($self->{nc} == 0x003F) { # ?
5223     $self->{ct}->{data} .= '?';
5224     ## Stay in the state.
5225    
5226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5227     $self->{line_prev} = $self->{line};
5228     $self->{column_prev} = $self->{column};
5229     $self->{column}++;
5230     $self->{nc}
5231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5232     } else {
5233     $self->{set_nc}->($self);
5234     }
5235    
5236     redo A;
5237     } else {
5238     $self->{ct}->{data} .= '?'; ## XML5: not appended
5239     $self->{state} = PI_DATA_STATE;
5240     ## Reprocess.
5241     redo A;
5242     }
5243 wakaba 1.12
5244     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5245     if ($self->{nc} == 0x003C) { # <
5246 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5247 wakaba 1.12
5248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5249     $self->{line_prev} = $self->{line};
5250     $self->{column_prev} = $self->{column};
5251     $self->{column}++;
5252     $self->{nc}
5253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5254     } else {
5255     $self->{set_nc}->($self);
5256     }
5257    
5258     redo A;
5259     } elsif ($self->{nc} == 0x0025) { # %
5260     ## XML5: Not defined yet.
5261    
5262     ## TODO:
5263    
5264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5265     $self->{line_prev} = $self->{line};
5266     $self->{column_prev} = $self->{column};
5267     $self->{column}++;
5268     $self->{nc}
5269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5270     } else {
5271     $self->{set_nc}->($self);
5272     }
5273    
5274     redo A;
5275     } elsif ($self->{nc} == 0x005D) { # ]
5276 wakaba 1.13 delete $self->{in_subset};
5277 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5278    
5279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5280     $self->{line_prev} = $self->{line};
5281     $self->{column_prev} = $self->{column};
5282     $self->{column}++;
5283     $self->{nc}
5284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5285     } else {
5286     $self->{set_nc}->($self);
5287     }
5288    
5289     redo A;
5290     } elsif ($is_space->{$self->{nc}}) {
5291     ## Stay in the state.
5292    
5293     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5294     $self->{line_prev} = $self->{line};
5295     $self->{column_prev} = $self->{column};
5296     $self->{column}++;
5297     $self->{nc}
5298     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5299     } else {
5300     $self->{set_nc}->($self);
5301     }
5302    
5303     redo A;
5304     } elsif ($self->{nc} == -1) {
5305     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5306 wakaba 1.13 delete $self->{in_subset};
5307 wakaba 1.12 $self->{state} = DATA_STATE;
5308     $self->{s_kwd} = '';
5309     ## Reconsume.
5310 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5311 wakaba 1.12 redo A;
5312     } else {
5313     unless ($self->{internal_subset_tainted}) {
5314     ## XML5: No parse error.
5315     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5316     $self->{internal_subset_tainted} = 1;
5317     }
5318     ## Stay in the state.
5319    
5320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5321     $self->{line_prev} = $self->{line};
5322     $self->{column_prev} = $self->{column};
5323     $self->{column}++;
5324     $self->{nc}
5325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5326     } else {
5327     $self->{set_nc}->($self);
5328     }
5329    
5330     redo A;
5331     }
5332     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5333     if ($self->{nc} == 0x003E) { # >
5334     $self->{state} = DATA_STATE;
5335     $self->{s_kwd} = '';
5336    
5337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5338     $self->{line_prev} = $self->{line};
5339     $self->{column_prev} = $self->{column};
5340     $self->{column}++;
5341     $self->{nc}
5342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5343     } else {
5344     $self->{set_nc}->($self);
5345     }
5346    
5347 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5348 wakaba 1.12 redo A;
5349     } elsif ($self->{nc} == -1) {
5350     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5351     $self->{state} = DATA_STATE;
5352     $self->{s_kwd} = '';
5353     ## Reconsume.
5354 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5355 wakaba 1.12 redo A;
5356     } else {
5357     ## XML5: No parse error and stay in the state.
5358     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5359    
5360 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5361    
5362     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5363     $self->{line_prev} = $self->{line};
5364     $self->{column_prev} = $self->{column};
5365     $self->{column}++;
5366     $self->{nc}
5367     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5368     } else {
5369     $self->{set_nc}->($self);
5370     }
5371    
5372     redo A;
5373     }
5374     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5375     if ($self->{nc} == 0x003E) { # >
5376     $self->{state} = DATA_STATE;
5377     $self->{s_kwd} = '';
5378    
5379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5380     $self->{line_prev} = $self->{line};
5381     $self->{column_prev} = $self->{column};
5382     $self->{column}++;
5383     $self->{nc}
5384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5385     } else {
5386     $self->{set_nc}->($self);
5387     }
5388    
5389     return ({type => END_OF_DOCTYPE_TOKEN});
5390     redo A;
5391     } elsif ($self->{nc} == -1) {
5392     $self->{state} = DATA_STATE;
5393     $self->{s_kwd} = '';
5394     ## Reconsume.
5395     return ({type => END_OF_DOCTYPE_TOKEN});
5396     redo A;
5397     } else {
5398     ## Stay in the state.
5399    
5400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5401     $self->{line_prev} = $self->{line};
5402     $self->{column_prev} = $self->{column};
5403     $self->{column}++;
5404     $self->{nc}
5405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5406     } else {
5407     $self->{set_nc}->($self);
5408     }
5409    
5410     redo A;
5411     }
5412     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5413     if ($self->{nc} == 0x0021) { # !
5414 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5415 wakaba 1.13
5416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5417     $self->{line_prev} = $self->{line};
5418     $self->{column_prev} = $self->{column};
5419     $self->{column}++;
5420     $self->{nc}
5421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5422     } else {
5423     $self->{set_nc}->($self);
5424     }
5425    
5426     redo A;
5427     } elsif ($self->{nc} == 0x003F) { # ?
5428     $self->{state} = PI_STATE;
5429    
5430     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5431     $self->{line_prev} = $self->{line};
5432     $self->{column_prev} = $self->{column};
5433     $self->{column}++;
5434     $self->{nc}
5435     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5436     } else {
5437     $self->{set_nc}->($self);
5438     }
5439    
5440     redo A;
5441     } elsif ($self->{nc} == -1) {
5442     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5443     $self->{state} = DATA_STATE;
5444     $self->{s_kwd} = '';
5445     ## Reconsume.
5446     redo A;
5447     } else {
5448     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5449     line => $self->{line_prev},
5450     column => $self->{column_prev});
5451     $self->{state} = BOGUS_COMMENT_STATE;
5452     $self->{ct} = {type => COMMENT_TOKEN,
5453     data => '',
5454     }; ## NOTE: Will be discarded.
5455 wakaba 1.12
5456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5457     $self->{line_prev} = $self->{line};
5458     $self->{column_prev} = $self->{column};
5459     $self->{column}++;
5460     $self->{nc}
5461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5462     } else {
5463     $self->{set_nc}->($self);
5464     }
5465    
5466     redo A;
5467     }
5468 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5469     ## XML5: "DOCTYPE markup declaration state".
5470    
5471     if ($self->{nc} == 0x002D) { # -
5472     $self->{state} = MD_HYPHEN_STATE;
5473    
5474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475     $self->{line_prev} = $self->{line};
5476     $self->{column_prev} = $self->{column};
5477     $self->{column}++;
5478     $self->{nc}
5479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480     } else {
5481     $self->{set_nc}->($self);
5482     }
5483    
5484     redo A;
5485     } elsif ($self->{nc} == 0x0045) { # E
5486     $self->{state} = MD_E_STATE;
5487     $self->{kwd} = chr $self->{nc};
5488    
5489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5490     $self->{line_prev} = $self->{line};
5491     $self->{column_prev} = $self->{column};
5492     $self->{column}++;
5493     $self->{nc}
5494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5495     } else {
5496     $self->{set_nc}->($self);
5497     }
5498    
5499     redo A;
5500     } elsif ($self->{nc} == 0x0041) { # A
5501     $self->{state} = MD_ATTLIST_STATE;
5502     $self->{kwd} = chr $self->{nc};
5503    
5504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5505     $self->{line_prev} = $self->{line};
5506     $self->{column_prev} = $self->{column};
5507     $self->{column}++;
5508     $self->{nc}
5509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5510     } else {
5511     $self->{set_nc}->($self);
5512     }
5513    
5514     redo A;
5515     } elsif ($self->{nc} == 0x004E) { # N
5516     $self->{state} = MD_NOTATION_STATE;
5517     $self->{kwd} = chr $self->{nc};
5518    
5519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5520     $self->{line_prev} = $self->{line};
5521     $self->{column_prev} = $self->{column};
5522     $self->{column}++;
5523     $self->{nc}
5524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5525     } else {
5526     $self->{set_nc}->($self);
5527     }
5528    
5529     redo A;
5530     } else {
5531     #
5532     }
5533    
5534     ## XML5: No parse error.
5535     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5536     line => $self->{line_prev},
5537     column => $self->{column_prev} - 1);
5538     ## Reconsume.
5539     $self->{state} = BOGUS_COMMENT_STATE;
5540     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5541     redo A;
5542     } elsif ($self->{state} == MD_E_STATE) {
5543     if ($self->{nc} == 0x004E) { # N
5544     $self->{state} = MD_ENTITY_STATE;
5545     $self->{kwd} .= chr $self->{nc};
5546    
5547     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5548     $self->{line_prev} = $self->{line};
5549     $self->{column_prev} = $self->{column};
5550     $self->{column}++;
5551     $self->{nc}
5552     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5553     } else {
5554     $self->{set_nc}->($self);
5555     }
5556    
5557     redo A;
5558     } elsif ($self->{nc} == 0x004C) { # L
5559     ## XML5: <!ELEMENT> not supported.
5560     $self->{state} = MD_ELEMENT_STATE;
5561     $self->{kwd} .= chr $self->{nc};
5562    
5563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5564     $self->{line_prev} = $self->{line};
5565     $self->{column_prev} = $self->{column};
5566     $self->{column}++;
5567     $self->{nc}
5568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5569     } else {
5570     $self->{set_nc}->($self);
5571     }
5572    
5573     redo A;
5574     } else {
5575     ## XML5: No parse error.
5576     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5577     line => $self->{line_prev},
5578     column => $self->{column_prev} - 2
5579     + 1 * ($self->{nc} == -1));
5580     ## Reconsume.
5581     $self->{state} = BOGUS_COMMENT_STATE;
5582     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5583     redo A;
5584     }
5585     } elsif ($self->{state} == MD_ENTITY_STATE) {
5586     if ($self->{nc} == {
5587     'EN' => 0x0054, # T
5588     'ENT' => 0x0049, # I
5589     'ENTI' => 0x0054, # T
5590     }->{$self->{kwd}}) {
5591     ## Stay in the state.
5592     $self->{kwd} .= chr $self->{nc};
5593    
5594     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5595     $self->{line_prev} = $self->{line};
5596     $self->{column_prev} = $self->{column};
5597     $self->{column}++;
5598     $self->{nc}
5599     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5600     } else {
5601     $self->{set_nc}->($self);
5602     }
5603    
5604     redo A;
5605     } elsif ($self->{kwd} eq 'ENTIT' and
5606     $self->{nc} == 0x0059) { # Y
5607     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
5608     line => $self->{line_prev},
5609     column => $self->{column_prev} - 6};
5610     $self->{state} = DOCTYPE_MD_STATE;
5611    
5612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5613     $self->{line_prev} = $self->{line};
5614     $self->{column_prev} = $self->{column};
5615     $self->{column}++;
5616     $self->{nc}
5617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5618     } else {
5619     $self->{set_nc}->($self);
5620     }
5621    
5622     redo A;
5623     } else {
5624     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5625     line => $self->{line_prev},
5626     column => $self->{column_prev} - 1
5627     - (length $self->{kwd})
5628     + 1 * ($self->{nc} == -1));
5629     $self->{state} = BOGUS_COMMENT_STATE;
5630     ## Reconsume.
5631     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5632     redo A;
5633     }
5634     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5635     if ($self->{nc} == {
5636     'EL' => 0x0045, # E
5637     'ELE' => 0x004D, # M
5638     'ELEM' => 0x0045, # E
5639     'ELEME' => 0x004E, # N
5640     }->{$self->{kwd}}) {
5641     ## Stay in the state.
5642     $self->{kwd} .= chr $self->{nc};
5643    
5644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5645     $self->{line_prev} = $self->{line};
5646     $self->{column_prev} = $self->{column};
5647     $self->{column}++;
5648     $self->{nc}
5649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5650     } else {
5651     $self->{set_nc}->($self);
5652     }
5653    
5654     redo A;
5655     } elsif ($self->{kwd} eq 'ELEMEN' and
5656     $self->{nc} == 0x0054) { # T
5657     $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5658     line => $self->{line_prev},
5659     column => $self->{column_prev} - 6};
5660     $self->{state} = DOCTYPE_MD_STATE;
5661    
5662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5663     $self->{line_prev} = $self->{line};
5664     $self->{column_prev} = $self->{column};
5665     $self->{column}++;
5666     $self->{nc}
5667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5668     } else {
5669     $self->{set_nc}->($self);
5670     }
5671    
5672     redo A;
5673     } else {
5674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5675     line => $self->{line_prev},
5676     column => $self->{column_prev} - 1
5677     - (length $self->{kwd})
5678     + 1 * ($self->{nc} == -1));
5679     $self->{state} = BOGUS_COMMENT_STATE;
5680     ## Reconsume.
5681     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5682     redo A;
5683     }
5684     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5685     if ($self->{nc} == {
5686     'A' => 0x0054, # T
5687     'AT' => 0x0054, # T
5688     'ATT' => 0x004C, # L
5689     'ATTL' => 0x0049, # I
5690     'ATTLI' => 0x0053, # S
5691     }->{$self->{kwd}}) {
5692     ## Stay in the state.
5693     $self->{kwd} .= chr $self->{nc};
5694    
5695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5696     $self->{line_prev} = $self->{line};
5697     $self->{column_prev} = $self->{column};
5698     $self->{column}++;
5699     $self->{nc}
5700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5701     } else {
5702     $self->{set_nc}->($self);
5703     }
5704    
5705     redo A;
5706     } elsif ($self->{kwd} eq 'ATTLIS' and
5707     $self->{nc} == 0x0054) { # T
5708     $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5709 wakaba 1.15 attrdefs => [],
5710 wakaba 1.14 line => $self->{line_prev},
5711     column => $self->{column_prev} - 6};
5712     $self->{state} = DOCTYPE_MD_STATE;
5713    
5714     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5715     $self->{line_prev} = $self->{line};
5716     $self->{column_prev} = $self->{column};
5717     $self->{column}++;
5718     $self->{nc}
5719     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5720     } else {
5721     $self->{set_nc}->($self);
5722     }
5723    
5724     redo A;
5725     } else {
5726     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5727     line => $self->{line_prev},
5728     column => $self->{column_prev} - 1
5729     - (length $self->{kwd})
5730     + 1 * ($self->{nc} == -1));
5731     $self->{state} = BOGUS_COMMENT_STATE;
5732     ## Reconsume.
5733     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5734     redo A;
5735     }
5736     } elsif ($self->{state} == MD_NOTATION_STATE) {
5737     if ($self->{nc} == {
5738     'N' => 0x004F, # O
5739     'NO' => 0x0054, # T
5740     'NOT' => 0x0041, # A
5741     'NOTA' => 0x0054, # T
5742     'NOTAT' => 0x0049, # I
5743     'NOTATI' => 0x004F, # O
5744     }->{$self->{kwd}}) {
5745     ## Stay in the state.
5746     $self->{kwd} .= chr $self->{nc};
5747    
5748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749     $self->{line_prev} = $self->{line};
5750     $self->{column_prev} = $self->{column};
5751     $self->{column}++;
5752     $self->{nc}
5753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754     } else {
5755     $self->{set_nc}->($self);
5756     }
5757    
5758     redo A;
5759     } elsif ($self->{kwd} eq 'NOTATIO' and
5760     $self->{nc} == 0x004E) { # N
5761     $self->{ct} = {type => NOTATION_TOKEN, name => '',
5762     line => $self->{line_prev},
5763     column => $self->{column_prev} - 6};
5764     $self->{state} = DOCTYPE_MD_STATE;
5765    
5766     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5767     $self->{line_prev} = $self->{line};
5768     $self->{column_prev} = $self->{column};
5769     $self->{column}++;
5770     $self->{nc}
5771     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5772     } else {
5773     $self->{set_nc}->($self);
5774     }
5775    
5776     redo A;
5777     } else {
5778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5779     line => $self->{line_prev},
5780     column => $self->{column_prev} - 1
5781     - (length $self->{kwd})
5782     + 1 * ($self->{nc} == -1));
5783     $self->{state} = BOGUS_COMMENT_STATE;
5784     ## Reconsume.
5785     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5786     redo A;
5787     }
5788     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5789     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5790     ## "DOCTYPE NOTATION state".
5791    
5792     if ($is_space->{$self->{nc}}) {
5793     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5794     $self->{state} = BEFORE_MD_NAME_STATE;
5795    
5796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5797     $self->{line_prev} = $self->{line};
5798     $self->{column_prev} = $self->{column};
5799     $self->{column}++;
5800     $self->{nc}
5801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5802     } else {
5803     $self->{set_nc}->($self);
5804     }
5805    
5806     redo A;
5807     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5808     $self->{nc} == 0x0025) { # %
5809     ## XML5: Switch to the "DOCTYPE bogus comment state".
5810     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5811     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5812    
5813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5814     $self->{line_prev} = $self->{line};
5815     $self->{column_prev} = $self->{column};
5816     $self->{column}++;
5817     $self->{nc}
5818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5819     } else {
5820     $self->{set_nc}->($self);
5821     }
5822    
5823     redo A;
5824     } elsif ($self->{nc} == -1) {
5825     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5826     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5827     ## Reconsume.
5828     redo A;
5829     } elsif ($self->{nc} == 0x003E) { # >
5830     ## XML5: Switch to the "DOCTYPE bogus comment state".
5831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5832     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5833    
5834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5835     $self->{line_prev} = $self->{line};
5836     $self->{column_prev} = $self->{column};
5837     $self->{column}++;
5838     $self->{nc}
5839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5840     } else {
5841     $self->{set_nc}->($self);
5842     }
5843    
5844     redo A;
5845     } else {
5846     ## XML5: Switch to the "DOCTYPE bogus comment state".
5847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5848     $self->{state} = BEFORE_MD_NAME_STATE;
5849     redo A;
5850     }
5851     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
5852     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
5853     ## before state", "DOCTYPE ATTLIST name before state".
5854    
5855     if ($is_space->{$self->{nc}}) {
5856     ## Stay in the state.
5857    
5858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5859     $self->{line_prev} = $self->{line};
5860     $self->{column_prev} = $self->{column};
5861     $self->{column}++;
5862     $self->{nc}
5863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5864     } else {
5865     $self->{set_nc}->($self);
5866     }
5867    
5868     redo A;
5869     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5870     $self->{nc} == 0x0025) { # %
5871     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5872    
5873     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5874     $self->{line_prev} = $self->{line};
5875     $self->{column_prev} = $self->{column};
5876     $self->{column}++;
5877     $self->{nc}
5878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5879     } else {
5880     $self->{set_nc}->($self);
5881     }
5882    
5883     redo A;
5884     } elsif ($self->{nc} == 0x003E) { # >
5885     ## XML5: Same as "Anything else".
5886     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5887     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5888    
5889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5890     $self->{line_prev} = $self->{line};
5891     $self->{column_prev} = $self->{column};
5892     $self->{column}++;
5893     $self->{nc}
5894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5895     } else {
5896     $self->{set_nc}->($self);
5897     }
5898    
5899     redo A;
5900     } elsif ($self->{nc} == -1) {
5901     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5902     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5903     ## Reconsume.
5904     redo A;
5905     } else {
5906     ## XML5: [ATTLIST] Not defined yet.
5907     $self->{ct}->{name} .= chr $self->{nc};
5908     $self->{state} = MD_NAME_STATE;
5909    
5910     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5911     $self->{line_prev} = $self->{line};
5912     $self->{column_prev} = $self->{column};
5913     $self->{column}++;
5914     $self->{nc}
5915     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5916     } else {
5917     $self->{set_nc}->($self);
5918     }
5919    
5920     redo A;
5921     }
5922     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
5923     if ($is_space->{$self->{nc}}) {
5924     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
5925     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
5926     $self->{state} = BEFORE_MD_NAME_STATE;
5927 wakaba 1.8
5928 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5929     $self->{line_prev} = $self->{line};
5930     $self->{column_prev} = $self->{column};
5931     $self->{column}++;
5932     $self->{nc}
5933     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5934     } else {
5935     $self->{set_nc}->($self);
5936     }
5937    
5938     redo A;
5939     } elsif ($self->{nc} == 0x003E) { # >
5940     ## XML5: Same as "Anything else".
5941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5942     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5943    
5944     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5945     $self->{line_prev} = $self->{line};
5946     $self->{column_prev} = $self->{column};
5947     $self->{column}++;
5948     $self->{nc}
5949     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5950     } else {
5951     $self->{set_nc}->($self);
5952     }
5953    
5954     redo A;
5955     } elsif ($self->{nc} == -1) {
5956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
5957     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5958     ## Reconsume.
5959     redo A;
5960     } else {
5961     ## XML5: No parse error.
5962     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
5963     $self->{state} = BOGUS_COMMENT_STATE;
5964     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5965     ## Reconsume.
5966     redo A;
5967     }
5968     } elsif ($self->{state} == MD_NAME_STATE) {
5969     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
5970    
5971     if ($is_space->{$self->{nc}}) {
5972 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
5973     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
5974     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
5975     ## TODO: ...
5976     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
5977     } else { # ENTITY/NOTATION
5978     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
5979     }
5980 wakaba 1.14
5981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5982     $self->{line_prev} = $self->{line};
5983     $self->{column_prev} = $self->{column};
5984     $self->{column}++;
5985     $self->{nc}
5986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5987     } else {
5988     $self->{set_nc}->($self);
5989     }
5990    
5991     redo A;
5992     } elsif ($self->{nc} == 0x003E) { # >
5993     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
5994     #
5995     } else {
5996 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
5997 wakaba 1.14 }
5998     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5999    
6000     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6001     $self->{line_prev} = $self->{line};
6002     $self->{column_prev} = $self->{column};
6003     $self->{column}++;
6004     $self->{nc}
6005     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6006     } else {
6007     $self->{set_nc}->($self);
6008     }
6009    
6010     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6011     redo A;
6012     } elsif ($self->{nc} == -1) {
6013     ## XML5: [ATTLIST] No parse error.
6014     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6015     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6016     ## Reconsume.
6017     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6018     redo A;
6019     } else {
6020     ## XML5: [ATTLIST] Not defined yet.
6021     $self->{ct}->{name} .= chr $self->{nc};
6022     ## Stay in the state.
6023    
6024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6025     $self->{line_prev} = $self->{line};
6026     $self->{column_prev} = $self->{column};
6027     $self->{column}++;
6028     $self->{nc}
6029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6030     } else {
6031     $self->{set_nc}->($self);
6032     }
6033    
6034     redo A;
6035     }
6036     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6037     if ($is_space->{$self->{nc}}) {
6038     ## Stay in the state.
6039    
6040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6041     $self->{line_prev} = $self->{line};
6042     $self->{column_prev} = $self->{column};
6043     $self->{column}++;
6044     $self->{nc}
6045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6046     } else {
6047     $self->{set_nc}->($self);
6048     }
6049    
6050     redo A;
6051     } elsif ($self->{nc} == 0x003E) { # >
6052     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6053    
6054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6055     $self->{line_prev} = $self->{line};
6056     $self->{column_prev} = $self->{column};
6057     $self->{column}++;
6058     $self->{nc}
6059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6060     } else {
6061     $self->{set_nc}->($self);
6062     }
6063    
6064     return ($self->{ct}); # ATTLIST
6065     redo A;
6066     } elsif ($self->{nc} == -1) {
6067     ## XML5: No parse error.
6068     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6069     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6070 wakaba 1.15 return ($self->{ct});
6071 wakaba 1.14 redo A;
6072     } else {
6073     ## XML5: Not defined yet.
6074 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6075     tokens => [],
6076     line => $self->{line}, column => $self->{column}};
6077     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6078    
6079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6080     $self->{line_prev} = $self->{line};
6081     $self->{column_prev} = $self->{column};
6082     $self->{column}++;
6083     $self->{nc}
6084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6085     } else {
6086     $self->{set_nc}->($self);
6087     }
6088    
6089     redo A;
6090     }
6091     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6092     if ($is_space->{$self->{nc}}) {
6093     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6094    
6095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6096     $self->{line_prev} = $self->{line};
6097     $self->{column_prev} = $self->{column};
6098     $self->{column}++;
6099     $self->{nc}
6100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6101     } else {
6102     $self->{set_nc}->($self);
6103     }
6104    
6105     redo A;
6106     } elsif ($self->{nc} == 0x003E) { # >
6107     ## XML5: Same as "anything else".
6108     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6109     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6110    
6111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6112     $self->{line_prev} = $self->{line};
6113     $self->{column_prev} = $self->{column};
6114     $self->{column}++;
6115     $self->{nc}
6116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6117     } else {
6118     $self->{set_nc}->($self);
6119     }
6120    
6121     return ($self->{ct}); # ATTLIST
6122     redo A;
6123     } elsif ($self->{nc} == 0x0028) { # (
6124     ## XML5: Same as "anything else".
6125     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6126     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6127    
6128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6129     $self->{line_prev} = $self->{line};
6130     $self->{column_prev} = $self->{column};
6131     $self->{column}++;
6132     $self->{nc}
6133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6134     } else {
6135     $self->{set_nc}->($self);
6136     }
6137    
6138     redo A;
6139     } elsif ($self->{nc} == -1) {
6140     ## XML5: No parse error.
6141     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6142     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6143    
6144     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6145     $self->{line_prev} = $self->{line};
6146     $self->{column_prev} = $self->{column};
6147     $self->{column}++;
6148     $self->{nc}
6149     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6150     } else {
6151     $self->{set_nc}->($self);
6152     }
6153    
6154     return ($self->{ct}); # ATTLIST
6155     redo A;
6156     } else {
6157     ## XML5: Not defined yet.
6158     $self->{ca}->{name} .= chr $self->{nc};
6159     ## Stay in the state.
6160    
6161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6162     $self->{line_prev} = $self->{line};
6163     $self->{column_prev} = $self->{column};
6164     $self->{column}++;
6165     $self->{nc}
6166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6167     } else {
6168     $self->{set_nc}->($self);
6169     }
6170    
6171 wakaba 1.14 redo A;
6172     }
6173 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6174     if ($is_space->{$self->{nc}}) {
6175     ## Stay in the state.
6176    
6177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6178     $self->{line_prev} = $self->{line};
6179     $self->{column_prev} = $self->{column};
6180     $self->{column}++;
6181     $self->{nc}
6182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6183     } else {
6184     $self->{set_nc}->($self);
6185     }
6186    
6187     redo A;
6188     } elsif ($self->{nc} == 0x003E) { # >
6189     ## XML5: Same as "anything else".
6190     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6192    
6193     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6194     $self->{line_prev} = $self->{line};
6195     $self->{column_prev} = $self->{column};
6196     $self->{column}++;
6197     $self->{nc}
6198     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6199     } else {
6200     $self->{set_nc}->($self);
6201     }
6202    
6203     return ($self->{ct}); # ATTLIST
6204     redo A;
6205     } elsif ($self->{nc} == 0x0028) { # (
6206     ## XML5: Same as "anything else".
6207     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6208    
6209     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6210     $self->{line_prev} = $self->{line};
6211     $self->{column_prev} = $self->{column};
6212     $self->{column}++;
6213     $self->{nc}
6214     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6215     } else {
6216     $self->{set_nc}->($self);
6217     }
6218    
6219     redo A;
6220     } elsif ($self->{nc} == -1) {
6221     ## XML5: No parse error.
6222     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6223     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6224    
6225     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6226     $self->{line_prev} = $self->{line};
6227     $self->{column_prev} = $self->{column};
6228     $self->{column}++;
6229     $self->{nc}
6230     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6231     } else {
6232     $self->{set_nc}->($self);
6233     }
6234    
6235     return ($self->{ct});
6236     redo A;
6237     } else {
6238     ## XML5: Not defined yet.
6239     $self->{ca}->{type} = chr $self->{nc};
6240     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6241    
6242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6243     $self->{line_prev} = $self->{line};
6244     $self->{column_prev} = $self->{column};
6245     $self->{column}++;
6246     $self->{nc}
6247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6248     } else {
6249     $self->{set_nc}->($self);
6250     }
6251    
6252     redo A;
6253     }
6254     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6255     if ($is_space->{$self->{nc}}) {
6256     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6257    
6258     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6259     $self->{line_prev} = $self->{line};
6260     $self->{column_prev} = $self->{column};
6261     $self->{column}++;
6262     $self->{nc}
6263     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6264     } else {
6265     $self->{set_nc}->($self);
6266     }
6267    
6268     redo A;
6269     } elsif ($self->{nc} == 0x0023) { # #
6270     ## XML5: Same as "anything else".
6271     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6272     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6273    
6274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6275     $self->{line_prev} = $self->{line};
6276     $self->{column_prev} = $self->{column};
6277     $self->{column}++;
6278     $self->{nc}
6279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6280     } else {
6281     $self->{set_nc}->($self);
6282     }
6283    
6284     redo A;
6285     } elsif ($self->{nc} == 0x0022) { # "
6286     ## XML5: Same as "anything else".
6287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6288     $self->{ca}->{value} = '';
6289     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6290    
6291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6292     $self->{line_prev} = $self->{line};
6293     $self->{column_prev} = $self->{column};
6294     $self->{column}++;
6295     $self->{nc}
6296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6297     } else {
6298     $self->{set_nc}->($self);
6299     }
6300    
6301     redo A;
6302     } elsif ($self->{nc} == 0x0027) { # '
6303     ## XML5: Same as "anything else".
6304     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6305     $self->{ca}->{value} = '';
6306     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6307    
6308     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6309     $self->{line_prev} = $self->{line};
6310     $self->{column_prev} = $self->{column};
6311     $self->{column}++;
6312     $self->{nc}
6313     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6314     } else {
6315     $self->{set_nc}->($self);
6316     }
6317    
6318     redo A;
6319     } elsif ($self->{nc} == 0x003E) { # >
6320     ## XML5: Same as "anything else".
6321     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6322     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6323    
6324     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6325     $self->{line_prev} = $self->{line};
6326     $self->{column_prev} = $self->{column};
6327     $self->{column}++;
6328     $self->{nc}
6329     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6330     } else {
6331     $self->{set_nc}->($self);
6332     }
6333    
6334     return ($self->{ct}); # ATTLIST
6335     redo A;
6336     } elsif ($self->{nc} == 0x0028) { # (
6337     ## XML5: Same as "anything else".
6338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6339     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6340    
6341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6342     $self->{line_prev} = $self->{line};
6343     $self->{column_prev} = $self->{column};
6344     $self->{column}++;
6345     $self->{nc}
6346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6347     } else {
6348     $self->{set_nc}->($self);
6349     }
6350    
6351     redo A;
6352     } elsif ($self->{nc} == -1) {
6353     ## XML5: No parse error.
6354     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6355     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6356    
6357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6358     $self->{line_prev} = $self->{line};
6359     $self->{column_prev} = $self->{column};
6360     $self->{column}++;
6361     $self->{nc}
6362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6363     } else {
6364     $self->{set_nc}->($self);
6365     }
6366    
6367     return ($self->{ct});
6368     redo A;
6369     } else {
6370     ## XML5: Not defined yet.
6371     $self->{ca}->{type} .= chr $self->{nc};
6372     ## Stay in the state.
6373    
6374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6375     $self->{line_prev} = $self->{line};
6376     $self->{column_prev} = $self->{column};
6377     $self->{column}++;
6378     $self->{nc}
6379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6380     } else {
6381     $self->{set_nc}->($self);
6382     }
6383    
6384     redo A;
6385     }
6386     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6387     if ($is_space->{$self->{nc}}) {
6388     ## Stay in the state.
6389    
6390     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6391     $self->{line_prev} = $self->{line};
6392     $self->{column_prev} = $self->{column};
6393     $self->{column}++;
6394     $self->{nc}
6395     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6396     } else {
6397     $self->{set_nc}->($self);
6398     }
6399    
6400     redo A;
6401     } elsif ($self->{nc} == 0x0028) { # (
6402     ## XML5: Same as "anything else".
6403     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6404    
6405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6406     $self->{line_prev} = $self->{line};
6407     $self->{column_prev} = $self->{column};
6408     $self->{column}++;
6409     $self->{nc}
6410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6411     } else {
6412     $self->{set_nc}->($self);
6413     }
6414    
6415     redo A;
6416     } elsif ($self->{nc} == 0x0023) { # #
6417     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6418    
6419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6420     $self->{line_prev} = $self->{line};
6421     $self->{column_prev} = $self->{column};
6422     $self->{column}++;
6423     $self->{nc}
6424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6425     } else {
6426     $self->{set_nc}->($self);
6427     }
6428    
6429     redo A;
6430     } elsif ($self->{nc} == 0x0022) { # "
6431     ## XML5: Same as "anything else".
6432     $self->{ca}->{value} = '';
6433     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6434    
6435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6436     $self->{line_prev} = $self->{line};
6437     $self->{column_prev} = $self->{column};
6438     $self->{column}++;
6439     $self->{nc}
6440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6441     } else {
6442     $self->{set_nc}->($self);
6443     }
6444    
6445     redo A;
6446     } elsif ($self->{nc} == 0x0027) { # '
6447     ## XML5: Same as "anything else".
6448     $self->{ca}->{value} = '';
6449     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6450    
6451     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6452     $self->{line_prev} = $self->{line};
6453     $self->{column_prev} = $self->{column};
6454     $self->{column}++;
6455     $self->{nc}
6456     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6457     } else {
6458     $self->{set_nc}->($self);
6459     }
6460    
6461     redo A;
6462     } elsif ($self->{nc} == 0x003E) { # >
6463     ## XML5: Same as "anything else".
6464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6465     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6466    
6467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6468     $self->{line_prev} = $self->{line};
6469     $self->{column_prev} = $self->{column};
6470     $self->{column}++;
6471     $self->{nc}
6472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6473     } else {
6474     $self->{set_nc}->($self);
6475     }
6476    
6477     return ($self->{ct}); # ATTLIST
6478     redo A;
6479     } elsif ($self->{nc} == -1) {
6480     ## XML5: No parse error.
6481     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6482     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6483    
6484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6485     $self->{line_prev} = $self->{line};
6486     $self->{column_prev} = $self->{column};
6487     $self->{column}++;
6488     $self->{nc}
6489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6490     } else {
6491     $self->{set_nc}->($self);
6492     }
6493    
6494     return ($self->{ct});
6495     redo A;
6496     } else {
6497     ## XML5: Switch to the "DOCTYPE bogus comment state".
6498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6499     $self->{ca}->{value} = '';
6500     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6501     ## Reconsume.
6502     redo A;
6503     }
6504     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6505     if ($is_space->{$self->{nc}}) {
6506     ## Stay in the state.
6507    
6508     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6509     $self->{line_prev} = $self->{line};
6510     $self->{column_prev} = $self->{column};
6511     $self->{column}++;
6512     $self->{nc}
6513     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6514     } else {
6515     $self->{set_nc}->($self);
6516     }
6517    
6518     redo A;
6519     } elsif ($self->{nc} == 0x007C) { # |
6520     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6521     ## Stay in the state.
6522    
6523     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6524     $self->{line_prev} = $self->{line};
6525     $self->{column_prev} = $self->{column};
6526     $self->{column}++;
6527     $self->{nc}
6528     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6529     } else {
6530     $self->{set_nc}->($self);
6531     }
6532    
6533     redo A;
6534     } elsif ($self->{nc} == 0x0029) { # )
6535     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6536     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6537    
6538     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539     $self->{line_prev} = $self->{line};
6540     $self->{column_prev} = $self->{column};
6541     $self->{column}++;
6542     $self->{nc}
6543     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544     } else {
6545     $self->{set_nc}->($self);
6546     }
6547    
6548     redo A;
6549     } elsif ($self->{nc} == 0x003E) { # >
6550     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6551     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6552    
6553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6554     $self->{line_prev} = $self->{line};
6555     $self->{column_prev} = $self->{column};
6556     $self->{column}++;
6557     $self->{nc}
6558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6559     } else {
6560     $self->{set_nc}->($self);
6561     }
6562    
6563     return ($self->{ct}); # ATTLIST
6564     redo A;
6565     } elsif ($self->{nc} == -1) {
6566     ## XML5: No parse error.
6567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6568     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6569    
6570     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6571     $self->{line_prev} = $self->{line};
6572     $self->{column_prev} = $self->{column};
6573     $self->{column}++;
6574     $self->{nc}
6575     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6576     } else {
6577     $self->{set_nc}->($self);
6578     }
6579    
6580     return ($self->{ct});
6581     redo A;
6582     } else {
6583     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6584     $self->{state} = ALLOWED_TOKEN_STATE;
6585    
6586     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6587     $self->{line_prev} = $self->{line};
6588     $self->{column_prev} = $self->{column};
6589     $self->{column}++;
6590     $self->{nc}
6591     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6592     } else {
6593     $self->{set_nc}->($self);
6594     }
6595    
6596     redo A;
6597     }
6598     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6599     if ($is_space->{$self->{nc}}) {
6600     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6601    
6602     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6603     $self->{line_prev} = $self->{line};
6604     $self->{column_prev} = $self->{column};
6605     $self->{column}++;
6606     $self->{nc}
6607     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6608     } else {
6609     $self->{set_nc}->($self);
6610     }
6611    
6612     redo A;
6613     } elsif ($self->{nc} == 0x007C) { # |
6614     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6615    
6616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617     $self->{line_prev} = $self->{line};
6618     $self->{column_prev} = $self->{column};
6619     $self->{column}++;
6620     $self->{nc}
6621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622     } else {
6623     $self->{set_nc}->($self);
6624     }
6625    
6626     redo A;
6627     } elsif ($self->{nc} == 0x0029) { # )
6628     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6629    
6630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6631     $self->{line_prev} = $self->{line};
6632     $self->{column_prev} = $self->{column};
6633     $self->{column}++;
6634     $self->{nc}
6635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6636     } else {
6637     $self->{set_nc}->($self);
6638     }
6639    
6640     redo A;
6641     } elsif ($self->{nc} == 0x003E) { # >
6642     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6643     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6644    
6645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6646     $self->{line_prev} = $self->{line};
6647     $self->{column_prev} = $self->{column};
6648     $self->{column}++;
6649     $self->{nc}
6650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6651     } else {
6652     $self->{set_nc}->($self);
6653     }
6654    
6655     return ($self->{ct}); # ATTLIST
6656     redo A;
6657     } elsif ($self->{nc} == -1) {
6658     ## XML5: No parse error.
6659     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6660     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6661    
6662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6663     $self->{line_prev} = $self->{line};
6664     $self->{column_prev} = $self->{column};
6665     $self->{column}++;
6666     $self->{nc}
6667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6668     } else {
6669     $self->{set_nc}->($self);
6670     }
6671    
6672     return ($self->{ct});
6673     redo A;
6674     } else {
6675     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6676     ## Stay in the state.
6677    
6678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6679     $self->{line_prev} = $self->{line};
6680     $self->{column_prev} = $self->{column};
6681     $self->{column}++;
6682     $self->{nc}
6683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6684     } else {
6685     $self->{set_nc}->($self);
6686     }
6687    
6688     redo A;
6689     }
6690     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6691     if ($is_space->{$self->{nc}}) {
6692     ## Stay in the state.
6693    
6694     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6695     $self->{line_prev} = $self->{line};
6696     $self->{column_prev} = $self->{column};
6697     $self->{column}++;
6698     $self->{nc}
6699     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6700     } else {
6701     $self->{set_nc}->($self);
6702     }
6703    
6704     redo A;
6705     } elsif ($self->{nc} == 0x007C) { # |
6706     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6707    
6708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6709     $self->{line_prev} = $self->{line};
6710     $self->{column_prev} = $self->{column};
6711     $self->{column}++;
6712     $self->{nc}
6713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6714     } else {
6715     $self->{set_nc}->($self);
6716     }
6717    
6718     redo A;
6719     } elsif ($self->{nc} == 0x0029) { # )
6720     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6721    
6722     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6723     $self->{line_prev} = $self->{line};
6724     $self->{column_prev} = $self->{column};
6725     $self->{column}++;
6726     $self->{nc}
6727     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6728     } else {
6729     $self->{set_nc}->($self);
6730     }
6731    
6732     redo A;
6733     } elsif ($self->{nc} == 0x003E) { # >
6734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6736    
6737     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6738     $self->{line_prev} = $self->{line};
6739     $self->{column_prev} = $self->{column};
6740     $self->{column}++;
6741     $self->{nc}
6742     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6743     } else {
6744     $self->{set_nc}->($self);
6745     }
6746    
6747     return ($self->{ct}); # ATTLIST
6748     redo A;
6749     } elsif ($self->{nc} == -1) {
6750     ## XML5: No parse error.
6751     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6752     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6753    
6754     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6755     $self->{line_prev} = $self->{line};
6756     $self->{column_prev} = $self->{column};
6757     $self->{column}++;
6758     $self->{nc}
6759     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6760     } else {
6761     $self->{set_nc}->($self);
6762     }
6763    
6764     return ($self->{ct});
6765     redo A;
6766     } else {
6767     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6768     line => $self->{line_prev},
6769     column => $self->{column_prev});
6770     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6771     $self->{state} = ALLOWED_TOKEN_STATE;
6772    
6773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6774     $self->{line_prev} = $self->{line};
6775     $self->{column_prev} = $self->{column};
6776     $self->{column}++;
6777     $self->{nc}
6778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6779     } else {
6780     $self->{set_nc}->($self);
6781     }
6782    
6783     redo A;
6784     }
6785     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6786     if ($is_space->{$self->{nc}}) {
6787     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6788    
6789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790     $self->{line_prev} = $self->{line};
6791     $self->{column_prev} = $self->{column};
6792     $self->{column}++;
6793     $self->{nc}
6794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795     } else {
6796     $self->{set_nc}->($self);
6797     }
6798    
6799     redo A;
6800     } elsif ($self->{nc} == 0x0023) { # #
6801     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6802     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6803    
6804     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6805     $self->{line_prev} = $self->{line};
6806     $self->{column_prev} = $self->{column};
6807     $self->{column}++;
6808     $self->{nc}
6809     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6810     } else {
6811     $self->{set_nc}->($self);
6812     }
6813    
6814     redo A;
6815     } elsif ($self->{nc} == 0x0022) { # "
6816     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6817     $self->{ca}->{value} = '';
6818     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6819    
6820     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6821     $self->{line_prev} = $self->{line};
6822     $self->{column_prev} = $self->{column};
6823     $self->{column}++;
6824     $self->{nc}
6825     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6826     } else {
6827     $self->{set_nc}->($self);
6828     }
6829    
6830     redo A;
6831     } elsif ($self->{nc} == 0x0027) { # '
6832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6833     $self->{ca}->{value} = '';
6834     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6835    
6836     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6837     $self->{line_prev} = $self->{line};
6838     $self->{column_prev} = $self->{column};
6839     $self->{column}++;
6840     $self->{nc}
6841     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6842     } else {
6843     $self->{set_nc}->($self);
6844     }
6845    
6846     redo A;
6847     } elsif ($self->{nc} == 0x003E) { # >
6848     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6849     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6850    
6851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6852     $self->{line_prev} = $self->{line};
6853     $self->{column_prev} = $self->{column};
6854     $self->{column}++;
6855     $self->{nc}
6856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6857     } else {
6858     $self->{set_nc}->($self);
6859     }
6860    
6861     return ($self->{ct}); # ATTLIST
6862     redo A;
6863     } elsif ($self->{nc} == -1) {
6864     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6866    
6867     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6868     $self->{line_prev} = $self->{line};
6869     $self->{column_prev} = $self->{column};
6870     $self->{column}++;
6871     $self->{nc}
6872     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6873     } else {
6874     $self->{set_nc}->($self);
6875     }
6876    
6877     return ($self->{ct});
6878     redo A;
6879     } else {
6880     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6881     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6882     ## Reconsume.
6883     redo A;
6884     }
6885     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
6886     if ($is_space->{$self->{nc}}) {
6887     ## Stay in the state.
6888    
6889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6890     $self->{line_prev} = $self->{line};
6891     $self->{column_prev} = $self->{column};
6892     $self->{column}++;
6893     $self->{nc}
6894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6895     } else {
6896     $self->{set_nc}->($self);
6897     }
6898    
6899     redo A;
6900     } elsif ($self->{nc} == 0x0023) { # #
6901     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6902    
6903     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6904     $self->{line_prev} = $self->{line};
6905     $self->{column_prev} = $self->{column};
6906     $self->{column}++;
6907     $self->{nc}
6908     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6909     } else {
6910     $self->{set_nc}->($self);
6911     }
6912    
6913     redo A;
6914     } elsif ($self->{nc} == 0x0022) { # "
6915     $self->{ca}->{value} = '';
6916     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6917    
6918     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6919     $self->{line_prev} = $self->{line};
6920     $self->{column_prev} = $self->{column};
6921     $self->{column}++;
6922     $self->{nc}
6923     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6924     } else {
6925     $self->{set_nc}->($self);
6926     }
6927    
6928     redo A;
6929     } elsif ($self->{nc} == 0x0027) { # '
6930     $self->{ca}->{value} = '';
6931     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6932    
6933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6934     $self->{line_prev} = $self->{line};
6935     $self->{column_prev} = $self->{column};
6936     $self->{column}++;
6937     $self->{nc}
6938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6939     } else {
6940     $self->{set_nc}->($self);
6941     }
6942    
6943     redo A;
6944     } elsif ($self->{nc} == 0x003E) { # >
6945     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6946     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6947    
6948     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6949     $self->{line_prev} = $self->{line};
6950     $self->{column_prev} = $self->{column};
6951     $self->{column}++;
6952     $self->{nc}
6953     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6954     } else {
6955     $self->{set_nc}->($self);
6956     }
6957    
6958     return ($self->{ct}); # ATTLIST
6959     redo A;
6960     } elsif ($self->{nc} == -1) {
6961     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6962     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6963    
6964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6965     $self->{line_prev} = $self->{line};
6966     $self->{column_prev} = $self->{column};
6967     $self->{column}++;
6968     $self->{nc}
6969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6970     } else {
6971     $self->{set_nc}->($self);
6972     }
6973    
6974     return ($self->{ct});
6975     redo A;
6976     } else {
6977     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6978     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6979     ## Reconsume.
6980     redo A;
6981     }
6982     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
6983     if ($is_space->{$self->{nc}}) {
6984     ## XML5: No parse error.
6985     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
6986 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
6987 wakaba 1.15 ## Reconsume.
6988     redo A;
6989     } elsif ($self->{nc} == 0x0022) { # "
6990     ## XML5: Same as "anything else".
6991     $self->{ca}->{value} = '';
6992     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6993    
6994     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6995     $self->{line_prev} = $self->{line};
6996     $self->{column_prev} = $self->{column};
6997     $self->{column}++;
6998     $self->{nc}
6999     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7000     } else {
7001     $self->{set_nc}->($self);
7002     }
7003    
7004     redo A;
7005     } elsif ($self->{nc} == 0x0027) { # '
7006     ## XML5: Same as "anything else".
7007     $self->{ca}->{value} = '';
7008     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7009    
7010     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7011     $self->{line_prev} = $self->{line};
7012     $self->{column_prev} = $self->{column};
7013     $self->{column}++;
7014     $self->{nc}
7015     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7016     } else {
7017     $self->{set_nc}->($self);
7018     }
7019    
7020     redo A;
7021     } elsif ($self->{nc} == 0x003E) { # >
7022     ## XML5: Same as "anything else".
7023     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7024     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7025    
7026     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7027     $self->{line_prev} = $self->{line};
7028     $self->{column_prev} = $self->{column};
7029     $self->{column}++;
7030     $self->{nc}
7031     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7032     } else {
7033     $self->{set_nc}->($self);
7034     }
7035    
7036     return ($self->{ct}); # ATTLIST
7037     redo A;
7038     } elsif ($self->{nc} == -1) {
7039     ## XML5: No parse error.
7040     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7041     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7042    
7043     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7044     $self->{line_prev} = $self->{line};
7045     $self->{column_prev} = $self->{column};
7046     $self->{column}++;
7047     $self->{nc}
7048     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7049     } else {
7050     $self->{set_nc}->($self);
7051     }
7052    
7053     return ($self->{ct});
7054     redo A;
7055     } else {
7056     $self->{ca}->{default} = chr $self->{nc};
7057     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7058    
7059     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7060     $self->{line_prev} = $self->{line};
7061     $self->{column_prev} = $self->{column};
7062     $self->{column}++;
7063     $self->{nc}
7064     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7065     } else {
7066     $self->{set_nc}->($self);
7067     }
7068    
7069     redo A;
7070     }
7071     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7072     if ($is_space->{$self->{nc}}) {
7073     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7074    
7075     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7076     $self->{line_prev} = $self->{line};
7077     $self->{column_prev} = $self->{column};
7078     $self->{column}++;
7079     $self->{nc}
7080     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7081     } else {
7082     $self->{set_nc}->($self);
7083     }
7084    
7085     redo A;
7086     } elsif ($self->{nc} == 0x0022) { # "
7087     ## XML5: Same as "anything else".
7088     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7089     $self->{ca}->{value} = '';
7090     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7091    
7092     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7093     $self->{line_prev} = $self->{line};
7094     $self->{column_prev} = $self->{column};
7095     $self->{column}++;
7096     $self->{nc}
7097     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7098     } else {
7099     $self->{set_nc}->($self);
7100     }
7101    
7102     redo A;
7103     } elsif ($self->{nc} == 0x0027) { # '
7104     ## XML5: Same as "anything else".
7105     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7106     $self->{ca}->{value} = '';
7107     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7108    
7109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7110     $self->{line_prev} = $self->{line};
7111     $self->{column_prev} = $self->{column};
7112     $self->{column}++;
7113     $self->{nc}
7114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7115     } else {
7116     $self->{set_nc}->($self);
7117     }
7118    
7119     redo A;
7120     } elsif ($self->{nc} == 0x003E) { # >
7121     ## XML5: Same as "anything else".
7122     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7123     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7124    
7125     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7126     $self->{line_prev} = $self->{line};
7127     $self->{column_prev} = $self->{column};
7128     $self->{column}++;
7129     $self->{nc}
7130     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7131     } else {
7132     $self->{set_nc}->($self);
7133     }
7134    
7135     return ($self->{ct}); # ATTLIST
7136     redo A;
7137     } elsif ($self->{nc} == -1) {
7138     ## XML5: No parse error.
7139     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7140     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7141     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7142    
7143     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7144     $self->{line_prev} = $self->{line};
7145     $self->{column_prev} = $self->{column};
7146     $self->{column}++;
7147     $self->{nc}
7148     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7149     } else {
7150     $self->{set_nc}->($self);
7151     }
7152    
7153     return ($self->{ct});
7154     redo A;
7155     } else {
7156     $self->{ca}->{default} .= chr $self->{nc};
7157     ## Stay in the state.
7158    
7159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7160     $self->{line_prev} = $self->{line};
7161     $self->{column_prev} = $self->{column};
7162     $self->{column}++;
7163     $self->{nc}
7164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7165     } else {
7166     $self->{set_nc}->($self);
7167     }
7168    
7169     redo A;
7170     }
7171     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7172     if ($is_space->{$self->{nc}}) {
7173     ## Stay in the state.
7174    
7175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7176     $self->{line_prev} = $self->{line};
7177     $self->{column_prev} = $self->{column};
7178     $self->{column}++;
7179     $self->{nc}
7180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7181     } else {
7182     $self->{set_nc}->($self);
7183     }
7184    
7185     redo A;
7186     } elsif ($self->{nc} == 0x0022) { # "
7187     $self->{ca}->{value} = '';
7188     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7189    
7190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7191     $self->{line_prev} = $self->{line};
7192     $self->{column_prev} = $self->{column};
7193     $self->{column}++;
7194     $self->{nc}
7195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7196     } else {
7197     $self->{set_nc}->($self);
7198     }
7199    
7200     redo A;
7201     } elsif ($self->{nc} == 0x0027) { # '
7202     $self->{ca}->{value} = '';
7203     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7204    
7205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7206     $self->{line_prev} = $self->{line};
7207     $self->{column_prev} = $self->{column};
7208     $self->{column}++;
7209     $self->{nc}
7210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7211     } else {
7212     $self->{set_nc}->($self);
7213     }
7214    
7215     redo A;
7216     } elsif ($self->{nc} == 0x003E) { # >
7217     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7218     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7219    
7220     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7221     $self->{line_prev} = $self->{line};
7222     $self->{column_prev} = $self->{column};
7223     $self->{column}++;
7224     $self->{nc}
7225     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7226     } else {
7227     $self->{set_nc}->($self);
7228     }
7229    
7230     return ($self->{ct}); # ATTLIST
7231     redo A;
7232     } elsif ($self->{nc} == -1) {
7233     ## XML5: No parse error.
7234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7235     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7236     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7237    
7238     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7239     $self->{line_prev} = $self->{line};
7240     $self->{column_prev} = $self->{column};
7241     $self->{column}++;
7242     $self->{nc}
7243     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7244     } else {
7245     $self->{set_nc}->($self);
7246     }
7247    
7248     return ($self->{ct});
7249     redo A;
7250     } else {
7251     ## XML5: Not defined yet.
7252     if ($self->{ca}->{default} eq 'FIXED') {
7253     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7254     } else {
7255     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7256     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7257     }
7258     ## Reconsume.
7259     redo A;
7260     }
7261     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7262     if ($is_space->{$self->{nc}} or
7263     $self->{nc} == -1 or
7264     $self->{nc} == 0x003E) { # >
7265     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7266     ## Reconsume.
7267     redo A;
7268     } else {
7269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7270     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7271     ## Reconsume.
7272     redo A;
7273 wakaba 1.16 }
7274    
7275     } elsif ($self->{state} == BOGUS_MD_STATE) {
7276     if ($self->{nc} == 0x003E) { # >
7277     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7278    
7279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7280     $self->{line_prev} = $self->{line};
7281     $self->{column_prev} = $self->{column};
7282     $self->{column}++;
7283     $self->{nc}
7284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7285     } else {
7286     $self->{set_nc}->($self);
7287     }
7288    
7289     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7290     redo A;
7291     } elsif ($self->{nc} == -1) {
7292     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7293     ## Reconsume.
7294     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7295     redo A;
7296     } else {
7297     ## Stay in the state.
7298    
7299     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7300     $self->{line_prev} = $self->{line};
7301     $self->{column_prev} = $self->{column};
7302     $self->{column}++;
7303     $self->{nc}
7304     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7305     } else {
7306     $self->{set_nc}->($self);
7307     }
7308    
7309     redo A;
7310     }
7311 wakaba 1.1 } else {
7312     die "$0: $self->{state}: Unknown state";
7313     }
7314     } # A
7315    
7316     die "$0: _get_next_token: unexpected case";
7317     } # _get_next_token
7318    
7319     1;
7320 wakaba 1.16 ## $Date: 2008/10/18 08:05:29 $
7321 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24