/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (hide annotations) (download)
Wed Oct 15 12:49:49 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.11: +409 -82 lines
++ whatpm/t/ChangeLog	15 Oct 2008 12:49:07 -0000
	* XML-Parser.t: "xml/doctypes-2.dat" added.

	* tokenizer-test-1.test: Keyword case-sensitivility tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 12:49:41 -0000
	* doctypes-1.dat: A keyword case-sensitivility test added.

	* doctypes-2.dat: New test data file.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 12:46:53 -0000
	* Tokenizer.pm.src: $self->{s_kwd} for non-DATA_STATE states are
	renamed as $self->{kwd} to avoid confliction.  Don't raise
	case-sensitivity error for the keyword "DOCTYPE" in HTML mode.
	Support for internal subsets (internal subset itself only; no
	declaration in them is supported yet).  Raise a parse error for
	non-uppercase keywords "PUBLIC" and "SYSTEM" in XML mode.  Raise a
	parse error if no system identifier is specified for a DOCTYPE
	declaration with a public identifier.  Don't close the DOCTYPE
	declaration by a ">" character in the system declaration in XML
	mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 12:48:30 -0000
	* Parser.pm.src: Typo fixed.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.12 our $VERSION=do{my @r=(q$Revision: 1.11 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
35    
36 wakaba 1.2 ## Token types
37    
38 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
39 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
40     sub START_TAG_TOKEN () { 3 }
41     sub END_TAG_TOKEN () { 4 }
42     sub END_OF_FILE_TOKEN () { 5 }
43     sub CHARACTER_TOKEN () { 6 }
44 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
45     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
46    
47     ## XML5: XML5 has "empty tag token". In this implementation, it is
48     ## represented as a start tag token with $self->{self_closing} flag
49     ## set to true.
50    
51     ## XML5: XML5 has "short end tag token". In this implementation, it
52     ## is represented as an end tag token with $token->{tag_name} flag set
53     ## to an empty string.
54 wakaba 1.1
55     package Whatpm::HTML;
56    
57 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
58    
59 wakaba 1.1 ## Content model flags
60    
61     sub CM_ENTITY () { 0b001 } # & markup in data
62     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
63     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
64    
65     sub PLAINTEXT_CONTENT_MODEL () { 0 }
66     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
67     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
68     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
69    
70     ## Tokenizer states
71    
72     sub DATA_STATE () { 0 }
73     #sub ENTITY_DATA_STATE () { 1 }
74     sub TAG_OPEN_STATE () { 2 }
75     sub CLOSE_TAG_OPEN_STATE () { 3 }
76     sub TAG_NAME_STATE () { 4 }
77     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
78     sub ATTRIBUTE_NAME_STATE () { 6 }
79     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
80     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
81     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
82     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
83     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
84     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
85     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
86     sub COMMENT_START_STATE () { 14 }
87     sub COMMENT_START_DASH_STATE () { 15 }
88     sub COMMENT_STATE () { 16 }
89     sub COMMENT_END_STATE () { 17 }
90     sub COMMENT_END_DASH_STATE () { 18 }
91     sub BOGUS_COMMENT_STATE () { 19 }
92     sub DOCTYPE_STATE () { 20 }
93     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
94     sub DOCTYPE_NAME_STATE () { 22 }
95     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
96     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
97     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
98     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
99     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
100     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
101     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
102     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
103     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
104     sub BOGUS_DOCTYPE_STATE () { 32 }
105     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
106     sub SELF_CLOSING_START_TAG_STATE () { 34 }
107     sub CDATA_SECTION_STATE () { 35 }
108     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
109     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
110     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
111     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
112     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
113     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
114     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
115     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
116     ## NOTE: "Entity data state", "entity in attribute value state", and
117     ## "consume a character reference" algorithm are jointly implemented
118     ## using the following six states:
119     sub ENTITY_STATE () { 44 }
120     sub ENTITY_HASH_STATE () { 45 }
121     sub NCR_NUM_STATE () { 46 }
122     sub HEXREF_X_STATE () { 47 }
123     sub HEXREF_HEX_STATE () { 48 }
124     sub ENTITY_NAME_STATE () { 49 }
125     sub PCDATA_STATE () { 50 } # "data state" in the spec
126    
127 wakaba 1.12 ## XML-only states
128 wakaba 1.8 sub PI_STATE () { 51 }
129     sub PI_TARGET_STATE () { 52 }
130     sub PI_TARGET_AFTER_STATE () { 53 }
131     sub PI_DATA_STATE () { 54 }
132     sub PI_AFTER_STATE () { 55 }
133     sub PI_DATA_AFTER_STATE () { 56 }
134 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
135     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
136 wakaba 1.8
137 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
138     ## list and descriptions)
139    
140     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
141     sub FOREIGN_EL () { 0b1_00000000000 }
142    
143     ## Character reference mappings
144    
145     my $charref_map = {
146     0x0D => 0x000A,
147     0x80 => 0x20AC,
148     0x81 => 0xFFFD,
149     0x82 => 0x201A,
150     0x83 => 0x0192,
151     0x84 => 0x201E,
152     0x85 => 0x2026,
153     0x86 => 0x2020,
154     0x87 => 0x2021,
155     0x88 => 0x02C6,
156     0x89 => 0x2030,
157     0x8A => 0x0160,
158     0x8B => 0x2039,
159     0x8C => 0x0152,
160     0x8D => 0xFFFD,
161     0x8E => 0x017D,
162     0x8F => 0xFFFD,
163     0x90 => 0xFFFD,
164     0x91 => 0x2018,
165     0x92 => 0x2019,
166     0x93 => 0x201C,
167     0x94 => 0x201D,
168     0x95 => 0x2022,
169     0x96 => 0x2013,
170     0x97 => 0x2014,
171     0x98 => 0x02DC,
172     0x99 => 0x2122,
173     0x9A => 0x0161,
174     0x9B => 0x203A,
175     0x9C => 0x0153,
176     0x9D => 0xFFFD,
177     0x9E => 0x017E,
178     0x9F => 0x0178,
179     }; # $charref_map
180     $charref_map->{$_} = 0xFFFD
181     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
182     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
183     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
184     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
185     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
186     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
187     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
188    
189     ## Implementations MUST act as if state machine in the spec
190    
191     sub _initialize_tokenizer ($) {
192     my $self = shift;
193    
194     ## NOTE: Fields set by |new| constructor:
195     #$self->{level}
196     #$self->{set_nc}
197     #$self->{parse_error}
198 wakaba 1.3 #$self->{is_xml} (if XML)
199 wakaba 1.1
200     $self->{state} = DATA_STATE; # MUST
201 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
202     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
203 wakaba 1.1 #$self->{entity__value}; # initialized when used
204     #$self->{entity__match}; # initialized when used
205     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
206     undef $self->{ct}; # current token
207     undef $self->{ca}; # current attribute
208     undef $self->{last_stag_name}; # last emitted start tag name
209     #$self->{prev_state}; # initialized when used
210     delete $self->{self_closing};
211     $self->{char_buffer} = '';
212     $self->{char_buffer_pos} = 0;
213     $self->{nc} = -1; # next input character
214     #$self->{next_nc}
215    
216     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
217     $self->{line_prev} = $self->{line};
218     $self->{column_prev} = $self->{column};
219     $self->{column}++;
220     $self->{nc}
221     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
222     } else {
223     $self->{set_nc}->($self);
224     }
225    
226     $self->{token} = [];
227     # $self->{escape}
228     } # _initialize_tokenizer
229    
230     ## A token has:
231     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
232 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
233 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
234     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
235 wakaba 1.11 ## ->{target} (PI_TOKEN)
236 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
237     ## ->{sysid} (DOCTYPE_TOKEN)
238     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
239     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
240     ## ->{name}
241     ## ->{value}
242     ## ->{has_reference} == 1 or 0
243 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
244     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
245 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
246 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
247 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
248    
249 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
250     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
251     ## while the token is pushed back to the stack.
252    
253     ## Emitted token MUST immediately be handled by the tree construction state.
254    
255     ## Before each step, UA MAY check to see if either one of the scripts in
256     ## "list of scripts that will execute as soon as possible" or the first
257     ## script in the "list of scripts that will execute asynchronously",
258     ## has completed loading. If one has, then it MUST be executed
259     ## and removed from the list.
260    
261     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
262     ## (This requirement was dropped from HTML5 spec, unfortunately.)
263    
264     my $is_space = {
265     0x0009 => 1, # CHARACTER TABULATION (HT)
266     0x000A => 1, # LINE FEED (LF)
267     #0x000B => 0, # LINE TABULATION (VT)
268 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
269 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
270     0x0020 => 1, # SPACE (SP)
271     };
272    
273     sub _get_next_token ($) {
274     my $self = shift;
275    
276     if ($self->{self_closing}) {
277     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
278     ## NOTE: The |self_closing| flag is only set by start tag token.
279     ## In addition, when a start tag token is emitted, it is always set to
280     ## |ct|.
281     delete $self->{self_closing};
282     }
283    
284     if (@{$self->{token}}) {
285     $self->{self_closing} = $self->{token}->[0]->{self_closing};
286     return shift @{$self->{token}};
287     }
288    
289     A: {
290     if ($self->{state} == PCDATA_STATE) {
291     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
292    
293     if ($self->{nc} == 0x0026) { # &
294    
295     ## NOTE: In the spec, the tokenizer is switched to the
296     ## "entity data state". In this implementation, the tokenizer
297     ## is switched to the |ENTITY_STATE|, which is an implementation
298     ## of the "consume a character reference" algorithm.
299     $self->{entity_add} = -1;
300     $self->{prev_state} = DATA_STATE;
301     $self->{state} = ENTITY_STATE;
302    
303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
304     $self->{line_prev} = $self->{line};
305     $self->{column_prev} = $self->{column};
306     $self->{column}++;
307     $self->{nc}
308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
309     } else {
310     $self->{set_nc}->($self);
311     }
312    
313     redo A;
314     } elsif ($self->{nc} == 0x003C) { # <
315    
316     $self->{state} = TAG_OPEN_STATE;
317    
318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
319     $self->{line_prev} = $self->{line};
320     $self->{column_prev} = $self->{column};
321     $self->{column}++;
322     $self->{nc}
323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
324     } else {
325     $self->{set_nc}->($self);
326     }
327    
328     redo A;
329     } elsif ($self->{nc} == -1) {
330    
331     return ({type => END_OF_FILE_TOKEN,
332     line => $self->{line}, column => $self->{column}});
333     last A; ## TODO: ok?
334     } else {
335    
336     #
337     }
338    
339     # Anything else
340     my $token = {type => CHARACTER_TOKEN,
341     data => chr $self->{nc},
342     line => $self->{line}, column => $self->{column},
343     };
344     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
345    
346     ## Stay in the state.
347    
348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349     $self->{line_prev} = $self->{line};
350     $self->{column_prev} = $self->{column};
351     $self->{column}++;
352     $self->{nc}
353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354     } else {
355     $self->{set_nc}->($self);
356     }
357    
358     return ($token);
359     redo A;
360     } elsif ($self->{state} == DATA_STATE) {
361     $self->{s_kwd} = '' unless defined $self->{s_kwd};
362     if ($self->{nc} == 0x0026) { # &
363     $self->{s_kwd} = '';
364     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
365     not $self->{escape}) {
366    
367     ## NOTE: In the spec, the tokenizer is switched to the
368     ## "entity data state". In this implementation, the tokenizer
369     ## is switched to the |ENTITY_STATE|, which is an implementation
370     ## of the "consume a character reference" algorithm.
371     $self->{entity_add} = -1;
372     $self->{prev_state} = DATA_STATE;
373     $self->{state} = ENTITY_STATE;
374    
375     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
376     $self->{line_prev} = $self->{line};
377     $self->{column_prev} = $self->{column};
378     $self->{column}++;
379     $self->{nc}
380     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
381     } else {
382     $self->{set_nc}->($self);
383     }
384    
385     redo A;
386     } else {
387    
388     #
389     }
390     } elsif ($self->{nc} == 0x002D) { # -
391     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
392 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
393 wakaba 1.1
394     $self->{escape} = 1; # unless $self->{escape};
395     $self->{s_kwd} = '--';
396     #
397 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
398 wakaba 1.1
399     $self->{s_kwd} = '--';
400     #
401 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
402    
403     $self->{s_kwd} .= '-';
404     #
405 wakaba 1.1 } else {
406    
407 wakaba 1.5 $self->{s_kwd} = '-';
408 wakaba 1.1 #
409     }
410     }
411    
412     #
413     } elsif ($self->{nc} == 0x0021) { # !
414     if (length $self->{s_kwd}) {
415    
416     $self->{s_kwd} .= '!';
417     #
418     } else {
419    
420     #$self->{s_kwd} = '';
421     #
422     }
423     #
424     } elsif ($self->{nc} == 0x003C) { # <
425     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
426     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
427     not $self->{escape})) {
428    
429     $self->{state} = TAG_OPEN_STATE;
430    
431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
432     $self->{line_prev} = $self->{line};
433     $self->{column_prev} = $self->{column};
434     $self->{column}++;
435     $self->{nc}
436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
437     } else {
438     $self->{set_nc}->($self);
439     }
440    
441     redo A;
442     } else {
443    
444     $self->{s_kwd} = '';
445     #
446     }
447     } elsif ($self->{nc} == 0x003E) { # >
448     if ($self->{escape} and
449     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
450     if ($self->{s_kwd} eq '--') {
451    
452     delete $self->{escape};
453 wakaba 1.5 #
454 wakaba 1.1 } else {
455    
456 wakaba 1.5 #
457 wakaba 1.1 }
458 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
459    
460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
461     line => $self->{line_prev},
462     column => $self->{column_prev} - 1);
463     #
464 wakaba 1.1 } else {
465    
466 wakaba 1.5 #
467 wakaba 1.1 }
468    
469     $self->{s_kwd} = '';
470     #
471 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
472     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
473    
474     $self->{s_kwd} .= ']';
475     } elsif ($self->{s_kwd} eq ']]') {
476    
477     #
478     } else {
479    
480     $self->{s_kwd} = '';
481     }
482     #
483 wakaba 1.1 } elsif ($self->{nc} == -1) {
484    
485     $self->{s_kwd} = '';
486     return ({type => END_OF_FILE_TOKEN,
487     line => $self->{line}, column => $self->{column}});
488     last A; ## TODO: ok?
489     } else {
490    
491     $self->{s_kwd} = '';
492     #
493     }
494    
495     # Anything else
496     my $token = {type => CHARACTER_TOKEN,
497     data => chr $self->{nc},
498     line => $self->{line}, column => $self->{column},
499     };
500 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
501 wakaba 1.1 length $token->{data})) {
502     $self->{s_kwd} = '';
503     }
504    
505     ## Stay in the data state.
506 wakaba 1.5 if (not $self->{is_xml} and
507     $self->{content_model} == PCDATA_CONTENT_MODEL) {
508 wakaba 1.1
509     $self->{state} = PCDATA_STATE;
510     } else {
511    
512     ## Stay in the state.
513     }
514    
515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
516     $self->{line_prev} = $self->{line};
517     $self->{column_prev} = $self->{column};
518     $self->{column}++;
519     $self->{nc}
520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
521     } else {
522     $self->{set_nc}->($self);
523     }
524    
525     return ($token);
526     redo A;
527     } elsif ($self->{state} == TAG_OPEN_STATE) {
528 wakaba 1.10 ## XML5: "tag state".
529    
530 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
531     if ($self->{nc} == 0x002F) { # /
532    
533    
534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
535     $self->{line_prev} = $self->{line};
536     $self->{column_prev} = $self->{column};
537     $self->{column}++;
538     $self->{nc}
539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
540     } else {
541     $self->{set_nc}->($self);
542     }
543    
544     $self->{state} = CLOSE_TAG_OPEN_STATE;
545     redo A;
546     } elsif ($self->{nc} == 0x0021) { # !
547    
548 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
549 wakaba 1.1 #
550     } else {
551    
552 wakaba 1.12 $self->{s_kwd} = '';
553 wakaba 1.1 #
554     }
555    
556     ## reconsume
557     $self->{state} = DATA_STATE;
558     return ({type => CHARACTER_TOKEN, data => '<',
559     line => $self->{line_prev},
560     column => $self->{column_prev},
561     });
562     redo A;
563     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
564     if ($self->{nc} == 0x0021) { # !
565    
566     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
567    
568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
569     $self->{line_prev} = $self->{line};
570     $self->{column_prev} = $self->{column};
571     $self->{column}++;
572     $self->{nc}
573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
574     } else {
575     $self->{set_nc}->($self);
576     }
577    
578     redo A;
579     } elsif ($self->{nc} == 0x002F) { # /
580    
581     $self->{state} = CLOSE_TAG_OPEN_STATE;
582    
583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
584     $self->{line_prev} = $self->{line};
585     $self->{column_prev} = $self->{column};
586     $self->{column}++;
587     $self->{nc}
588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
589     } else {
590     $self->{set_nc}->($self);
591     }
592    
593     redo A;
594     } elsif (0x0041 <= $self->{nc} and
595     $self->{nc} <= 0x005A) { # A..Z
596    
597     $self->{ct}
598     = {type => START_TAG_TOKEN,
599 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
600 wakaba 1.1 line => $self->{line_prev},
601     column => $self->{column_prev}};
602     $self->{state} = TAG_NAME_STATE;
603    
604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
605     $self->{line_prev} = $self->{line};
606     $self->{column_prev} = $self->{column};
607     $self->{column}++;
608     $self->{nc}
609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
610     } else {
611     $self->{set_nc}->($self);
612     }
613    
614     redo A;
615     } elsif (0x0061 <= $self->{nc} and
616     $self->{nc} <= 0x007A) { # a..z
617    
618     $self->{ct} = {type => START_TAG_TOKEN,
619     tag_name => chr ($self->{nc}),
620     line => $self->{line_prev},
621     column => $self->{column_prev}};
622     $self->{state} = TAG_NAME_STATE;
623    
624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
625     $self->{line_prev} = $self->{line};
626     $self->{column_prev} = $self->{column};
627     $self->{column}++;
628     $self->{nc}
629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
630     } else {
631     $self->{set_nc}->($self);
632     }
633    
634     redo A;
635     } elsif ($self->{nc} == 0x003E) { # >
636    
637     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
638     line => $self->{line_prev},
639     column => $self->{column_prev});
640     $self->{state} = DATA_STATE;
641 wakaba 1.5 $self->{s_kwd} = '';
642 wakaba 1.1
643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
644     $self->{line_prev} = $self->{line};
645     $self->{column_prev} = $self->{column};
646     $self->{column}++;
647     $self->{nc}
648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
649     } else {
650     $self->{set_nc}->($self);
651     }
652    
653    
654     return ({type => CHARACTER_TOKEN, data => '<>',
655     line => $self->{line_prev},
656     column => $self->{column_prev},
657     });
658    
659     redo A;
660     } elsif ($self->{nc} == 0x003F) { # ?
661 wakaba 1.8 if ($self->{is_xml}) {
662    
663     $self->{state} = PI_STATE;
664    
665     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666     $self->{line_prev} = $self->{line};
667     $self->{column_prev} = $self->{column};
668     $self->{column}++;
669     $self->{nc}
670     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671     } else {
672     $self->{set_nc}->($self);
673     }
674    
675     redo A;
676     } else {
677    
678     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
679     line => $self->{line_prev},
680     column => $self->{column_prev});
681     $self->{state} = BOGUS_COMMENT_STATE;
682     $self->{ct} = {type => COMMENT_TOKEN, data => '',
683     line => $self->{line_prev},
684     column => $self->{column_prev},
685     };
686     ## $self->{nc} is intentionally left as is
687     redo A;
688     }
689 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
690 wakaba 1.1
691     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
692     line => $self->{line_prev},
693     column => $self->{column_prev});
694     $self->{state} = DATA_STATE;
695 wakaba 1.5 $self->{s_kwd} = '';
696 wakaba 1.1 ## reconsume
697    
698     return ({type => CHARACTER_TOKEN, data => '<',
699     line => $self->{line_prev},
700     column => $self->{column_prev},
701     });
702    
703     redo A;
704 wakaba 1.9 } else {
705     ## XML5: "<:" is a parse error.
706    
707     $self->{ct} = {type => START_TAG_TOKEN,
708     tag_name => chr ($self->{nc}),
709     line => $self->{line_prev},
710     column => $self->{column_prev}};
711     $self->{state} = TAG_NAME_STATE;
712    
713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
714     $self->{line_prev} = $self->{line};
715     $self->{column_prev} = $self->{column};
716     $self->{column}++;
717     $self->{nc}
718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
719     } else {
720     $self->{set_nc}->($self);
721     }
722    
723     redo A;
724 wakaba 1.1 }
725     } else {
726     die "$0: $self->{content_model} in tag open";
727     }
728     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
729     ## NOTE: The "close tag open state" in the spec is implemented as
730     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
731    
732 wakaba 1.10 ## XML5: "end tag state".
733    
734 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
735     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
736     if (defined $self->{last_stag_name}) {
737     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
738 wakaba 1.12 $self->{kwd} = '';
739 wakaba 1.1 ## Reconsume.
740     redo A;
741     } else {
742     ## No start tag token has ever been emitted
743     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
744    
745     $self->{state} = DATA_STATE;
746 wakaba 1.5 $self->{s_kwd} = '';
747 wakaba 1.1 ## Reconsume.
748     return ({type => CHARACTER_TOKEN, data => '</',
749     line => $l, column => $c,
750     });
751     redo A;
752     }
753     }
754    
755     if (0x0041 <= $self->{nc} and
756     $self->{nc} <= 0x005A) { # A..Z
757    
758     $self->{ct}
759     = {type => END_TAG_TOKEN,
760 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
761 wakaba 1.1 line => $l, column => $c};
762     $self->{state} = TAG_NAME_STATE;
763    
764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
765     $self->{line_prev} = $self->{line};
766     $self->{column_prev} = $self->{column};
767     $self->{column}++;
768     $self->{nc}
769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
770     } else {
771     $self->{set_nc}->($self);
772     }
773    
774     redo A;
775     } elsif (0x0061 <= $self->{nc} and
776     $self->{nc} <= 0x007A) { # a..z
777    
778     $self->{ct} = {type => END_TAG_TOKEN,
779     tag_name => chr ($self->{nc}),
780     line => $l, column => $c};
781     $self->{state} = TAG_NAME_STATE;
782    
783     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
784     $self->{line_prev} = $self->{line};
785     $self->{column_prev} = $self->{column};
786     $self->{column}++;
787     $self->{nc}
788     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
789     } else {
790     $self->{set_nc}->($self);
791     }
792    
793     redo A;
794     } elsif ($self->{nc} == 0x003E) { # >
795     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
796     line => $self->{line_prev}, ## "<" in "</>"
797     column => $self->{column_prev} - 1);
798     $self->{state} = DATA_STATE;
799 wakaba 1.5 $self->{s_kwd} = '';
800 wakaba 1.10 if ($self->{is_xml}) {
801    
802     ## XML5: No parse error.
803    
804     ## NOTE: This parser raises a parse error, since it supports
805     ## XML1, not XML5.
806    
807     ## NOTE: A short end tag token.
808     my $ct = {type => END_TAG_TOKEN,
809     tag_name => '',
810     line => $self->{line_prev},
811     column => $self->{column_prev} - 1,
812     };
813    
814     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
815     $self->{line_prev} = $self->{line};
816     $self->{column_prev} = $self->{column};
817     $self->{column}++;
818     $self->{nc}
819     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
820     } else {
821     $self->{set_nc}->($self);
822     }
823    
824     return ($ct);
825     } else {
826    
827    
828 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
829     $self->{line_prev} = $self->{line};
830     $self->{column_prev} = $self->{column};
831     $self->{column}++;
832     $self->{nc}
833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
834     } else {
835     $self->{set_nc}->($self);
836     }
837    
838 wakaba 1.10 }
839 wakaba 1.1 redo A;
840     } elsif ($self->{nc} == -1) {
841    
842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
843 wakaba 1.5 $self->{s_kwd} = '';
844 wakaba 1.1 $self->{state} = DATA_STATE;
845     # reconsume
846    
847     return ({type => CHARACTER_TOKEN, data => '</',
848     line => $l, column => $c,
849     });
850    
851     redo A;
852 wakaba 1.10 } elsif (not $self->{is_xml} or
853     $is_space->{$self->{nc}}) {
854 wakaba 1.1
855 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
856     line => $self->{line_prev}, # "<" of "</"
857     column => $self->{column_prev} - 1);
858 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
859     $self->{ct} = {type => COMMENT_TOKEN, data => '',
860     line => $self->{line_prev}, # "<" of "</"
861     column => $self->{column_prev} - 1,
862     };
863     ## NOTE: $self->{nc} is intentionally left as is.
864     ## Although the "anything else" case of the spec not explicitly
865     ## states that the next input character is to be reconsumed,
866     ## it will be included to the |data| of the comment token
867     ## generated from the bogus end tag, as defined in the
868     ## "bogus comment state" entry.
869     redo A;
870 wakaba 1.10 } else {
871     ## XML5: "</:" is a parse error.
872    
873     $self->{ct} = {type => END_TAG_TOKEN,
874     tag_name => chr ($self->{nc}),
875     line => $l, column => $c};
876     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
877    
878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
879     $self->{line_prev} = $self->{line};
880     $self->{column_prev} = $self->{column};
881     $self->{column}++;
882     $self->{nc}
883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
884     } else {
885     $self->{set_nc}->($self);
886     }
887    
888     redo A;
889 wakaba 1.1 }
890     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
891 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
892 wakaba 1.1 if (length $ch) {
893     my $CH = $ch;
894     $ch =~ tr/a-z/A-Z/;
895     my $nch = chr $self->{nc};
896     if ($nch eq $ch or $nch eq $CH) {
897    
898     ## Stay in the state.
899 wakaba 1.12 $self->{kwd} .= $nch;
900 wakaba 1.1
901     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
902     $self->{line_prev} = $self->{line};
903     $self->{column_prev} = $self->{column};
904     $self->{column}++;
905     $self->{nc}
906     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
907     } else {
908     $self->{set_nc}->($self);
909     }
910    
911     redo A;
912     } else {
913    
914     $self->{state} = DATA_STATE;
915 wakaba 1.5 $self->{s_kwd} = '';
916 wakaba 1.1 ## Reconsume.
917     return ({type => CHARACTER_TOKEN,
918 wakaba 1.12 data => '</' . $self->{kwd},
919 wakaba 1.1 line => $self->{line_prev},
920 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
921 wakaba 1.1 });
922     redo A;
923     }
924     } else { # after "<{tag-name}"
925     unless ($is_space->{$self->{nc}} or
926     {
927     0x003E => 1, # >
928     0x002F => 1, # /
929     -1 => 1, # EOF
930     }->{$self->{nc}}) {
931    
932     ## Reconsume.
933     $self->{state} = DATA_STATE;
934 wakaba 1.5 $self->{s_kwd} = '';
935 wakaba 1.1 return ({type => CHARACTER_TOKEN,
936 wakaba 1.12 data => '</' . $self->{kwd},
937 wakaba 1.1 line => $self->{line_prev},
938 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
939 wakaba 1.1 });
940     redo A;
941     } else {
942    
943     $self->{ct}
944     = {type => END_TAG_TOKEN,
945     tag_name => $self->{last_stag_name},
946     line => $self->{line_prev},
947 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
948 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
949     ## Reconsume.
950     redo A;
951     }
952     }
953     } elsif ($self->{state} == TAG_NAME_STATE) {
954     if ($is_space->{$self->{nc}}) {
955    
956     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
957    
958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
959     $self->{line_prev} = $self->{line};
960     $self->{column_prev} = $self->{column};
961     $self->{column}++;
962     $self->{nc}
963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
964     } else {
965     $self->{set_nc}->($self);
966     }
967    
968     redo A;
969     } elsif ($self->{nc} == 0x003E) { # >
970     if ($self->{ct}->{type} == START_TAG_TOKEN) {
971    
972     $self->{last_stag_name} = $self->{ct}->{tag_name};
973     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
974     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
975     #if ($self->{ct}->{attributes}) {
976     # ## NOTE: This should never be reached.
977     # !!! cp (36);
978     # !!! parse-error (type => 'end tag attribute');
979     #} else {
980    
981     #}
982     } else {
983     die "$0: $self->{ct}->{type}: Unknown token type";
984     }
985     $self->{state} = DATA_STATE;
986 wakaba 1.5 $self->{s_kwd} = '';
987 wakaba 1.1
988     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
989     $self->{line_prev} = $self->{line};
990     $self->{column_prev} = $self->{column};
991     $self->{column}++;
992     $self->{nc}
993     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
994     } else {
995     $self->{set_nc}->($self);
996     }
997    
998    
999     return ($self->{ct}); # start tag or end tag
1000    
1001     redo A;
1002     } elsif (0x0041 <= $self->{nc} and
1003     $self->{nc} <= 0x005A) { # A..Z
1004    
1005 wakaba 1.4 $self->{ct}->{tag_name}
1006     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1007 wakaba 1.1 # start tag or end tag
1008     ## Stay in this state
1009    
1010     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1011     $self->{line_prev} = $self->{line};
1012     $self->{column_prev} = $self->{column};
1013     $self->{column}++;
1014     $self->{nc}
1015     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1016     } else {
1017     $self->{set_nc}->($self);
1018     }
1019    
1020     redo A;
1021     } elsif ($self->{nc} == -1) {
1022     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1023     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1024    
1025     $self->{last_stag_name} = $self->{ct}->{tag_name};
1026     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1027     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1028     #if ($self->{ct}->{attributes}) {
1029     # ## NOTE: This state should never be reached.
1030     # !!! cp (40);
1031     # !!! parse-error (type => 'end tag attribute');
1032     #} else {
1033    
1034     #}
1035     } else {
1036     die "$0: $self->{ct}->{type}: Unknown token type";
1037     }
1038     $self->{state} = DATA_STATE;
1039 wakaba 1.5 $self->{s_kwd} = '';
1040 wakaba 1.1 # reconsume
1041    
1042     return ($self->{ct}); # start tag or end tag
1043    
1044     redo A;
1045     } elsif ($self->{nc} == 0x002F) { # /
1046    
1047     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1048    
1049     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050     $self->{line_prev} = $self->{line};
1051     $self->{column_prev} = $self->{column};
1052     $self->{column}++;
1053     $self->{nc}
1054     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055     } else {
1056     $self->{set_nc}->($self);
1057     }
1058    
1059     redo A;
1060     } else {
1061    
1062     $self->{ct}->{tag_name} .= chr $self->{nc};
1063     # start tag or end tag
1064     ## Stay in the state
1065    
1066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1067     $self->{line_prev} = $self->{line};
1068     $self->{column_prev} = $self->{column};
1069     $self->{column}++;
1070     $self->{nc}
1071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1072     } else {
1073     $self->{set_nc}->($self);
1074     }
1075    
1076     redo A;
1077     }
1078     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1079 wakaba 1.11 ## XML5: "Tag attribute name before state".
1080    
1081 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1082    
1083     ## Stay in the state
1084    
1085     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1086     $self->{line_prev} = $self->{line};
1087     $self->{column_prev} = $self->{column};
1088     $self->{column}++;
1089     $self->{nc}
1090     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1091     } else {
1092     $self->{set_nc}->($self);
1093     }
1094    
1095     redo A;
1096     } elsif ($self->{nc} == 0x003E) { # >
1097     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1098    
1099     $self->{last_stag_name} = $self->{ct}->{tag_name};
1100     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1101     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1102     if ($self->{ct}->{attributes}) {
1103    
1104     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1105     } else {
1106    
1107     }
1108     } else {
1109     die "$0: $self->{ct}->{type}: Unknown token type";
1110     }
1111     $self->{state} = DATA_STATE;
1112 wakaba 1.5 $self->{s_kwd} = '';
1113 wakaba 1.1
1114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1115     $self->{line_prev} = $self->{line};
1116     $self->{column_prev} = $self->{column};
1117     $self->{column}++;
1118     $self->{nc}
1119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1120     } else {
1121     $self->{set_nc}->($self);
1122     }
1123    
1124    
1125     return ($self->{ct}); # start tag or end tag
1126    
1127     redo A;
1128     } elsif (0x0041 <= $self->{nc} and
1129     $self->{nc} <= 0x005A) { # A..Z
1130    
1131     $self->{ca}
1132 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1133 wakaba 1.1 value => '',
1134     line => $self->{line}, column => $self->{column}};
1135     $self->{state} = ATTRIBUTE_NAME_STATE;
1136    
1137     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1138     $self->{line_prev} = $self->{line};
1139     $self->{column_prev} = $self->{column};
1140     $self->{column}++;
1141     $self->{nc}
1142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1143     } else {
1144     $self->{set_nc}->($self);
1145     }
1146    
1147     redo A;
1148     } elsif ($self->{nc} == 0x002F) { # /
1149    
1150     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1151    
1152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1153     $self->{line_prev} = $self->{line};
1154     $self->{column_prev} = $self->{column};
1155     $self->{column}++;
1156     $self->{nc}
1157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1158     } else {
1159     $self->{set_nc}->($self);
1160     }
1161    
1162     redo A;
1163     } elsif ($self->{nc} == -1) {
1164     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1165     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1166    
1167     $self->{last_stag_name} = $self->{ct}->{tag_name};
1168     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1169     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1170     if ($self->{ct}->{attributes}) {
1171    
1172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1173     } else {
1174    
1175     }
1176     } else {
1177     die "$0: $self->{ct}->{type}: Unknown token type";
1178     }
1179     $self->{state} = DATA_STATE;
1180 wakaba 1.5 $self->{s_kwd} = '';
1181 wakaba 1.1 # reconsume
1182    
1183     return ($self->{ct}); # start tag or end tag
1184    
1185     redo A;
1186     } else {
1187     if ({
1188     0x0022 => 1, # "
1189     0x0027 => 1, # '
1190     0x003D => 1, # =
1191     }->{$self->{nc}}) {
1192    
1193 wakaba 1.11 ## XML5: Not a parse error.
1194 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1195     } else {
1196    
1197 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1198 wakaba 1.1 }
1199     $self->{ca}
1200     = {name => chr ($self->{nc}),
1201     value => '',
1202     line => $self->{line}, column => $self->{column}};
1203     $self->{state} = ATTRIBUTE_NAME_STATE;
1204    
1205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1206     $self->{line_prev} = $self->{line};
1207     $self->{column_prev} = $self->{column};
1208     $self->{column}++;
1209     $self->{nc}
1210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1211     } else {
1212     $self->{set_nc}->($self);
1213     }
1214    
1215     redo A;
1216     }
1217     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1218 wakaba 1.11 ## XML5: "Tag attribute name state".
1219    
1220 wakaba 1.1 my $before_leave = sub {
1221     if (exists $self->{ct}->{attributes} # start tag or end tag
1222     ->{$self->{ca}->{name}}) { # MUST
1223    
1224     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1225     ## Discard $self->{ca} # MUST
1226     } else {
1227    
1228     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1229     = $self->{ca};
1230 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1231 wakaba 1.1 }
1232     }; # $before_leave
1233    
1234     if ($is_space->{$self->{nc}}) {
1235    
1236     $before_leave->();
1237     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1238    
1239     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1240     $self->{line_prev} = $self->{line};
1241     $self->{column_prev} = $self->{column};
1242     $self->{column}++;
1243     $self->{nc}
1244     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1245     } else {
1246     $self->{set_nc}->($self);
1247     }
1248    
1249     redo A;
1250     } elsif ($self->{nc} == 0x003D) { # =
1251    
1252     $before_leave->();
1253     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1254    
1255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1256     $self->{line_prev} = $self->{line};
1257     $self->{column_prev} = $self->{column};
1258     $self->{column}++;
1259     $self->{nc}
1260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1261     } else {
1262     $self->{set_nc}->($self);
1263     }
1264    
1265     redo A;
1266     } elsif ($self->{nc} == 0x003E) { # >
1267 wakaba 1.11 if ($self->{is_xml}) {
1268    
1269     ## XML5: Not a parse error.
1270     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1271     } else {
1272    
1273     }
1274    
1275 wakaba 1.1 $before_leave->();
1276     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1277    
1278     $self->{last_stag_name} = $self->{ct}->{tag_name};
1279     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1280    
1281     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1282     if ($self->{ct}->{attributes}) {
1283     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1284     }
1285     } else {
1286     die "$0: $self->{ct}->{type}: Unknown token type";
1287     }
1288     $self->{state} = DATA_STATE;
1289 wakaba 1.5 $self->{s_kwd} = '';
1290 wakaba 1.1
1291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1292     $self->{line_prev} = $self->{line};
1293     $self->{column_prev} = $self->{column};
1294     $self->{column}++;
1295     $self->{nc}
1296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1297     } else {
1298     $self->{set_nc}->($self);
1299     }
1300    
1301    
1302     return ($self->{ct}); # start tag or end tag
1303    
1304     redo A;
1305     } elsif (0x0041 <= $self->{nc} and
1306     $self->{nc} <= 0x005A) { # A..Z
1307    
1308 wakaba 1.4 $self->{ca}->{name}
1309     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1310 wakaba 1.1 ## Stay in the state
1311    
1312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1313     $self->{line_prev} = $self->{line};
1314     $self->{column_prev} = $self->{column};
1315     $self->{column}++;
1316     $self->{nc}
1317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1318     } else {
1319     $self->{set_nc}->($self);
1320     }
1321    
1322     redo A;
1323     } elsif ($self->{nc} == 0x002F) { # /
1324 wakaba 1.11 if ($self->{is_xml}) {
1325    
1326     ## XML5: Not a parse error.
1327     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1328     } else {
1329    
1330     }
1331 wakaba 1.1
1332     $before_leave->();
1333     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1334    
1335     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1336     $self->{line_prev} = $self->{line};
1337     $self->{column_prev} = $self->{column};
1338     $self->{column}++;
1339     $self->{nc}
1340     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1341     } else {
1342     $self->{set_nc}->($self);
1343     }
1344    
1345     redo A;
1346     } elsif ($self->{nc} == -1) {
1347     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1348     $before_leave->();
1349     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1350    
1351     $self->{last_stag_name} = $self->{ct}->{tag_name};
1352     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1353     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1354     if ($self->{ct}->{attributes}) {
1355    
1356     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1357     } else {
1358     ## NOTE: This state should never be reached.
1359    
1360     }
1361     } else {
1362     die "$0: $self->{ct}->{type}: Unknown token type";
1363     }
1364     $self->{state} = DATA_STATE;
1365 wakaba 1.5 $self->{s_kwd} = '';
1366 wakaba 1.1 # reconsume
1367    
1368     return ($self->{ct}); # start tag or end tag
1369    
1370     redo A;
1371     } else {
1372     if ($self->{nc} == 0x0022 or # "
1373     $self->{nc} == 0x0027) { # '
1374    
1375 wakaba 1.11 ## XML5: Not a parse error.
1376 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1377     } else {
1378    
1379     }
1380     $self->{ca}->{name} .= chr ($self->{nc});
1381     ## Stay in the state
1382    
1383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1384     $self->{line_prev} = $self->{line};
1385     $self->{column_prev} = $self->{column};
1386     $self->{column}++;
1387     $self->{nc}
1388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1389     } else {
1390     $self->{set_nc}->($self);
1391     }
1392    
1393     redo A;
1394     }
1395     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1396 wakaba 1.11 ## XML5: "Tag attribute name after state".
1397    
1398 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1399    
1400     ## Stay in the state
1401    
1402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1403     $self->{line_prev} = $self->{line};
1404     $self->{column_prev} = $self->{column};
1405     $self->{column}++;
1406     $self->{nc}
1407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1408     } else {
1409     $self->{set_nc}->($self);
1410     }
1411    
1412     redo A;
1413     } elsif ($self->{nc} == 0x003D) { # =
1414    
1415     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1416    
1417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1418     $self->{line_prev} = $self->{line};
1419     $self->{column_prev} = $self->{column};
1420     $self->{column}++;
1421     $self->{nc}
1422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1423     } else {
1424     $self->{set_nc}->($self);
1425     }
1426    
1427     redo A;
1428     } elsif ($self->{nc} == 0x003E) { # >
1429 wakaba 1.11 if ($self->{is_xml}) {
1430    
1431     ## XML5: Not a parse error.
1432     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1433     } else {
1434    
1435     }
1436    
1437 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1438    
1439     $self->{last_stag_name} = $self->{ct}->{tag_name};
1440     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1441     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1442     if ($self->{ct}->{attributes}) {
1443    
1444     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1445     } else {
1446     ## NOTE: This state should never be reached.
1447    
1448     }
1449     } else {
1450     die "$0: $self->{ct}->{type}: Unknown token type";
1451     }
1452     $self->{state} = DATA_STATE;
1453 wakaba 1.5 $self->{s_kwd} = '';
1454 wakaba 1.1
1455     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1456     $self->{line_prev} = $self->{line};
1457     $self->{column_prev} = $self->{column};
1458     $self->{column}++;
1459     $self->{nc}
1460     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1461     } else {
1462     $self->{set_nc}->($self);
1463     }
1464    
1465    
1466     return ($self->{ct}); # start tag or end tag
1467    
1468     redo A;
1469     } elsif (0x0041 <= $self->{nc} and
1470     $self->{nc} <= 0x005A) { # A..Z
1471    
1472     $self->{ca}
1473 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1474 wakaba 1.1 value => '',
1475     line => $self->{line}, column => $self->{column}};
1476     $self->{state} = ATTRIBUTE_NAME_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     } elsif ($self->{nc} == 0x002F) { # /
1490 wakaba 1.11 if ($self->{is_xml}) {
1491    
1492     ## XML5: Not a parse error.
1493     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494     } else {
1495    
1496     }
1497 wakaba 1.1
1498     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1499    
1500     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1501     $self->{line_prev} = $self->{line};
1502     $self->{column_prev} = $self->{column};
1503     $self->{column}++;
1504     $self->{nc}
1505     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1506     } else {
1507     $self->{set_nc}->($self);
1508     }
1509    
1510     redo A;
1511     } elsif ($self->{nc} == -1) {
1512     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1513     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1514    
1515     $self->{last_stag_name} = $self->{ct}->{tag_name};
1516     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1517     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1518     if ($self->{ct}->{attributes}) {
1519    
1520     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1521     } else {
1522     ## NOTE: This state should never be reached.
1523    
1524     }
1525     } else {
1526     die "$0: $self->{ct}->{type}: Unknown token type";
1527     }
1528 wakaba 1.5 $self->{s_kwd} = '';
1529 wakaba 1.1 $self->{state} = DATA_STATE;
1530     # reconsume
1531    
1532     return ($self->{ct}); # start tag or end tag
1533    
1534     redo A;
1535     } else {
1536 wakaba 1.11 if ($self->{is_xml}) {
1537    
1538     ## XML5: Not a parse error.
1539     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1540     } else {
1541    
1542     }
1543    
1544 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1545     $self->{nc} == 0x0027) { # '
1546    
1547 wakaba 1.11 ## XML5: Not a parse error.
1548 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1549     } else {
1550    
1551     }
1552     $self->{ca}
1553     = {name => chr ($self->{nc}),
1554     value => '',
1555     line => $self->{line}, column => $self->{column}};
1556     $self->{state} = ATTRIBUTE_NAME_STATE;
1557    
1558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1559     $self->{line_prev} = $self->{line};
1560     $self->{column_prev} = $self->{column};
1561     $self->{column}++;
1562     $self->{nc}
1563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1564     } else {
1565     $self->{set_nc}->($self);
1566     }
1567    
1568     redo A;
1569     }
1570     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1571 wakaba 1.11 ## XML5: "Tag attribute value before state".
1572    
1573 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1574    
1575     ## Stay in the state
1576    
1577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1578     $self->{line_prev} = $self->{line};
1579     $self->{column_prev} = $self->{column};
1580     $self->{column}++;
1581     $self->{nc}
1582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1583     } else {
1584     $self->{set_nc}->($self);
1585     }
1586    
1587     redo A;
1588     } elsif ($self->{nc} == 0x0022) { # "
1589    
1590     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1591    
1592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1593     $self->{line_prev} = $self->{line};
1594     $self->{column_prev} = $self->{column};
1595     $self->{column}++;
1596     $self->{nc}
1597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1598     } else {
1599     $self->{set_nc}->($self);
1600     }
1601    
1602     redo A;
1603     } elsif ($self->{nc} == 0x0026) { # &
1604    
1605     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1606     ## reconsume
1607     redo A;
1608     } elsif ($self->{nc} == 0x0027) { # '
1609    
1610     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1611    
1612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1613     $self->{line_prev} = $self->{line};
1614     $self->{column_prev} = $self->{column};
1615     $self->{column}++;
1616     $self->{nc}
1617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1618     } else {
1619     $self->{set_nc}->($self);
1620     }
1621    
1622     redo A;
1623     } elsif ($self->{nc} == 0x003E) { # >
1624     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1625     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1626    
1627     $self->{last_stag_name} = $self->{ct}->{tag_name};
1628     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1629     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1630     if ($self->{ct}->{attributes}) {
1631    
1632     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1633     } else {
1634     ## NOTE: This state should never be reached.
1635    
1636     }
1637     } else {
1638     die "$0: $self->{ct}->{type}: Unknown token type";
1639     }
1640     $self->{state} = DATA_STATE;
1641 wakaba 1.5 $self->{s_kwd} = '';
1642 wakaba 1.1
1643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644     $self->{line_prev} = $self->{line};
1645     $self->{column_prev} = $self->{column};
1646     $self->{column}++;
1647     $self->{nc}
1648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649     } else {
1650     $self->{set_nc}->($self);
1651     }
1652    
1653    
1654     return ($self->{ct}); # start tag or end tag
1655    
1656     redo A;
1657     } elsif ($self->{nc} == -1) {
1658     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1659     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1660    
1661     $self->{last_stag_name} = $self->{ct}->{tag_name};
1662     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1663     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1664     if ($self->{ct}->{attributes}) {
1665    
1666     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1667     } else {
1668     ## NOTE: This state should never be reached.
1669    
1670     }
1671     } else {
1672     die "$0: $self->{ct}->{type}: Unknown token type";
1673     }
1674     $self->{state} = DATA_STATE;
1675 wakaba 1.5 $self->{s_kwd} = '';
1676 wakaba 1.1 ## reconsume
1677    
1678     return ($self->{ct}); # start tag or end tag
1679    
1680     redo A;
1681     } else {
1682     if ($self->{nc} == 0x003D) { # =
1683    
1684 wakaba 1.11 ## XML5: Not a parse error.
1685 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1686 wakaba 1.11 } elsif ($self->{is_xml}) {
1687    
1688     ## XML5: No parse error.
1689     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1690 wakaba 1.1 } else {
1691    
1692     }
1693     $self->{ca}->{value} .= chr ($self->{nc});
1694     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1695    
1696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1697     $self->{line_prev} = $self->{line};
1698     $self->{column_prev} = $self->{column};
1699     $self->{column}++;
1700     $self->{nc}
1701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1702     } else {
1703     $self->{set_nc}->($self);
1704     }
1705    
1706     redo A;
1707     }
1708     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1709 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1710    
1711 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1712    
1713 wakaba 1.11 ## XML5: "Tag attribute name before state".
1714 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1715    
1716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1717     $self->{line_prev} = $self->{line};
1718     $self->{column_prev} = $self->{column};
1719     $self->{column}++;
1720     $self->{nc}
1721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1722     } else {
1723     $self->{set_nc}->($self);
1724     }
1725    
1726     redo A;
1727     } elsif ($self->{nc} == 0x0026) { # &
1728    
1729 wakaba 1.11 ## XML5: Not defined yet.
1730    
1731 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1732     ## "entity in attribute value state". In this implementation, the
1733     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1734     ## implementation of the "consume a character reference" algorithm.
1735     $self->{prev_state} = $self->{state};
1736     $self->{entity_add} = 0x0022; # "
1737     $self->{state} = ENTITY_STATE;
1738    
1739     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1740     $self->{line_prev} = $self->{line};
1741     $self->{column_prev} = $self->{column};
1742     $self->{column}++;
1743     $self->{nc}
1744     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1745     } else {
1746     $self->{set_nc}->($self);
1747     }
1748    
1749     redo A;
1750     } elsif ($self->{nc} == -1) {
1751     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1752     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1753    
1754     $self->{last_stag_name} = $self->{ct}->{tag_name};
1755     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1756     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1757     if ($self->{ct}->{attributes}) {
1758    
1759     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1760     } else {
1761     ## NOTE: This state should never be reached.
1762    
1763     }
1764     } else {
1765     die "$0: $self->{ct}->{type}: Unknown token type";
1766     }
1767     $self->{state} = DATA_STATE;
1768 wakaba 1.5 $self->{s_kwd} = '';
1769 wakaba 1.1 ## reconsume
1770    
1771     return ($self->{ct}); # start tag or end tag
1772    
1773     redo A;
1774     } else {
1775 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1776    
1777     ## XML5: Not a parse error.
1778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1779     } else {
1780    
1781     }
1782 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1783     $self->{read_until}->($self->{ca}->{value},
1784 wakaba 1.11 q["&<],
1785 wakaba 1.1 length $self->{ca}->{value});
1786    
1787     ## Stay in the state
1788    
1789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1790     $self->{line_prev} = $self->{line};
1791     $self->{column_prev} = $self->{column};
1792     $self->{column}++;
1793     $self->{nc}
1794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1795     } else {
1796     $self->{set_nc}->($self);
1797     }
1798    
1799     redo A;
1800     }
1801     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1802 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1803    
1804 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1805    
1806 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1807 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1808    
1809     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1810     $self->{line_prev} = $self->{line};
1811     $self->{column_prev} = $self->{column};
1812     $self->{column}++;
1813     $self->{nc}
1814     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1815     } else {
1816     $self->{set_nc}->($self);
1817     }
1818    
1819     redo A;
1820     } elsif ($self->{nc} == 0x0026) { # &
1821    
1822 wakaba 1.11 ## XML5: Not defined yet.
1823    
1824 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1825     ## "entity in attribute value state". In this implementation, the
1826     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1827     ## implementation of the "consume a character reference" algorithm.
1828     $self->{entity_add} = 0x0027; # '
1829     $self->{prev_state} = $self->{state};
1830     $self->{state} = ENTITY_STATE;
1831    
1832     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1833     $self->{line_prev} = $self->{line};
1834     $self->{column_prev} = $self->{column};
1835     $self->{column}++;
1836     $self->{nc}
1837     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1838     } else {
1839     $self->{set_nc}->($self);
1840     }
1841    
1842     redo A;
1843     } elsif ($self->{nc} == -1) {
1844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1845     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1846    
1847     $self->{last_stag_name} = $self->{ct}->{tag_name};
1848     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1849     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1850     if ($self->{ct}->{attributes}) {
1851    
1852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1853     } else {
1854     ## NOTE: This state should never be reached.
1855    
1856     }
1857     } else {
1858     die "$0: $self->{ct}->{type}: Unknown token type";
1859     }
1860     $self->{state} = DATA_STATE;
1861 wakaba 1.5 $self->{s_kwd} = '';
1862 wakaba 1.1 ## reconsume
1863    
1864     return ($self->{ct}); # start tag or end tag
1865    
1866     redo A;
1867     } else {
1868 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1869    
1870     ## XML5: Not a parse error.
1871     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1872     } else {
1873    
1874     }
1875 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1876     $self->{read_until}->($self->{ca}->{value},
1877 wakaba 1.11 q['&<],
1878 wakaba 1.1 length $self->{ca}->{value});
1879    
1880     ## Stay in the state
1881    
1882     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1883     $self->{line_prev} = $self->{line};
1884     $self->{column_prev} = $self->{column};
1885     $self->{column}++;
1886     $self->{nc}
1887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1888     } else {
1889     $self->{set_nc}->($self);
1890     }
1891    
1892     redo A;
1893     }
1894     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1895 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1896    
1897 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1898    
1899 wakaba 1.11 ## XML5: "Tag attribute name before state".
1900 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1901    
1902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1903     $self->{line_prev} = $self->{line};
1904     $self->{column_prev} = $self->{column};
1905     $self->{column}++;
1906     $self->{nc}
1907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1908     } else {
1909     $self->{set_nc}->($self);
1910     }
1911    
1912     redo A;
1913     } elsif ($self->{nc} == 0x0026) { # &
1914    
1915 wakaba 1.11
1916     ## XML5: Not defined yet.
1917    
1918 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1919     ## "entity in attribute value state". In this implementation, the
1920     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1921     ## implementation of the "consume a character reference" algorithm.
1922     $self->{entity_add} = -1;
1923     $self->{prev_state} = $self->{state};
1924     $self->{state} = ENTITY_STATE;
1925    
1926     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1927     $self->{line_prev} = $self->{line};
1928     $self->{column_prev} = $self->{column};
1929     $self->{column}++;
1930     $self->{nc}
1931     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1932     } else {
1933     $self->{set_nc}->($self);
1934     }
1935    
1936     redo A;
1937     } elsif ($self->{nc} == 0x003E) { # >
1938     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1939    
1940     $self->{last_stag_name} = $self->{ct}->{tag_name};
1941     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1942     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1943     if ($self->{ct}->{attributes}) {
1944    
1945     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1946     } else {
1947     ## NOTE: This state should never be reached.
1948    
1949     }
1950     } else {
1951     die "$0: $self->{ct}->{type}: Unknown token type";
1952     }
1953     $self->{state} = DATA_STATE;
1954 wakaba 1.5 $self->{s_kwd} = '';
1955 wakaba 1.1
1956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957     $self->{line_prev} = $self->{line};
1958     $self->{column_prev} = $self->{column};
1959     $self->{column}++;
1960     $self->{nc}
1961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962     } else {
1963     $self->{set_nc}->($self);
1964     }
1965    
1966    
1967     return ($self->{ct}); # start tag or end tag
1968    
1969     redo A;
1970     } elsif ($self->{nc} == -1) {
1971     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1972     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1973    
1974     $self->{last_stag_name} = $self->{ct}->{tag_name};
1975     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1976     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1977     if ($self->{ct}->{attributes}) {
1978    
1979     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1980     } else {
1981     ## NOTE: This state should never be reached.
1982    
1983     }
1984     } else {
1985     die "$0: $self->{ct}->{type}: Unknown token type";
1986     }
1987     $self->{state} = DATA_STATE;
1988 wakaba 1.5 $self->{s_kwd} = '';
1989 wakaba 1.1 ## reconsume
1990    
1991     return ($self->{ct}); # start tag or end tag
1992    
1993     redo A;
1994     } else {
1995     if ({
1996     0x0022 => 1, # "
1997     0x0027 => 1, # '
1998     0x003D => 1, # =
1999     }->{$self->{nc}}) {
2000    
2001 wakaba 1.11 ## XML5: Not a parse error.
2002 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2003     } else {
2004    
2005     }
2006     $self->{ca}->{value} .= chr ($self->{nc});
2007     $self->{read_until}->($self->{ca}->{value},
2008     q["'=& >],
2009     length $self->{ca}->{value});
2010    
2011     ## Stay in the state
2012    
2013     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2014     $self->{line_prev} = $self->{line};
2015     $self->{column_prev} = $self->{column};
2016     $self->{column}++;
2017     $self->{nc}
2018     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2019     } else {
2020     $self->{set_nc}->($self);
2021     }
2022    
2023     redo A;
2024     }
2025     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2026     if ($is_space->{$self->{nc}}) {
2027    
2028     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2029    
2030     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2031     $self->{line_prev} = $self->{line};
2032     $self->{column_prev} = $self->{column};
2033     $self->{column}++;
2034     $self->{nc}
2035     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2036     } else {
2037     $self->{set_nc}->($self);
2038     }
2039    
2040     redo A;
2041     } elsif ($self->{nc} == 0x003E) { # >
2042     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2043    
2044     $self->{last_stag_name} = $self->{ct}->{tag_name};
2045     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2046     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2047     if ($self->{ct}->{attributes}) {
2048    
2049     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2050     } else {
2051     ## NOTE: This state should never be reached.
2052    
2053     }
2054     } else {
2055     die "$0: $self->{ct}->{type}: Unknown token type";
2056     }
2057     $self->{state} = DATA_STATE;
2058 wakaba 1.5 $self->{s_kwd} = '';
2059 wakaba 1.1
2060     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2061     $self->{line_prev} = $self->{line};
2062     $self->{column_prev} = $self->{column};
2063     $self->{column}++;
2064     $self->{nc}
2065     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2066     } else {
2067     $self->{set_nc}->($self);
2068     }
2069    
2070    
2071     return ($self->{ct}); # start tag or end tag
2072    
2073     redo A;
2074     } elsif ($self->{nc} == 0x002F) { # /
2075    
2076     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2077    
2078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2079     $self->{line_prev} = $self->{line};
2080     $self->{column_prev} = $self->{column};
2081     $self->{column}++;
2082     $self->{nc}
2083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2084     } else {
2085     $self->{set_nc}->($self);
2086     }
2087    
2088     redo A;
2089     } elsif ($self->{nc} == -1) {
2090     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2091     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2092    
2093     $self->{last_stag_name} = $self->{ct}->{tag_name};
2094     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2095     if ($self->{ct}->{attributes}) {
2096    
2097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2098     } else {
2099     ## NOTE: This state should never be reached.
2100    
2101     }
2102     } else {
2103     die "$0: $self->{ct}->{type}: Unknown token type";
2104     }
2105     $self->{state} = DATA_STATE;
2106 wakaba 1.5 $self->{s_kwd} = '';
2107 wakaba 1.1 ## Reconsume.
2108     return ($self->{ct}); # start tag or end tag
2109     redo A;
2110     } else {
2111    
2112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2113     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2114     ## reconsume
2115     redo A;
2116     }
2117     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2118 wakaba 1.11 ## XML5: "Empty tag state".
2119    
2120 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2121     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2122    
2123     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2124     ## TODO: Different type than slash in start tag
2125     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2126     if ($self->{ct}->{attributes}) {
2127    
2128     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2129     } else {
2130    
2131     }
2132     ## TODO: Test |<title></title/>|
2133     } else {
2134    
2135     $self->{self_closing} = 1;
2136     }
2137    
2138     $self->{state} = DATA_STATE;
2139 wakaba 1.5 $self->{s_kwd} = '';
2140 wakaba 1.1
2141     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2142     $self->{line_prev} = $self->{line};
2143     $self->{column_prev} = $self->{column};
2144     $self->{column}++;
2145     $self->{nc}
2146     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2147     } else {
2148     $self->{set_nc}->($self);
2149     }
2150    
2151    
2152     return ($self->{ct}); # start tag or end tag
2153    
2154     redo A;
2155     } elsif ($self->{nc} == -1) {
2156     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2158    
2159     $self->{last_stag_name} = $self->{ct}->{tag_name};
2160     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2161     if ($self->{ct}->{attributes}) {
2162    
2163     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2164     } else {
2165     ## NOTE: This state should never be reached.
2166    
2167     }
2168     } else {
2169     die "$0: $self->{ct}->{type}: Unknown token type";
2170     }
2171 wakaba 1.11 ## XML5: "Tag attribute name before state".
2172 wakaba 1.1 $self->{state} = DATA_STATE;
2173 wakaba 1.5 $self->{s_kwd} = '';
2174 wakaba 1.1 ## Reconsume.
2175     return ($self->{ct}); # start tag or end tag
2176     redo A;
2177     } else {
2178    
2179     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2180     ## TODO: This error type is wrong.
2181     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182     ## Reconsume.
2183     redo A;
2184     }
2185     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2186     ## (only happen if PCDATA state)
2187    
2188     ## NOTE: Unlike spec's "bogus comment state", this implementation
2189     ## consumes characters one-by-one basis.
2190    
2191     if ($self->{nc} == 0x003E) { # >
2192    
2193     $self->{state} = DATA_STATE;
2194 wakaba 1.5 $self->{s_kwd} = '';
2195 wakaba 1.1
2196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2197     $self->{line_prev} = $self->{line};
2198     $self->{column_prev} = $self->{column};
2199     $self->{column}++;
2200     $self->{nc}
2201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2202     } else {
2203     $self->{set_nc}->($self);
2204     }
2205    
2206    
2207     return ($self->{ct}); # comment
2208     redo A;
2209     } elsif ($self->{nc} == -1) {
2210    
2211     $self->{state} = DATA_STATE;
2212 wakaba 1.5 $self->{s_kwd} = '';
2213 wakaba 1.1 ## reconsume
2214    
2215     return ($self->{ct}); # comment
2216     redo A;
2217     } else {
2218    
2219     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2220     $self->{read_until}->($self->{ct}->{data},
2221     q[>],
2222     length $self->{ct}->{data});
2223    
2224     ## Stay in the state.
2225    
2226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2227     $self->{line_prev} = $self->{line};
2228     $self->{column_prev} = $self->{column};
2229     $self->{column}++;
2230     $self->{nc}
2231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2232     } else {
2233     $self->{set_nc}->($self);
2234     }
2235    
2236     redo A;
2237     }
2238     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2239     ## (only happen if PCDATA state)
2240    
2241     if ($self->{nc} == 0x002D) { # -
2242    
2243     $self->{state} = MD_HYPHEN_STATE;
2244    
2245     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2246     $self->{line_prev} = $self->{line};
2247     $self->{column_prev} = $self->{column};
2248     $self->{column}++;
2249     $self->{nc}
2250     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2251     } else {
2252     $self->{set_nc}->($self);
2253     }
2254    
2255     redo A;
2256     } elsif ($self->{nc} == 0x0044 or # D
2257     $self->{nc} == 0x0064) { # d
2258     ## ASCII case-insensitive.
2259    
2260     $self->{state} = MD_DOCTYPE_STATE;
2261 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2262 wakaba 1.1
2263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2264     $self->{line_prev} = $self->{line};
2265     $self->{column_prev} = $self->{column};
2266     $self->{column}++;
2267     $self->{nc}
2268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2269     } else {
2270     $self->{set_nc}->($self);
2271     }
2272    
2273     redo A;
2274 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2275     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2276     $self->{is_xml}) and
2277 wakaba 1.1 $self->{nc} == 0x005B) { # [
2278    
2279     $self->{state} = MD_CDATA_STATE;
2280 wakaba 1.12 $self->{kwd} = '[';
2281 wakaba 1.1
2282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2283     $self->{line_prev} = $self->{line};
2284     $self->{column_prev} = $self->{column};
2285     $self->{column}++;
2286     $self->{nc}
2287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2288     } else {
2289     $self->{set_nc}->($self);
2290     }
2291    
2292     redo A;
2293     } else {
2294    
2295     }
2296    
2297     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2298     line => $self->{line_prev},
2299     column => $self->{column_prev} - 1);
2300     ## Reconsume.
2301     $self->{state} = BOGUS_COMMENT_STATE;
2302     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2303     line => $self->{line_prev},
2304     column => $self->{column_prev} - 1,
2305     };
2306     redo A;
2307     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2308     if ($self->{nc} == 0x002D) { # -
2309    
2310     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2311     line => $self->{line_prev},
2312     column => $self->{column_prev} - 2,
2313     };
2314 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2315 wakaba 1.1
2316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2317     $self->{line_prev} = $self->{line};
2318     $self->{column_prev} = $self->{column};
2319     $self->{column}++;
2320     $self->{nc}
2321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2322     } else {
2323     $self->{set_nc}->($self);
2324     }
2325    
2326     redo A;
2327     } else {
2328    
2329     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2330     line => $self->{line_prev},
2331     column => $self->{column_prev} - 2);
2332     $self->{state} = BOGUS_COMMENT_STATE;
2333     ## Reconsume.
2334     $self->{ct} = {type => COMMENT_TOKEN,
2335     data => '-',
2336     line => $self->{line_prev},
2337     column => $self->{column_prev} - 2,
2338     };
2339     redo A;
2340     }
2341     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2342     ## ASCII case-insensitive.
2343     if ($self->{nc} == [
2344     undef,
2345     0x004F, # O
2346     0x0043, # C
2347     0x0054, # T
2348     0x0059, # Y
2349     0x0050, # P
2350 wakaba 1.12 ]->[length $self->{kwd}] or
2351 wakaba 1.1 $self->{nc} == [
2352     undef,
2353     0x006F, # o
2354     0x0063, # c
2355     0x0074, # t
2356     0x0079, # y
2357     0x0070, # p
2358 wakaba 1.12 ]->[length $self->{kwd}]) {
2359 wakaba 1.1
2360     ## Stay in the state.
2361 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2362 wakaba 1.1
2363     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2364     $self->{line_prev} = $self->{line};
2365     $self->{column_prev} = $self->{column};
2366     $self->{column}++;
2367     $self->{nc}
2368     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2369     } else {
2370     $self->{set_nc}->($self);
2371     }
2372    
2373     redo A;
2374 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2375 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2376     $self->{nc} == 0x0065)) { # e
2377 wakaba 1.12 if ($self->{is_xml} and
2378     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2379 wakaba 1.10
2380     ## XML5: case-sensitive.
2381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2382     text => 'DOCTYPE',
2383     line => $self->{line_prev},
2384     column => $self->{column_prev} - 5);
2385     } else {
2386    
2387     }
2388 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2389     $self->{ct} = {type => DOCTYPE_TOKEN,
2390     quirks => 1,
2391     line => $self->{line_prev},
2392     column => $self->{column_prev} - 7,
2393     };
2394    
2395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2396     $self->{line_prev} = $self->{line};
2397     $self->{column_prev} = $self->{column};
2398     $self->{column}++;
2399     $self->{nc}
2400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2401     } else {
2402     $self->{set_nc}->($self);
2403     }
2404    
2405     redo A;
2406     } else {
2407    
2408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2409     line => $self->{line_prev},
2410 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2411 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2412     ## Reconsume.
2413     $self->{ct} = {type => COMMENT_TOKEN,
2414 wakaba 1.12 data => $self->{kwd},
2415 wakaba 1.1 line => $self->{line_prev},
2416 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2417 wakaba 1.1 };
2418     redo A;
2419     }
2420     } elsif ($self->{state} == MD_CDATA_STATE) {
2421     if ($self->{nc} == {
2422     '[' => 0x0043, # C
2423     '[C' => 0x0044, # D
2424     '[CD' => 0x0041, # A
2425     '[CDA' => 0x0054, # T
2426     '[CDAT' => 0x0041, # A
2427 wakaba 1.12 }->{$self->{kwd}}) {
2428 wakaba 1.1
2429     ## Stay in the state.
2430 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2431 wakaba 1.1
2432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2433     $self->{line_prev} = $self->{line};
2434     $self->{column_prev} = $self->{column};
2435     $self->{column}++;
2436     $self->{nc}
2437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2438     } else {
2439     $self->{set_nc}->($self);
2440     }
2441    
2442     redo A;
2443 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2444 wakaba 1.1 $self->{nc} == 0x005B) { # [
2445 wakaba 1.6 if ($self->{is_xml} and
2446     not $self->{tainted} and
2447     @{$self->{open_elements} or []} == 0) {
2448 wakaba 1.8
2449 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2450     line => $self->{line_prev},
2451     column => $self->{column_prev} - 7);
2452     $self->{tainted} = 1;
2453 wakaba 1.8 } else {
2454    
2455 wakaba 1.6 }
2456    
2457 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2458     data => '',
2459     line => $self->{line_prev},
2460     column => $self->{column_prev} - 7};
2461     $self->{state} = CDATA_SECTION_STATE;
2462    
2463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2464     $self->{line_prev} = $self->{line};
2465     $self->{column_prev} = $self->{column};
2466     $self->{column}++;
2467     $self->{nc}
2468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2469     } else {
2470     $self->{set_nc}->($self);
2471     }
2472    
2473     redo A;
2474     } else {
2475    
2476     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2477     line => $self->{line_prev},
2478 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2479 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2480     ## Reconsume.
2481     $self->{ct} = {type => COMMENT_TOKEN,
2482 wakaba 1.12 data => $self->{kwd},
2483 wakaba 1.1 line => $self->{line_prev},
2484 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2485 wakaba 1.1 };
2486     redo A;
2487     }
2488     } elsif ($self->{state} == COMMENT_START_STATE) {
2489     if ($self->{nc} == 0x002D) { # -
2490    
2491     $self->{state} = COMMENT_START_DASH_STATE;
2492    
2493     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2494     $self->{line_prev} = $self->{line};
2495     $self->{column_prev} = $self->{column};
2496     $self->{column}++;
2497     $self->{nc}
2498     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2499     } else {
2500     $self->{set_nc}->($self);
2501     }
2502    
2503     redo A;
2504     } elsif ($self->{nc} == 0x003E) { # >
2505    
2506     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2507     $self->{state} = DATA_STATE;
2508 wakaba 1.5 $self->{s_kwd} = '';
2509 wakaba 1.1
2510     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2511     $self->{line_prev} = $self->{line};
2512     $self->{column_prev} = $self->{column};
2513     $self->{column}++;
2514     $self->{nc}
2515     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2516     } else {
2517     $self->{set_nc}->($self);
2518     }
2519    
2520    
2521     return ($self->{ct}); # comment
2522    
2523     redo A;
2524     } elsif ($self->{nc} == -1) {
2525    
2526     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2527     $self->{state} = DATA_STATE;
2528 wakaba 1.5 $self->{s_kwd} = '';
2529 wakaba 1.1 ## reconsume
2530    
2531     return ($self->{ct}); # comment
2532    
2533     redo A;
2534     } else {
2535    
2536     $self->{ct}->{data} # comment
2537     .= chr ($self->{nc});
2538     $self->{state} = COMMENT_STATE;
2539    
2540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2541     $self->{line_prev} = $self->{line};
2542     $self->{column_prev} = $self->{column};
2543     $self->{column}++;
2544     $self->{nc}
2545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2546     } else {
2547     $self->{set_nc}->($self);
2548     }
2549    
2550     redo A;
2551     }
2552     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2553     if ($self->{nc} == 0x002D) { # -
2554    
2555     $self->{state} = COMMENT_END_STATE;
2556    
2557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2558     $self->{line_prev} = $self->{line};
2559     $self->{column_prev} = $self->{column};
2560     $self->{column}++;
2561     $self->{nc}
2562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2563     } else {
2564     $self->{set_nc}->($self);
2565     }
2566    
2567     redo A;
2568     } elsif ($self->{nc} == 0x003E) { # >
2569    
2570     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2571     $self->{state} = DATA_STATE;
2572 wakaba 1.5 $self->{s_kwd} = '';
2573 wakaba 1.1
2574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2575     $self->{line_prev} = $self->{line};
2576     $self->{column_prev} = $self->{column};
2577     $self->{column}++;
2578     $self->{nc}
2579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2580     } else {
2581     $self->{set_nc}->($self);
2582     }
2583    
2584    
2585     return ($self->{ct}); # comment
2586    
2587     redo A;
2588     } elsif ($self->{nc} == -1) {
2589    
2590     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2591     $self->{state} = DATA_STATE;
2592 wakaba 1.5 $self->{s_kwd} = '';
2593 wakaba 1.1 ## reconsume
2594    
2595     return ($self->{ct}); # comment
2596    
2597     redo A;
2598     } else {
2599    
2600     $self->{ct}->{data} # comment
2601     .= '-' . chr ($self->{nc});
2602     $self->{state} = COMMENT_STATE;
2603    
2604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2605     $self->{line_prev} = $self->{line};
2606     $self->{column_prev} = $self->{column};
2607     $self->{column}++;
2608     $self->{nc}
2609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2610     } else {
2611     $self->{set_nc}->($self);
2612     }
2613    
2614     redo A;
2615     }
2616     } elsif ($self->{state} == COMMENT_STATE) {
2617     if ($self->{nc} == 0x002D) { # -
2618    
2619     $self->{state} = COMMENT_END_DASH_STATE;
2620    
2621     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2622     $self->{line_prev} = $self->{line};
2623     $self->{column_prev} = $self->{column};
2624     $self->{column}++;
2625     $self->{nc}
2626     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2627     } else {
2628     $self->{set_nc}->($self);
2629     }
2630    
2631     redo A;
2632     } elsif ($self->{nc} == -1) {
2633    
2634     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2635     $self->{state} = DATA_STATE;
2636 wakaba 1.5 $self->{s_kwd} = '';
2637 wakaba 1.1 ## reconsume
2638    
2639     return ($self->{ct}); # comment
2640    
2641     redo A;
2642     } else {
2643    
2644     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2645     $self->{read_until}->($self->{ct}->{data},
2646     q[-],
2647     length $self->{ct}->{data});
2648    
2649     ## Stay in the state
2650    
2651     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2652     $self->{line_prev} = $self->{line};
2653     $self->{column_prev} = $self->{column};
2654     $self->{column}++;
2655     $self->{nc}
2656     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2657     } else {
2658     $self->{set_nc}->($self);
2659     }
2660    
2661     redo A;
2662     }
2663     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2664 wakaba 1.10 ## XML5: "comment dash state".
2665    
2666 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2667    
2668     $self->{state} = COMMENT_END_STATE;
2669    
2670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2671     $self->{line_prev} = $self->{line};
2672     $self->{column_prev} = $self->{column};
2673     $self->{column}++;
2674     $self->{nc}
2675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2676     } else {
2677     $self->{set_nc}->($self);
2678     }
2679    
2680     redo A;
2681     } elsif ($self->{nc} == -1) {
2682    
2683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2684     $self->{state} = DATA_STATE;
2685 wakaba 1.5 $self->{s_kwd} = '';
2686 wakaba 1.1 ## reconsume
2687    
2688     return ($self->{ct}); # comment
2689    
2690     redo A;
2691     } else {
2692    
2693     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2694     $self->{state} = COMMENT_STATE;
2695    
2696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2697     $self->{line_prev} = $self->{line};
2698     $self->{column_prev} = $self->{column};
2699     $self->{column}++;
2700     $self->{nc}
2701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2702     } else {
2703     $self->{set_nc}->($self);
2704     }
2705    
2706     redo A;
2707     }
2708     } elsif ($self->{state} == COMMENT_END_STATE) {
2709     if ($self->{nc} == 0x003E) { # >
2710    
2711     $self->{state} = DATA_STATE;
2712 wakaba 1.5 $self->{s_kwd} = '';
2713 wakaba 1.1
2714     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2715     $self->{line_prev} = $self->{line};
2716     $self->{column_prev} = $self->{column};
2717     $self->{column}++;
2718     $self->{nc}
2719     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2720     } else {
2721     $self->{set_nc}->($self);
2722     }
2723    
2724    
2725     return ($self->{ct}); # comment
2726    
2727     redo A;
2728     } elsif ($self->{nc} == 0x002D) { # -
2729    
2730 wakaba 1.10 ## XML5: Not a parse error.
2731 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2732     line => $self->{line_prev},
2733     column => $self->{column_prev});
2734     $self->{ct}->{data} .= '-'; # comment
2735     ## Stay in the state
2736    
2737     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2738     $self->{line_prev} = $self->{line};
2739     $self->{column_prev} = $self->{column};
2740     $self->{column}++;
2741     $self->{nc}
2742     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2743     } else {
2744     $self->{set_nc}->($self);
2745     }
2746    
2747     redo A;
2748     } elsif ($self->{nc} == -1) {
2749    
2750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2751     $self->{state} = DATA_STATE;
2752 wakaba 1.5 $self->{s_kwd} = '';
2753 wakaba 1.1 ## reconsume
2754    
2755     return ($self->{ct}); # comment
2756    
2757     redo A;
2758     } else {
2759    
2760 wakaba 1.10 ## XML5: Not a parse error.
2761 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2762     line => $self->{line_prev},
2763     column => $self->{column_prev});
2764     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2765     $self->{state} = COMMENT_STATE;
2766    
2767     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2768     $self->{line_prev} = $self->{line};
2769     $self->{column_prev} = $self->{column};
2770     $self->{column}++;
2771     $self->{nc}
2772     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2773     } else {
2774     $self->{set_nc}->($self);
2775     }
2776    
2777     redo A;
2778     }
2779     } elsif ($self->{state} == DOCTYPE_STATE) {
2780     if ($is_space->{$self->{nc}}) {
2781    
2782     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2783    
2784     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2785     $self->{line_prev} = $self->{line};
2786     $self->{column_prev} = $self->{column};
2787     $self->{column}++;
2788     $self->{nc}
2789     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2790     } else {
2791     $self->{set_nc}->($self);
2792     }
2793    
2794     redo A;
2795     } else {
2796    
2797 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2798 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2799     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2800     ## reconsume
2801     redo A;
2802     }
2803     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2804 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2805    
2806 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2807    
2808     ## Stay in the state
2809    
2810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2811     $self->{line_prev} = $self->{line};
2812     $self->{column_prev} = $self->{column};
2813     $self->{column}++;
2814     $self->{nc}
2815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2816     } else {
2817     $self->{set_nc}->($self);
2818     }
2819    
2820     redo A;
2821     } elsif ($self->{nc} == 0x003E) { # >
2822    
2823 wakaba 1.12 ## XML5: No parse error.
2824 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2825     $self->{state} = DATA_STATE;
2826 wakaba 1.5 $self->{s_kwd} = '';
2827 wakaba 1.1
2828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2829     $self->{line_prev} = $self->{line};
2830     $self->{column_prev} = $self->{column};
2831     $self->{column}++;
2832     $self->{nc}
2833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2834     } else {
2835     $self->{set_nc}->($self);
2836     }
2837    
2838    
2839     return ($self->{ct}); # DOCTYPE (quirks)
2840    
2841     redo A;
2842     } elsif ($self->{nc} == -1) {
2843    
2844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2845     $self->{state} = DATA_STATE;
2846 wakaba 1.5 $self->{s_kwd} = '';
2847 wakaba 1.1 ## reconsume
2848    
2849     return ($self->{ct}); # DOCTYPE (quirks)
2850    
2851     redo A;
2852 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2853    
2854     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2855     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856    
2857     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2858     $self->{line_prev} = $self->{line};
2859     $self->{column_prev} = $self->{column};
2860     $self->{column}++;
2861     $self->{nc}
2862     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2863     } else {
2864     $self->{set_nc}->($self);
2865     }
2866    
2867     redo A;
2868 wakaba 1.1 } else {
2869    
2870     $self->{ct}->{name} = chr $self->{nc};
2871     delete $self->{ct}->{quirks};
2872     $self->{state} = DOCTYPE_NAME_STATE;
2873    
2874     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2875     $self->{line_prev} = $self->{line};
2876     $self->{column_prev} = $self->{column};
2877     $self->{column}++;
2878     $self->{nc}
2879     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2880     } else {
2881     $self->{set_nc}->($self);
2882     }
2883    
2884     redo A;
2885     }
2886     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2887 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2888    
2889     ## ISSUE: Redundant "First," in the spec.
2890    
2891 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2892    
2893     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2894    
2895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896     $self->{line_prev} = $self->{line};
2897     $self->{column_prev} = $self->{column};
2898     $self->{column}++;
2899     $self->{nc}
2900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901     } else {
2902     $self->{set_nc}->($self);
2903     }
2904    
2905     redo A;
2906     } elsif ($self->{nc} == 0x003E) { # >
2907    
2908     $self->{state} = DATA_STATE;
2909 wakaba 1.5 $self->{s_kwd} = '';
2910 wakaba 1.1
2911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2912     $self->{line_prev} = $self->{line};
2913     $self->{column_prev} = $self->{column};
2914     $self->{column}++;
2915     $self->{nc}
2916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2917     } else {
2918     $self->{set_nc}->($self);
2919     }
2920    
2921    
2922     return ($self->{ct}); # DOCTYPE
2923    
2924     redo A;
2925     } elsif ($self->{nc} == -1) {
2926    
2927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2928     $self->{state} = DATA_STATE;
2929 wakaba 1.5 $self->{s_kwd} = '';
2930 wakaba 1.1 ## reconsume
2931    
2932     $self->{ct}->{quirks} = 1;
2933     return ($self->{ct}); # DOCTYPE
2934    
2935     redo A;
2936 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2937    
2938     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939    
2940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2941     $self->{line_prev} = $self->{line};
2942     $self->{column_prev} = $self->{column};
2943     $self->{column}++;
2944     $self->{nc}
2945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2946     } else {
2947     $self->{set_nc}->($self);
2948     }
2949    
2950     redo A;
2951 wakaba 1.1 } else {
2952    
2953     $self->{ct}->{name}
2954     .= chr ($self->{nc}); # DOCTYPE
2955     ## Stay in the state
2956    
2957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2958     $self->{line_prev} = $self->{line};
2959     $self->{column_prev} = $self->{column};
2960     $self->{column}++;
2961     $self->{nc}
2962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2963     } else {
2964     $self->{set_nc}->($self);
2965     }
2966    
2967     redo A;
2968     }
2969     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2970 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2971     ## state", but implemented differently.
2972    
2973 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2974    
2975     ## Stay in the state
2976    
2977     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2978     $self->{line_prev} = $self->{line};
2979     $self->{column_prev} = $self->{column};
2980     $self->{column}++;
2981     $self->{nc}
2982     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2983     } else {
2984     $self->{set_nc}->($self);
2985     }
2986    
2987     redo A;
2988     } elsif ($self->{nc} == 0x003E) { # >
2989    
2990     $self->{state} = DATA_STATE;
2991 wakaba 1.5 $self->{s_kwd} = '';
2992 wakaba 1.1
2993     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2994     $self->{line_prev} = $self->{line};
2995     $self->{column_prev} = $self->{column};
2996     $self->{column}++;
2997     $self->{nc}
2998     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2999     } else {
3000     $self->{set_nc}->($self);
3001     }
3002    
3003    
3004     return ($self->{ct}); # DOCTYPE
3005    
3006     redo A;
3007     } elsif ($self->{nc} == -1) {
3008    
3009     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3010     $self->{state} = DATA_STATE;
3011 wakaba 1.5 $self->{s_kwd} = '';
3012 wakaba 1.1 ## reconsume
3013    
3014     $self->{ct}->{quirks} = 1;
3015     return ($self->{ct}); # DOCTYPE
3016    
3017     redo A;
3018     } elsif ($self->{nc} == 0x0050 or # P
3019     $self->{nc} == 0x0070) { # p
3020 wakaba 1.12
3021 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3022 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3023 wakaba 1.1
3024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3025     $self->{line_prev} = $self->{line};
3026     $self->{column_prev} = $self->{column};
3027     $self->{column}++;
3028     $self->{nc}
3029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3030     } else {
3031     $self->{set_nc}->($self);
3032     }
3033    
3034     redo A;
3035     } elsif ($self->{nc} == 0x0053 or # S
3036     $self->{nc} == 0x0073) { # s
3037 wakaba 1.12
3038 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3039 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3040    
3041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3042     $self->{line_prev} = $self->{line};
3043     $self->{column_prev} = $self->{column};
3044     $self->{column}++;
3045     $self->{nc}
3046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3047     } else {
3048     $self->{set_nc}->($self);
3049     }
3050    
3051     redo A;
3052     } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3053    
3054     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3055     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3056 wakaba 1.1
3057     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3058     $self->{line_prev} = $self->{line};
3059     $self->{column_prev} = $self->{column};
3060     $self->{column}++;
3061     $self->{nc}
3062     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3063     } else {
3064     $self->{set_nc}->($self);
3065     }
3066    
3067     redo A;
3068     } else {
3069    
3070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
3071     $self->{ct}->{quirks} = 1;
3072    
3073     $self->{state} = BOGUS_DOCTYPE_STATE;
3074    
3075     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3076     $self->{line_prev} = $self->{line};
3077     $self->{column_prev} = $self->{column};
3078     $self->{column}++;
3079     $self->{nc}
3080     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3081     } else {
3082     $self->{set_nc}->($self);
3083     }
3084    
3085     redo A;
3086     }
3087     } elsif ($self->{state} == PUBLIC_STATE) {
3088     ## ASCII case-insensitive
3089     if ($self->{nc} == [
3090     undef,
3091     0x0055, # U
3092     0x0042, # B
3093     0x004C, # L
3094     0x0049, # I
3095 wakaba 1.12 ]->[length $self->{kwd}] or
3096 wakaba 1.1 $self->{nc} == [
3097     undef,
3098     0x0075, # u
3099     0x0062, # b
3100     0x006C, # l
3101     0x0069, # i
3102 wakaba 1.12 ]->[length $self->{kwd}]) {
3103 wakaba 1.1
3104     ## Stay in the state.
3105 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3106 wakaba 1.1
3107     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3108     $self->{line_prev} = $self->{line};
3109     $self->{column_prev} = $self->{column};
3110     $self->{column}++;
3111     $self->{nc}
3112     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3113     } else {
3114     $self->{set_nc}->($self);
3115     }
3116    
3117     redo A;
3118 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3119 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3120     $self->{nc} == 0x0063)) { # c
3121 wakaba 1.12 if ($self->{is_xml} and
3122     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3123    
3124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3125     text => 'PUBLIC',
3126     line => $self->{line_prev},
3127     column => $self->{column_prev} - 4);
3128     } else {
3129    
3130     }
3131 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3132    
3133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3134     $self->{line_prev} = $self->{line};
3135     $self->{column_prev} = $self->{column};
3136     $self->{column}++;
3137     $self->{nc}
3138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3139     } else {
3140     $self->{set_nc}->($self);
3141     }
3142    
3143     redo A;
3144     } else {
3145    
3146     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3147     line => $self->{line_prev},
3148 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3149 wakaba 1.1 $self->{ct}->{quirks} = 1;
3150    
3151     $self->{state} = BOGUS_DOCTYPE_STATE;
3152     ## Reconsume.
3153     redo A;
3154     }
3155     } elsif ($self->{state} == SYSTEM_STATE) {
3156     ## ASCII case-insensitive
3157     if ($self->{nc} == [
3158     undef,
3159     0x0059, # Y
3160     0x0053, # S
3161     0x0054, # T
3162     0x0045, # E
3163 wakaba 1.12 ]->[length $self->{kwd}] or
3164 wakaba 1.1 $self->{nc} == [
3165     undef,
3166     0x0079, # y
3167     0x0073, # s
3168     0x0074, # t
3169     0x0065, # e
3170 wakaba 1.12 ]->[length $self->{kwd}]) {
3171 wakaba 1.1
3172     ## Stay in the state.
3173 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3174 wakaba 1.1
3175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3176     $self->{line_prev} = $self->{line};
3177     $self->{column_prev} = $self->{column};
3178     $self->{column}++;
3179     $self->{nc}
3180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3181     } else {
3182     $self->{set_nc}->($self);
3183     }
3184    
3185     redo A;
3186 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3187 wakaba 1.1 ($self->{nc} == 0x004D or # M
3188     $self->{nc} == 0x006D)) { # m
3189 wakaba 1.12 if ($self->{is_xml} and
3190     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3191    
3192     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3193     text => 'SYSTEM',
3194     line => $self->{line_prev},
3195     column => $self->{column_prev} - 4);
3196     } else {
3197    
3198     }
3199 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3200    
3201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3202     $self->{line_prev} = $self->{line};
3203     $self->{column_prev} = $self->{column};
3204     $self->{column}++;
3205     $self->{nc}
3206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3207     } else {
3208     $self->{set_nc}->($self);
3209     }
3210    
3211     redo A;
3212     } else {
3213    
3214     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3215     line => $self->{line_prev},
3216 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3217 wakaba 1.1 $self->{ct}->{quirks} = 1;
3218    
3219     $self->{state} = BOGUS_DOCTYPE_STATE;
3220     ## Reconsume.
3221     redo A;
3222     }
3223     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3224     if ($is_space->{$self->{nc}}) {
3225    
3226     ## Stay in the state
3227    
3228     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3229     $self->{line_prev} = $self->{line};
3230     $self->{column_prev} = $self->{column};
3231     $self->{column}++;
3232     $self->{nc}
3233     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3234     } else {
3235     $self->{set_nc}->($self);
3236     }
3237    
3238     redo A;
3239     } elsif ($self->{nc} eq 0x0022) { # "
3240    
3241     $self->{ct}->{pubid} = ''; # DOCTYPE
3242     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3243    
3244     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3245     $self->{line_prev} = $self->{line};
3246     $self->{column_prev} = $self->{column};
3247     $self->{column}++;
3248     $self->{nc}
3249     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3250     } else {
3251     $self->{set_nc}->($self);
3252     }
3253    
3254     redo A;
3255     } elsif ($self->{nc} eq 0x0027) { # '
3256    
3257     $self->{ct}->{pubid} = ''; # DOCTYPE
3258     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3259    
3260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3261     $self->{line_prev} = $self->{line};
3262     $self->{column_prev} = $self->{column};
3263     $self->{column}++;
3264     $self->{nc}
3265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3266     } else {
3267     $self->{set_nc}->($self);
3268     }
3269    
3270     redo A;
3271     } elsif ($self->{nc} eq 0x003E) { # >
3272    
3273     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3274    
3275     $self->{state} = DATA_STATE;
3276 wakaba 1.5 $self->{s_kwd} = '';
3277 wakaba 1.1
3278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3279     $self->{line_prev} = $self->{line};
3280     $self->{column_prev} = $self->{column};
3281     $self->{column}++;
3282     $self->{nc}
3283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3284     } else {
3285     $self->{set_nc}->($self);
3286     }
3287    
3288    
3289     $self->{ct}->{quirks} = 1;
3290     return ($self->{ct}); # DOCTYPE
3291    
3292     redo A;
3293     } elsif ($self->{nc} == -1) {
3294    
3295     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3296    
3297     $self->{state} = DATA_STATE;
3298 wakaba 1.5 $self->{s_kwd} = '';
3299 wakaba 1.1 ## reconsume
3300    
3301     $self->{ct}->{quirks} = 1;
3302     return ($self->{ct}); # DOCTYPE
3303    
3304     redo A;
3305 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3306    
3307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3308     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3309     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3310    
3311     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3312     $self->{line_prev} = $self->{line};
3313     $self->{column_prev} = $self->{column};
3314     $self->{column}++;
3315     $self->{nc}
3316     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3317     } else {
3318     $self->{set_nc}->($self);
3319     }
3320    
3321     redo A;
3322 wakaba 1.1 } else {
3323    
3324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3325     $self->{ct}->{quirks} = 1;
3326    
3327     $self->{state} = BOGUS_DOCTYPE_STATE;
3328    
3329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3330     $self->{line_prev} = $self->{line};
3331     $self->{column_prev} = $self->{column};
3332     $self->{column}++;
3333     $self->{nc}
3334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3335     } else {
3336     $self->{set_nc}->($self);
3337     }
3338    
3339     redo A;
3340     }
3341     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3342     if ($self->{nc} == 0x0022) { # "
3343    
3344     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3345    
3346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3347     $self->{line_prev} = $self->{line};
3348     $self->{column_prev} = $self->{column};
3349     $self->{column}++;
3350     $self->{nc}
3351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3352     } else {
3353     $self->{set_nc}->($self);
3354     }
3355    
3356     redo A;
3357     } elsif ($self->{nc} == 0x003E) { # >
3358    
3359     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3360    
3361     $self->{state} = DATA_STATE;
3362 wakaba 1.5 $self->{s_kwd} = '';
3363 wakaba 1.1
3364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3365     $self->{line_prev} = $self->{line};
3366     $self->{column_prev} = $self->{column};
3367     $self->{column}++;
3368     $self->{nc}
3369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3370     } else {
3371     $self->{set_nc}->($self);
3372     }
3373    
3374    
3375     $self->{ct}->{quirks} = 1;
3376     return ($self->{ct}); # DOCTYPE
3377    
3378     redo A;
3379     } elsif ($self->{nc} == -1) {
3380    
3381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3382    
3383     $self->{state} = DATA_STATE;
3384 wakaba 1.5 $self->{s_kwd} = '';
3385 wakaba 1.1 ## reconsume
3386    
3387     $self->{ct}->{quirks} = 1;
3388     return ($self->{ct}); # DOCTYPE
3389    
3390     redo A;
3391     } else {
3392    
3393     $self->{ct}->{pubid} # DOCTYPE
3394     .= chr $self->{nc};
3395     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3396     length $self->{ct}->{pubid});
3397    
3398     ## Stay in the state
3399    
3400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3401     $self->{line_prev} = $self->{line};
3402     $self->{column_prev} = $self->{column};
3403     $self->{column}++;
3404     $self->{nc}
3405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3406     } else {
3407     $self->{set_nc}->($self);
3408     }
3409    
3410     redo A;
3411     }
3412     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3413     if ($self->{nc} == 0x0027) { # '
3414    
3415     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3416    
3417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3418     $self->{line_prev} = $self->{line};
3419     $self->{column_prev} = $self->{column};
3420     $self->{column}++;
3421     $self->{nc}
3422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3423     } else {
3424     $self->{set_nc}->($self);
3425     }
3426    
3427     redo A;
3428     } elsif ($self->{nc} == 0x003E) { # >
3429    
3430     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3431    
3432     $self->{state} = DATA_STATE;
3433 wakaba 1.5 $self->{s_kwd} = '';
3434 wakaba 1.1
3435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3436     $self->{line_prev} = $self->{line};
3437     $self->{column_prev} = $self->{column};
3438     $self->{column}++;
3439     $self->{nc}
3440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3441     } else {
3442     $self->{set_nc}->($self);
3443     }
3444    
3445    
3446     $self->{ct}->{quirks} = 1;
3447     return ($self->{ct}); # DOCTYPE
3448    
3449     redo A;
3450     } elsif ($self->{nc} == -1) {
3451    
3452     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3453    
3454     $self->{state} = DATA_STATE;
3455 wakaba 1.5 $self->{s_kwd} = '';
3456 wakaba 1.1 ## reconsume
3457    
3458     $self->{ct}->{quirks} = 1;
3459     return ($self->{ct}); # DOCTYPE
3460    
3461     redo A;
3462     } else {
3463    
3464     $self->{ct}->{pubid} # DOCTYPE
3465     .= chr $self->{nc};
3466     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3467     length $self->{ct}->{pubid});
3468    
3469     ## Stay in the state
3470    
3471     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3472     $self->{line_prev} = $self->{line};
3473     $self->{column_prev} = $self->{column};
3474     $self->{column}++;
3475     $self->{nc}
3476     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3477     } else {
3478     $self->{set_nc}->($self);
3479     }
3480    
3481     redo A;
3482     }
3483     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3484     if ($is_space->{$self->{nc}}) {
3485    
3486     ## Stay in the state
3487    
3488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3489     $self->{line_prev} = $self->{line};
3490     $self->{column_prev} = $self->{column};
3491     $self->{column}++;
3492     $self->{nc}
3493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3494     } else {
3495     $self->{set_nc}->($self);
3496     }
3497    
3498     redo A;
3499     } elsif ($self->{nc} == 0x0022) { # "
3500    
3501     $self->{ct}->{sysid} = ''; # DOCTYPE
3502     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3503    
3504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3505     $self->{line_prev} = $self->{line};
3506     $self->{column_prev} = $self->{column};
3507     $self->{column}++;
3508     $self->{nc}
3509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3510     } else {
3511     $self->{set_nc}->($self);
3512     }
3513    
3514     redo A;
3515     } elsif ($self->{nc} == 0x0027) { # '
3516    
3517     $self->{ct}->{sysid} = ''; # DOCTYPE
3518     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3519    
3520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3521     $self->{line_prev} = $self->{line};
3522     $self->{column_prev} = $self->{column};
3523     $self->{column}++;
3524     $self->{nc}
3525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3526     } else {
3527     $self->{set_nc}->($self);
3528     }
3529    
3530     redo A;
3531     } elsif ($self->{nc} == 0x003E) { # >
3532 wakaba 1.12 if ($self->{is_xml}) {
3533    
3534     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3535     } else {
3536    
3537     }
3538 wakaba 1.1 $self->{state} = DATA_STATE;
3539 wakaba 1.5 $self->{s_kwd} = '';
3540 wakaba 1.1
3541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3542     $self->{line_prev} = $self->{line};
3543     $self->{column_prev} = $self->{column};
3544     $self->{column}++;
3545     $self->{nc}
3546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3547     } else {
3548     $self->{set_nc}->($self);
3549     }
3550    
3551    
3552     return ($self->{ct}); # DOCTYPE
3553    
3554     redo A;
3555     } elsif ($self->{nc} == -1) {
3556    
3557     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3558    
3559     $self->{state} = DATA_STATE;
3560 wakaba 1.5 $self->{s_kwd} = '';
3561 wakaba 1.1 ## reconsume
3562    
3563     $self->{ct}->{quirks} = 1;
3564     return ($self->{ct}); # DOCTYPE
3565    
3566     redo A;
3567 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3568    
3569     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3570     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3571     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3572    
3573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574     $self->{line_prev} = $self->{line};
3575     $self->{column_prev} = $self->{column};
3576     $self->{column}++;
3577     $self->{nc}
3578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3579     } else {
3580     $self->{set_nc}->($self);
3581     }
3582    
3583     redo A;
3584 wakaba 1.1 } else {
3585    
3586     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3587     $self->{ct}->{quirks} = 1;
3588    
3589     $self->{state} = BOGUS_DOCTYPE_STATE;
3590    
3591     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3592     $self->{line_prev} = $self->{line};
3593     $self->{column_prev} = $self->{column};
3594     $self->{column}++;
3595     $self->{nc}
3596     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3597     } else {
3598     $self->{set_nc}->($self);
3599     }
3600    
3601     redo A;
3602     }
3603     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3604     if ($is_space->{$self->{nc}}) {
3605    
3606     ## Stay in the state
3607    
3608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3609     $self->{line_prev} = $self->{line};
3610     $self->{column_prev} = $self->{column};
3611     $self->{column}++;
3612     $self->{nc}
3613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3614     } else {
3615     $self->{set_nc}->($self);
3616     }
3617    
3618     redo A;
3619     } elsif ($self->{nc} == 0x0022) { # "
3620    
3621     $self->{ct}->{sysid} = ''; # DOCTYPE
3622     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3623    
3624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625     $self->{line_prev} = $self->{line};
3626     $self->{column_prev} = $self->{column};
3627     $self->{column}++;
3628     $self->{nc}
3629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630     } else {
3631     $self->{set_nc}->($self);
3632     }
3633    
3634     redo A;
3635     } elsif ($self->{nc} == 0x0027) { # '
3636    
3637     $self->{ct}->{sysid} = ''; # DOCTYPE
3638     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3639    
3640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3641     $self->{line_prev} = $self->{line};
3642     $self->{column_prev} = $self->{column};
3643     $self->{column}++;
3644     $self->{nc}
3645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3646     } else {
3647     $self->{set_nc}->($self);
3648     }
3649    
3650     redo A;
3651     } elsif ($self->{nc} == 0x003E) { # >
3652    
3653     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3654     $self->{state} = DATA_STATE;
3655 wakaba 1.5 $self->{s_kwd} = '';
3656 wakaba 1.1
3657     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3658     $self->{line_prev} = $self->{line};
3659     $self->{column_prev} = $self->{column};
3660     $self->{column}++;
3661     $self->{nc}
3662     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3663     } else {
3664     $self->{set_nc}->($self);
3665     }
3666    
3667    
3668     $self->{ct}->{quirks} = 1;
3669     return ($self->{ct}); # DOCTYPE
3670    
3671     redo A;
3672     } elsif ($self->{nc} == -1) {
3673    
3674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3675    
3676     $self->{state} = DATA_STATE;
3677 wakaba 1.5 $self->{s_kwd} = '';
3678 wakaba 1.1 ## reconsume
3679    
3680     $self->{ct}->{quirks} = 1;
3681     return ($self->{ct}); # DOCTYPE
3682    
3683     redo A;
3684 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3685    
3686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3687    
3688     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3690    
3691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3692     $self->{line_prev} = $self->{line};
3693     $self->{column_prev} = $self->{column};
3694     $self->{column}++;
3695     $self->{nc}
3696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3697     } else {
3698     $self->{set_nc}->($self);
3699     }
3700    
3701     redo A;
3702 wakaba 1.1 } else {
3703    
3704     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3705     $self->{ct}->{quirks} = 1;
3706    
3707     $self->{state} = BOGUS_DOCTYPE_STATE;
3708    
3709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3710     $self->{line_prev} = $self->{line};
3711     $self->{column_prev} = $self->{column};
3712     $self->{column}++;
3713     $self->{nc}
3714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3715     } else {
3716     $self->{set_nc}->($self);
3717     }
3718    
3719     redo A;
3720     }
3721     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3722     if ($self->{nc} == 0x0022) { # "
3723    
3724     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3725    
3726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3727     $self->{line_prev} = $self->{line};
3728     $self->{column_prev} = $self->{column};
3729     $self->{column}++;
3730     $self->{nc}
3731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3732     } else {
3733     $self->{set_nc}->($self);
3734     }
3735    
3736     redo A;
3737 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3738 wakaba 1.1
3739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3740    
3741     $self->{state} = DATA_STATE;
3742 wakaba 1.5 $self->{s_kwd} = '';
3743 wakaba 1.1
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754    
3755     $self->{ct}->{quirks} = 1;
3756     return ($self->{ct}); # DOCTYPE
3757    
3758     redo A;
3759     } elsif ($self->{nc} == -1) {
3760    
3761     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3762    
3763     $self->{state} = DATA_STATE;
3764 wakaba 1.5 $self->{s_kwd} = '';
3765 wakaba 1.1 ## reconsume
3766    
3767     $self->{ct}->{quirks} = 1;
3768     return ($self->{ct}); # DOCTYPE
3769    
3770     redo A;
3771     } else {
3772    
3773     $self->{ct}->{sysid} # DOCTYPE
3774     .= chr $self->{nc};
3775     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3776     length $self->{ct}->{sysid});
3777    
3778     ## Stay in the state
3779    
3780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781     $self->{line_prev} = $self->{line};
3782     $self->{column_prev} = $self->{column};
3783     $self->{column}++;
3784     $self->{nc}
3785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786     } else {
3787     $self->{set_nc}->($self);
3788     }
3789    
3790     redo A;
3791     }
3792     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3793     if ($self->{nc} == 0x0027) { # '
3794    
3795     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3796    
3797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798     $self->{line_prev} = $self->{line};
3799     $self->{column_prev} = $self->{column};
3800     $self->{column}++;
3801     $self->{nc}
3802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803     } else {
3804     $self->{set_nc}->($self);
3805     }
3806    
3807     redo A;
3808 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3809 wakaba 1.1
3810     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3811    
3812     $self->{state} = DATA_STATE;
3813 wakaba 1.5 $self->{s_kwd} = '';
3814 wakaba 1.1
3815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3816     $self->{line_prev} = $self->{line};
3817     $self->{column_prev} = $self->{column};
3818     $self->{column}++;
3819     $self->{nc}
3820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3821     } else {
3822     $self->{set_nc}->($self);
3823     }
3824    
3825    
3826     $self->{ct}->{quirks} = 1;
3827     return ($self->{ct}); # DOCTYPE
3828    
3829     redo A;
3830     } elsif ($self->{nc} == -1) {
3831    
3832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3833    
3834     $self->{state} = DATA_STATE;
3835 wakaba 1.5 $self->{s_kwd} = '';
3836 wakaba 1.1 ## reconsume
3837    
3838     $self->{ct}->{quirks} = 1;
3839     return ($self->{ct}); # DOCTYPE
3840    
3841     redo A;
3842     } else {
3843    
3844     $self->{ct}->{sysid} # DOCTYPE
3845     .= chr $self->{nc};
3846     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3847     length $self->{ct}->{sysid});
3848    
3849     ## Stay in the state
3850    
3851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3852     $self->{line_prev} = $self->{line};
3853     $self->{column_prev} = $self->{column};
3854     $self->{column}++;
3855     $self->{nc}
3856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3857     } else {
3858     $self->{set_nc}->($self);
3859     }
3860    
3861     redo A;
3862     }
3863     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3864     if ($is_space->{$self->{nc}}) {
3865    
3866     ## Stay in the state
3867    
3868     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3869     $self->{line_prev} = $self->{line};
3870     $self->{column_prev} = $self->{column};
3871     $self->{column}++;
3872     $self->{nc}
3873     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3874     } else {
3875     $self->{set_nc}->($self);
3876     }
3877    
3878     redo A;
3879     } elsif ($self->{nc} == 0x003E) { # >
3880    
3881     $self->{state} = DATA_STATE;
3882 wakaba 1.5 $self->{s_kwd} = '';
3883 wakaba 1.1
3884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3885     $self->{line_prev} = $self->{line};
3886     $self->{column_prev} = $self->{column};
3887     $self->{column}++;
3888     $self->{nc}
3889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3890     } else {
3891     $self->{set_nc}->($self);
3892     }
3893    
3894    
3895     return ($self->{ct}); # DOCTYPE
3896    
3897     redo A;
3898     } elsif ($self->{nc} == -1) {
3899    
3900     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3901     $self->{state} = DATA_STATE;
3902 wakaba 1.5 $self->{s_kwd} = '';
3903 wakaba 1.1 ## reconsume
3904    
3905     $self->{ct}->{quirks} = 1;
3906     return ($self->{ct}); # DOCTYPE
3907    
3908     redo A;
3909 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3910    
3911     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3912     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3913    
3914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915     $self->{line_prev} = $self->{line};
3916     $self->{column_prev} = $self->{column};
3917     $self->{column}++;
3918     $self->{nc}
3919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920     } else {
3921     $self->{set_nc}->($self);
3922     }
3923    
3924     redo A;
3925 wakaba 1.1 } else {
3926    
3927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3928     #$self->{ct}->{quirks} = 1;
3929    
3930     $self->{state} = BOGUS_DOCTYPE_STATE;
3931    
3932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3933     $self->{line_prev} = $self->{line};
3934     $self->{column_prev} = $self->{column};
3935     $self->{column}++;
3936     $self->{nc}
3937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3938     } else {
3939     $self->{set_nc}->($self);
3940     }
3941    
3942     redo A;
3943     }
3944     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3945     if ($self->{nc} == 0x003E) { # >
3946    
3947     $self->{state} = DATA_STATE;
3948 wakaba 1.5 $self->{s_kwd} = '';
3949 wakaba 1.1
3950     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3951     $self->{line_prev} = $self->{line};
3952     $self->{column_prev} = $self->{column};
3953     $self->{column}++;
3954     $self->{nc}
3955     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3956     } else {
3957     $self->{set_nc}->($self);
3958     }
3959    
3960    
3961     return ($self->{ct}); # DOCTYPE
3962    
3963     redo A;
3964 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3965     if ($self->{ct}->{has_internal_subset}) { # DOCTYPE
3966    
3967     ## Stay in the state.
3968    
3969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3970     $self->{line_prev} = $self->{line};
3971     $self->{column_prev} = $self->{column};
3972     $self->{column}++;
3973     $self->{nc}
3974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3975     } else {
3976     $self->{set_nc}->($self);
3977     }
3978    
3979     redo A;
3980     } else {
3981    
3982     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3983     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3984    
3985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3986     $self->{line_prev} = $self->{line};
3987     $self->{column_prev} = $self->{column};
3988     $self->{column}++;
3989     $self->{nc}
3990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3991     } else {
3992     $self->{set_nc}->($self);
3993     }
3994    
3995     redo A;
3996     }
3997 wakaba 1.1 } elsif ($self->{nc} == -1) {
3998    
3999     $self->{state} = DATA_STATE;
4000 wakaba 1.5 $self->{s_kwd} = '';
4001 wakaba 1.1 ## reconsume
4002    
4003     return ($self->{ct}); # DOCTYPE
4004    
4005     redo A;
4006     } else {
4007    
4008     my $s = '';
4009 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4010 wakaba 1.1
4011     ## Stay in the state
4012    
4013     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4014     $self->{line_prev} = $self->{line};
4015     $self->{column_prev} = $self->{column};
4016     $self->{column}++;
4017     $self->{nc}
4018     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4019     } else {
4020     $self->{set_nc}->($self);
4021     }
4022    
4023     redo A;
4024     }
4025     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4026     ## NOTE: "CDATA section state" in the state is jointly implemented
4027     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4028     ## and |CDATA_SECTION_MSE2_STATE|.
4029 wakaba 1.10
4030     ## XML5: "CDATA state".
4031 wakaba 1.1
4032     if ($self->{nc} == 0x005D) { # ]
4033    
4034     $self->{state} = CDATA_SECTION_MSE1_STATE;
4035    
4036     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4037     $self->{line_prev} = $self->{line};
4038     $self->{column_prev} = $self->{column};
4039     $self->{column}++;
4040     $self->{nc}
4041     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4042     } else {
4043     $self->{set_nc}->($self);
4044     }
4045    
4046     redo A;
4047     } elsif ($self->{nc} == -1) {
4048 wakaba 1.6 if ($self->{is_xml}) {
4049 wakaba 1.8
4050 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4051 wakaba 1.8 } else {
4052    
4053 wakaba 1.6 }
4054    
4055 wakaba 1.1 $self->{state} = DATA_STATE;
4056 wakaba 1.5 $self->{s_kwd} = '';
4057 wakaba 1.10 ## Reconsume.
4058 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4059    
4060     return ($self->{ct}); # character
4061     } else {
4062    
4063     ## No token to emit. $self->{ct} is discarded.
4064     }
4065     redo A;
4066     } else {
4067    
4068     $self->{ct}->{data} .= chr $self->{nc};
4069     $self->{read_until}->($self->{ct}->{data},
4070     q<]>,
4071     length $self->{ct}->{data});
4072    
4073     ## Stay in the state.
4074    
4075     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4076     $self->{line_prev} = $self->{line};
4077     $self->{column_prev} = $self->{column};
4078     $self->{column}++;
4079     $self->{nc}
4080     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4081     } else {
4082     $self->{set_nc}->($self);
4083     }
4084    
4085     redo A;
4086     }
4087    
4088     ## ISSUE: "text tokens" in spec.
4089     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4090 wakaba 1.10 ## XML5: "CDATA bracket state".
4091    
4092 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4093    
4094     $self->{state} = CDATA_SECTION_MSE2_STATE;
4095    
4096     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4097     $self->{line_prev} = $self->{line};
4098     $self->{column_prev} = $self->{column};
4099     $self->{column}++;
4100     $self->{nc}
4101     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4102     } else {
4103     $self->{set_nc}->($self);
4104     }
4105    
4106     redo A;
4107     } else {
4108    
4109 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4110 wakaba 1.1 $self->{ct}->{data} .= ']';
4111 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4112 wakaba 1.1 ## Reconsume.
4113     redo A;
4114     }
4115     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4116 wakaba 1.10 ## XML5: "CDATA end state".
4117    
4118 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4119     $self->{state} = DATA_STATE;
4120 wakaba 1.5 $self->{s_kwd} = '';
4121 wakaba 1.1
4122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4123     $self->{line_prev} = $self->{line};
4124     $self->{column_prev} = $self->{column};
4125     $self->{column}++;
4126     $self->{nc}
4127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4128     } else {
4129     $self->{set_nc}->($self);
4130     }
4131    
4132     if (length $self->{ct}->{data}) { # character
4133    
4134     return ($self->{ct}); # character
4135     } else {
4136    
4137     ## No token to emit. $self->{ct} is discarded.
4138     }
4139     redo A;
4140     } elsif ($self->{nc} == 0x005D) { # ]
4141     # character
4142     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4143     ## Stay in the state.
4144    
4145     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4146     $self->{line_prev} = $self->{line};
4147     $self->{column_prev} = $self->{column};
4148     $self->{column}++;
4149     $self->{nc}
4150     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4151     } else {
4152     $self->{set_nc}->($self);
4153     }
4154    
4155     redo A;
4156     } else {
4157    
4158     $self->{ct}->{data} .= ']]'; # character
4159     $self->{state} = CDATA_SECTION_STATE;
4160 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4161 wakaba 1.1 redo A;
4162     }
4163     } elsif ($self->{state} == ENTITY_STATE) {
4164     if ($is_space->{$self->{nc}} or
4165     {
4166     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4167     $self->{entity_add} => 1,
4168     }->{$self->{nc}}) {
4169    
4170     ## Don't consume
4171     ## No error
4172     ## Return nothing.
4173     #
4174     } elsif ($self->{nc} == 0x0023) { # #
4175    
4176     $self->{state} = ENTITY_HASH_STATE;
4177 wakaba 1.12 $self->{kwd} = '#';
4178 wakaba 1.1
4179     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4180     $self->{line_prev} = $self->{line};
4181     $self->{column_prev} = $self->{column};
4182     $self->{column}++;
4183     $self->{nc}
4184     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4185     } else {
4186     $self->{set_nc}->($self);
4187     }
4188    
4189     redo A;
4190     } elsif ((0x0041 <= $self->{nc} and
4191     $self->{nc} <= 0x005A) or # A..Z
4192     (0x0061 <= $self->{nc} and
4193     $self->{nc} <= 0x007A)) { # a..z
4194    
4195     require Whatpm::_NamedEntityList;
4196     $self->{state} = ENTITY_NAME_STATE;
4197 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4198     $self->{entity__value} = $self->{kwd};
4199 wakaba 1.1 $self->{entity__match} = 0;
4200    
4201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4202     $self->{line_prev} = $self->{line};
4203     $self->{column_prev} = $self->{column};
4204     $self->{column}++;
4205     $self->{nc}
4206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4207     } else {
4208     $self->{set_nc}->($self);
4209     }
4210    
4211     redo A;
4212     } else {
4213    
4214     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4215     ## Return nothing.
4216     #
4217     }
4218    
4219     ## NOTE: No character is consumed by the "consume a character
4220     ## reference" algorithm. In other word, there is an "&" character
4221     ## that does not introduce a character reference, which would be
4222     ## appended to the parent element or the attribute value in later
4223     ## process of the tokenizer.
4224    
4225     if ($self->{prev_state} == DATA_STATE) {
4226    
4227     $self->{state} = $self->{prev_state};
4228 wakaba 1.5 $self->{s_kwd} = '';
4229 wakaba 1.1 ## Reconsume.
4230     return ({type => CHARACTER_TOKEN, data => '&',
4231     line => $self->{line_prev},
4232     column => $self->{column_prev},
4233     });
4234     redo A;
4235     } else {
4236    
4237     $self->{ca}->{value} .= '&';
4238     $self->{state} = $self->{prev_state};
4239 wakaba 1.5 $self->{s_kwd} = '';
4240 wakaba 1.1 ## Reconsume.
4241     redo A;
4242     }
4243     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4244     if ($self->{nc} == 0x0078 or # x
4245     $self->{nc} == 0x0058) { # X
4246    
4247     $self->{state} = HEXREF_X_STATE;
4248 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4249 wakaba 1.1
4250     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4251     $self->{line_prev} = $self->{line};
4252     $self->{column_prev} = $self->{column};
4253     $self->{column}++;
4254     $self->{nc}
4255     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4256     } else {
4257     $self->{set_nc}->($self);
4258     }
4259    
4260     redo A;
4261     } elsif (0x0030 <= $self->{nc} and
4262     $self->{nc} <= 0x0039) { # 0..9
4263    
4264     $self->{state} = NCR_NUM_STATE;
4265 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4266 wakaba 1.1
4267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268     $self->{line_prev} = $self->{line};
4269     $self->{column_prev} = $self->{column};
4270     $self->{column}++;
4271     $self->{nc}
4272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4273     } else {
4274     $self->{set_nc}->($self);
4275     }
4276    
4277     redo A;
4278     } else {
4279     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4280     line => $self->{line_prev},
4281     column => $self->{column_prev} - 1);
4282    
4283     ## NOTE: According to the spec algorithm, nothing is returned,
4284     ## and then "&#" is appended to the parent element or the attribute
4285     ## value in the later processing.
4286    
4287     if ($self->{prev_state} == DATA_STATE) {
4288    
4289     $self->{state} = $self->{prev_state};
4290 wakaba 1.5 $self->{s_kwd} = '';
4291 wakaba 1.1 ## Reconsume.
4292     return ({type => CHARACTER_TOKEN,
4293     data => '&#',
4294     line => $self->{line_prev},
4295     column => $self->{column_prev} - 1,
4296     });
4297     redo A;
4298     } else {
4299    
4300     $self->{ca}->{value} .= '&#';
4301     $self->{state} = $self->{prev_state};
4302 wakaba 1.5 $self->{s_kwd} = '';
4303 wakaba 1.1 ## Reconsume.
4304     redo A;
4305     }
4306     }
4307     } elsif ($self->{state} == NCR_NUM_STATE) {
4308     if (0x0030 <= $self->{nc} and
4309     $self->{nc} <= 0x0039) { # 0..9
4310    
4311 wakaba 1.12 $self->{kwd} *= 10;
4312     $self->{kwd} += $self->{nc} - 0x0030;
4313 wakaba 1.1
4314     ## Stay in the state.
4315    
4316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4317     $self->{line_prev} = $self->{line};
4318     $self->{column_prev} = $self->{column};
4319     $self->{column}++;
4320     $self->{nc}
4321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4322     } else {
4323     $self->{set_nc}->($self);
4324     }
4325    
4326     redo A;
4327     } elsif ($self->{nc} == 0x003B) { # ;
4328    
4329    
4330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4331     $self->{line_prev} = $self->{line};
4332     $self->{column_prev} = $self->{column};
4333     $self->{column}++;
4334     $self->{nc}
4335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4336     } else {
4337     $self->{set_nc}->($self);
4338     }
4339    
4340     #
4341     } else {
4342    
4343     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4344     ## Reconsume.
4345     #
4346     }
4347    
4348 wakaba 1.12 my $code = $self->{kwd};
4349 wakaba 1.1 my $l = $self->{line_prev};
4350     my $c = $self->{column_prev};
4351     if ($charref_map->{$code}) {
4352    
4353     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4354     text => (sprintf 'U+%04X', $code),
4355     line => $l, column => $c);
4356     $code = $charref_map->{$code};
4357     } elsif ($code > 0x10FFFF) {
4358    
4359     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4360     text => (sprintf 'U-%08X', $code),
4361     line => $l, column => $c);
4362     $code = 0xFFFD;
4363     }
4364    
4365     if ($self->{prev_state} == DATA_STATE) {
4366    
4367     $self->{state} = $self->{prev_state};
4368 wakaba 1.5 $self->{s_kwd} = '';
4369 wakaba 1.1 ## Reconsume.
4370     return ({type => CHARACTER_TOKEN, data => chr $code,
4371 wakaba 1.7 has_reference => 1,
4372 wakaba 1.1 line => $l, column => $c,
4373     });
4374     redo A;
4375     } else {
4376    
4377     $self->{ca}->{value} .= chr $code;
4378     $self->{ca}->{has_reference} = 1;
4379     $self->{state} = $self->{prev_state};
4380 wakaba 1.5 $self->{s_kwd} = '';
4381 wakaba 1.1 ## Reconsume.
4382     redo A;
4383     }
4384     } elsif ($self->{state} == HEXREF_X_STATE) {
4385     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4386     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4387     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4388     # 0..9, A..F, a..f
4389    
4390     $self->{state} = HEXREF_HEX_STATE;
4391 wakaba 1.12 $self->{kwd} = 0;
4392 wakaba 1.1 ## Reconsume.
4393     redo A;
4394     } else {
4395     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4396     line => $self->{line_prev},
4397     column => $self->{column_prev} - 2);
4398    
4399     ## NOTE: According to the spec algorithm, nothing is returned,
4400     ## and then "&#" followed by "X" or "x" is appended to the parent
4401     ## element or the attribute value in the later processing.
4402    
4403     if ($self->{prev_state} == DATA_STATE) {
4404    
4405     $self->{state} = $self->{prev_state};
4406 wakaba 1.5 $self->{s_kwd} = '';
4407 wakaba 1.1 ## Reconsume.
4408     return ({type => CHARACTER_TOKEN,
4409 wakaba 1.12 data => '&' . $self->{kwd},
4410 wakaba 1.1 line => $self->{line_prev},
4411 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4412 wakaba 1.1 });
4413     redo A;
4414     } else {
4415    
4416 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4417 wakaba 1.1 $self->{state} = $self->{prev_state};
4418 wakaba 1.5 $self->{s_kwd} = '';
4419 wakaba 1.1 ## Reconsume.
4420     redo A;
4421     }
4422     }
4423     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4424     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4425     # 0..9
4426    
4427 wakaba 1.12 $self->{kwd} *= 0x10;
4428     $self->{kwd} += $self->{nc} - 0x0030;
4429 wakaba 1.1 ## Stay in the state.
4430    
4431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4432     $self->{line_prev} = $self->{line};
4433     $self->{column_prev} = $self->{column};
4434     $self->{column}++;
4435     $self->{nc}
4436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4437     } else {
4438     $self->{set_nc}->($self);
4439     }
4440    
4441     redo A;
4442     } elsif (0x0061 <= $self->{nc} and
4443     $self->{nc} <= 0x0066) { # a..f
4444    
4445 wakaba 1.12 $self->{kwd} *= 0x10;
4446     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4447 wakaba 1.1 ## Stay in the state.
4448    
4449     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4450     $self->{line_prev} = $self->{line};
4451     $self->{column_prev} = $self->{column};
4452     $self->{column}++;
4453     $self->{nc}
4454     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4455     } else {
4456     $self->{set_nc}->($self);
4457     }
4458    
4459     redo A;
4460     } elsif (0x0041 <= $self->{nc} and
4461     $self->{nc} <= 0x0046) { # A..F
4462    
4463 wakaba 1.12 $self->{kwd} *= 0x10;
4464     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4465 wakaba 1.1 ## Stay in the state.
4466    
4467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4468     $self->{line_prev} = $self->{line};
4469     $self->{column_prev} = $self->{column};
4470     $self->{column}++;
4471     $self->{nc}
4472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4473     } else {
4474     $self->{set_nc}->($self);
4475     }
4476    
4477     redo A;
4478     } elsif ($self->{nc} == 0x003B) { # ;
4479    
4480    
4481     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4482     $self->{line_prev} = $self->{line};
4483     $self->{column_prev} = $self->{column};
4484     $self->{column}++;
4485     $self->{nc}
4486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4487     } else {
4488     $self->{set_nc}->($self);
4489     }
4490    
4491     #
4492     } else {
4493    
4494     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4495     line => $self->{line},
4496     column => $self->{column});
4497     ## Reconsume.
4498     #
4499     }
4500    
4501 wakaba 1.12 my $code = $self->{kwd};
4502 wakaba 1.1 my $l = $self->{line_prev};
4503     my $c = $self->{column_prev};
4504     if ($charref_map->{$code}) {
4505    
4506     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4507     text => (sprintf 'U+%04X', $code),
4508     line => $l, column => $c);
4509     $code = $charref_map->{$code};
4510     } elsif ($code > 0x10FFFF) {
4511    
4512     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4513     text => (sprintf 'U-%08X', $code),
4514     line => $l, column => $c);
4515     $code = 0xFFFD;
4516     }
4517    
4518     if ($self->{prev_state} == DATA_STATE) {
4519    
4520     $self->{state} = $self->{prev_state};
4521 wakaba 1.5 $self->{s_kwd} = '';
4522 wakaba 1.1 ## Reconsume.
4523     return ({type => CHARACTER_TOKEN, data => chr $code,
4524 wakaba 1.7 has_reference => 1,
4525 wakaba 1.1 line => $l, column => $c,
4526     });
4527     redo A;
4528     } else {
4529    
4530     $self->{ca}->{value} .= chr $code;
4531     $self->{ca}->{has_reference} = 1;
4532     $self->{state} = $self->{prev_state};
4533 wakaba 1.5 $self->{s_kwd} = '';
4534 wakaba 1.1 ## Reconsume.
4535     redo A;
4536     }
4537     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4538 wakaba 1.12 if (length $self->{kwd} < 30 and
4539 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
4540     ((0x0041 <= $self->{nc} and # a
4541     $self->{nc} <= 0x005A) or # x
4542     (0x0061 <= $self->{nc} and # a
4543     $self->{nc} <= 0x007A) or # z
4544     (0x0030 <= $self->{nc} and # 0
4545     $self->{nc} <= 0x0039) or # 9
4546     $self->{nc} == 0x003B)) { # ;
4547     our $EntityChar;
4548 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4549     if (defined $EntityChar->{$self->{kwd}}) {
4550 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
4551    
4552 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4553 wakaba 1.1 $self->{entity__match} = 1;
4554    
4555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4556     $self->{line_prev} = $self->{line};
4557     $self->{column_prev} = $self->{column};
4558     $self->{column}++;
4559     $self->{nc}
4560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4561     } else {
4562     $self->{set_nc}->($self);
4563     }
4564    
4565     #
4566     } else {
4567    
4568 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4569 wakaba 1.1 $self->{entity__match} = -1;
4570     ## Stay in the state.
4571    
4572     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4573     $self->{line_prev} = $self->{line};
4574     $self->{column_prev} = $self->{column};
4575     $self->{column}++;
4576     $self->{nc}
4577     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4578     } else {
4579     $self->{set_nc}->($self);
4580     }
4581    
4582     redo A;
4583     }
4584     } else {
4585    
4586     $self->{entity__value} .= chr $self->{nc};
4587     $self->{entity__match} *= 2;
4588     ## Stay in the state.
4589    
4590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4591     $self->{line_prev} = $self->{line};
4592     $self->{column_prev} = $self->{column};
4593     $self->{column}++;
4594     $self->{nc}
4595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4596     } else {
4597     $self->{set_nc}->($self);
4598     }
4599    
4600     redo A;
4601     }
4602     }
4603    
4604     my $data;
4605     my $has_ref;
4606     if ($self->{entity__match} > 0) {
4607    
4608     $data = $self->{entity__value};
4609     $has_ref = 1;
4610     #
4611     } elsif ($self->{entity__match} < 0) {
4612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4613     if ($self->{prev_state} != DATA_STATE and # in attribute
4614     $self->{entity__match} < -1) {
4615    
4616 wakaba 1.12 $data = '&' . $self->{kwd};
4617 wakaba 1.1 #
4618     } else {
4619    
4620     $data = $self->{entity__value};
4621     $has_ref = 1;
4622     #
4623     }
4624     } else {
4625    
4626     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4627     line => $self->{line_prev},
4628 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
4629     $data = '&' . $self->{kwd};
4630 wakaba 1.1 #
4631     }
4632    
4633     ## NOTE: In these cases, when a character reference is found,
4634     ## it is consumed and a character token is returned, or, otherwise,
4635     ## nothing is consumed and returned, according to the spec algorithm.
4636     ## In this implementation, anything that has been examined by the
4637     ## tokenizer is appended to the parent element or the attribute value
4638     ## as string, either literal string when no character reference or
4639     ## entity-replaced string otherwise, in this stage, since any characters
4640     ## that would not be consumed are appended in the data state or in an
4641     ## appropriate attribute value state anyway.
4642    
4643     if ($self->{prev_state} == DATA_STATE) {
4644    
4645     $self->{state} = $self->{prev_state};
4646 wakaba 1.5 $self->{s_kwd} = '';
4647 wakaba 1.1 ## Reconsume.
4648     return ({type => CHARACTER_TOKEN,
4649     data => $data,
4650 wakaba 1.7 has_reference => $has_ref,
4651 wakaba 1.1 line => $self->{line_prev},
4652 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
4653 wakaba 1.1 });
4654     redo A;
4655     } else {
4656    
4657     $self->{ca}->{value} .= $data;
4658     $self->{ca}->{has_reference} = 1 if $has_ref;
4659     $self->{state} = $self->{prev_state};
4660 wakaba 1.5 $self->{s_kwd} = '';
4661 wakaba 1.1 ## Reconsume.
4662     redo A;
4663     }
4664 wakaba 1.8
4665     ## XML-only states
4666    
4667     } elsif ($self->{state} == PI_STATE) {
4668     if ($is_space->{$self->{nc}} or
4669     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4670     $self->{nc} == -1) {
4671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4672     line => $self->{line_prev},
4673     column => $self->{column_prev}
4674     - 1 * ($self->{nc} != -1));
4675     $self->{state} = BOGUS_COMMENT_STATE;
4676     ## Reconsume.
4677     $self->{ct} = {type => COMMENT_TOKEN,
4678     data => '?',
4679     line => $self->{line_prev},
4680     column => $self->{column_prev}
4681     - 1 * ($self->{nc} != -1),
4682     };
4683     redo A;
4684     } else {
4685     $self->{ct} = {type => PI_TOKEN,
4686     target => chr $self->{nc},
4687     data => '',
4688     line => $self->{line_prev},
4689     column => $self->{column_prev} - 1,
4690     };
4691     $self->{state} = PI_TARGET_STATE;
4692    
4693     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4694     $self->{line_prev} = $self->{line};
4695     $self->{column_prev} = $self->{column};
4696     $self->{column}++;
4697     $self->{nc}
4698     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4699     } else {
4700     $self->{set_nc}->($self);
4701     }
4702    
4703     redo A;
4704     }
4705     } elsif ($self->{state} == PI_TARGET_STATE) {
4706     if ($is_space->{$self->{nc}}) {
4707     $self->{state} = PI_TARGET_AFTER_STATE;
4708    
4709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4710     $self->{line_prev} = $self->{line};
4711     $self->{column_prev} = $self->{column};
4712     $self->{column}++;
4713     $self->{nc}
4714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4715     } else {
4716     $self->{set_nc}->($self);
4717     }
4718    
4719     redo A;
4720     } elsif ($self->{nc} == -1) {
4721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4722     $self->{state} = DATA_STATE;
4723     $self->{s_kwd} = '';
4724     ## Reconsume.
4725     return ($self->{ct}); # pi
4726     redo A;
4727     } elsif ($self->{nc} == 0x003F) { # ?
4728     $self->{state} = PI_AFTER_STATE;
4729    
4730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4731     $self->{line_prev} = $self->{line};
4732     $self->{column_prev} = $self->{column};
4733     $self->{column}++;
4734     $self->{nc}
4735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4736     } else {
4737     $self->{set_nc}->($self);
4738     }
4739    
4740     redo A;
4741     } else {
4742     ## XML5: typo ("tag name" -> "target")
4743     $self->{ct}->{target} .= chr $self->{nc}; # pi
4744    
4745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4746     $self->{line_prev} = $self->{line};
4747     $self->{column_prev} = $self->{column};
4748     $self->{column}++;
4749     $self->{nc}
4750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4751     } else {
4752     $self->{set_nc}->($self);
4753     }
4754    
4755     redo A;
4756     }
4757     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4758     if ($is_space->{$self->{nc}}) {
4759     ## Stay in the state.
4760    
4761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4762     $self->{line_prev} = $self->{line};
4763     $self->{column_prev} = $self->{column};
4764     $self->{column}++;
4765     $self->{nc}
4766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4767     } else {
4768     $self->{set_nc}->($self);
4769     }
4770    
4771     redo A;
4772     } else {
4773     $self->{state} = PI_DATA_STATE;
4774     ## Reprocess.
4775     redo A;
4776     }
4777     } elsif ($self->{state} == PI_DATA_STATE) {
4778     if ($self->{nc} == 0x003F) { # ?
4779     $self->{state} = PI_DATA_AFTER_STATE;
4780    
4781     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4782     $self->{line_prev} = $self->{line};
4783     $self->{column_prev} = $self->{column};
4784     $self->{column}++;
4785     $self->{nc}
4786     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4787     } else {
4788     $self->{set_nc}->($self);
4789     }
4790    
4791     redo A;
4792     } elsif ($self->{nc} == -1) {
4793     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4794     $self->{state} = DATA_STATE;
4795     $self->{s_kwd} = '';
4796     ## Reprocess.
4797     return ($self->{ct}); # pi
4798     redo A;
4799     } else {
4800     $self->{ct}->{data} .= chr $self->{nc}; # pi
4801     $self->{read_until}->($self->{ct}->{data}, q[?],
4802     length $self->{ct}->{data});
4803     ## Stay in the state.
4804    
4805     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4806     $self->{line_prev} = $self->{line};
4807     $self->{column_prev} = $self->{column};
4808     $self->{column}++;
4809     $self->{nc}
4810     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4811     } else {
4812     $self->{set_nc}->($self);
4813     }
4814    
4815     ## Reprocess.
4816     redo A;
4817     }
4818     } elsif ($self->{state} == PI_AFTER_STATE) {
4819     if ($self->{nc} == 0x003E) { # >
4820     $self->{state} = DATA_STATE;
4821     $self->{s_kwd} = '';
4822    
4823     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4824     $self->{line_prev} = $self->{line};
4825     $self->{column_prev} = $self->{column};
4826     $self->{column}++;
4827     $self->{nc}
4828     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4829     } else {
4830     $self->{set_nc}->($self);
4831     }
4832    
4833     return ($self->{ct}); # pi
4834     redo A;
4835     } elsif ($self->{nc} == 0x003F) { # ?
4836     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4837     line => $self->{line_prev},
4838     column => $self->{column_prev}); ## XML5: no error
4839     $self->{ct}->{data} .= '?';
4840     $self->{state} = PI_DATA_AFTER_STATE;
4841    
4842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4843     $self->{line_prev} = $self->{line};
4844     $self->{column_prev} = $self->{column};
4845     $self->{column}++;
4846     $self->{nc}
4847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4848     } else {
4849     $self->{set_nc}->($self);
4850     }
4851    
4852     redo A;
4853     } else {
4854     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4855     line => $self->{line_prev},
4856     column => $self->{column_prev}
4857     + 1 * ($self->{nc} == -1)); ## XML5: no error
4858     $self->{ct}->{data} .= '?'; ## XML5: not appended
4859     $self->{state} = PI_DATA_STATE;
4860     ## Reprocess.
4861     redo A;
4862     }
4863     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4864     ## XML5: Same as "pi after state" in XML5
4865     if ($self->{nc} == 0x003E) { # >
4866     $self->{state} = DATA_STATE;
4867     $self->{s_kwd} = '';
4868    
4869     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4870     $self->{line_prev} = $self->{line};
4871     $self->{column_prev} = $self->{column};
4872     $self->{column}++;
4873     $self->{nc}
4874     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4875     } else {
4876     $self->{set_nc}->($self);
4877     }
4878    
4879     return ($self->{ct}); # pi
4880     redo A;
4881     } elsif ($self->{nc} == 0x003F) { # ?
4882     $self->{ct}->{data} .= '?';
4883     ## Stay in the state.
4884    
4885     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4886     $self->{line_prev} = $self->{line};
4887     $self->{column_prev} = $self->{column};
4888     $self->{column}++;
4889     $self->{nc}
4890     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4891     } else {
4892     $self->{set_nc}->($self);
4893     }
4894    
4895     redo A;
4896     } else {
4897     $self->{ct}->{data} .= '?'; ## XML5: not appended
4898     $self->{state} = PI_DATA_STATE;
4899     ## Reprocess.
4900     redo A;
4901     }
4902 wakaba 1.12
4903     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4904     if ($self->{nc} == 0x003C) { # <
4905     ## TODO:
4906    
4907     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4908     $self->{line_prev} = $self->{line};
4909     $self->{column_prev} = $self->{column};
4910     $self->{column}++;
4911     $self->{nc}
4912     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4913     } else {
4914     $self->{set_nc}->($self);
4915     }
4916    
4917     redo A;
4918     } elsif ($self->{nc} == 0x0025) { # %
4919     ## XML5: Not defined yet.
4920    
4921     ## TODO:
4922    
4923     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4924     $self->{line_prev} = $self->{line};
4925     $self->{column_prev} = $self->{column};
4926     $self->{column}++;
4927     $self->{nc}
4928     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4929     } else {
4930     $self->{set_nc}->($self);
4931     }
4932    
4933     redo A;
4934     } elsif ($self->{nc} == 0x005D) { # ]
4935     $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
4936    
4937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4938     $self->{line_prev} = $self->{line};
4939     $self->{column_prev} = $self->{column};
4940     $self->{column}++;
4941     $self->{nc}
4942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4943     } else {
4944     $self->{set_nc}->($self);
4945     }
4946    
4947     redo A;
4948     } elsif ($is_space->{$self->{nc}}) {
4949     ## Stay in the state.
4950    
4951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4952     $self->{line_prev} = $self->{line};
4953     $self->{column_prev} = $self->{column};
4954     $self->{column}++;
4955     $self->{nc}
4956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4957     } else {
4958     $self->{set_nc}->($self);
4959     }
4960    
4961     redo A;
4962     } elsif ($self->{nc} == -1) {
4963     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
4964     $self->{state} = DATA_STATE;
4965     $self->{s_kwd} = '';
4966     ## Reconsume.
4967     return ($self->{ct}); # DOCTYPE
4968     redo A;
4969     } else {
4970     unless ($self->{internal_subset_tainted}) {
4971     ## XML5: No parse error.
4972     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
4973     $self->{internal_subset_tainted} = 1;
4974     }
4975     ## Stay in the state.
4976    
4977     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4978     $self->{line_prev} = $self->{line};
4979     $self->{column_prev} = $self->{column};
4980     $self->{column}++;
4981     $self->{nc}
4982     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4983     } else {
4984     $self->{set_nc}->($self);
4985     }
4986    
4987     redo A;
4988     }
4989     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
4990     if ($self->{nc} == 0x003E) { # >
4991     $self->{state} = DATA_STATE;
4992     $self->{s_kwd} = '';
4993    
4994     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4995     $self->{line_prev} = $self->{line};
4996     $self->{column_prev} = $self->{column};
4997     $self->{column}++;
4998     $self->{nc}
4999     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5000     } else {
5001     $self->{set_nc}->($self);
5002     }
5003    
5004     return ($self->{ct}); # DOCTYPE
5005     redo A;
5006     } elsif ($self->{nc} == -1) {
5007     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5008     $self->{state} = DATA_STATE;
5009     $self->{s_kwd} = '';
5010     ## Reconsume.
5011     return ($self->{ct}); # DOCTYPE
5012     redo A;
5013     } else {
5014     ## XML5: No parse error and stay in the state.
5015     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5016    
5017     $self->{state} = BOGUS_DOCTYPE_STATE;
5018    
5019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5020     $self->{line_prev} = $self->{line};
5021     $self->{column_prev} = $self->{column};
5022     $self->{column}++;
5023     $self->{nc}
5024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5025     } else {
5026     $self->{set_nc}->($self);
5027     }
5028    
5029     redo A;
5030     }
5031 wakaba 1.8
5032 wakaba 1.1 } else {
5033     die "$0: $self->{state}: Unknown state";
5034     }
5035     } # A
5036    
5037     die "$0: _get_next_token: unexpected case";
5038     } # _get_next_token
5039    
5040     1;
5041 wakaba 1.12 ## $Date: 2008/10/15 10:50:38 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24