/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (hide annotations) (download)
Thu Oct 16 03:39:57 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.12: +237 -70 lines
++ whatpm/t/ChangeLog	16 Oct 2008 03:39:39 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/pis-2.dat" and "xml/comments-2.dat" are added.

++ whatpm/t/xml/ChangeLog	16 Oct 2008 03:39:53 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* doctypes-2.dat: New test added.

	* comments-2.dat, pis-2.dat: New test data files.

++ whatpm/Whatpm/HTML/ChangeLog	16 Oct 2008 03:36:51 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token type END_OF_DOCTYPE_TOKEN added.
	New states DOCTYPE_TAG_STATE and
	BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE are added.  (Bogus
	string after the internal subset, which was handled by the state
	BOGUS_DOCTYPE_STATE, are now handled by the new state.)  Support
	for comments, bogus comments, and processing instructions in the
	internal subset.  If there is the internal subset, then emit the
	doctype token before the internal subset (with its
	$token->{has_internal_subset} flag set) and an
	END_OF_DOCTYPE_TOKEN after the internal subset.

++ whatpm/Whatpm/XML/ChangeLog	16 Oct 2008 03:39:19 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Insertion mode IN_SUBSET_IM added.  In the
	"initial" insertion mode, if the DOCTYPE token's "has internal
	subset" flag is set, then switch to the "in subset" insertion
	mode.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.13 our $VERSION=do{my @r=(q$Revision: 1.12 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.2 );
20    
21     our %EXPORT_TAGS = (
22     token => [qw(
23     DOCTYPE_TOKEN
24     COMMENT_TOKEN
25     START_TAG_TOKEN
26     END_TAG_TOKEN
27     END_OF_FILE_TOKEN
28     CHARACTER_TOKEN
29     PI_TOKEN
30     ABORT_TOKEN
31 wakaba 1.13 END_OF_DOCTYPE_TOKEN
32 wakaba 1.2 )],
33     );
34     }
35    
36 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
37    
38 wakaba 1.2 ## Token types
39    
40 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
41 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
42     sub START_TAG_TOKEN () { 3 }
43     sub END_TAG_TOKEN () { 4 }
44     sub END_OF_FILE_TOKEN () { 5 }
45     sub CHARACTER_TOKEN () { 6 }
46 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
47     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48 wakaba 1.13 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49 wakaba 1.12
50     ## XML5: XML5 has "empty tag token". In this implementation, it is
51     ## represented as a start tag token with $self->{self_closing} flag
52     ## set to true.
53    
54     ## XML5: XML5 has "short end tag token". In this implementation, it
55     ## is represented as an end tag token with $token->{tag_name} flag set
56     ## to an empty string.
57 wakaba 1.1
58     package Whatpm::HTML;
59    
60 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
61    
62 wakaba 1.1 ## Content model flags
63    
64     sub CM_ENTITY () { 0b001 } # & markup in data
65     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
66     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
67    
68     sub PLAINTEXT_CONTENT_MODEL () { 0 }
69     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
70     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
71     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
72    
73     ## Tokenizer states
74    
75     sub DATA_STATE () { 0 }
76     #sub ENTITY_DATA_STATE () { 1 }
77     sub TAG_OPEN_STATE () { 2 }
78     sub CLOSE_TAG_OPEN_STATE () { 3 }
79     sub TAG_NAME_STATE () { 4 }
80     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
81     sub ATTRIBUTE_NAME_STATE () { 6 }
82     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
83     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
84     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
85     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
86     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
87     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
88     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
89     sub COMMENT_START_STATE () { 14 }
90     sub COMMENT_START_DASH_STATE () { 15 }
91     sub COMMENT_STATE () { 16 }
92     sub COMMENT_END_STATE () { 17 }
93     sub COMMENT_END_DASH_STATE () { 18 }
94     sub BOGUS_COMMENT_STATE () { 19 }
95     sub DOCTYPE_STATE () { 20 }
96     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
97     sub DOCTYPE_NAME_STATE () { 22 }
98     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
99     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
100     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
101     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
102     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
103     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
104     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
105     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
106     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
107     sub BOGUS_DOCTYPE_STATE () { 32 }
108     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
109     sub SELF_CLOSING_START_TAG_STATE () { 34 }
110     sub CDATA_SECTION_STATE () { 35 }
111     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
112     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
113     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
114     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
115     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
116     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
117     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
118     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
119     ## NOTE: "Entity data state", "entity in attribute value state", and
120     ## "consume a character reference" algorithm are jointly implemented
121     ## using the following six states:
122     sub ENTITY_STATE () { 44 }
123     sub ENTITY_HASH_STATE () { 45 }
124     sub NCR_NUM_STATE () { 46 }
125     sub HEXREF_X_STATE () { 47 }
126     sub HEXREF_HEX_STATE () { 48 }
127     sub ENTITY_NAME_STATE () { 49 }
128     sub PCDATA_STATE () { 50 } # "data state" in the spec
129    
130 wakaba 1.12 ## XML-only states
131 wakaba 1.8 sub PI_STATE () { 51 }
132     sub PI_TARGET_STATE () { 52 }
133     sub PI_TARGET_AFTER_STATE () { 53 }
134     sub PI_DATA_STATE () { 54 }
135     sub PI_AFTER_STATE () { 55 }
136     sub PI_DATA_AFTER_STATE () { 56 }
137 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139 wakaba 1.13 sub DOCTYPE_TAG_STATE () { 59 }
140     sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141 wakaba 1.8
142 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
143     ## list and descriptions)
144    
145     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
146     sub FOREIGN_EL () { 0b1_00000000000 }
147    
148     ## Character reference mappings
149    
150     my $charref_map = {
151     0x0D => 0x000A,
152     0x80 => 0x20AC,
153     0x81 => 0xFFFD,
154     0x82 => 0x201A,
155     0x83 => 0x0192,
156     0x84 => 0x201E,
157     0x85 => 0x2026,
158     0x86 => 0x2020,
159     0x87 => 0x2021,
160     0x88 => 0x02C6,
161     0x89 => 0x2030,
162     0x8A => 0x0160,
163     0x8B => 0x2039,
164     0x8C => 0x0152,
165     0x8D => 0xFFFD,
166     0x8E => 0x017D,
167     0x8F => 0xFFFD,
168     0x90 => 0xFFFD,
169     0x91 => 0x2018,
170     0x92 => 0x2019,
171     0x93 => 0x201C,
172     0x94 => 0x201D,
173     0x95 => 0x2022,
174     0x96 => 0x2013,
175     0x97 => 0x2014,
176     0x98 => 0x02DC,
177     0x99 => 0x2122,
178     0x9A => 0x0161,
179     0x9B => 0x203A,
180     0x9C => 0x0153,
181     0x9D => 0xFFFD,
182     0x9E => 0x017E,
183     0x9F => 0x0178,
184     }; # $charref_map
185     $charref_map->{$_} = 0xFFFD
186     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
187     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
188     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
189     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
190     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
191     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
192     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
193    
194     ## Implementations MUST act as if state machine in the spec
195    
196     sub _initialize_tokenizer ($) {
197     my $self = shift;
198    
199     ## NOTE: Fields set by |new| constructor:
200     #$self->{level}
201     #$self->{set_nc}
202     #$self->{parse_error}
203 wakaba 1.3 #$self->{is_xml} (if XML)
204 wakaba 1.1
205     $self->{state} = DATA_STATE; # MUST
206 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
207     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
208 wakaba 1.1 #$self->{entity__value}; # initialized when used
209     #$self->{entity__match}; # initialized when used
210     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
211     undef $self->{ct}; # current token
212     undef $self->{ca}; # current attribute
213     undef $self->{last_stag_name}; # last emitted start tag name
214     #$self->{prev_state}; # initialized when used
215     delete $self->{self_closing};
216     $self->{char_buffer} = '';
217     $self->{char_buffer_pos} = 0;
218     $self->{nc} = -1; # next input character
219     #$self->{next_nc}
220    
221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
222     $self->{line_prev} = $self->{line};
223     $self->{column_prev} = $self->{column};
224     $self->{column}++;
225     $self->{nc}
226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
227     } else {
228     $self->{set_nc}->($self);
229     }
230    
231     $self->{token} = [];
232     # $self->{escape}
233     } # _initialize_tokenizer
234    
235     ## A token has:
236     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
237 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
238 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
239     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
240 wakaba 1.11 ## ->{target} (PI_TOKEN)
241 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
242     ## ->{sysid} (DOCTYPE_TOKEN)
243     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
244     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
245     ## ->{name}
246     ## ->{value}
247     ## ->{has_reference} == 1 or 0
248 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
249     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
250 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
251 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
252 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
253    
254 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
255     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
256     ## while the token is pushed back to the stack.
257    
258     ## Emitted token MUST immediately be handled by the tree construction state.
259    
260     ## Before each step, UA MAY check to see if either one of the scripts in
261     ## "list of scripts that will execute as soon as possible" or the first
262     ## script in the "list of scripts that will execute asynchronously",
263     ## has completed loading. If one has, then it MUST be executed
264     ## and removed from the list.
265    
266     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
267     ## (This requirement was dropped from HTML5 spec, unfortunately.)
268    
269     my $is_space = {
270     0x0009 => 1, # CHARACTER TABULATION (HT)
271     0x000A => 1, # LINE FEED (LF)
272     #0x000B => 0, # LINE TABULATION (VT)
273 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
274 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
275     0x0020 => 1, # SPACE (SP)
276     };
277    
278     sub _get_next_token ($) {
279     my $self = shift;
280    
281     if ($self->{self_closing}) {
282     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
283     ## NOTE: The |self_closing| flag is only set by start tag token.
284     ## In addition, when a start tag token is emitted, it is always set to
285     ## |ct|.
286     delete $self->{self_closing};
287     }
288    
289     if (@{$self->{token}}) {
290     $self->{self_closing} = $self->{token}->[0]->{self_closing};
291     return shift @{$self->{token}};
292     }
293    
294     A: {
295     if ($self->{state} == PCDATA_STATE) {
296     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
297    
298     if ($self->{nc} == 0x0026) { # &
299    
300     ## NOTE: In the spec, the tokenizer is switched to the
301     ## "entity data state". In this implementation, the tokenizer
302     ## is switched to the |ENTITY_STATE|, which is an implementation
303     ## of the "consume a character reference" algorithm.
304     $self->{entity_add} = -1;
305     $self->{prev_state} = DATA_STATE;
306     $self->{state} = ENTITY_STATE;
307    
308     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
309     $self->{line_prev} = $self->{line};
310     $self->{column_prev} = $self->{column};
311     $self->{column}++;
312     $self->{nc}
313     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
314     } else {
315     $self->{set_nc}->($self);
316     }
317    
318     redo A;
319     } elsif ($self->{nc} == 0x003C) { # <
320    
321     $self->{state} = TAG_OPEN_STATE;
322    
323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
324     $self->{line_prev} = $self->{line};
325     $self->{column_prev} = $self->{column};
326     $self->{column}++;
327     $self->{nc}
328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
329     } else {
330     $self->{set_nc}->($self);
331     }
332    
333     redo A;
334     } elsif ($self->{nc} == -1) {
335    
336     return ({type => END_OF_FILE_TOKEN,
337     line => $self->{line}, column => $self->{column}});
338     last A; ## TODO: ok?
339     } else {
340    
341     #
342     }
343    
344     # Anything else
345     my $token = {type => CHARACTER_TOKEN,
346     data => chr $self->{nc},
347     line => $self->{line}, column => $self->{column},
348     };
349     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
350    
351     ## Stay in the state.
352    
353     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
354     $self->{line_prev} = $self->{line};
355     $self->{column_prev} = $self->{column};
356     $self->{column}++;
357     $self->{nc}
358     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
359     } else {
360     $self->{set_nc}->($self);
361     }
362    
363     return ($token);
364     redo A;
365     } elsif ($self->{state} == DATA_STATE) {
366     $self->{s_kwd} = '' unless defined $self->{s_kwd};
367     if ($self->{nc} == 0x0026) { # &
368     $self->{s_kwd} = '';
369     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
370     not $self->{escape}) {
371    
372     ## NOTE: In the spec, the tokenizer is switched to the
373     ## "entity data state". In this implementation, the tokenizer
374     ## is switched to the |ENTITY_STATE|, which is an implementation
375     ## of the "consume a character reference" algorithm.
376     $self->{entity_add} = -1;
377     $self->{prev_state} = DATA_STATE;
378     $self->{state} = ENTITY_STATE;
379    
380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
381     $self->{line_prev} = $self->{line};
382     $self->{column_prev} = $self->{column};
383     $self->{column}++;
384     $self->{nc}
385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
386     } else {
387     $self->{set_nc}->($self);
388     }
389    
390     redo A;
391     } else {
392    
393     #
394     }
395     } elsif ($self->{nc} == 0x002D) { # -
396     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
397 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
398 wakaba 1.1
399     $self->{escape} = 1; # unless $self->{escape};
400     $self->{s_kwd} = '--';
401     #
402 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
403 wakaba 1.1
404     $self->{s_kwd} = '--';
405     #
406 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
407    
408     $self->{s_kwd} .= '-';
409     #
410 wakaba 1.1 } else {
411    
412 wakaba 1.5 $self->{s_kwd} = '-';
413 wakaba 1.1 #
414     }
415     }
416    
417     #
418     } elsif ($self->{nc} == 0x0021) { # !
419     if (length $self->{s_kwd}) {
420    
421     $self->{s_kwd} .= '!';
422     #
423     } else {
424    
425     #$self->{s_kwd} = '';
426     #
427     }
428     #
429     } elsif ($self->{nc} == 0x003C) { # <
430     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
431     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
432     not $self->{escape})) {
433    
434     $self->{state} = TAG_OPEN_STATE;
435    
436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437     $self->{line_prev} = $self->{line};
438     $self->{column_prev} = $self->{column};
439     $self->{column}++;
440     $self->{nc}
441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442     } else {
443     $self->{set_nc}->($self);
444     }
445    
446     redo A;
447     } else {
448    
449     $self->{s_kwd} = '';
450     #
451     }
452     } elsif ($self->{nc} == 0x003E) { # >
453     if ($self->{escape} and
454     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
455     if ($self->{s_kwd} eq '--') {
456    
457     delete $self->{escape};
458 wakaba 1.5 #
459 wakaba 1.1 } else {
460    
461 wakaba 1.5 #
462 wakaba 1.1 }
463 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
464    
465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
466     line => $self->{line_prev},
467     column => $self->{column_prev} - 1);
468     #
469 wakaba 1.1 } else {
470    
471 wakaba 1.5 #
472 wakaba 1.1 }
473    
474     $self->{s_kwd} = '';
475     #
476 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
477     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
478    
479     $self->{s_kwd} .= ']';
480     } elsif ($self->{s_kwd} eq ']]') {
481    
482     #
483     } else {
484    
485     $self->{s_kwd} = '';
486     }
487     #
488 wakaba 1.1 } elsif ($self->{nc} == -1) {
489    
490     $self->{s_kwd} = '';
491     return ({type => END_OF_FILE_TOKEN,
492     line => $self->{line}, column => $self->{column}});
493     last A; ## TODO: ok?
494     } else {
495    
496     $self->{s_kwd} = '';
497     #
498     }
499    
500     # Anything else
501     my $token = {type => CHARACTER_TOKEN,
502     data => chr $self->{nc},
503     line => $self->{line}, column => $self->{column},
504     };
505 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
506 wakaba 1.1 length $token->{data})) {
507     $self->{s_kwd} = '';
508     }
509    
510     ## Stay in the data state.
511 wakaba 1.5 if (not $self->{is_xml} and
512     $self->{content_model} == PCDATA_CONTENT_MODEL) {
513 wakaba 1.1
514     $self->{state} = PCDATA_STATE;
515     } else {
516    
517     ## Stay in the state.
518     }
519    
520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
521     $self->{line_prev} = $self->{line};
522     $self->{column_prev} = $self->{column};
523     $self->{column}++;
524     $self->{nc}
525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
526     } else {
527     $self->{set_nc}->($self);
528     }
529    
530     return ($token);
531     redo A;
532     } elsif ($self->{state} == TAG_OPEN_STATE) {
533 wakaba 1.10 ## XML5: "tag state".
534    
535 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
536     if ($self->{nc} == 0x002F) { # /
537    
538    
539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
540     $self->{line_prev} = $self->{line};
541     $self->{column_prev} = $self->{column};
542     $self->{column}++;
543     $self->{nc}
544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
545     } else {
546     $self->{set_nc}->($self);
547     }
548    
549     $self->{state} = CLOSE_TAG_OPEN_STATE;
550     redo A;
551     } elsif ($self->{nc} == 0x0021) { # !
552    
553 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
554 wakaba 1.1 #
555     } else {
556    
557 wakaba 1.12 $self->{s_kwd} = '';
558 wakaba 1.1 #
559     }
560    
561     ## reconsume
562     $self->{state} = DATA_STATE;
563     return ({type => CHARACTER_TOKEN, data => '<',
564     line => $self->{line_prev},
565     column => $self->{column_prev},
566     });
567     redo A;
568     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
569     if ($self->{nc} == 0x0021) { # !
570    
571     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
572    
573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
574     $self->{line_prev} = $self->{line};
575     $self->{column_prev} = $self->{column};
576     $self->{column}++;
577     $self->{nc}
578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
579     } else {
580     $self->{set_nc}->($self);
581     }
582    
583     redo A;
584     } elsif ($self->{nc} == 0x002F) { # /
585    
586     $self->{state} = CLOSE_TAG_OPEN_STATE;
587    
588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
589     $self->{line_prev} = $self->{line};
590     $self->{column_prev} = $self->{column};
591     $self->{column}++;
592     $self->{nc}
593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
594     } else {
595     $self->{set_nc}->($self);
596     }
597    
598     redo A;
599     } elsif (0x0041 <= $self->{nc} and
600     $self->{nc} <= 0x005A) { # A..Z
601    
602     $self->{ct}
603     = {type => START_TAG_TOKEN,
604 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
605 wakaba 1.1 line => $self->{line_prev},
606     column => $self->{column_prev}};
607     $self->{state} = TAG_NAME_STATE;
608    
609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
610     $self->{line_prev} = $self->{line};
611     $self->{column_prev} = $self->{column};
612     $self->{column}++;
613     $self->{nc}
614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
615     } else {
616     $self->{set_nc}->($self);
617     }
618    
619     redo A;
620     } elsif (0x0061 <= $self->{nc} and
621     $self->{nc} <= 0x007A) { # a..z
622    
623     $self->{ct} = {type => START_TAG_TOKEN,
624     tag_name => chr ($self->{nc}),
625     line => $self->{line_prev},
626     column => $self->{column_prev}};
627     $self->{state} = TAG_NAME_STATE;
628    
629     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630     $self->{line_prev} = $self->{line};
631     $self->{column_prev} = $self->{column};
632     $self->{column}++;
633     $self->{nc}
634     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635     } else {
636     $self->{set_nc}->($self);
637     }
638    
639     redo A;
640     } elsif ($self->{nc} == 0x003E) { # >
641    
642     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
643     line => $self->{line_prev},
644     column => $self->{column_prev});
645     $self->{state} = DATA_STATE;
646 wakaba 1.5 $self->{s_kwd} = '';
647 wakaba 1.1
648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
649     $self->{line_prev} = $self->{line};
650     $self->{column_prev} = $self->{column};
651     $self->{column}++;
652     $self->{nc}
653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
654     } else {
655     $self->{set_nc}->($self);
656     }
657    
658    
659     return ({type => CHARACTER_TOKEN, data => '<>',
660     line => $self->{line_prev},
661     column => $self->{column_prev},
662     });
663    
664     redo A;
665     } elsif ($self->{nc} == 0x003F) { # ?
666 wakaba 1.8 if ($self->{is_xml}) {
667    
668     $self->{state} = PI_STATE;
669    
670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
671     $self->{line_prev} = $self->{line};
672     $self->{column_prev} = $self->{column};
673     $self->{column}++;
674     $self->{nc}
675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
676     } else {
677     $self->{set_nc}->($self);
678     }
679    
680     redo A;
681     } else {
682    
683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
684     line => $self->{line_prev},
685     column => $self->{column_prev});
686     $self->{state} = BOGUS_COMMENT_STATE;
687     $self->{ct} = {type => COMMENT_TOKEN, data => '',
688     line => $self->{line_prev},
689     column => $self->{column_prev},
690     };
691     ## $self->{nc} is intentionally left as is
692     redo A;
693     }
694 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
695 wakaba 1.1
696     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
697     line => $self->{line_prev},
698     column => $self->{column_prev});
699     $self->{state} = DATA_STATE;
700 wakaba 1.5 $self->{s_kwd} = '';
701 wakaba 1.1 ## reconsume
702    
703     return ({type => CHARACTER_TOKEN, data => '<',
704     line => $self->{line_prev},
705     column => $self->{column_prev},
706     });
707    
708     redo A;
709 wakaba 1.9 } else {
710     ## XML5: "<:" is a parse error.
711    
712     $self->{ct} = {type => START_TAG_TOKEN,
713     tag_name => chr ($self->{nc}),
714     line => $self->{line_prev},
715     column => $self->{column_prev}};
716     $self->{state} = TAG_NAME_STATE;
717    
718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
719     $self->{line_prev} = $self->{line};
720     $self->{column_prev} = $self->{column};
721     $self->{column}++;
722     $self->{nc}
723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
724     } else {
725     $self->{set_nc}->($self);
726     }
727    
728     redo A;
729 wakaba 1.1 }
730     } else {
731     die "$0: $self->{content_model} in tag open";
732     }
733     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
734     ## NOTE: The "close tag open state" in the spec is implemented as
735     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
736    
737 wakaba 1.10 ## XML5: "end tag state".
738    
739 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
740     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
741     if (defined $self->{last_stag_name}) {
742     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
743 wakaba 1.12 $self->{kwd} = '';
744 wakaba 1.1 ## Reconsume.
745     redo A;
746     } else {
747     ## No start tag token has ever been emitted
748     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
749    
750     $self->{state} = DATA_STATE;
751 wakaba 1.5 $self->{s_kwd} = '';
752 wakaba 1.1 ## Reconsume.
753     return ({type => CHARACTER_TOKEN, data => '</',
754     line => $l, column => $c,
755     });
756     redo A;
757     }
758     }
759    
760     if (0x0041 <= $self->{nc} and
761     $self->{nc} <= 0x005A) { # A..Z
762    
763     $self->{ct}
764     = {type => END_TAG_TOKEN,
765 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
766 wakaba 1.1 line => $l, column => $c};
767     $self->{state} = TAG_NAME_STATE;
768    
769     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
770     $self->{line_prev} = $self->{line};
771     $self->{column_prev} = $self->{column};
772     $self->{column}++;
773     $self->{nc}
774     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
775     } else {
776     $self->{set_nc}->($self);
777     }
778    
779     redo A;
780     } elsif (0x0061 <= $self->{nc} and
781     $self->{nc} <= 0x007A) { # a..z
782    
783     $self->{ct} = {type => END_TAG_TOKEN,
784     tag_name => chr ($self->{nc}),
785     line => $l, column => $c};
786     $self->{state} = TAG_NAME_STATE;
787    
788     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
789     $self->{line_prev} = $self->{line};
790     $self->{column_prev} = $self->{column};
791     $self->{column}++;
792     $self->{nc}
793     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
794     } else {
795     $self->{set_nc}->($self);
796     }
797    
798     redo A;
799     } elsif ($self->{nc} == 0x003E) { # >
800     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
801     line => $self->{line_prev}, ## "<" in "</>"
802     column => $self->{column_prev} - 1);
803     $self->{state} = DATA_STATE;
804 wakaba 1.5 $self->{s_kwd} = '';
805 wakaba 1.10 if ($self->{is_xml}) {
806    
807     ## XML5: No parse error.
808    
809     ## NOTE: This parser raises a parse error, since it supports
810     ## XML1, not XML5.
811    
812     ## NOTE: A short end tag token.
813     my $ct = {type => END_TAG_TOKEN,
814     tag_name => '',
815     line => $self->{line_prev},
816     column => $self->{column_prev} - 1,
817     };
818    
819     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
820     $self->{line_prev} = $self->{line};
821     $self->{column_prev} = $self->{column};
822     $self->{column}++;
823     $self->{nc}
824     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
825     } else {
826     $self->{set_nc}->($self);
827     }
828    
829     return ($ct);
830     } else {
831    
832    
833 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
834     $self->{line_prev} = $self->{line};
835     $self->{column_prev} = $self->{column};
836     $self->{column}++;
837     $self->{nc}
838     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
839     } else {
840     $self->{set_nc}->($self);
841     }
842    
843 wakaba 1.10 }
844 wakaba 1.1 redo A;
845     } elsif ($self->{nc} == -1) {
846    
847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
848 wakaba 1.5 $self->{s_kwd} = '';
849 wakaba 1.1 $self->{state} = DATA_STATE;
850     # reconsume
851    
852     return ({type => CHARACTER_TOKEN, data => '</',
853     line => $l, column => $c,
854     });
855    
856     redo A;
857 wakaba 1.10 } elsif (not $self->{is_xml} or
858     $is_space->{$self->{nc}}) {
859 wakaba 1.1
860 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
861     line => $self->{line_prev}, # "<" of "</"
862     column => $self->{column_prev} - 1);
863 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
864     $self->{ct} = {type => COMMENT_TOKEN, data => '',
865     line => $self->{line_prev}, # "<" of "</"
866     column => $self->{column_prev} - 1,
867     };
868     ## NOTE: $self->{nc} is intentionally left as is.
869     ## Although the "anything else" case of the spec not explicitly
870     ## states that the next input character is to be reconsumed,
871     ## it will be included to the |data| of the comment token
872     ## generated from the bogus end tag, as defined in the
873     ## "bogus comment state" entry.
874     redo A;
875 wakaba 1.10 } else {
876     ## XML5: "</:" is a parse error.
877    
878     $self->{ct} = {type => END_TAG_TOKEN,
879     tag_name => chr ($self->{nc}),
880     line => $l, column => $c};
881     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
882    
883     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
884     $self->{line_prev} = $self->{line};
885     $self->{column_prev} = $self->{column};
886     $self->{column}++;
887     $self->{nc}
888     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
889     } else {
890     $self->{set_nc}->($self);
891     }
892    
893     redo A;
894 wakaba 1.1 }
895     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
896 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
897 wakaba 1.1 if (length $ch) {
898     my $CH = $ch;
899     $ch =~ tr/a-z/A-Z/;
900     my $nch = chr $self->{nc};
901     if ($nch eq $ch or $nch eq $CH) {
902    
903     ## Stay in the state.
904 wakaba 1.12 $self->{kwd} .= $nch;
905 wakaba 1.1
906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
907     $self->{line_prev} = $self->{line};
908     $self->{column_prev} = $self->{column};
909     $self->{column}++;
910     $self->{nc}
911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
912     } else {
913     $self->{set_nc}->($self);
914     }
915    
916     redo A;
917     } else {
918    
919     $self->{state} = DATA_STATE;
920 wakaba 1.5 $self->{s_kwd} = '';
921 wakaba 1.1 ## Reconsume.
922     return ({type => CHARACTER_TOKEN,
923 wakaba 1.12 data => '</' . $self->{kwd},
924 wakaba 1.1 line => $self->{line_prev},
925 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
926 wakaba 1.1 });
927     redo A;
928     }
929     } else { # after "<{tag-name}"
930     unless ($is_space->{$self->{nc}} or
931     {
932     0x003E => 1, # >
933     0x002F => 1, # /
934     -1 => 1, # EOF
935     }->{$self->{nc}}) {
936    
937     ## Reconsume.
938     $self->{state} = DATA_STATE;
939 wakaba 1.5 $self->{s_kwd} = '';
940 wakaba 1.1 return ({type => CHARACTER_TOKEN,
941 wakaba 1.12 data => '</' . $self->{kwd},
942 wakaba 1.1 line => $self->{line_prev},
943 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
944 wakaba 1.1 });
945     redo A;
946     } else {
947    
948     $self->{ct}
949     = {type => END_TAG_TOKEN,
950     tag_name => $self->{last_stag_name},
951     line => $self->{line_prev},
952 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
953 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
954     ## Reconsume.
955     redo A;
956     }
957     }
958     } elsif ($self->{state} == TAG_NAME_STATE) {
959     if ($is_space->{$self->{nc}}) {
960    
961     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
962    
963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
964     $self->{line_prev} = $self->{line};
965     $self->{column_prev} = $self->{column};
966     $self->{column}++;
967     $self->{nc}
968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
969     } else {
970     $self->{set_nc}->($self);
971     }
972    
973     redo A;
974     } elsif ($self->{nc} == 0x003E) { # >
975     if ($self->{ct}->{type} == START_TAG_TOKEN) {
976    
977     $self->{last_stag_name} = $self->{ct}->{tag_name};
978     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
979     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
980     #if ($self->{ct}->{attributes}) {
981     # ## NOTE: This should never be reached.
982     # !!! cp (36);
983     # !!! parse-error (type => 'end tag attribute');
984     #} else {
985    
986     #}
987     } else {
988     die "$0: $self->{ct}->{type}: Unknown token type";
989     }
990     $self->{state} = DATA_STATE;
991 wakaba 1.5 $self->{s_kwd} = '';
992 wakaba 1.1
993     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
994     $self->{line_prev} = $self->{line};
995     $self->{column_prev} = $self->{column};
996     $self->{column}++;
997     $self->{nc}
998     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
999     } else {
1000     $self->{set_nc}->($self);
1001     }
1002    
1003    
1004     return ($self->{ct}); # start tag or end tag
1005    
1006     redo A;
1007     } elsif (0x0041 <= $self->{nc} and
1008     $self->{nc} <= 0x005A) { # A..Z
1009    
1010 wakaba 1.4 $self->{ct}->{tag_name}
1011     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1012 wakaba 1.1 # start tag or end tag
1013     ## Stay in this state
1014    
1015     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1016     $self->{line_prev} = $self->{line};
1017     $self->{column_prev} = $self->{column};
1018     $self->{column}++;
1019     $self->{nc}
1020     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1021     } else {
1022     $self->{set_nc}->($self);
1023     }
1024    
1025     redo A;
1026     } elsif ($self->{nc} == -1) {
1027     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1028     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1029    
1030     $self->{last_stag_name} = $self->{ct}->{tag_name};
1031     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1032     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1033     #if ($self->{ct}->{attributes}) {
1034     # ## NOTE: This state should never be reached.
1035     # !!! cp (40);
1036     # !!! parse-error (type => 'end tag attribute');
1037     #} else {
1038    
1039     #}
1040     } else {
1041     die "$0: $self->{ct}->{type}: Unknown token type";
1042     }
1043     $self->{state} = DATA_STATE;
1044 wakaba 1.5 $self->{s_kwd} = '';
1045 wakaba 1.1 # reconsume
1046    
1047     return ($self->{ct}); # start tag or end tag
1048    
1049     redo A;
1050     } elsif ($self->{nc} == 0x002F) { # /
1051    
1052     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1053    
1054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1055     $self->{line_prev} = $self->{line};
1056     $self->{column_prev} = $self->{column};
1057     $self->{column}++;
1058     $self->{nc}
1059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1060     } else {
1061     $self->{set_nc}->($self);
1062     }
1063    
1064     redo A;
1065     } else {
1066    
1067     $self->{ct}->{tag_name} .= chr $self->{nc};
1068     # start tag or end tag
1069     ## Stay in the state
1070    
1071     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072     $self->{line_prev} = $self->{line};
1073     $self->{column_prev} = $self->{column};
1074     $self->{column}++;
1075     $self->{nc}
1076     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077     } else {
1078     $self->{set_nc}->($self);
1079     }
1080    
1081     redo A;
1082     }
1083     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1084 wakaba 1.11 ## XML5: "Tag attribute name before state".
1085    
1086 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1087    
1088     ## Stay in the state
1089    
1090     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1091     $self->{line_prev} = $self->{line};
1092     $self->{column_prev} = $self->{column};
1093     $self->{column}++;
1094     $self->{nc}
1095     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1096     } else {
1097     $self->{set_nc}->($self);
1098     }
1099    
1100     redo A;
1101     } elsif ($self->{nc} == 0x003E) { # >
1102     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1103    
1104     $self->{last_stag_name} = $self->{ct}->{tag_name};
1105     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1106     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1107     if ($self->{ct}->{attributes}) {
1108    
1109     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1110     } else {
1111    
1112     }
1113     } else {
1114     die "$0: $self->{ct}->{type}: Unknown token type";
1115     }
1116     $self->{state} = DATA_STATE;
1117 wakaba 1.5 $self->{s_kwd} = '';
1118 wakaba 1.1
1119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1120     $self->{line_prev} = $self->{line};
1121     $self->{column_prev} = $self->{column};
1122     $self->{column}++;
1123     $self->{nc}
1124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1125     } else {
1126     $self->{set_nc}->($self);
1127     }
1128    
1129    
1130     return ($self->{ct}); # start tag or end tag
1131    
1132     redo A;
1133     } elsif (0x0041 <= $self->{nc} and
1134     $self->{nc} <= 0x005A) { # A..Z
1135    
1136     $self->{ca}
1137 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1138 wakaba 1.1 value => '',
1139     line => $self->{line}, column => $self->{column}};
1140     $self->{state} = ATTRIBUTE_NAME_STATE;
1141    
1142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1143     $self->{line_prev} = $self->{line};
1144     $self->{column_prev} = $self->{column};
1145     $self->{column}++;
1146     $self->{nc}
1147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1148     } else {
1149     $self->{set_nc}->($self);
1150     }
1151    
1152     redo A;
1153     } elsif ($self->{nc} == 0x002F) { # /
1154    
1155     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1156    
1157     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1158     $self->{line_prev} = $self->{line};
1159     $self->{column_prev} = $self->{column};
1160     $self->{column}++;
1161     $self->{nc}
1162     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1163     } else {
1164     $self->{set_nc}->($self);
1165     }
1166    
1167     redo A;
1168     } elsif ($self->{nc} == -1) {
1169     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1170     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171    
1172     $self->{last_stag_name} = $self->{ct}->{tag_name};
1173     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175     if ($self->{ct}->{attributes}) {
1176    
1177     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1178     } else {
1179    
1180     }
1181     } else {
1182     die "$0: $self->{ct}->{type}: Unknown token type";
1183     }
1184     $self->{state} = DATA_STATE;
1185 wakaba 1.5 $self->{s_kwd} = '';
1186 wakaba 1.1 # reconsume
1187    
1188     return ($self->{ct}); # start tag or end tag
1189    
1190     redo A;
1191     } else {
1192     if ({
1193     0x0022 => 1, # "
1194     0x0027 => 1, # '
1195     0x003D => 1, # =
1196     }->{$self->{nc}}) {
1197    
1198 wakaba 1.11 ## XML5: Not a parse error.
1199 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1200     } else {
1201    
1202 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1203 wakaba 1.1 }
1204     $self->{ca}
1205     = {name => chr ($self->{nc}),
1206     value => '',
1207     line => $self->{line}, column => $self->{column}};
1208     $self->{state} = ATTRIBUTE_NAME_STATE;
1209    
1210     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1211     $self->{line_prev} = $self->{line};
1212     $self->{column_prev} = $self->{column};
1213     $self->{column}++;
1214     $self->{nc}
1215     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1216     } else {
1217     $self->{set_nc}->($self);
1218     }
1219    
1220     redo A;
1221     }
1222     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1223 wakaba 1.11 ## XML5: "Tag attribute name state".
1224    
1225 wakaba 1.1 my $before_leave = sub {
1226     if (exists $self->{ct}->{attributes} # start tag or end tag
1227     ->{$self->{ca}->{name}}) { # MUST
1228    
1229     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1230     ## Discard $self->{ca} # MUST
1231     } else {
1232    
1233     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1234     = $self->{ca};
1235 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1236 wakaba 1.1 }
1237     }; # $before_leave
1238    
1239     if ($is_space->{$self->{nc}}) {
1240    
1241     $before_leave->();
1242     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1243    
1244     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1245     $self->{line_prev} = $self->{line};
1246     $self->{column_prev} = $self->{column};
1247     $self->{column}++;
1248     $self->{nc}
1249     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1250     } else {
1251     $self->{set_nc}->($self);
1252     }
1253    
1254     redo A;
1255     } elsif ($self->{nc} == 0x003D) { # =
1256    
1257     $before_leave->();
1258     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1259    
1260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1261     $self->{line_prev} = $self->{line};
1262     $self->{column_prev} = $self->{column};
1263     $self->{column}++;
1264     $self->{nc}
1265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1266     } else {
1267     $self->{set_nc}->($self);
1268     }
1269    
1270     redo A;
1271     } elsif ($self->{nc} == 0x003E) { # >
1272 wakaba 1.11 if ($self->{is_xml}) {
1273    
1274     ## XML5: Not a parse error.
1275     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1276     } else {
1277    
1278     }
1279    
1280 wakaba 1.1 $before_leave->();
1281     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1282    
1283     $self->{last_stag_name} = $self->{ct}->{tag_name};
1284     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1285    
1286     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1287     if ($self->{ct}->{attributes}) {
1288     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1289     }
1290     } else {
1291     die "$0: $self->{ct}->{type}: Unknown token type";
1292     }
1293     $self->{state} = DATA_STATE;
1294 wakaba 1.5 $self->{s_kwd} = '';
1295 wakaba 1.1
1296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1297     $self->{line_prev} = $self->{line};
1298     $self->{column_prev} = $self->{column};
1299     $self->{column}++;
1300     $self->{nc}
1301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1302     } else {
1303     $self->{set_nc}->($self);
1304     }
1305    
1306    
1307     return ($self->{ct}); # start tag or end tag
1308    
1309     redo A;
1310     } elsif (0x0041 <= $self->{nc} and
1311     $self->{nc} <= 0x005A) { # A..Z
1312    
1313 wakaba 1.4 $self->{ca}->{name}
1314     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1315 wakaba 1.1 ## Stay in the state
1316    
1317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1318     $self->{line_prev} = $self->{line};
1319     $self->{column_prev} = $self->{column};
1320     $self->{column}++;
1321     $self->{nc}
1322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1323     } else {
1324     $self->{set_nc}->($self);
1325     }
1326    
1327     redo A;
1328     } elsif ($self->{nc} == 0x002F) { # /
1329 wakaba 1.11 if ($self->{is_xml}) {
1330    
1331     ## XML5: Not a parse error.
1332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1333     } else {
1334    
1335     }
1336 wakaba 1.1
1337     $before_leave->();
1338     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1339    
1340     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1341     $self->{line_prev} = $self->{line};
1342     $self->{column_prev} = $self->{column};
1343     $self->{column}++;
1344     $self->{nc}
1345     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1346     } else {
1347     $self->{set_nc}->($self);
1348     }
1349    
1350     redo A;
1351     } elsif ($self->{nc} == -1) {
1352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1353     $before_leave->();
1354     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1355    
1356     $self->{last_stag_name} = $self->{ct}->{tag_name};
1357     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1358     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1359     if ($self->{ct}->{attributes}) {
1360    
1361     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1362     } else {
1363     ## NOTE: This state should never be reached.
1364    
1365     }
1366     } else {
1367     die "$0: $self->{ct}->{type}: Unknown token type";
1368     }
1369     $self->{state} = DATA_STATE;
1370 wakaba 1.5 $self->{s_kwd} = '';
1371 wakaba 1.1 # reconsume
1372    
1373     return ($self->{ct}); # start tag or end tag
1374    
1375     redo A;
1376     } else {
1377     if ($self->{nc} == 0x0022 or # "
1378     $self->{nc} == 0x0027) { # '
1379    
1380 wakaba 1.11 ## XML5: Not a parse error.
1381 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1382     } else {
1383    
1384     }
1385     $self->{ca}->{name} .= chr ($self->{nc});
1386     ## Stay in the state
1387    
1388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1389     $self->{line_prev} = $self->{line};
1390     $self->{column_prev} = $self->{column};
1391     $self->{column}++;
1392     $self->{nc}
1393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1394     } else {
1395     $self->{set_nc}->($self);
1396     }
1397    
1398     redo A;
1399     }
1400     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1401 wakaba 1.11 ## XML5: "Tag attribute name after state".
1402    
1403 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1404    
1405     ## Stay in the state
1406    
1407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1408     $self->{line_prev} = $self->{line};
1409     $self->{column_prev} = $self->{column};
1410     $self->{column}++;
1411     $self->{nc}
1412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1413     } else {
1414     $self->{set_nc}->($self);
1415     }
1416    
1417     redo A;
1418     } elsif ($self->{nc} == 0x003D) { # =
1419    
1420     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1421    
1422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1423     $self->{line_prev} = $self->{line};
1424     $self->{column_prev} = $self->{column};
1425     $self->{column}++;
1426     $self->{nc}
1427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1428     } else {
1429     $self->{set_nc}->($self);
1430     }
1431    
1432     redo A;
1433     } elsif ($self->{nc} == 0x003E) { # >
1434 wakaba 1.11 if ($self->{is_xml}) {
1435    
1436     ## XML5: Not a parse error.
1437     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1438     } else {
1439    
1440     }
1441    
1442 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1443    
1444     $self->{last_stag_name} = $self->{ct}->{tag_name};
1445     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1446     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1447     if ($self->{ct}->{attributes}) {
1448    
1449     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1450     } else {
1451     ## NOTE: This state should never be reached.
1452    
1453     }
1454     } else {
1455     die "$0: $self->{ct}->{type}: Unknown token type";
1456     }
1457     $self->{state} = DATA_STATE;
1458 wakaba 1.5 $self->{s_kwd} = '';
1459 wakaba 1.1
1460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1461     $self->{line_prev} = $self->{line};
1462     $self->{column_prev} = $self->{column};
1463     $self->{column}++;
1464     $self->{nc}
1465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1466     } else {
1467     $self->{set_nc}->($self);
1468     }
1469    
1470    
1471     return ($self->{ct}); # start tag or end tag
1472    
1473     redo A;
1474     } elsif (0x0041 <= $self->{nc} and
1475     $self->{nc} <= 0x005A) { # A..Z
1476    
1477     $self->{ca}
1478 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1479 wakaba 1.1 value => '',
1480     line => $self->{line}, column => $self->{column}};
1481     $self->{state} = ATTRIBUTE_NAME_STATE;
1482    
1483     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1484     $self->{line_prev} = $self->{line};
1485     $self->{column_prev} = $self->{column};
1486     $self->{column}++;
1487     $self->{nc}
1488     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1489     } else {
1490     $self->{set_nc}->($self);
1491     }
1492    
1493     redo A;
1494     } elsif ($self->{nc} == 0x002F) { # /
1495 wakaba 1.11 if ($self->{is_xml}) {
1496    
1497     ## XML5: Not a parse error.
1498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1499     } else {
1500    
1501     }
1502 wakaba 1.1
1503     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1504    
1505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1506     $self->{line_prev} = $self->{line};
1507     $self->{column_prev} = $self->{column};
1508     $self->{column}++;
1509     $self->{nc}
1510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1511     } else {
1512     $self->{set_nc}->($self);
1513     }
1514    
1515     redo A;
1516     } elsif ($self->{nc} == -1) {
1517     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1518     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1519    
1520     $self->{last_stag_name} = $self->{ct}->{tag_name};
1521     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1522     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1523     if ($self->{ct}->{attributes}) {
1524    
1525     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1526     } else {
1527     ## NOTE: This state should never be reached.
1528    
1529     }
1530     } else {
1531     die "$0: $self->{ct}->{type}: Unknown token type";
1532     }
1533 wakaba 1.5 $self->{s_kwd} = '';
1534 wakaba 1.1 $self->{state} = DATA_STATE;
1535     # reconsume
1536    
1537     return ($self->{ct}); # start tag or end tag
1538    
1539     redo A;
1540     } else {
1541 wakaba 1.11 if ($self->{is_xml}) {
1542    
1543     ## XML5: Not a parse error.
1544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1545     } else {
1546    
1547     }
1548    
1549 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1550     $self->{nc} == 0x0027) { # '
1551    
1552 wakaba 1.11 ## XML5: Not a parse error.
1553 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1554     } else {
1555    
1556     }
1557     $self->{ca}
1558     = {name => chr ($self->{nc}),
1559     value => '',
1560     line => $self->{line}, column => $self->{column}};
1561     $self->{state} = ATTRIBUTE_NAME_STATE;
1562    
1563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1564     $self->{line_prev} = $self->{line};
1565     $self->{column_prev} = $self->{column};
1566     $self->{column}++;
1567     $self->{nc}
1568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1569     } else {
1570     $self->{set_nc}->($self);
1571     }
1572    
1573     redo A;
1574     }
1575     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1576 wakaba 1.11 ## XML5: "Tag attribute value before state".
1577    
1578 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1579    
1580     ## Stay in the state
1581    
1582     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1583     $self->{line_prev} = $self->{line};
1584     $self->{column_prev} = $self->{column};
1585     $self->{column}++;
1586     $self->{nc}
1587     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1588     } else {
1589     $self->{set_nc}->($self);
1590     }
1591    
1592     redo A;
1593     } elsif ($self->{nc} == 0x0022) { # "
1594    
1595     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1596    
1597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1598     $self->{line_prev} = $self->{line};
1599     $self->{column_prev} = $self->{column};
1600     $self->{column}++;
1601     $self->{nc}
1602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1603     } else {
1604     $self->{set_nc}->($self);
1605     }
1606    
1607     redo A;
1608     } elsif ($self->{nc} == 0x0026) { # &
1609    
1610     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1611     ## reconsume
1612     redo A;
1613     } elsif ($self->{nc} == 0x0027) { # '
1614    
1615     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1616    
1617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1618     $self->{line_prev} = $self->{line};
1619     $self->{column_prev} = $self->{column};
1620     $self->{column}++;
1621     $self->{nc}
1622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1623     } else {
1624     $self->{set_nc}->($self);
1625     }
1626    
1627     redo A;
1628     } elsif ($self->{nc} == 0x003E) { # >
1629     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1630     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1631    
1632     $self->{last_stag_name} = $self->{ct}->{tag_name};
1633     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1634     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1635     if ($self->{ct}->{attributes}) {
1636    
1637     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1638     } else {
1639     ## NOTE: This state should never be reached.
1640    
1641     }
1642     } else {
1643     die "$0: $self->{ct}->{type}: Unknown token type";
1644     }
1645     $self->{state} = DATA_STATE;
1646 wakaba 1.5 $self->{s_kwd} = '';
1647 wakaba 1.1
1648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1649     $self->{line_prev} = $self->{line};
1650     $self->{column_prev} = $self->{column};
1651     $self->{column}++;
1652     $self->{nc}
1653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1654     } else {
1655     $self->{set_nc}->($self);
1656     }
1657    
1658    
1659     return ($self->{ct}); # start tag or end tag
1660    
1661     redo A;
1662     } elsif ($self->{nc} == -1) {
1663     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1664     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1665    
1666     $self->{last_stag_name} = $self->{ct}->{tag_name};
1667     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1668     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1669     if ($self->{ct}->{attributes}) {
1670    
1671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1672     } else {
1673     ## NOTE: This state should never be reached.
1674    
1675     }
1676     } else {
1677     die "$0: $self->{ct}->{type}: Unknown token type";
1678     }
1679     $self->{state} = DATA_STATE;
1680 wakaba 1.5 $self->{s_kwd} = '';
1681 wakaba 1.1 ## reconsume
1682    
1683     return ($self->{ct}); # start tag or end tag
1684    
1685     redo A;
1686     } else {
1687     if ($self->{nc} == 0x003D) { # =
1688    
1689 wakaba 1.11 ## XML5: Not a parse error.
1690 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1691 wakaba 1.11 } elsif ($self->{is_xml}) {
1692    
1693     ## XML5: No parse error.
1694     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1695 wakaba 1.1 } else {
1696    
1697     }
1698     $self->{ca}->{value} .= chr ($self->{nc});
1699     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1700    
1701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1702     $self->{line_prev} = $self->{line};
1703     $self->{column_prev} = $self->{column};
1704     $self->{column}++;
1705     $self->{nc}
1706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1707     } else {
1708     $self->{set_nc}->($self);
1709     }
1710    
1711     redo A;
1712     }
1713     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1714 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1715    
1716 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1717    
1718 wakaba 1.11 ## XML5: "Tag attribute name before state".
1719 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1720    
1721     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1722     $self->{line_prev} = $self->{line};
1723     $self->{column_prev} = $self->{column};
1724     $self->{column}++;
1725     $self->{nc}
1726     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1727     } else {
1728     $self->{set_nc}->($self);
1729     }
1730    
1731     redo A;
1732     } elsif ($self->{nc} == 0x0026) { # &
1733    
1734 wakaba 1.11 ## XML5: Not defined yet.
1735    
1736 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1737     ## "entity in attribute value state". In this implementation, the
1738     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1739     ## implementation of the "consume a character reference" algorithm.
1740     $self->{prev_state} = $self->{state};
1741     $self->{entity_add} = 0x0022; # "
1742     $self->{state} = ENTITY_STATE;
1743    
1744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1745     $self->{line_prev} = $self->{line};
1746     $self->{column_prev} = $self->{column};
1747     $self->{column}++;
1748     $self->{nc}
1749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1750     } else {
1751     $self->{set_nc}->($self);
1752     }
1753    
1754     redo A;
1755     } elsif ($self->{nc} == -1) {
1756     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1757     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1758    
1759     $self->{last_stag_name} = $self->{ct}->{tag_name};
1760     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1761     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1762     if ($self->{ct}->{attributes}) {
1763    
1764     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1765     } else {
1766     ## NOTE: This state should never be reached.
1767    
1768     }
1769     } else {
1770     die "$0: $self->{ct}->{type}: Unknown token type";
1771     }
1772     $self->{state} = DATA_STATE;
1773 wakaba 1.5 $self->{s_kwd} = '';
1774 wakaba 1.1 ## reconsume
1775    
1776     return ($self->{ct}); # start tag or end tag
1777    
1778     redo A;
1779     } else {
1780 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1781    
1782     ## XML5: Not a parse error.
1783     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1784     } else {
1785    
1786     }
1787 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1788     $self->{read_until}->($self->{ca}->{value},
1789 wakaba 1.11 q["&<],
1790 wakaba 1.1 length $self->{ca}->{value});
1791    
1792     ## Stay in the state
1793    
1794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795     $self->{line_prev} = $self->{line};
1796     $self->{column_prev} = $self->{column};
1797     $self->{column}++;
1798     $self->{nc}
1799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800     } else {
1801     $self->{set_nc}->($self);
1802     }
1803    
1804     redo A;
1805     }
1806     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1807 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1808    
1809 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1810    
1811 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1812 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1813    
1814     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1815     $self->{line_prev} = $self->{line};
1816     $self->{column_prev} = $self->{column};
1817     $self->{column}++;
1818     $self->{nc}
1819     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1820     } else {
1821     $self->{set_nc}->($self);
1822     }
1823    
1824     redo A;
1825     } elsif ($self->{nc} == 0x0026) { # &
1826    
1827 wakaba 1.11 ## XML5: Not defined yet.
1828    
1829 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1830     ## "entity in attribute value state". In this implementation, the
1831     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1832     ## implementation of the "consume a character reference" algorithm.
1833     $self->{entity_add} = 0x0027; # '
1834     $self->{prev_state} = $self->{state};
1835     $self->{state} = ENTITY_STATE;
1836    
1837     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1838     $self->{line_prev} = $self->{line};
1839     $self->{column_prev} = $self->{column};
1840     $self->{column}++;
1841     $self->{nc}
1842     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1843     } else {
1844     $self->{set_nc}->($self);
1845     }
1846    
1847     redo A;
1848     } elsif ($self->{nc} == -1) {
1849     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1850     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1851    
1852     $self->{last_stag_name} = $self->{ct}->{tag_name};
1853     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1854     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1855     if ($self->{ct}->{attributes}) {
1856    
1857     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1858     } else {
1859     ## NOTE: This state should never be reached.
1860    
1861     }
1862     } else {
1863     die "$0: $self->{ct}->{type}: Unknown token type";
1864     }
1865     $self->{state} = DATA_STATE;
1866 wakaba 1.5 $self->{s_kwd} = '';
1867 wakaba 1.1 ## reconsume
1868    
1869     return ($self->{ct}); # start tag or end tag
1870    
1871     redo A;
1872     } else {
1873 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1874    
1875     ## XML5: Not a parse error.
1876     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1877     } else {
1878    
1879     }
1880 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1881     $self->{read_until}->($self->{ca}->{value},
1882 wakaba 1.11 q['&<],
1883 wakaba 1.1 length $self->{ca}->{value});
1884    
1885     ## Stay in the state
1886    
1887     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1888     $self->{line_prev} = $self->{line};
1889     $self->{column_prev} = $self->{column};
1890     $self->{column}++;
1891     $self->{nc}
1892     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1893     } else {
1894     $self->{set_nc}->($self);
1895     }
1896    
1897     redo A;
1898     }
1899     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1900 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1901    
1902 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1903    
1904 wakaba 1.11 ## XML5: "Tag attribute name before state".
1905 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1906    
1907     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1908     $self->{line_prev} = $self->{line};
1909     $self->{column_prev} = $self->{column};
1910     $self->{column}++;
1911     $self->{nc}
1912     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1913     } else {
1914     $self->{set_nc}->($self);
1915     }
1916    
1917     redo A;
1918     } elsif ($self->{nc} == 0x0026) { # &
1919    
1920 wakaba 1.11
1921     ## XML5: Not defined yet.
1922    
1923 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1924     ## "entity in attribute value state". In this implementation, the
1925     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1926     ## implementation of the "consume a character reference" algorithm.
1927     $self->{entity_add} = -1;
1928     $self->{prev_state} = $self->{state};
1929     $self->{state} = ENTITY_STATE;
1930    
1931     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1932     $self->{line_prev} = $self->{line};
1933     $self->{column_prev} = $self->{column};
1934     $self->{column}++;
1935     $self->{nc}
1936     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1937     } else {
1938     $self->{set_nc}->($self);
1939     }
1940    
1941     redo A;
1942     } elsif ($self->{nc} == 0x003E) { # >
1943     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1944    
1945     $self->{last_stag_name} = $self->{ct}->{tag_name};
1946     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1947     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1948     if ($self->{ct}->{attributes}) {
1949    
1950     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1951     } else {
1952     ## NOTE: This state should never be reached.
1953    
1954     }
1955     } else {
1956     die "$0: $self->{ct}->{type}: Unknown token type";
1957     }
1958     $self->{state} = DATA_STATE;
1959 wakaba 1.5 $self->{s_kwd} = '';
1960 wakaba 1.1
1961     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1962     $self->{line_prev} = $self->{line};
1963     $self->{column_prev} = $self->{column};
1964     $self->{column}++;
1965     $self->{nc}
1966     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1967     } else {
1968     $self->{set_nc}->($self);
1969     }
1970    
1971    
1972     return ($self->{ct}); # start tag or end tag
1973    
1974     redo A;
1975     } elsif ($self->{nc} == -1) {
1976     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1977     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1978    
1979     $self->{last_stag_name} = $self->{ct}->{tag_name};
1980     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1981     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1982     if ($self->{ct}->{attributes}) {
1983    
1984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1985     } else {
1986     ## NOTE: This state should never be reached.
1987    
1988     }
1989     } else {
1990     die "$0: $self->{ct}->{type}: Unknown token type";
1991     }
1992     $self->{state} = DATA_STATE;
1993 wakaba 1.5 $self->{s_kwd} = '';
1994 wakaba 1.1 ## reconsume
1995    
1996     return ($self->{ct}); # start tag or end tag
1997    
1998     redo A;
1999     } else {
2000     if ({
2001     0x0022 => 1, # "
2002     0x0027 => 1, # '
2003     0x003D => 1, # =
2004     }->{$self->{nc}}) {
2005    
2006 wakaba 1.11 ## XML5: Not a parse error.
2007 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2008     } else {
2009    
2010     }
2011     $self->{ca}->{value} .= chr ($self->{nc});
2012     $self->{read_until}->($self->{ca}->{value},
2013     q["'=& >],
2014     length $self->{ca}->{value});
2015    
2016     ## Stay in the state
2017    
2018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2019     $self->{line_prev} = $self->{line};
2020     $self->{column_prev} = $self->{column};
2021     $self->{column}++;
2022     $self->{nc}
2023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2024     } else {
2025     $self->{set_nc}->($self);
2026     }
2027    
2028     redo A;
2029     }
2030     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2031     if ($is_space->{$self->{nc}}) {
2032    
2033     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2034    
2035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036     $self->{line_prev} = $self->{line};
2037     $self->{column_prev} = $self->{column};
2038     $self->{column}++;
2039     $self->{nc}
2040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041     } else {
2042     $self->{set_nc}->($self);
2043     }
2044    
2045     redo A;
2046     } elsif ($self->{nc} == 0x003E) { # >
2047     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048    
2049     $self->{last_stag_name} = $self->{ct}->{tag_name};
2050     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2051     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2052     if ($self->{ct}->{attributes}) {
2053    
2054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2055     } else {
2056     ## NOTE: This state should never be reached.
2057    
2058     }
2059     } else {
2060     die "$0: $self->{ct}->{type}: Unknown token type";
2061     }
2062     $self->{state} = DATA_STATE;
2063 wakaba 1.5 $self->{s_kwd} = '';
2064 wakaba 1.1
2065     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2066     $self->{line_prev} = $self->{line};
2067     $self->{column_prev} = $self->{column};
2068     $self->{column}++;
2069     $self->{nc}
2070     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2071     } else {
2072     $self->{set_nc}->($self);
2073     }
2074    
2075    
2076     return ($self->{ct}); # start tag or end tag
2077    
2078     redo A;
2079     } elsif ($self->{nc} == 0x002F) { # /
2080    
2081     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2082    
2083     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2084     $self->{line_prev} = $self->{line};
2085     $self->{column_prev} = $self->{column};
2086     $self->{column}++;
2087     $self->{nc}
2088     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2089     } else {
2090     $self->{set_nc}->($self);
2091     }
2092    
2093     redo A;
2094     } elsif ($self->{nc} == -1) {
2095     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2096     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2097    
2098     $self->{last_stag_name} = $self->{ct}->{tag_name};
2099     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2100     if ($self->{ct}->{attributes}) {
2101    
2102     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2103     } else {
2104     ## NOTE: This state should never be reached.
2105    
2106     }
2107     } else {
2108     die "$0: $self->{ct}->{type}: Unknown token type";
2109     }
2110     $self->{state} = DATA_STATE;
2111 wakaba 1.5 $self->{s_kwd} = '';
2112 wakaba 1.1 ## Reconsume.
2113     return ($self->{ct}); # start tag or end tag
2114     redo A;
2115     } else {
2116    
2117     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2118     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2119     ## reconsume
2120     redo A;
2121     }
2122     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2123 wakaba 1.11 ## XML5: "Empty tag state".
2124    
2125 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2126     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2127    
2128     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2129     ## TODO: Different type than slash in start tag
2130     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2131     if ($self->{ct}->{attributes}) {
2132    
2133     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2134     } else {
2135    
2136     }
2137     ## TODO: Test |<title></title/>|
2138     } else {
2139    
2140     $self->{self_closing} = 1;
2141     }
2142    
2143     $self->{state} = DATA_STATE;
2144 wakaba 1.5 $self->{s_kwd} = '';
2145 wakaba 1.1
2146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2147     $self->{line_prev} = $self->{line};
2148     $self->{column_prev} = $self->{column};
2149     $self->{column}++;
2150     $self->{nc}
2151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2152     } else {
2153     $self->{set_nc}->($self);
2154     }
2155    
2156    
2157     return ($self->{ct}); # start tag or end tag
2158    
2159     redo A;
2160     } elsif ($self->{nc} == -1) {
2161     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2162     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2163    
2164     $self->{last_stag_name} = $self->{ct}->{tag_name};
2165     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2166     if ($self->{ct}->{attributes}) {
2167    
2168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2169     } else {
2170     ## NOTE: This state should never be reached.
2171    
2172     }
2173     } else {
2174     die "$0: $self->{ct}->{type}: Unknown token type";
2175     }
2176 wakaba 1.11 ## XML5: "Tag attribute name before state".
2177 wakaba 1.1 $self->{state} = DATA_STATE;
2178 wakaba 1.5 $self->{s_kwd} = '';
2179 wakaba 1.1 ## Reconsume.
2180     return ($self->{ct}); # start tag or end tag
2181     redo A;
2182     } else {
2183    
2184     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2185     ## TODO: This error type is wrong.
2186     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2187     ## Reconsume.
2188     redo A;
2189     }
2190     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2191     ## NOTE: Unlike spec's "bogus comment state", this implementation
2192     ## consumes characters one-by-one basis.
2193    
2194     if ($self->{nc} == 0x003E) { # >
2195 wakaba 1.13 if ($self->{in_subset}) {
2196    
2197     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2198     } else {
2199    
2200     $self->{state} = DATA_STATE;
2201     $self->{s_kwd} = '';
2202     }
2203 wakaba 1.1
2204     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2205     $self->{line_prev} = $self->{line};
2206     $self->{column_prev} = $self->{column};
2207     $self->{column}++;
2208     $self->{nc}
2209     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2210     } else {
2211     $self->{set_nc}->($self);
2212     }
2213    
2214    
2215     return ($self->{ct}); # comment
2216     redo A;
2217     } elsif ($self->{nc} == -1) {
2218 wakaba 1.13 if ($self->{in_subset}) {
2219    
2220     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2221     } else {
2222    
2223     $self->{state} = DATA_STATE;
2224     $self->{s_kwd} = '';
2225     }
2226 wakaba 1.1 ## reconsume
2227    
2228     return ($self->{ct}); # comment
2229     redo A;
2230     } else {
2231    
2232     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2233     $self->{read_until}->($self->{ct}->{data},
2234     q[>],
2235     length $self->{ct}->{data});
2236    
2237     ## Stay in the state.
2238    
2239     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2240     $self->{line_prev} = $self->{line};
2241     $self->{column_prev} = $self->{column};
2242     $self->{column}++;
2243     $self->{nc}
2244     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2245     } else {
2246     $self->{set_nc}->($self);
2247     }
2248    
2249     redo A;
2250     }
2251     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2252 wakaba 1.13 ## XML5: "Markup declaration state" and "DOCTYPE markup
2253     ## declaration state".
2254 wakaba 1.1
2255     if ($self->{nc} == 0x002D) { # -
2256    
2257     $self->{state} = MD_HYPHEN_STATE;
2258    
2259     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2260     $self->{line_prev} = $self->{line};
2261     $self->{column_prev} = $self->{column};
2262     $self->{column}++;
2263     $self->{nc}
2264     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2265     } else {
2266     $self->{set_nc}->($self);
2267     }
2268    
2269     redo A;
2270     } elsif ($self->{nc} == 0x0044 or # D
2271     $self->{nc} == 0x0064) { # d
2272     ## ASCII case-insensitive.
2273    
2274     $self->{state} = MD_DOCTYPE_STATE;
2275 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2276 wakaba 1.1
2277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2278     $self->{line_prev} = $self->{line};
2279     $self->{column_prev} = $self->{column};
2280     $self->{column}++;
2281     $self->{nc}
2282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2283     } else {
2284     $self->{set_nc}->($self);
2285     }
2286    
2287     redo A;
2288 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2289     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2290     $self->{is_xml}) and
2291 wakaba 1.1 $self->{nc} == 0x005B) { # [
2292    
2293     $self->{state} = MD_CDATA_STATE;
2294 wakaba 1.12 $self->{kwd} = '[';
2295 wakaba 1.1
2296     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2297     $self->{line_prev} = $self->{line};
2298     $self->{column_prev} = $self->{column};
2299     $self->{column}++;
2300     $self->{nc}
2301     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2302     } else {
2303     $self->{set_nc}->($self);
2304     }
2305    
2306     redo A;
2307     } else {
2308    
2309     }
2310    
2311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2312     line => $self->{line_prev},
2313     column => $self->{column_prev} - 1);
2314     ## Reconsume.
2315     $self->{state} = BOGUS_COMMENT_STATE;
2316     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2317     line => $self->{line_prev},
2318     column => $self->{column_prev} - 1,
2319     };
2320     redo A;
2321     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2322     if ($self->{nc} == 0x002D) { # -
2323    
2324     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2325     line => $self->{line_prev},
2326     column => $self->{column_prev} - 2,
2327     };
2328 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2329 wakaba 1.1
2330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2331     $self->{line_prev} = $self->{line};
2332     $self->{column_prev} = $self->{column};
2333     $self->{column}++;
2334     $self->{nc}
2335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2336     } else {
2337     $self->{set_nc}->($self);
2338     }
2339    
2340     redo A;
2341     } else {
2342    
2343     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2344     line => $self->{line_prev},
2345     column => $self->{column_prev} - 2);
2346     $self->{state} = BOGUS_COMMENT_STATE;
2347     ## Reconsume.
2348     $self->{ct} = {type => COMMENT_TOKEN,
2349     data => '-',
2350     line => $self->{line_prev},
2351     column => $self->{column_prev} - 2,
2352     };
2353     redo A;
2354     }
2355     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2356     ## ASCII case-insensitive.
2357     if ($self->{nc} == [
2358     undef,
2359     0x004F, # O
2360     0x0043, # C
2361     0x0054, # T
2362     0x0059, # Y
2363     0x0050, # P
2364 wakaba 1.12 ]->[length $self->{kwd}] or
2365 wakaba 1.1 $self->{nc} == [
2366     undef,
2367     0x006F, # o
2368     0x0063, # c
2369     0x0074, # t
2370     0x0079, # y
2371     0x0070, # p
2372 wakaba 1.12 ]->[length $self->{kwd}]) {
2373 wakaba 1.1
2374     ## Stay in the state.
2375 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2376 wakaba 1.1
2377     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2378     $self->{line_prev} = $self->{line};
2379     $self->{column_prev} = $self->{column};
2380     $self->{column}++;
2381     $self->{nc}
2382     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2383     } else {
2384     $self->{set_nc}->($self);
2385     }
2386    
2387     redo A;
2388 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2389 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2390     $self->{nc} == 0x0065)) { # e
2391 wakaba 1.12 if ($self->{is_xml} and
2392     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2393 wakaba 1.10
2394     ## XML5: case-sensitive.
2395     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2396     text => 'DOCTYPE',
2397     line => $self->{line_prev},
2398     column => $self->{column_prev} - 5);
2399     } else {
2400    
2401     }
2402 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2403     $self->{ct} = {type => DOCTYPE_TOKEN,
2404     quirks => 1,
2405     line => $self->{line_prev},
2406     column => $self->{column_prev} - 7,
2407     };
2408    
2409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2410     $self->{line_prev} = $self->{line};
2411     $self->{column_prev} = $self->{column};
2412     $self->{column}++;
2413     $self->{nc}
2414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2415     } else {
2416     $self->{set_nc}->($self);
2417     }
2418    
2419     redo A;
2420     } else {
2421    
2422     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2423     line => $self->{line_prev},
2424 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2425 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2426     ## Reconsume.
2427     $self->{ct} = {type => COMMENT_TOKEN,
2428 wakaba 1.12 data => $self->{kwd},
2429 wakaba 1.1 line => $self->{line_prev},
2430 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2431 wakaba 1.1 };
2432     redo A;
2433     }
2434     } elsif ($self->{state} == MD_CDATA_STATE) {
2435     if ($self->{nc} == {
2436     '[' => 0x0043, # C
2437     '[C' => 0x0044, # D
2438     '[CD' => 0x0041, # A
2439     '[CDA' => 0x0054, # T
2440     '[CDAT' => 0x0041, # A
2441 wakaba 1.12 }->{$self->{kwd}}) {
2442 wakaba 1.1
2443     ## Stay in the state.
2444 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2445 wakaba 1.1
2446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2447     $self->{line_prev} = $self->{line};
2448     $self->{column_prev} = $self->{column};
2449     $self->{column}++;
2450     $self->{nc}
2451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2452     } else {
2453     $self->{set_nc}->($self);
2454     }
2455    
2456     redo A;
2457 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2458 wakaba 1.1 $self->{nc} == 0x005B) { # [
2459 wakaba 1.6 if ($self->{is_xml} and
2460     not $self->{tainted} and
2461     @{$self->{open_elements} or []} == 0) {
2462 wakaba 1.8
2463 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2464     line => $self->{line_prev},
2465     column => $self->{column_prev} - 7);
2466     $self->{tainted} = 1;
2467 wakaba 1.8 } else {
2468    
2469 wakaba 1.6 }
2470    
2471 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2472     data => '',
2473     line => $self->{line_prev},
2474     column => $self->{column_prev} - 7};
2475     $self->{state} = CDATA_SECTION_STATE;
2476    
2477     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2478     $self->{line_prev} = $self->{line};
2479     $self->{column_prev} = $self->{column};
2480     $self->{column}++;
2481     $self->{nc}
2482     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2483     } else {
2484     $self->{set_nc}->($self);
2485     }
2486    
2487     redo A;
2488     } else {
2489    
2490     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2491     line => $self->{line_prev},
2492 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2493 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2494     ## Reconsume.
2495     $self->{ct} = {type => COMMENT_TOKEN,
2496 wakaba 1.12 data => $self->{kwd},
2497 wakaba 1.1 line => $self->{line_prev},
2498 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2499 wakaba 1.1 };
2500     redo A;
2501     }
2502     } elsif ($self->{state} == COMMENT_START_STATE) {
2503     if ($self->{nc} == 0x002D) { # -
2504    
2505     $self->{state} = COMMENT_START_DASH_STATE;
2506    
2507     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2508     $self->{line_prev} = $self->{line};
2509     $self->{column_prev} = $self->{column};
2510     $self->{column}++;
2511     $self->{nc}
2512     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2513     } else {
2514     $self->{set_nc}->($self);
2515     }
2516    
2517     redo A;
2518     } elsif ($self->{nc} == 0x003E) { # >
2519     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2520 wakaba 1.13 if ($self->{in_subset}) {
2521    
2522     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2523     } else {
2524    
2525     $self->{state} = DATA_STATE;
2526     $self->{s_kwd} = '';
2527     }
2528 wakaba 1.1
2529     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2530     $self->{line_prev} = $self->{line};
2531     $self->{column_prev} = $self->{column};
2532     $self->{column}++;
2533     $self->{nc}
2534     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2535     } else {
2536     $self->{set_nc}->($self);
2537     }
2538    
2539    
2540     return ($self->{ct}); # comment
2541    
2542     redo A;
2543     } elsif ($self->{nc} == -1) {
2544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2545 wakaba 1.13 if ($self->{in_subset}) {
2546    
2547     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2548     } else {
2549    
2550     $self->{state} = DATA_STATE;
2551     $self->{s_kwd} = '';
2552     }
2553 wakaba 1.1 ## reconsume
2554    
2555     return ($self->{ct}); # comment
2556    
2557     redo A;
2558     } else {
2559    
2560     $self->{ct}->{data} # comment
2561     .= chr ($self->{nc});
2562     $self->{state} = COMMENT_STATE;
2563    
2564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2565     $self->{line_prev} = $self->{line};
2566     $self->{column_prev} = $self->{column};
2567     $self->{column}++;
2568     $self->{nc}
2569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2570     } else {
2571     $self->{set_nc}->($self);
2572     }
2573    
2574     redo A;
2575     }
2576     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2577     if ($self->{nc} == 0x002D) { # -
2578    
2579     $self->{state} = COMMENT_END_STATE;
2580    
2581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2582     $self->{line_prev} = $self->{line};
2583     $self->{column_prev} = $self->{column};
2584     $self->{column}++;
2585     $self->{nc}
2586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2587     } else {
2588     $self->{set_nc}->($self);
2589     }
2590    
2591     redo A;
2592     } elsif ($self->{nc} == 0x003E) { # >
2593     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2594 wakaba 1.13 if ($self->{in_subset}) {
2595    
2596     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597     } else {
2598    
2599     $self->{state} = DATA_STATE;
2600     $self->{s_kwd} = '';
2601     }
2602 wakaba 1.1
2603     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2604     $self->{line_prev} = $self->{line};
2605     $self->{column_prev} = $self->{column};
2606     $self->{column}++;
2607     $self->{nc}
2608     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2609     } else {
2610     $self->{set_nc}->($self);
2611     }
2612    
2613    
2614     return ($self->{ct}); # comment
2615    
2616     redo A;
2617     } elsif ($self->{nc} == -1) {
2618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619 wakaba 1.13 if ($self->{in_subset}) {
2620    
2621     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2622     } else {
2623    
2624     $self->{state} = DATA_STATE;
2625     $self->{s_kwd} = '';
2626     }
2627 wakaba 1.1 ## reconsume
2628    
2629     return ($self->{ct}); # comment
2630    
2631     redo A;
2632     } else {
2633    
2634     $self->{ct}->{data} # comment
2635     .= '-' . chr ($self->{nc});
2636     $self->{state} = COMMENT_STATE;
2637    
2638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2639     $self->{line_prev} = $self->{line};
2640     $self->{column_prev} = $self->{column};
2641     $self->{column}++;
2642     $self->{nc}
2643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2644     } else {
2645     $self->{set_nc}->($self);
2646     }
2647    
2648     redo A;
2649     }
2650     } elsif ($self->{state} == COMMENT_STATE) {
2651     if ($self->{nc} == 0x002D) { # -
2652    
2653     $self->{state} = COMMENT_END_DASH_STATE;
2654    
2655     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2656     $self->{line_prev} = $self->{line};
2657     $self->{column_prev} = $self->{column};
2658     $self->{column}++;
2659     $self->{nc}
2660     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2661     } else {
2662     $self->{set_nc}->($self);
2663     }
2664    
2665     redo A;
2666     } elsif ($self->{nc} == -1) {
2667     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668 wakaba 1.13 if ($self->{in_subset}) {
2669    
2670     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2671     } else {
2672    
2673     $self->{state} = DATA_STATE;
2674     $self->{s_kwd} = '';
2675     }
2676 wakaba 1.1 ## reconsume
2677    
2678     return ($self->{ct}); # comment
2679    
2680     redo A;
2681     } else {
2682    
2683     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2684     $self->{read_until}->($self->{ct}->{data},
2685     q[-],
2686     length $self->{ct}->{data});
2687    
2688     ## Stay in the state
2689    
2690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2691     $self->{line_prev} = $self->{line};
2692     $self->{column_prev} = $self->{column};
2693     $self->{column}++;
2694     $self->{nc}
2695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2696     } else {
2697     $self->{set_nc}->($self);
2698     }
2699    
2700     redo A;
2701     }
2702     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2703 wakaba 1.10 ## XML5: "comment dash state".
2704    
2705 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2706    
2707     $self->{state} = COMMENT_END_STATE;
2708    
2709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2710     $self->{line_prev} = $self->{line};
2711     $self->{column_prev} = $self->{column};
2712     $self->{column}++;
2713     $self->{nc}
2714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2715     } else {
2716     $self->{set_nc}->($self);
2717     }
2718    
2719     redo A;
2720     } elsif ($self->{nc} == -1) {
2721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2722 wakaba 1.13 if ($self->{in_subset}) {
2723    
2724     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725     } else {
2726    
2727     $self->{state} = DATA_STATE;
2728     $self->{s_kwd} = '';
2729     }
2730 wakaba 1.1 ## reconsume
2731    
2732     return ($self->{ct}); # comment
2733    
2734     redo A;
2735     } else {
2736    
2737     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2738     $self->{state} = COMMENT_STATE;
2739    
2740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2741     $self->{line_prev} = $self->{line};
2742     $self->{column_prev} = $self->{column};
2743     $self->{column}++;
2744     $self->{nc}
2745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2746     } else {
2747     $self->{set_nc}->($self);
2748     }
2749    
2750     redo A;
2751     }
2752     } elsif ($self->{state} == COMMENT_END_STATE) {
2753     if ($self->{nc} == 0x003E) { # >
2754 wakaba 1.13 if ($self->{in_subset}) {
2755    
2756     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2757     } else {
2758    
2759     $self->{state} = DATA_STATE;
2760     $self->{s_kwd} = '';
2761     }
2762 wakaba 1.1
2763     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2764     $self->{line_prev} = $self->{line};
2765     $self->{column_prev} = $self->{column};
2766     $self->{column}++;
2767     $self->{nc}
2768     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2769     } else {
2770     $self->{set_nc}->($self);
2771     }
2772    
2773    
2774     return ($self->{ct}); # comment
2775    
2776     redo A;
2777     } elsif ($self->{nc} == 0x002D) { # -
2778    
2779 wakaba 1.10 ## XML5: Not a parse error.
2780 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2781     line => $self->{line_prev},
2782     column => $self->{column_prev});
2783     $self->{ct}->{data} .= '-'; # comment
2784     ## Stay in the state
2785    
2786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2787     $self->{line_prev} = $self->{line};
2788     $self->{column_prev} = $self->{column};
2789     $self->{column}++;
2790     $self->{nc}
2791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2792     } else {
2793     $self->{set_nc}->($self);
2794     }
2795    
2796     redo A;
2797     } elsif ($self->{nc} == -1) {
2798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2799 wakaba 1.13 if ($self->{in_subset}) {
2800    
2801     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2802     } else {
2803    
2804     $self->{state} = DATA_STATE;
2805     $self->{s_kwd} = '';
2806     }
2807 wakaba 1.1 ## reconsume
2808    
2809     return ($self->{ct}); # comment
2810    
2811     redo A;
2812     } else {
2813    
2814 wakaba 1.10 ## XML5: Not a parse error.
2815 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2816     line => $self->{line_prev},
2817     column => $self->{column_prev});
2818     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2819     $self->{state} = COMMENT_STATE;
2820    
2821     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2822     $self->{line_prev} = $self->{line};
2823     $self->{column_prev} = $self->{column};
2824     $self->{column}++;
2825     $self->{nc}
2826     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2827     } else {
2828     $self->{set_nc}->($self);
2829     }
2830    
2831     redo A;
2832     }
2833     } elsif ($self->{state} == DOCTYPE_STATE) {
2834     if ($is_space->{$self->{nc}}) {
2835    
2836     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2837    
2838     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2839     $self->{line_prev} = $self->{line};
2840     $self->{column_prev} = $self->{column};
2841     $self->{column}++;
2842     $self->{nc}
2843     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2844     } else {
2845     $self->{set_nc}->($self);
2846     }
2847    
2848     redo A;
2849     } else {
2850    
2851 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2852 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2853     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2854     ## reconsume
2855     redo A;
2856     }
2857     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2858 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
2859    
2860 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2861    
2862     ## Stay in the state
2863    
2864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2865     $self->{line_prev} = $self->{line};
2866     $self->{column_prev} = $self->{column};
2867     $self->{column}++;
2868     $self->{nc}
2869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2870     } else {
2871     $self->{set_nc}->($self);
2872     }
2873    
2874     redo A;
2875     } elsif ($self->{nc} == 0x003E) { # >
2876    
2877 wakaba 1.12 ## XML5: No parse error.
2878 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2879     $self->{state} = DATA_STATE;
2880 wakaba 1.5 $self->{s_kwd} = '';
2881 wakaba 1.1
2882     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2883     $self->{line_prev} = $self->{line};
2884     $self->{column_prev} = $self->{column};
2885     $self->{column}++;
2886     $self->{nc}
2887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2888     } else {
2889     $self->{set_nc}->($self);
2890     }
2891    
2892    
2893     return ($self->{ct}); # DOCTYPE (quirks)
2894    
2895     redo A;
2896     } elsif ($self->{nc} == -1) {
2897    
2898     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2899     $self->{state} = DATA_STATE;
2900 wakaba 1.5 $self->{s_kwd} = '';
2901 wakaba 1.1 ## reconsume
2902    
2903     return ($self->{ct}); # DOCTYPE (quirks)
2904    
2905     redo A;
2906 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2907    
2908     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2909     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2911     $self->{in_subset} = 1;
2912 wakaba 1.12
2913     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2914     $self->{line_prev} = $self->{line};
2915     $self->{column_prev} = $self->{column};
2916     $self->{column}++;
2917     $self->{nc}
2918     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2919     } else {
2920     $self->{set_nc}->($self);
2921     }
2922    
2923 wakaba 1.13 return ($self->{ct}); # DOCTYPE
2924 wakaba 1.12 redo A;
2925 wakaba 1.1 } else {
2926    
2927     $self->{ct}->{name} = chr $self->{nc};
2928     delete $self->{ct}->{quirks};
2929     $self->{state} = DOCTYPE_NAME_STATE;
2930    
2931     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2932     $self->{line_prev} = $self->{line};
2933     $self->{column_prev} = $self->{column};
2934     $self->{column}++;
2935     $self->{nc}
2936     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2937     } else {
2938     $self->{set_nc}->($self);
2939     }
2940    
2941     redo A;
2942     }
2943     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2944 wakaba 1.12 ## XML5: "DOCTYPE root name state".
2945    
2946     ## ISSUE: Redundant "First," in the spec.
2947    
2948 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2949    
2950     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2951    
2952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2953     $self->{line_prev} = $self->{line};
2954     $self->{column_prev} = $self->{column};
2955     $self->{column}++;
2956     $self->{nc}
2957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2958     } else {
2959     $self->{set_nc}->($self);
2960     }
2961    
2962     redo A;
2963     } elsif ($self->{nc} == 0x003E) { # >
2964    
2965     $self->{state} = DATA_STATE;
2966 wakaba 1.5 $self->{s_kwd} = '';
2967 wakaba 1.1
2968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2969     $self->{line_prev} = $self->{line};
2970     $self->{column_prev} = $self->{column};
2971     $self->{column}++;
2972     $self->{nc}
2973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2974     } else {
2975     $self->{set_nc}->($self);
2976     }
2977    
2978    
2979     return ($self->{ct}); # DOCTYPE
2980    
2981     redo A;
2982     } elsif ($self->{nc} == -1) {
2983    
2984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2985     $self->{state} = DATA_STATE;
2986 wakaba 1.5 $self->{s_kwd} = '';
2987 wakaba 1.1 ## reconsume
2988    
2989     $self->{ct}->{quirks} = 1;
2990     return ($self->{ct}); # DOCTYPE
2991    
2992     redo A;
2993 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2994    
2995     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2996 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2997     $self->{in_subset} = 1;
2998 wakaba 1.12
2999     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3000     $self->{line_prev} = $self->{line};
3001     $self->{column_prev} = $self->{column};
3002     $self->{column}++;
3003     $self->{nc}
3004     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3005     } else {
3006     $self->{set_nc}->($self);
3007     }
3008    
3009 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3010 wakaba 1.12 redo A;
3011 wakaba 1.1 } else {
3012    
3013     $self->{ct}->{name}
3014     .= chr ($self->{nc}); # DOCTYPE
3015     ## Stay in the state
3016    
3017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018     $self->{line_prev} = $self->{line};
3019     $self->{column_prev} = $self->{column};
3020     $self->{column}++;
3021     $self->{nc}
3022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023     } else {
3024     $self->{set_nc}->($self);
3025     }
3026    
3027     redo A;
3028     }
3029     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3030 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3031     ## state", but implemented differently.
3032    
3033 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3034    
3035     ## Stay in the state
3036    
3037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3038     $self->{line_prev} = $self->{line};
3039     $self->{column_prev} = $self->{column};
3040     $self->{column}++;
3041     $self->{nc}
3042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3043     } else {
3044     $self->{set_nc}->($self);
3045     }
3046    
3047     redo A;
3048     } elsif ($self->{nc} == 0x003E) { # >
3049    
3050     $self->{state} = DATA_STATE;
3051 wakaba 1.5 $self->{s_kwd} = '';
3052 wakaba 1.1
3053     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3054     $self->{line_prev} = $self->{line};
3055     $self->{column_prev} = $self->{column};
3056     $self->{column}++;
3057     $self->{nc}
3058     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3059     } else {
3060     $self->{set_nc}->($self);
3061     }
3062    
3063    
3064     return ($self->{ct}); # DOCTYPE
3065    
3066     redo A;
3067     } elsif ($self->{nc} == -1) {
3068    
3069     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3070     $self->{state} = DATA_STATE;
3071 wakaba 1.5 $self->{s_kwd} = '';
3072 wakaba 1.1 ## reconsume
3073    
3074     $self->{ct}->{quirks} = 1;
3075     return ($self->{ct}); # DOCTYPE
3076    
3077     redo A;
3078     } elsif ($self->{nc} == 0x0050 or # P
3079     $self->{nc} == 0x0070) { # p
3080 wakaba 1.12
3081 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3082 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3083 wakaba 1.1
3084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085     $self->{line_prev} = $self->{line};
3086     $self->{column_prev} = $self->{column};
3087     $self->{column}++;
3088     $self->{nc}
3089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090     } else {
3091     $self->{set_nc}->($self);
3092     }
3093    
3094     redo A;
3095     } elsif ($self->{nc} == 0x0053 or # S
3096     $self->{nc} == 0x0073) { # s
3097 wakaba 1.12
3098 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3099 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3100    
3101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3102     $self->{line_prev} = $self->{line};
3103     $self->{column_prev} = $self->{column};
3104     $self->{column}++;
3105     $self->{nc}
3106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3107     } else {
3108     $self->{set_nc}->($self);
3109     }
3110    
3111     redo A;
3112     } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3113    
3114     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3116 wakaba 1.13 $self->{in_subset} = 1;
3117 wakaba 1.1
3118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3119     $self->{line_prev} = $self->{line};
3120     $self->{column_prev} = $self->{column};
3121     $self->{column}++;
3122     $self->{nc}
3123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3124     } else {
3125     $self->{set_nc}->($self);
3126     }
3127    
3128 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3129 wakaba 1.1 redo A;
3130     } else {
3131    
3132     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
3133     $self->{ct}->{quirks} = 1;
3134    
3135     $self->{state} = BOGUS_DOCTYPE_STATE;
3136    
3137     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3138     $self->{line_prev} = $self->{line};
3139     $self->{column_prev} = $self->{column};
3140     $self->{column}++;
3141     $self->{nc}
3142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3143     } else {
3144     $self->{set_nc}->($self);
3145     }
3146    
3147     redo A;
3148     }
3149     } elsif ($self->{state} == PUBLIC_STATE) {
3150     ## ASCII case-insensitive
3151     if ($self->{nc} == [
3152     undef,
3153     0x0055, # U
3154     0x0042, # B
3155     0x004C, # L
3156     0x0049, # I
3157 wakaba 1.12 ]->[length $self->{kwd}] or
3158 wakaba 1.1 $self->{nc} == [
3159     undef,
3160     0x0075, # u
3161     0x0062, # b
3162     0x006C, # l
3163     0x0069, # i
3164 wakaba 1.12 ]->[length $self->{kwd}]) {
3165 wakaba 1.1
3166     ## Stay in the state.
3167 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3168 wakaba 1.1
3169     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3170     $self->{line_prev} = $self->{line};
3171     $self->{column_prev} = $self->{column};
3172     $self->{column}++;
3173     $self->{nc}
3174     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3175     } else {
3176     $self->{set_nc}->($self);
3177     }
3178    
3179     redo A;
3180 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3181 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3182     $self->{nc} == 0x0063)) { # c
3183 wakaba 1.12 if ($self->{is_xml} and
3184     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3185    
3186     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3187     text => 'PUBLIC',
3188     line => $self->{line_prev},
3189     column => $self->{column_prev} - 4);
3190     } else {
3191    
3192     }
3193 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3194    
3195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3196     $self->{line_prev} = $self->{line};
3197     $self->{column_prev} = $self->{column};
3198     $self->{column}++;
3199     $self->{nc}
3200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3201     } else {
3202     $self->{set_nc}->($self);
3203     }
3204    
3205     redo A;
3206     } else {
3207    
3208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3209     line => $self->{line_prev},
3210 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3211 wakaba 1.1 $self->{ct}->{quirks} = 1;
3212    
3213     $self->{state} = BOGUS_DOCTYPE_STATE;
3214     ## Reconsume.
3215     redo A;
3216     }
3217     } elsif ($self->{state} == SYSTEM_STATE) {
3218     ## ASCII case-insensitive
3219     if ($self->{nc} == [
3220     undef,
3221     0x0059, # Y
3222     0x0053, # S
3223     0x0054, # T
3224     0x0045, # E
3225 wakaba 1.12 ]->[length $self->{kwd}] or
3226 wakaba 1.1 $self->{nc} == [
3227     undef,
3228     0x0079, # y
3229     0x0073, # s
3230     0x0074, # t
3231     0x0065, # e
3232 wakaba 1.12 ]->[length $self->{kwd}]) {
3233 wakaba 1.1
3234     ## Stay in the state.
3235 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3236 wakaba 1.1
3237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238     $self->{line_prev} = $self->{line};
3239     $self->{column_prev} = $self->{column};
3240     $self->{column}++;
3241     $self->{nc}
3242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243     } else {
3244     $self->{set_nc}->($self);
3245     }
3246    
3247     redo A;
3248 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3249 wakaba 1.1 ($self->{nc} == 0x004D or # M
3250     $self->{nc} == 0x006D)) { # m
3251 wakaba 1.12 if ($self->{is_xml} and
3252     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3253    
3254     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3255     text => 'SYSTEM',
3256     line => $self->{line_prev},
3257     column => $self->{column_prev} - 4);
3258     } else {
3259    
3260     }
3261 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3262    
3263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264     $self->{line_prev} = $self->{line};
3265     $self->{column_prev} = $self->{column};
3266     $self->{column}++;
3267     $self->{nc}
3268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269     } else {
3270     $self->{set_nc}->($self);
3271     }
3272    
3273     redo A;
3274     } else {
3275    
3276     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3277     line => $self->{line_prev},
3278 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3279 wakaba 1.1 $self->{ct}->{quirks} = 1;
3280    
3281     $self->{state} = BOGUS_DOCTYPE_STATE;
3282     ## Reconsume.
3283     redo A;
3284     }
3285     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3286     if ($is_space->{$self->{nc}}) {
3287    
3288     ## Stay in the state
3289    
3290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3291     $self->{line_prev} = $self->{line};
3292     $self->{column_prev} = $self->{column};
3293     $self->{column}++;
3294     $self->{nc}
3295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3296     } else {
3297     $self->{set_nc}->($self);
3298     }
3299    
3300     redo A;
3301     } elsif ($self->{nc} eq 0x0022) { # "
3302    
3303     $self->{ct}->{pubid} = ''; # DOCTYPE
3304     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3305    
3306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3307     $self->{line_prev} = $self->{line};
3308     $self->{column_prev} = $self->{column};
3309     $self->{column}++;
3310     $self->{nc}
3311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3312     } else {
3313     $self->{set_nc}->($self);
3314     }
3315    
3316     redo A;
3317     } elsif ($self->{nc} eq 0x0027) { # '
3318    
3319     $self->{ct}->{pubid} = ''; # DOCTYPE
3320     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3321    
3322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3323     $self->{line_prev} = $self->{line};
3324     $self->{column_prev} = $self->{column};
3325     $self->{column}++;
3326     $self->{nc}
3327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3328     } else {
3329     $self->{set_nc}->($self);
3330     }
3331    
3332     redo A;
3333     } elsif ($self->{nc} eq 0x003E) { # >
3334    
3335     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3336    
3337     $self->{state} = DATA_STATE;
3338 wakaba 1.5 $self->{s_kwd} = '';
3339 wakaba 1.1
3340     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3341     $self->{line_prev} = $self->{line};
3342     $self->{column_prev} = $self->{column};
3343     $self->{column}++;
3344     $self->{nc}
3345     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3346     } else {
3347     $self->{set_nc}->($self);
3348     }
3349    
3350    
3351     $self->{ct}->{quirks} = 1;
3352     return ($self->{ct}); # DOCTYPE
3353    
3354     redo A;
3355     } elsif ($self->{nc} == -1) {
3356    
3357     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3358    
3359     $self->{state} = DATA_STATE;
3360 wakaba 1.5 $self->{s_kwd} = '';
3361 wakaba 1.1 ## reconsume
3362    
3363     $self->{ct}->{quirks} = 1;
3364     return ($self->{ct}); # DOCTYPE
3365    
3366     redo A;
3367 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3368    
3369     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3371     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3372 wakaba 1.13 $self->{in_subset} = 1;
3373 wakaba 1.12
3374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3375     $self->{line_prev} = $self->{line};
3376     $self->{column_prev} = $self->{column};
3377     $self->{column}++;
3378     $self->{nc}
3379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3380     } else {
3381     $self->{set_nc}->($self);
3382     }
3383    
3384 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3385 wakaba 1.12 redo A;
3386 wakaba 1.1 } else {
3387    
3388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3389     $self->{ct}->{quirks} = 1;
3390    
3391     $self->{state} = BOGUS_DOCTYPE_STATE;
3392    
3393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3394     $self->{line_prev} = $self->{line};
3395     $self->{column_prev} = $self->{column};
3396     $self->{column}++;
3397     $self->{nc}
3398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3399     } else {
3400     $self->{set_nc}->($self);
3401     }
3402    
3403     redo A;
3404     }
3405     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3406     if ($self->{nc} == 0x0022) { # "
3407    
3408     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3409    
3410     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3411     $self->{line_prev} = $self->{line};
3412     $self->{column_prev} = $self->{column};
3413     $self->{column}++;
3414     $self->{nc}
3415     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3416     } else {
3417     $self->{set_nc}->($self);
3418     }
3419    
3420     redo A;
3421     } elsif ($self->{nc} == 0x003E) { # >
3422    
3423     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3424    
3425     $self->{state} = DATA_STATE;
3426 wakaba 1.5 $self->{s_kwd} = '';
3427 wakaba 1.1
3428     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3429     $self->{line_prev} = $self->{line};
3430     $self->{column_prev} = $self->{column};
3431     $self->{column}++;
3432     $self->{nc}
3433     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3434     } else {
3435     $self->{set_nc}->($self);
3436     }
3437    
3438    
3439     $self->{ct}->{quirks} = 1;
3440     return ($self->{ct}); # DOCTYPE
3441    
3442     redo A;
3443     } elsif ($self->{nc} == -1) {
3444    
3445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3446    
3447     $self->{state} = DATA_STATE;
3448 wakaba 1.5 $self->{s_kwd} = '';
3449 wakaba 1.1 ## reconsume
3450    
3451     $self->{ct}->{quirks} = 1;
3452     return ($self->{ct}); # DOCTYPE
3453    
3454     redo A;
3455     } else {
3456    
3457     $self->{ct}->{pubid} # DOCTYPE
3458     .= chr $self->{nc};
3459     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3460     length $self->{ct}->{pubid});
3461    
3462     ## Stay in the state
3463    
3464     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3465     $self->{line_prev} = $self->{line};
3466     $self->{column_prev} = $self->{column};
3467     $self->{column}++;
3468     $self->{nc}
3469     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3470     } else {
3471     $self->{set_nc}->($self);
3472     }
3473    
3474     redo A;
3475     }
3476     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3477     if ($self->{nc} == 0x0027) { # '
3478    
3479     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3480    
3481     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3482     $self->{line_prev} = $self->{line};
3483     $self->{column_prev} = $self->{column};
3484     $self->{column}++;
3485     $self->{nc}
3486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3487     } else {
3488     $self->{set_nc}->($self);
3489     }
3490    
3491     redo A;
3492     } elsif ($self->{nc} == 0x003E) { # >
3493    
3494     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3495    
3496     $self->{state} = DATA_STATE;
3497 wakaba 1.5 $self->{s_kwd} = '';
3498 wakaba 1.1
3499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3500     $self->{line_prev} = $self->{line};
3501     $self->{column_prev} = $self->{column};
3502     $self->{column}++;
3503     $self->{nc}
3504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3505     } else {
3506     $self->{set_nc}->($self);
3507     }
3508    
3509    
3510     $self->{ct}->{quirks} = 1;
3511     return ($self->{ct}); # DOCTYPE
3512    
3513     redo A;
3514     } elsif ($self->{nc} == -1) {
3515    
3516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3517    
3518     $self->{state} = DATA_STATE;
3519 wakaba 1.5 $self->{s_kwd} = '';
3520 wakaba 1.1 ## reconsume
3521    
3522     $self->{ct}->{quirks} = 1;
3523     return ($self->{ct}); # DOCTYPE
3524    
3525     redo A;
3526     } else {
3527    
3528     $self->{ct}->{pubid} # DOCTYPE
3529     .= chr $self->{nc};
3530     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3531     length $self->{ct}->{pubid});
3532    
3533     ## Stay in the state
3534    
3535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3536     $self->{line_prev} = $self->{line};
3537     $self->{column_prev} = $self->{column};
3538     $self->{column}++;
3539     $self->{nc}
3540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3541     } else {
3542     $self->{set_nc}->($self);
3543     }
3544    
3545     redo A;
3546     }
3547     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3548     if ($is_space->{$self->{nc}}) {
3549    
3550     ## Stay in the state
3551    
3552     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3553     $self->{line_prev} = $self->{line};
3554     $self->{column_prev} = $self->{column};
3555     $self->{column}++;
3556     $self->{nc}
3557     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3558     } else {
3559     $self->{set_nc}->($self);
3560     }
3561    
3562     redo A;
3563     } elsif ($self->{nc} == 0x0022) { # "
3564    
3565     $self->{ct}->{sysid} = ''; # DOCTYPE
3566     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3567    
3568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3569     $self->{line_prev} = $self->{line};
3570     $self->{column_prev} = $self->{column};
3571     $self->{column}++;
3572     $self->{nc}
3573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3574     } else {
3575     $self->{set_nc}->($self);
3576     }
3577    
3578     redo A;
3579     } elsif ($self->{nc} == 0x0027) { # '
3580    
3581     $self->{ct}->{sysid} = ''; # DOCTYPE
3582     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3583    
3584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3585     $self->{line_prev} = $self->{line};
3586     $self->{column_prev} = $self->{column};
3587     $self->{column}++;
3588     $self->{nc}
3589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3590     } else {
3591     $self->{set_nc}->($self);
3592     }
3593    
3594     redo A;
3595     } elsif ($self->{nc} == 0x003E) { # >
3596 wakaba 1.12 if ($self->{is_xml}) {
3597    
3598     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3599     } else {
3600    
3601     }
3602 wakaba 1.1 $self->{state} = DATA_STATE;
3603 wakaba 1.5 $self->{s_kwd} = '';
3604 wakaba 1.1
3605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3606     $self->{line_prev} = $self->{line};
3607     $self->{column_prev} = $self->{column};
3608     $self->{column}++;
3609     $self->{nc}
3610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3611     } else {
3612     $self->{set_nc}->($self);
3613     }
3614    
3615    
3616     return ($self->{ct}); # DOCTYPE
3617    
3618     redo A;
3619     } elsif ($self->{nc} == -1) {
3620    
3621     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3622    
3623     $self->{state} = DATA_STATE;
3624 wakaba 1.5 $self->{s_kwd} = '';
3625 wakaba 1.1 ## reconsume
3626    
3627     $self->{ct}->{quirks} = 1;
3628     return ($self->{ct}); # DOCTYPE
3629    
3630     redo A;
3631 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3632    
3633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3634     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3635     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3636 wakaba 1.13 $self->{in_subset} = 1;
3637 wakaba 1.12
3638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639     $self->{line_prev} = $self->{line};
3640     $self->{column_prev} = $self->{column};
3641     $self->{column}++;
3642     $self->{nc}
3643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3644     } else {
3645     $self->{set_nc}->($self);
3646     }
3647    
3648 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3649 wakaba 1.12 redo A;
3650 wakaba 1.1 } else {
3651    
3652     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3653     $self->{ct}->{quirks} = 1;
3654    
3655     $self->{state} = BOGUS_DOCTYPE_STATE;
3656    
3657     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3658     $self->{line_prev} = $self->{line};
3659     $self->{column_prev} = $self->{column};
3660     $self->{column}++;
3661     $self->{nc}
3662     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3663     } else {
3664     $self->{set_nc}->($self);
3665     }
3666    
3667     redo A;
3668     }
3669     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3670     if ($is_space->{$self->{nc}}) {
3671    
3672     ## Stay in the state
3673    
3674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3675     $self->{line_prev} = $self->{line};
3676     $self->{column_prev} = $self->{column};
3677     $self->{column}++;
3678     $self->{nc}
3679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3680     } else {
3681     $self->{set_nc}->($self);
3682     }
3683    
3684     redo A;
3685     } elsif ($self->{nc} == 0x0022) { # "
3686    
3687     $self->{ct}->{sysid} = ''; # DOCTYPE
3688     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3689    
3690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3691     $self->{line_prev} = $self->{line};
3692     $self->{column_prev} = $self->{column};
3693     $self->{column}++;
3694     $self->{nc}
3695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3696     } else {
3697     $self->{set_nc}->($self);
3698     }
3699    
3700     redo A;
3701     } elsif ($self->{nc} == 0x0027) { # '
3702    
3703     $self->{ct}->{sysid} = ''; # DOCTYPE
3704     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3705    
3706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3707     $self->{line_prev} = $self->{line};
3708     $self->{column_prev} = $self->{column};
3709     $self->{column}++;
3710     $self->{nc}
3711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3712     } else {
3713     $self->{set_nc}->($self);
3714     }
3715    
3716     redo A;
3717     } elsif ($self->{nc} == 0x003E) { # >
3718    
3719     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3720     $self->{state} = DATA_STATE;
3721 wakaba 1.5 $self->{s_kwd} = '';
3722 wakaba 1.1
3723     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3724     $self->{line_prev} = $self->{line};
3725     $self->{column_prev} = $self->{column};
3726     $self->{column}++;
3727     $self->{nc}
3728     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3729     } else {
3730     $self->{set_nc}->($self);
3731     }
3732    
3733    
3734     $self->{ct}->{quirks} = 1;
3735     return ($self->{ct}); # DOCTYPE
3736    
3737     redo A;
3738     } elsif ($self->{nc} == -1) {
3739    
3740     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3741    
3742     $self->{state} = DATA_STATE;
3743 wakaba 1.5 $self->{s_kwd} = '';
3744 wakaba 1.1 ## reconsume
3745    
3746     $self->{ct}->{quirks} = 1;
3747     return ($self->{ct}); # DOCTYPE
3748    
3749     redo A;
3750 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3751    
3752     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3753    
3754     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3755     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3756 wakaba 1.13 $self->{in_subset} = 1;
3757 wakaba 1.12
3758     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3759     $self->{line_prev} = $self->{line};
3760     $self->{column_prev} = $self->{column};
3761     $self->{column}++;
3762     $self->{nc}
3763     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3764     } else {
3765     $self->{set_nc}->($self);
3766     }
3767    
3768 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3769 wakaba 1.12 redo A;
3770 wakaba 1.1 } else {
3771    
3772     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3773     $self->{ct}->{quirks} = 1;
3774    
3775     $self->{state} = BOGUS_DOCTYPE_STATE;
3776    
3777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3778     $self->{line_prev} = $self->{line};
3779     $self->{column_prev} = $self->{column};
3780     $self->{column}++;
3781     $self->{nc}
3782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3783     } else {
3784     $self->{set_nc}->($self);
3785     }
3786    
3787     redo A;
3788     }
3789     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3790     if ($self->{nc} == 0x0022) { # "
3791    
3792     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3793    
3794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3795     $self->{line_prev} = $self->{line};
3796     $self->{column_prev} = $self->{column};
3797     $self->{column}++;
3798     $self->{nc}
3799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3800     } else {
3801     $self->{set_nc}->($self);
3802     }
3803    
3804     redo A;
3805 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3806 wakaba 1.1
3807     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3808    
3809     $self->{state} = DATA_STATE;
3810 wakaba 1.5 $self->{s_kwd} = '';
3811 wakaba 1.1
3812     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3813     $self->{line_prev} = $self->{line};
3814     $self->{column_prev} = $self->{column};
3815     $self->{column}++;
3816     $self->{nc}
3817     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3818     } else {
3819     $self->{set_nc}->($self);
3820     }
3821    
3822    
3823     $self->{ct}->{quirks} = 1;
3824     return ($self->{ct}); # DOCTYPE
3825    
3826     redo A;
3827     } elsif ($self->{nc} == -1) {
3828    
3829     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3830    
3831     $self->{state} = DATA_STATE;
3832 wakaba 1.5 $self->{s_kwd} = '';
3833 wakaba 1.1 ## reconsume
3834    
3835     $self->{ct}->{quirks} = 1;
3836     return ($self->{ct}); # DOCTYPE
3837    
3838     redo A;
3839     } else {
3840    
3841     $self->{ct}->{sysid} # DOCTYPE
3842     .= chr $self->{nc};
3843     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3844     length $self->{ct}->{sysid});
3845    
3846     ## Stay in the state
3847    
3848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3849     $self->{line_prev} = $self->{line};
3850     $self->{column_prev} = $self->{column};
3851     $self->{column}++;
3852     $self->{nc}
3853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3854     } else {
3855     $self->{set_nc}->($self);
3856     }
3857    
3858     redo A;
3859     }
3860     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3861     if ($self->{nc} == 0x0027) { # '
3862    
3863     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3864    
3865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3866     $self->{line_prev} = $self->{line};
3867     $self->{column_prev} = $self->{column};
3868     $self->{column}++;
3869     $self->{nc}
3870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3871     } else {
3872     $self->{set_nc}->($self);
3873     }
3874    
3875     redo A;
3876 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3877 wakaba 1.1
3878     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3879    
3880     $self->{state} = DATA_STATE;
3881 wakaba 1.5 $self->{s_kwd} = '';
3882 wakaba 1.1
3883     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3884     $self->{line_prev} = $self->{line};
3885     $self->{column_prev} = $self->{column};
3886     $self->{column}++;
3887     $self->{nc}
3888     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3889     } else {
3890     $self->{set_nc}->($self);
3891     }
3892    
3893    
3894     $self->{ct}->{quirks} = 1;
3895     return ($self->{ct}); # DOCTYPE
3896    
3897     redo A;
3898     } elsif ($self->{nc} == -1) {
3899    
3900     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3901    
3902     $self->{state} = DATA_STATE;
3903 wakaba 1.5 $self->{s_kwd} = '';
3904 wakaba 1.1 ## reconsume
3905    
3906     $self->{ct}->{quirks} = 1;
3907     return ($self->{ct}); # DOCTYPE
3908    
3909     redo A;
3910     } else {
3911    
3912     $self->{ct}->{sysid} # DOCTYPE
3913     .= chr $self->{nc};
3914     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3915     length $self->{ct}->{sysid});
3916    
3917     ## Stay in the state
3918    
3919     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3920     $self->{line_prev} = $self->{line};
3921     $self->{column_prev} = $self->{column};
3922     $self->{column}++;
3923     $self->{nc}
3924     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3925     } else {
3926     $self->{set_nc}->($self);
3927     }
3928    
3929     redo A;
3930     }
3931     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3932     if ($is_space->{$self->{nc}}) {
3933    
3934     ## Stay in the state
3935    
3936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3937     $self->{line_prev} = $self->{line};
3938     $self->{column_prev} = $self->{column};
3939     $self->{column}++;
3940     $self->{nc}
3941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3942     } else {
3943     $self->{set_nc}->($self);
3944     }
3945    
3946     redo A;
3947     } elsif ($self->{nc} == 0x003E) { # >
3948    
3949     $self->{state} = DATA_STATE;
3950 wakaba 1.5 $self->{s_kwd} = '';
3951 wakaba 1.1
3952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3953     $self->{line_prev} = $self->{line};
3954     $self->{column_prev} = $self->{column};
3955     $self->{column}++;
3956     $self->{nc}
3957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3958     } else {
3959     $self->{set_nc}->($self);
3960     }
3961    
3962    
3963     return ($self->{ct}); # DOCTYPE
3964    
3965     redo A;
3966     } elsif ($self->{nc} == -1) {
3967    
3968     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3969     $self->{state} = DATA_STATE;
3970 wakaba 1.5 $self->{s_kwd} = '';
3971 wakaba 1.1 ## reconsume
3972    
3973     $self->{ct}->{quirks} = 1;
3974     return ($self->{ct}); # DOCTYPE
3975    
3976     redo A;
3977 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3978    
3979     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3980     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3981 wakaba 1.13 $self->{in_subset} = 1;
3982 wakaba 1.12
3983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3984     $self->{line_prev} = $self->{line};
3985     $self->{column_prev} = $self->{column};
3986     $self->{column}++;
3987     $self->{nc}
3988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3989     } else {
3990     $self->{set_nc}->($self);
3991     }
3992    
3993 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3994 wakaba 1.12 redo A;
3995 wakaba 1.1 } else {
3996    
3997     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3998     #$self->{ct}->{quirks} = 1;
3999    
4000     $self->{state} = BOGUS_DOCTYPE_STATE;
4001    
4002     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4003     $self->{line_prev} = $self->{line};
4004     $self->{column_prev} = $self->{column};
4005     $self->{column}++;
4006     $self->{nc}
4007     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4008     } else {
4009     $self->{set_nc}->($self);
4010     }
4011    
4012     redo A;
4013     }
4014     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4015     if ($self->{nc} == 0x003E) { # >
4016    
4017     $self->{state} = DATA_STATE;
4018 wakaba 1.5 $self->{s_kwd} = '';
4019 wakaba 1.1
4020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4021     $self->{line_prev} = $self->{line};
4022     $self->{column_prev} = $self->{column};
4023     $self->{column}++;
4024     $self->{nc}
4025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4026     } else {
4027     $self->{set_nc}->($self);
4028     }
4029    
4030    
4031     return ($self->{ct}); # DOCTYPE
4032    
4033     redo A;
4034 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4035 wakaba 1.13
4036     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4037     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4038     $self->{in_subset} = 1;
4039    
4040 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4041     $self->{line_prev} = $self->{line};
4042     $self->{column_prev} = $self->{column};
4043     $self->{column}++;
4044     $self->{nc}
4045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4046     } else {
4047     $self->{set_nc}->($self);
4048     }
4049    
4050 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4051     redo A;
4052 wakaba 1.1 } elsif ($self->{nc} == -1) {
4053    
4054     $self->{state} = DATA_STATE;
4055 wakaba 1.5 $self->{s_kwd} = '';
4056 wakaba 1.1 ## reconsume
4057    
4058     return ($self->{ct}); # DOCTYPE
4059    
4060     redo A;
4061     } else {
4062    
4063     my $s = '';
4064 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4065 wakaba 1.1
4066     ## Stay in the state
4067    
4068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4069     $self->{line_prev} = $self->{line};
4070     $self->{column_prev} = $self->{column};
4071     $self->{column}++;
4072     $self->{nc}
4073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4074     } else {
4075     $self->{set_nc}->($self);
4076     }
4077    
4078     redo A;
4079     }
4080     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4081     ## NOTE: "CDATA section state" in the state is jointly implemented
4082     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4083     ## and |CDATA_SECTION_MSE2_STATE|.
4084 wakaba 1.10
4085     ## XML5: "CDATA state".
4086 wakaba 1.1
4087     if ($self->{nc} == 0x005D) { # ]
4088    
4089     $self->{state} = CDATA_SECTION_MSE1_STATE;
4090    
4091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4092     $self->{line_prev} = $self->{line};
4093     $self->{column_prev} = $self->{column};
4094     $self->{column}++;
4095     $self->{nc}
4096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4097     } else {
4098     $self->{set_nc}->($self);
4099     }
4100    
4101     redo A;
4102     } elsif ($self->{nc} == -1) {
4103 wakaba 1.6 if ($self->{is_xml}) {
4104 wakaba 1.8
4105 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4106 wakaba 1.8 } else {
4107    
4108 wakaba 1.6 }
4109    
4110 wakaba 1.1 $self->{state} = DATA_STATE;
4111 wakaba 1.5 $self->{s_kwd} = '';
4112 wakaba 1.10 ## Reconsume.
4113 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4114    
4115     return ($self->{ct}); # character
4116     } else {
4117    
4118     ## No token to emit. $self->{ct} is discarded.
4119     }
4120     redo A;
4121     } else {
4122    
4123     $self->{ct}->{data} .= chr $self->{nc};
4124     $self->{read_until}->($self->{ct}->{data},
4125     q<]>,
4126     length $self->{ct}->{data});
4127    
4128     ## Stay in the state.
4129    
4130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4131     $self->{line_prev} = $self->{line};
4132     $self->{column_prev} = $self->{column};
4133     $self->{column}++;
4134     $self->{nc}
4135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4136     } else {
4137     $self->{set_nc}->($self);
4138     }
4139    
4140     redo A;
4141     }
4142    
4143     ## ISSUE: "text tokens" in spec.
4144     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4145 wakaba 1.10 ## XML5: "CDATA bracket state".
4146    
4147 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4148    
4149     $self->{state} = CDATA_SECTION_MSE2_STATE;
4150    
4151     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4152     $self->{line_prev} = $self->{line};
4153     $self->{column_prev} = $self->{column};
4154     $self->{column}++;
4155     $self->{nc}
4156     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4157     } else {
4158     $self->{set_nc}->($self);
4159     }
4160    
4161     redo A;
4162     } else {
4163    
4164 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4165 wakaba 1.1 $self->{ct}->{data} .= ']';
4166 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4167 wakaba 1.1 ## Reconsume.
4168     redo A;
4169     }
4170     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4171 wakaba 1.10 ## XML5: "CDATA end state".
4172    
4173 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4174     $self->{state} = DATA_STATE;
4175 wakaba 1.5 $self->{s_kwd} = '';
4176 wakaba 1.1
4177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4178     $self->{line_prev} = $self->{line};
4179     $self->{column_prev} = $self->{column};
4180     $self->{column}++;
4181     $self->{nc}
4182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4183     } else {
4184     $self->{set_nc}->($self);
4185     }
4186    
4187     if (length $self->{ct}->{data}) { # character
4188    
4189     return ($self->{ct}); # character
4190     } else {
4191    
4192     ## No token to emit. $self->{ct} is discarded.
4193     }
4194     redo A;
4195     } elsif ($self->{nc} == 0x005D) { # ]
4196     # character
4197     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4198     ## Stay in the state.
4199    
4200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4201     $self->{line_prev} = $self->{line};
4202     $self->{column_prev} = $self->{column};
4203     $self->{column}++;
4204     $self->{nc}
4205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4206     } else {
4207     $self->{set_nc}->($self);
4208     }
4209    
4210     redo A;
4211     } else {
4212    
4213     $self->{ct}->{data} .= ']]'; # character
4214     $self->{state} = CDATA_SECTION_STATE;
4215 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4216 wakaba 1.1 redo A;
4217     }
4218     } elsif ($self->{state} == ENTITY_STATE) {
4219     if ($is_space->{$self->{nc}} or
4220     {
4221     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4222     $self->{entity_add} => 1,
4223     }->{$self->{nc}}) {
4224    
4225     ## Don't consume
4226     ## No error
4227     ## Return nothing.
4228     #
4229     } elsif ($self->{nc} == 0x0023) { # #
4230    
4231     $self->{state} = ENTITY_HASH_STATE;
4232 wakaba 1.12 $self->{kwd} = '#';
4233 wakaba 1.1
4234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235     $self->{line_prev} = $self->{line};
4236     $self->{column_prev} = $self->{column};
4237     $self->{column}++;
4238     $self->{nc}
4239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240     } else {
4241     $self->{set_nc}->($self);
4242     }
4243    
4244     redo A;
4245     } elsif ((0x0041 <= $self->{nc} and
4246     $self->{nc} <= 0x005A) or # A..Z
4247     (0x0061 <= $self->{nc} and
4248     $self->{nc} <= 0x007A)) { # a..z
4249    
4250     require Whatpm::_NamedEntityList;
4251     $self->{state} = ENTITY_NAME_STATE;
4252 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4253     $self->{entity__value} = $self->{kwd};
4254 wakaba 1.1 $self->{entity__match} = 0;
4255    
4256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257     $self->{line_prev} = $self->{line};
4258     $self->{column_prev} = $self->{column};
4259     $self->{column}++;
4260     $self->{nc}
4261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262     } else {
4263     $self->{set_nc}->($self);
4264     }
4265    
4266     redo A;
4267     } else {
4268    
4269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4270     ## Return nothing.
4271     #
4272     }
4273    
4274     ## NOTE: No character is consumed by the "consume a character
4275     ## reference" algorithm. In other word, there is an "&" character
4276     ## that does not introduce a character reference, which would be
4277     ## appended to the parent element or the attribute value in later
4278     ## process of the tokenizer.
4279    
4280     if ($self->{prev_state} == DATA_STATE) {
4281    
4282     $self->{state} = $self->{prev_state};
4283 wakaba 1.5 $self->{s_kwd} = '';
4284 wakaba 1.1 ## Reconsume.
4285     return ({type => CHARACTER_TOKEN, data => '&',
4286     line => $self->{line_prev},
4287     column => $self->{column_prev},
4288     });
4289     redo A;
4290     } else {
4291    
4292     $self->{ca}->{value} .= '&';
4293     $self->{state} = $self->{prev_state};
4294 wakaba 1.5 $self->{s_kwd} = '';
4295 wakaba 1.1 ## Reconsume.
4296     redo A;
4297     }
4298     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4299     if ($self->{nc} == 0x0078 or # x
4300     $self->{nc} == 0x0058) { # X
4301    
4302     $self->{state} = HEXREF_X_STATE;
4303 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4304 wakaba 1.1
4305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4306     $self->{line_prev} = $self->{line};
4307     $self->{column_prev} = $self->{column};
4308     $self->{column}++;
4309     $self->{nc}
4310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4311     } else {
4312     $self->{set_nc}->($self);
4313     }
4314    
4315     redo A;
4316     } elsif (0x0030 <= $self->{nc} and
4317     $self->{nc} <= 0x0039) { # 0..9
4318    
4319     $self->{state} = NCR_NUM_STATE;
4320 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4321 wakaba 1.1
4322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4323     $self->{line_prev} = $self->{line};
4324     $self->{column_prev} = $self->{column};
4325     $self->{column}++;
4326     $self->{nc}
4327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4328     } else {
4329     $self->{set_nc}->($self);
4330     }
4331    
4332     redo A;
4333     } else {
4334     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4335     line => $self->{line_prev},
4336     column => $self->{column_prev} - 1);
4337    
4338     ## NOTE: According to the spec algorithm, nothing is returned,
4339     ## and then "&#" is appended to the parent element or the attribute
4340     ## value in the later processing.
4341    
4342     if ($self->{prev_state} == DATA_STATE) {
4343    
4344     $self->{state} = $self->{prev_state};
4345 wakaba 1.5 $self->{s_kwd} = '';
4346 wakaba 1.1 ## Reconsume.
4347     return ({type => CHARACTER_TOKEN,
4348     data => '&#',
4349     line => $self->{line_prev},
4350     column => $self->{column_prev} - 1,
4351     });
4352     redo A;
4353     } else {
4354    
4355     $self->{ca}->{value} .= '&#';
4356     $self->{state} = $self->{prev_state};
4357 wakaba 1.5 $self->{s_kwd} = '';
4358 wakaba 1.1 ## Reconsume.
4359     redo A;
4360     }
4361     }
4362     } elsif ($self->{state} == NCR_NUM_STATE) {
4363     if (0x0030 <= $self->{nc} and
4364     $self->{nc} <= 0x0039) { # 0..9
4365    
4366 wakaba 1.12 $self->{kwd} *= 10;
4367     $self->{kwd} += $self->{nc} - 0x0030;
4368 wakaba 1.1
4369     ## Stay in the state.
4370    
4371     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4372     $self->{line_prev} = $self->{line};
4373     $self->{column_prev} = $self->{column};
4374     $self->{column}++;
4375     $self->{nc}
4376     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4377     } else {
4378     $self->{set_nc}->($self);
4379     }
4380    
4381     redo A;
4382     } elsif ($self->{nc} == 0x003B) { # ;
4383    
4384    
4385     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4386     $self->{line_prev} = $self->{line};
4387     $self->{column_prev} = $self->{column};
4388     $self->{column}++;
4389     $self->{nc}
4390     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4391     } else {
4392     $self->{set_nc}->($self);
4393     }
4394    
4395     #
4396     } else {
4397    
4398     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4399     ## Reconsume.
4400     #
4401     }
4402    
4403 wakaba 1.12 my $code = $self->{kwd};
4404 wakaba 1.1 my $l = $self->{line_prev};
4405     my $c = $self->{column_prev};
4406     if ($charref_map->{$code}) {
4407    
4408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4409     text => (sprintf 'U+%04X', $code),
4410     line => $l, column => $c);
4411     $code = $charref_map->{$code};
4412     } elsif ($code > 0x10FFFF) {
4413    
4414     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4415     text => (sprintf 'U-%08X', $code),
4416     line => $l, column => $c);
4417     $code = 0xFFFD;
4418     }
4419    
4420     if ($self->{prev_state} == DATA_STATE) {
4421    
4422     $self->{state} = $self->{prev_state};
4423 wakaba 1.5 $self->{s_kwd} = '';
4424 wakaba 1.1 ## Reconsume.
4425     return ({type => CHARACTER_TOKEN, data => chr $code,
4426 wakaba 1.7 has_reference => 1,
4427 wakaba 1.1 line => $l, column => $c,
4428     });
4429     redo A;
4430     } else {
4431    
4432     $self->{ca}->{value} .= chr $code;
4433     $self->{ca}->{has_reference} = 1;
4434     $self->{state} = $self->{prev_state};
4435 wakaba 1.5 $self->{s_kwd} = '';
4436 wakaba 1.1 ## Reconsume.
4437     redo A;
4438     }
4439     } elsif ($self->{state} == HEXREF_X_STATE) {
4440     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4441     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4442     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4443     # 0..9, A..F, a..f
4444    
4445     $self->{state} = HEXREF_HEX_STATE;
4446 wakaba 1.12 $self->{kwd} = 0;
4447 wakaba 1.1 ## Reconsume.
4448     redo A;
4449     } else {
4450     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4451     line => $self->{line_prev},
4452     column => $self->{column_prev} - 2);
4453    
4454     ## NOTE: According to the spec algorithm, nothing is returned,
4455     ## and then "&#" followed by "X" or "x" is appended to the parent
4456     ## element or the attribute value in the later processing.
4457    
4458     if ($self->{prev_state} == DATA_STATE) {
4459    
4460     $self->{state} = $self->{prev_state};
4461 wakaba 1.5 $self->{s_kwd} = '';
4462 wakaba 1.1 ## Reconsume.
4463     return ({type => CHARACTER_TOKEN,
4464 wakaba 1.12 data => '&' . $self->{kwd},
4465 wakaba 1.1 line => $self->{line_prev},
4466 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4467 wakaba 1.1 });
4468     redo A;
4469     } else {
4470    
4471 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4472 wakaba 1.1 $self->{state} = $self->{prev_state};
4473 wakaba 1.5 $self->{s_kwd} = '';
4474 wakaba 1.1 ## Reconsume.
4475     redo A;
4476     }
4477     }
4478     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4479     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4480     # 0..9
4481    
4482 wakaba 1.12 $self->{kwd} *= 0x10;
4483     $self->{kwd} += $self->{nc} - 0x0030;
4484 wakaba 1.1 ## Stay in the state.
4485    
4486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4487     $self->{line_prev} = $self->{line};
4488     $self->{column_prev} = $self->{column};
4489     $self->{column}++;
4490     $self->{nc}
4491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4492     } else {
4493     $self->{set_nc}->($self);
4494     }
4495    
4496     redo A;
4497     } elsif (0x0061 <= $self->{nc} and
4498     $self->{nc} <= 0x0066) { # a..f
4499    
4500 wakaba 1.12 $self->{kwd} *= 0x10;
4501     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4502 wakaba 1.1 ## Stay in the state.
4503    
4504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4505     $self->{line_prev} = $self->{line};
4506     $self->{column_prev} = $self->{column};
4507     $self->{column}++;
4508     $self->{nc}
4509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4510     } else {
4511     $self->{set_nc}->($self);
4512     }
4513    
4514     redo A;
4515     } elsif (0x0041 <= $self->{nc} and
4516     $self->{nc} <= 0x0046) { # A..F
4517    
4518 wakaba 1.12 $self->{kwd} *= 0x10;
4519     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4520 wakaba 1.1 ## Stay in the state.
4521    
4522     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4523     $self->{line_prev} = $self->{line};
4524     $self->{column_prev} = $self->{column};
4525     $self->{column}++;
4526     $self->{nc}
4527     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4528     } else {
4529     $self->{set_nc}->($self);
4530     }
4531    
4532     redo A;
4533     } elsif ($self->{nc} == 0x003B) { # ;
4534    
4535    
4536     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4537     $self->{line_prev} = $self->{line};
4538     $self->{column_prev} = $self->{column};
4539     $self->{column}++;
4540     $self->{nc}
4541     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4542     } else {
4543     $self->{set_nc}->($self);
4544     }
4545    
4546     #
4547     } else {
4548    
4549     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4550     line => $self->{line},
4551     column => $self->{column});
4552     ## Reconsume.
4553     #
4554     }
4555    
4556 wakaba 1.12 my $code = $self->{kwd};
4557 wakaba 1.1 my $l = $self->{line_prev};
4558     my $c = $self->{column_prev};
4559     if ($charref_map->{$code}) {
4560    
4561     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4562     text => (sprintf 'U+%04X', $code),
4563     line => $l, column => $c);
4564     $code = $charref_map->{$code};
4565     } elsif ($code > 0x10FFFF) {
4566    
4567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4568     text => (sprintf 'U-%08X', $code),
4569     line => $l, column => $c);
4570     $code = 0xFFFD;
4571     }
4572    
4573     if ($self->{prev_state} == DATA_STATE) {
4574    
4575     $self->{state} = $self->{prev_state};
4576 wakaba 1.5 $self->{s_kwd} = '';
4577 wakaba 1.1 ## Reconsume.
4578     return ({type => CHARACTER_TOKEN, data => chr $code,
4579 wakaba 1.7 has_reference => 1,
4580 wakaba 1.1 line => $l, column => $c,
4581     });
4582     redo A;
4583     } else {
4584    
4585     $self->{ca}->{value} .= chr $code;
4586     $self->{ca}->{has_reference} = 1;
4587     $self->{state} = $self->{prev_state};
4588 wakaba 1.5 $self->{s_kwd} = '';
4589 wakaba 1.1 ## Reconsume.
4590     redo A;
4591     }
4592     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4593 wakaba 1.12 if (length $self->{kwd} < 30 and
4594 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
4595     ((0x0041 <= $self->{nc} and # a
4596     $self->{nc} <= 0x005A) or # x
4597     (0x0061 <= $self->{nc} and # a
4598     $self->{nc} <= 0x007A) or # z
4599     (0x0030 <= $self->{nc} and # 0
4600     $self->{nc} <= 0x0039) or # 9
4601     $self->{nc} == 0x003B)) { # ;
4602     our $EntityChar;
4603 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4604     if (defined $EntityChar->{$self->{kwd}}) {
4605 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
4606    
4607 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4608 wakaba 1.1 $self->{entity__match} = 1;
4609    
4610     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4611     $self->{line_prev} = $self->{line};
4612     $self->{column_prev} = $self->{column};
4613     $self->{column}++;
4614     $self->{nc}
4615     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4616     } else {
4617     $self->{set_nc}->($self);
4618     }
4619    
4620     #
4621     } else {
4622    
4623 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4624 wakaba 1.1 $self->{entity__match} = -1;
4625     ## Stay in the state.
4626    
4627     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4628     $self->{line_prev} = $self->{line};
4629     $self->{column_prev} = $self->{column};
4630     $self->{column}++;
4631     $self->{nc}
4632     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4633     } else {
4634     $self->{set_nc}->($self);
4635     }
4636    
4637     redo A;
4638     }
4639     } else {
4640    
4641     $self->{entity__value} .= chr $self->{nc};
4642     $self->{entity__match} *= 2;
4643     ## Stay in the state.
4644    
4645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4646     $self->{line_prev} = $self->{line};
4647     $self->{column_prev} = $self->{column};
4648     $self->{column}++;
4649     $self->{nc}
4650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4651     } else {
4652     $self->{set_nc}->($self);
4653     }
4654    
4655     redo A;
4656     }
4657     }
4658    
4659     my $data;
4660     my $has_ref;
4661     if ($self->{entity__match} > 0) {
4662    
4663     $data = $self->{entity__value};
4664     $has_ref = 1;
4665     #
4666     } elsif ($self->{entity__match} < 0) {
4667     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4668     if ($self->{prev_state} != DATA_STATE and # in attribute
4669     $self->{entity__match} < -1) {
4670    
4671 wakaba 1.12 $data = '&' . $self->{kwd};
4672 wakaba 1.1 #
4673     } else {
4674    
4675     $data = $self->{entity__value};
4676     $has_ref = 1;
4677     #
4678     }
4679     } else {
4680    
4681     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4682     line => $self->{line_prev},
4683 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
4684     $data = '&' . $self->{kwd};
4685 wakaba 1.1 #
4686     }
4687    
4688     ## NOTE: In these cases, when a character reference is found,
4689     ## it is consumed and a character token is returned, or, otherwise,
4690     ## nothing is consumed and returned, according to the spec algorithm.
4691     ## In this implementation, anything that has been examined by the
4692     ## tokenizer is appended to the parent element or the attribute value
4693     ## as string, either literal string when no character reference or
4694     ## entity-replaced string otherwise, in this stage, since any characters
4695     ## that would not be consumed are appended in the data state or in an
4696     ## appropriate attribute value state anyway.
4697    
4698     if ($self->{prev_state} == DATA_STATE) {
4699    
4700     $self->{state} = $self->{prev_state};
4701 wakaba 1.5 $self->{s_kwd} = '';
4702 wakaba 1.1 ## Reconsume.
4703     return ({type => CHARACTER_TOKEN,
4704     data => $data,
4705 wakaba 1.7 has_reference => $has_ref,
4706 wakaba 1.1 line => $self->{line_prev},
4707 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
4708 wakaba 1.1 });
4709     redo A;
4710     } else {
4711    
4712     $self->{ca}->{value} .= $data;
4713     $self->{ca}->{has_reference} = 1 if $has_ref;
4714     $self->{state} = $self->{prev_state};
4715 wakaba 1.5 $self->{s_kwd} = '';
4716 wakaba 1.1 ## Reconsume.
4717     redo A;
4718     }
4719 wakaba 1.8
4720     ## XML-only states
4721    
4722     } elsif ($self->{state} == PI_STATE) {
4723     if ($is_space->{$self->{nc}} or
4724     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4725     $self->{nc} == -1) {
4726     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4727     line => $self->{line_prev},
4728     column => $self->{column_prev}
4729     - 1 * ($self->{nc} != -1));
4730     $self->{state} = BOGUS_COMMENT_STATE;
4731     ## Reconsume.
4732     $self->{ct} = {type => COMMENT_TOKEN,
4733     data => '?',
4734     line => $self->{line_prev},
4735     column => $self->{column_prev}
4736     - 1 * ($self->{nc} != -1),
4737     };
4738     redo A;
4739     } else {
4740     $self->{ct} = {type => PI_TOKEN,
4741     target => chr $self->{nc},
4742     data => '',
4743     line => $self->{line_prev},
4744     column => $self->{column_prev} - 1,
4745     };
4746     $self->{state} = PI_TARGET_STATE;
4747    
4748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4749     $self->{line_prev} = $self->{line};
4750     $self->{column_prev} = $self->{column};
4751     $self->{column}++;
4752     $self->{nc}
4753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4754     } else {
4755     $self->{set_nc}->($self);
4756     }
4757    
4758     redo A;
4759     }
4760     } elsif ($self->{state} == PI_TARGET_STATE) {
4761     if ($is_space->{$self->{nc}}) {
4762     $self->{state} = PI_TARGET_AFTER_STATE;
4763    
4764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4765     $self->{line_prev} = $self->{line};
4766     $self->{column_prev} = $self->{column};
4767     $self->{column}++;
4768     $self->{nc}
4769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4770     } else {
4771     $self->{set_nc}->($self);
4772     }
4773    
4774     redo A;
4775     } elsif ($self->{nc} == -1) {
4776     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4777 wakaba 1.13 if ($self->{in_subset}) {
4778     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4779     } else {
4780     $self->{state} = DATA_STATE;
4781     $self->{s_kwd} = '';
4782     }
4783 wakaba 1.8 ## Reconsume.
4784     return ($self->{ct}); # pi
4785     redo A;
4786     } elsif ($self->{nc} == 0x003F) { # ?
4787     $self->{state} = PI_AFTER_STATE;
4788    
4789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4790     $self->{line_prev} = $self->{line};
4791     $self->{column_prev} = $self->{column};
4792     $self->{column}++;
4793     $self->{nc}
4794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4795     } else {
4796     $self->{set_nc}->($self);
4797     }
4798    
4799     redo A;
4800     } else {
4801     ## XML5: typo ("tag name" -> "target")
4802     $self->{ct}->{target} .= chr $self->{nc}; # pi
4803    
4804     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4805     $self->{line_prev} = $self->{line};
4806     $self->{column_prev} = $self->{column};
4807     $self->{column}++;
4808     $self->{nc}
4809     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4810     } else {
4811     $self->{set_nc}->($self);
4812     }
4813    
4814     redo A;
4815     }
4816     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4817     if ($is_space->{$self->{nc}}) {
4818     ## Stay in the state.
4819    
4820     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4821     $self->{line_prev} = $self->{line};
4822     $self->{column_prev} = $self->{column};
4823     $self->{column}++;
4824     $self->{nc}
4825     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4826     } else {
4827     $self->{set_nc}->($self);
4828     }
4829    
4830     redo A;
4831     } else {
4832     $self->{state} = PI_DATA_STATE;
4833     ## Reprocess.
4834     redo A;
4835     }
4836     } elsif ($self->{state} == PI_DATA_STATE) {
4837     if ($self->{nc} == 0x003F) { # ?
4838     $self->{state} = PI_DATA_AFTER_STATE;
4839    
4840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4841     $self->{line_prev} = $self->{line};
4842     $self->{column_prev} = $self->{column};
4843     $self->{column}++;
4844     $self->{nc}
4845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4846     } else {
4847     $self->{set_nc}->($self);
4848     }
4849    
4850     redo A;
4851     } elsif ($self->{nc} == -1) {
4852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4853 wakaba 1.13 if ($self->{in_subset}) {
4854     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4855     } else {
4856     $self->{state} = DATA_STATE;
4857     $self->{s_kwd} = '';
4858     }
4859 wakaba 1.8 ## Reprocess.
4860     return ($self->{ct}); # pi
4861     redo A;
4862     } else {
4863     $self->{ct}->{data} .= chr $self->{nc}; # pi
4864     $self->{read_until}->($self->{ct}->{data}, q[?],
4865     length $self->{ct}->{data});
4866     ## Stay in the state.
4867    
4868     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4869     $self->{line_prev} = $self->{line};
4870     $self->{column_prev} = $self->{column};
4871     $self->{column}++;
4872     $self->{nc}
4873     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4874     } else {
4875     $self->{set_nc}->($self);
4876     }
4877    
4878     ## Reprocess.
4879     redo A;
4880     }
4881     } elsif ($self->{state} == PI_AFTER_STATE) {
4882     if ($self->{nc} == 0x003E) { # >
4883 wakaba 1.13 if ($self->{in_subset}) {
4884     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4885     } else {
4886     $self->{state} = DATA_STATE;
4887     $self->{s_kwd} = '';
4888     }
4889 wakaba 1.8
4890     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4891     $self->{line_prev} = $self->{line};
4892     $self->{column_prev} = $self->{column};
4893     $self->{column}++;
4894     $self->{nc}
4895     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4896     } else {
4897     $self->{set_nc}->($self);
4898     }
4899    
4900     return ($self->{ct}); # pi
4901     redo A;
4902     } elsif ($self->{nc} == 0x003F) { # ?
4903     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4904     line => $self->{line_prev},
4905     column => $self->{column_prev}); ## XML5: no error
4906     $self->{ct}->{data} .= '?';
4907     $self->{state} = PI_DATA_AFTER_STATE;
4908    
4909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4910     $self->{line_prev} = $self->{line};
4911     $self->{column_prev} = $self->{column};
4912     $self->{column}++;
4913     $self->{nc}
4914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4915     } else {
4916     $self->{set_nc}->($self);
4917     }
4918    
4919     redo A;
4920     } else {
4921     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4922     line => $self->{line_prev},
4923     column => $self->{column_prev}
4924     + 1 * ($self->{nc} == -1)); ## XML5: no error
4925     $self->{ct}->{data} .= '?'; ## XML5: not appended
4926     $self->{state} = PI_DATA_STATE;
4927     ## Reprocess.
4928     redo A;
4929     }
4930     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4931     ## XML5: Same as "pi after state" in XML5
4932     if ($self->{nc} == 0x003E) { # >
4933 wakaba 1.13 if ($self->{in_subset}) {
4934     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935     } else {
4936     $self->{state} = DATA_STATE;
4937     $self->{s_kwd} = '';
4938     }
4939 wakaba 1.8
4940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4941     $self->{line_prev} = $self->{line};
4942     $self->{column_prev} = $self->{column};
4943     $self->{column}++;
4944     $self->{nc}
4945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4946     } else {
4947     $self->{set_nc}->($self);
4948     }
4949    
4950     return ($self->{ct}); # pi
4951     redo A;
4952     } elsif ($self->{nc} == 0x003F) { # ?
4953     $self->{ct}->{data} .= '?';
4954     ## Stay in the state.
4955    
4956     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957     $self->{line_prev} = $self->{line};
4958     $self->{column_prev} = $self->{column};
4959     $self->{column}++;
4960     $self->{nc}
4961     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962     } else {
4963     $self->{set_nc}->($self);
4964     }
4965    
4966     redo A;
4967     } else {
4968     $self->{ct}->{data} .= '?'; ## XML5: not appended
4969     $self->{state} = PI_DATA_STATE;
4970     ## Reprocess.
4971     redo A;
4972     }
4973 wakaba 1.12
4974     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4975     if ($self->{nc} == 0x003C) { # <
4976 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
4977 wakaba 1.12
4978     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4979     $self->{line_prev} = $self->{line};
4980     $self->{column_prev} = $self->{column};
4981     $self->{column}++;
4982     $self->{nc}
4983     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4984     } else {
4985     $self->{set_nc}->($self);
4986     }
4987    
4988     redo A;
4989     } elsif ($self->{nc} == 0x0025) { # %
4990     ## XML5: Not defined yet.
4991    
4992     ## TODO:
4993    
4994     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4995     $self->{line_prev} = $self->{line};
4996     $self->{column_prev} = $self->{column};
4997     $self->{column}++;
4998     $self->{nc}
4999     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5000     } else {
5001     $self->{set_nc}->($self);
5002     }
5003    
5004     redo A;
5005     } elsif ($self->{nc} == 0x005D) { # ]
5006 wakaba 1.13 delete $self->{in_subset};
5007 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5008    
5009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5010     $self->{line_prev} = $self->{line};
5011     $self->{column_prev} = $self->{column};
5012     $self->{column}++;
5013     $self->{nc}
5014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5015     } else {
5016     $self->{set_nc}->($self);
5017     }
5018    
5019     redo A;
5020     } elsif ($is_space->{$self->{nc}}) {
5021     ## Stay in the state.
5022    
5023     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5024     $self->{line_prev} = $self->{line};
5025     $self->{column_prev} = $self->{column};
5026     $self->{column}++;
5027     $self->{nc}
5028     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5029     } else {
5030     $self->{set_nc}->($self);
5031     }
5032    
5033     redo A;
5034     } elsif ($self->{nc} == -1) {
5035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5036 wakaba 1.13 delete $self->{in_subset};
5037 wakaba 1.12 $self->{state} = DATA_STATE;
5038     $self->{s_kwd} = '';
5039     ## Reconsume.
5040 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5041 wakaba 1.12 redo A;
5042     } else {
5043     unless ($self->{internal_subset_tainted}) {
5044     ## XML5: No parse error.
5045     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5046     $self->{internal_subset_tainted} = 1;
5047     }
5048     ## Stay in the state.
5049    
5050     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5051     $self->{line_prev} = $self->{line};
5052     $self->{column_prev} = $self->{column};
5053     $self->{column}++;
5054     $self->{nc}
5055     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5056     } else {
5057     $self->{set_nc}->($self);
5058     }
5059    
5060     redo A;
5061     }
5062     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5063     if ($self->{nc} == 0x003E) { # >
5064     $self->{state} = DATA_STATE;
5065     $self->{s_kwd} = '';
5066    
5067     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5068     $self->{line_prev} = $self->{line};
5069     $self->{column_prev} = $self->{column};
5070     $self->{column}++;
5071     $self->{nc}
5072     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5073     } else {
5074     $self->{set_nc}->($self);
5075     }
5076    
5077 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5078 wakaba 1.12 redo A;
5079     } elsif ($self->{nc} == -1) {
5080     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5081     $self->{state} = DATA_STATE;
5082     $self->{s_kwd} = '';
5083     ## Reconsume.
5084 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5085 wakaba 1.12 redo A;
5086     } else {
5087     ## XML5: No parse error and stay in the state.
5088     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5089    
5090 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5091    
5092     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5093     $self->{line_prev} = $self->{line};
5094     $self->{column_prev} = $self->{column};
5095     $self->{column}++;
5096     $self->{nc}
5097     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5098     } else {
5099     $self->{set_nc}->($self);
5100     }
5101    
5102     redo A;
5103     }
5104     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5105     if ($self->{nc} == 0x003E) { # >
5106     $self->{state} = DATA_STATE;
5107     $self->{s_kwd} = '';
5108    
5109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5110     $self->{line_prev} = $self->{line};
5111     $self->{column_prev} = $self->{column};
5112     $self->{column}++;
5113     $self->{nc}
5114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5115     } else {
5116     $self->{set_nc}->($self);
5117     }
5118    
5119     return ({type => END_OF_DOCTYPE_TOKEN});
5120     redo A;
5121     } elsif ($self->{nc} == -1) {
5122     $self->{state} = DATA_STATE;
5123     $self->{s_kwd} = '';
5124     ## Reconsume.
5125     return ({type => END_OF_DOCTYPE_TOKEN});
5126     redo A;
5127     } else {
5128     ## Stay in the state.
5129    
5130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5131     $self->{line_prev} = $self->{line};
5132     $self->{column_prev} = $self->{column};
5133     $self->{column}++;
5134     $self->{nc}
5135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5136     } else {
5137     $self->{set_nc}->($self);
5138     }
5139    
5140     redo A;
5141     }
5142     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5143     if ($self->{nc} == 0x0021) { # !
5144     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
5145    
5146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5147     $self->{line_prev} = $self->{line};
5148     $self->{column_prev} = $self->{column};
5149     $self->{column}++;
5150     $self->{nc}
5151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5152     } else {
5153     $self->{set_nc}->($self);
5154     }
5155    
5156     redo A;
5157     } elsif ($self->{nc} == 0x003F) { # ?
5158     $self->{state} = PI_STATE;
5159    
5160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5161     $self->{line_prev} = $self->{line};
5162     $self->{column_prev} = $self->{column};
5163     $self->{column}++;
5164     $self->{nc}
5165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5166     } else {
5167     $self->{set_nc}->($self);
5168     }
5169    
5170     redo A;
5171     } elsif ($self->{nc} == -1) {
5172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5173     $self->{state} = DATA_STATE;
5174     $self->{s_kwd} = '';
5175     ## Reconsume.
5176     redo A;
5177     } else {
5178     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5179     line => $self->{line_prev},
5180     column => $self->{column_prev});
5181     $self->{state} = BOGUS_COMMENT_STATE;
5182     $self->{ct} = {type => COMMENT_TOKEN,
5183     data => '',
5184     }; ## NOTE: Will be discarded.
5185 wakaba 1.12
5186     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5187     $self->{line_prev} = $self->{line};
5188     $self->{column_prev} = $self->{column};
5189     $self->{column}++;
5190     $self->{nc}
5191     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5192     } else {
5193     $self->{set_nc}->($self);
5194     }
5195    
5196     redo A;
5197     }
5198 wakaba 1.8
5199 wakaba 1.1 } else {
5200     die "$0: $self->{state}: Unknown state";
5201     }
5202     } # A
5203    
5204     die "$0: _get_next_token: unexpected case";
5205     } # _get_next_token
5206    
5207     1;
5208 wakaba 1.13 ## $Date: 2008/10/15 12:49:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24