/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (hide annotations) (download)
Wed Oct 15 10:50:38 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.10: +99 -8 lines
++ whatpm/t/xml/ChangeLog	15 Oct 2008 10:50:31 -0000
	* attrs-1.dat: Test cases for tokenizing errors are added.

	* elements-1.dat: A test result updated.

	* ns-attrs-1.dat: Test results updated.  New test cases for
	duplicate namespaced attributes are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 10:48:03 -0000
	* Tokenizer.pm.src: Set index attribute to each attribute token,
	for ignoring namespaced duplicate attribute at the XML namespace
	parser layer.  Raise a parse error if the attribute value is
	omitted, in XML mode.  Raise a parse error if the attribute value
	is not quoted, in XML mode.  Raise a parse error if "<" character
	is found in a quoted attribute value, in XML mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 10:49:16 -0000
	* Parser.pm.src: Use source order to determine which attribute is
	duplicate.  Preserve duplicate namespaced attributes as
	non-namespaced attributes.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.11 our $VERSION=do{my @r=(q$Revision: 1.10 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117 wakaba 1.8 ## XML states
118     sub PI_STATE () { 51 }
119     sub PI_TARGET_STATE () { 52 }
120     sub PI_TARGET_AFTER_STATE () { 53 }
121     sub PI_DATA_STATE () { 54 }
122     sub PI_AFTER_STATE () { 55 }
123     sub PI_DATA_AFTER_STATE () { 56 }
124    
125 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
126     ## list and descriptions)
127    
128     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129     sub FOREIGN_EL () { 0b1_00000000000 }
130    
131     ## Character reference mappings
132    
133     my $charref_map = {
134     0x0D => 0x000A,
135     0x80 => 0x20AC,
136     0x81 => 0xFFFD,
137     0x82 => 0x201A,
138     0x83 => 0x0192,
139     0x84 => 0x201E,
140     0x85 => 0x2026,
141     0x86 => 0x2020,
142     0x87 => 0x2021,
143     0x88 => 0x02C6,
144     0x89 => 0x2030,
145     0x8A => 0x0160,
146     0x8B => 0x2039,
147     0x8C => 0x0152,
148     0x8D => 0xFFFD,
149     0x8E => 0x017D,
150     0x8F => 0xFFFD,
151     0x90 => 0xFFFD,
152     0x91 => 0x2018,
153     0x92 => 0x2019,
154     0x93 => 0x201C,
155     0x94 => 0x201D,
156     0x95 => 0x2022,
157     0x96 => 0x2013,
158     0x97 => 0x2014,
159     0x98 => 0x02DC,
160     0x99 => 0x2122,
161     0x9A => 0x0161,
162     0x9B => 0x203A,
163     0x9C => 0x0153,
164     0x9D => 0xFFFD,
165     0x9E => 0x017E,
166     0x9F => 0x0178,
167     }; # $charref_map
168     $charref_map->{$_} = 0xFFFD
169     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176    
177     ## Implementations MUST act as if state machine in the spec
178    
179     sub _initialize_tokenizer ($) {
180     my $self = shift;
181    
182     ## NOTE: Fields set by |new| constructor:
183     #$self->{level}
184     #$self->{set_nc}
185     #$self->{parse_error}
186 wakaba 1.3 #$self->{is_xml} (if XML)
187 wakaba 1.1
188     $self->{state} = DATA_STATE; # MUST
189 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
190 wakaba 1.1 #$self->{entity__value}; # initialized when used
191     #$self->{entity__match}; # initialized when used
192     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193     undef $self->{ct}; # current token
194     undef $self->{ca}; # current attribute
195     undef $self->{last_stag_name}; # last emitted start tag name
196     #$self->{prev_state}; # initialized when used
197     delete $self->{self_closing};
198     $self->{char_buffer} = '';
199     $self->{char_buffer_pos} = 0;
200     $self->{nc} = -1; # next input character
201     #$self->{next_nc}
202    
203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
204     $self->{line_prev} = $self->{line};
205     $self->{column_prev} = $self->{column};
206     $self->{column}++;
207     $self->{nc}
208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
209     } else {
210     $self->{set_nc}->($self);
211     }
212    
213     $self->{token} = [];
214     # $self->{escape}
215     } # _initialize_tokenizer
216    
217     ## A token has:
218     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
220 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
221     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222 wakaba 1.11 ## ->{target} (PI_TOKEN)
223 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
224     ## ->{sysid} (DOCTYPE_TOKEN)
225     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
226     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
227     ## ->{name}
228     ## ->{value}
229     ## ->{has_reference} == 1 or 0
230 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
231     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
232 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
233 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
234 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
235     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
236     ## while the token is pushed back to the stack.
237    
238     ## Emitted token MUST immediately be handled by the tree construction state.
239    
240     ## Before each step, UA MAY check to see if either one of the scripts in
241     ## "list of scripts that will execute as soon as possible" or the first
242     ## script in the "list of scripts that will execute asynchronously",
243     ## has completed loading. If one has, then it MUST be executed
244     ## and removed from the list.
245    
246     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
247     ## (This requirement was dropped from HTML5 spec, unfortunately.)
248    
249     my $is_space = {
250     0x0009 => 1, # CHARACTER TABULATION (HT)
251     0x000A => 1, # LINE FEED (LF)
252     #0x000B => 0, # LINE TABULATION (VT)
253     0x000C => 1, # FORM FEED (FF)
254     #0x000D => 1, # CARRIAGE RETURN (CR)
255     0x0020 => 1, # SPACE (SP)
256     };
257    
258     sub _get_next_token ($) {
259     my $self = shift;
260    
261     if ($self->{self_closing}) {
262     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
263     ## NOTE: The |self_closing| flag is only set by start tag token.
264     ## In addition, when a start tag token is emitted, it is always set to
265     ## |ct|.
266     delete $self->{self_closing};
267     }
268    
269     if (@{$self->{token}}) {
270     $self->{self_closing} = $self->{token}->[0]->{self_closing};
271     return shift @{$self->{token}};
272     }
273    
274     A: {
275     if ($self->{state} == PCDATA_STATE) {
276     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
277    
278     if ($self->{nc} == 0x0026) { # &
279    
280     ## NOTE: In the spec, the tokenizer is switched to the
281     ## "entity data state". In this implementation, the tokenizer
282     ## is switched to the |ENTITY_STATE|, which is an implementation
283     ## of the "consume a character reference" algorithm.
284     $self->{entity_add} = -1;
285     $self->{prev_state} = DATA_STATE;
286     $self->{state} = ENTITY_STATE;
287    
288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
289     $self->{line_prev} = $self->{line};
290     $self->{column_prev} = $self->{column};
291     $self->{column}++;
292     $self->{nc}
293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
294     } else {
295     $self->{set_nc}->($self);
296     }
297    
298     redo A;
299     } elsif ($self->{nc} == 0x003C) { # <
300    
301     $self->{state} = TAG_OPEN_STATE;
302    
303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
304     $self->{line_prev} = $self->{line};
305     $self->{column_prev} = $self->{column};
306     $self->{column}++;
307     $self->{nc}
308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
309     } else {
310     $self->{set_nc}->($self);
311     }
312    
313     redo A;
314     } elsif ($self->{nc} == -1) {
315    
316     return ({type => END_OF_FILE_TOKEN,
317     line => $self->{line}, column => $self->{column}});
318     last A; ## TODO: ok?
319     } else {
320    
321     #
322     }
323    
324     # Anything else
325     my $token = {type => CHARACTER_TOKEN,
326     data => chr $self->{nc},
327     line => $self->{line}, column => $self->{column},
328     };
329     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
330    
331     ## Stay in the state.
332    
333     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
334     $self->{line_prev} = $self->{line};
335     $self->{column_prev} = $self->{column};
336     $self->{column}++;
337     $self->{nc}
338     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
339     } else {
340     $self->{set_nc}->($self);
341     }
342    
343     return ($token);
344     redo A;
345     } elsif ($self->{state} == DATA_STATE) {
346     $self->{s_kwd} = '' unless defined $self->{s_kwd};
347     if ($self->{nc} == 0x0026) { # &
348     $self->{s_kwd} = '';
349     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
350     not $self->{escape}) {
351    
352     ## NOTE: In the spec, the tokenizer is switched to the
353     ## "entity data state". In this implementation, the tokenizer
354     ## is switched to the |ENTITY_STATE|, which is an implementation
355     ## of the "consume a character reference" algorithm.
356     $self->{entity_add} = -1;
357     $self->{prev_state} = DATA_STATE;
358     $self->{state} = ENTITY_STATE;
359    
360     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
361     $self->{line_prev} = $self->{line};
362     $self->{column_prev} = $self->{column};
363     $self->{column}++;
364     $self->{nc}
365     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
366     } else {
367     $self->{set_nc}->($self);
368     }
369    
370     redo A;
371     } else {
372    
373     #
374     }
375     } elsif ($self->{nc} == 0x002D) { # -
376     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
377 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
378 wakaba 1.1
379     $self->{escape} = 1; # unless $self->{escape};
380     $self->{s_kwd} = '--';
381     #
382 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
383 wakaba 1.1
384     $self->{s_kwd} = '--';
385     #
386 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
387    
388     $self->{s_kwd} .= '-';
389     #
390 wakaba 1.1 } else {
391    
392 wakaba 1.5 $self->{s_kwd} = '-';
393 wakaba 1.1 #
394     }
395     }
396    
397     #
398     } elsif ($self->{nc} == 0x0021) { # !
399     if (length $self->{s_kwd}) {
400    
401     $self->{s_kwd} .= '!';
402     #
403     } else {
404    
405     #$self->{s_kwd} = '';
406     #
407     }
408     #
409     } elsif ($self->{nc} == 0x003C) { # <
410     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
411     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
412     not $self->{escape})) {
413    
414     $self->{state} = TAG_OPEN_STATE;
415    
416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
417     $self->{line_prev} = $self->{line};
418     $self->{column_prev} = $self->{column};
419     $self->{column}++;
420     $self->{nc}
421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
422     } else {
423     $self->{set_nc}->($self);
424     }
425    
426     redo A;
427     } else {
428    
429     $self->{s_kwd} = '';
430     #
431     }
432     } elsif ($self->{nc} == 0x003E) { # >
433     if ($self->{escape} and
434     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
435     if ($self->{s_kwd} eq '--') {
436    
437     delete $self->{escape};
438 wakaba 1.5 #
439 wakaba 1.1 } else {
440    
441 wakaba 1.5 #
442 wakaba 1.1 }
443 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
444    
445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
446     line => $self->{line_prev},
447     column => $self->{column_prev} - 1);
448     #
449 wakaba 1.1 } else {
450    
451 wakaba 1.5 #
452 wakaba 1.1 }
453    
454     $self->{s_kwd} = '';
455     #
456 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
457     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
458    
459     $self->{s_kwd} .= ']';
460     } elsif ($self->{s_kwd} eq ']]') {
461    
462     #
463     } else {
464    
465     $self->{s_kwd} = '';
466     }
467     #
468 wakaba 1.1 } elsif ($self->{nc} == -1) {
469    
470     $self->{s_kwd} = '';
471     return ({type => END_OF_FILE_TOKEN,
472     line => $self->{line}, column => $self->{column}});
473     last A; ## TODO: ok?
474     } else {
475    
476     $self->{s_kwd} = '';
477     #
478     }
479    
480     # Anything else
481     my $token = {type => CHARACTER_TOKEN,
482     data => chr $self->{nc},
483     line => $self->{line}, column => $self->{column},
484     };
485 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
486 wakaba 1.1 length $token->{data})) {
487     $self->{s_kwd} = '';
488     }
489    
490     ## Stay in the data state.
491 wakaba 1.5 if (not $self->{is_xml} and
492     $self->{content_model} == PCDATA_CONTENT_MODEL) {
493 wakaba 1.1
494     $self->{state} = PCDATA_STATE;
495     } else {
496    
497     ## Stay in the state.
498     }
499    
500     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
501     $self->{line_prev} = $self->{line};
502     $self->{column_prev} = $self->{column};
503     $self->{column}++;
504     $self->{nc}
505     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
506     } else {
507     $self->{set_nc}->($self);
508     }
509    
510     return ($token);
511     redo A;
512     } elsif ($self->{state} == TAG_OPEN_STATE) {
513 wakaba 1.10 ## XML5: "tag state".
514    
515 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
516     if ($self->{nc} == 0x002F) { # /
517    
518    
519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
520     $self->{line_prev} = $self->{line};
521     $self->{column_prev} = $self->{column};
522     $self->{column}++;
523     $self->{nc}
524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
525     } else {
526     $self->{set_nc}->($self);
527     }
528    
529     $self->{state} = CLOSE_TAG_OPEN_STATE;
530     redo A;
531     } elsif ($self->{nc} == 0x0021) { # !
532    
533     $self->{s_kwd} = '<' unless $self->{escape};
534     #
535     } else {
536    
537     #
538     }
539    
540     ## reconsume
541     $self->{state} = DATA_STATE;
542 wakaba 1.5 $self->{s_kwd} = '';
543 wakaba 1.1 return ({type => CHARACTER_TOKEN, data => '<',
544     line => $self->{line_prev},
545     column => $self->{column_prev},
546     });
547     redo A;
548     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
549     if ($self->{nc} == 0x0021) { # !
550    
551     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
552    
553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
554     $self->{line_prev} = $self->{line};
555     $self->{column_prev} = $self->{column};
556     $self->{column}++;
557     $self->{nc}
558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
559     } else {
560     $self->{set_nc}->($self);
561     }
562    
563     redo A;
564     } elsif ($self->{nc} == 0x002F) { # /
565    
566     $self->{state} = CLOSE_TAG_OPEN_STATE;
567    
568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
569     $self->{line_prev} = $self->{line};
570     $self->{column_prev} = $self->{column};
571     $self->{column}++;
572     $self->{nc}
573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
574     } else {
575     $self->{set_nc}->($self);
576     }
577    
578     redo A;
579     } elsif (0x0041 <= $self->{nc} and
580     $self->{nc} <= 0x005A) { # A..Z
581    
582     $self->{ct}
583     = {type => START_TAG_TOKEN,
584 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
585 wakaba 1.1 line => $self->{line_prev},
586     column => $self->{column_prev}};
587     $self->{state} = TAG_NAME_STATE;
588    
589     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
590     $self->{line_prev} = $self->{line};
591     $self->{column_prev} = $self->{column};
592     $self->{column}++;
593     $self->{nc}
594     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
595     } else {
596     $self->{set_nc}->($self);
597     }
598    
599     redo A;
600     } elsif (0x0061 <= $self->{nc} and
601     $self->{nc} <= 0x007A) { # a..z
602    
603     $self->{ct} = {type => START_TAG_TOKEN,
604     tag_name => chr ($self->{nc}),
605     line => $self->{line_prev},
606     column => $self->{column_prev}};
607     $self->{state} = TAG_NAME_STATE;
608    
609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
610     $self->{line_prev} = $self->{line};
611     $self->{column_prev} = $self->{column};
612     $self->{column}++;
613     $self->{nc}
614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
615     } else {
616     $self->{set_nc}->($self);
617     }
618    
619     redo A;
620     } elsif ($self->{nc} == 0x003E) { # >
621    
622     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
623     line => $self->{line_prev},
624     column => $self->{column_prev});
625     $self->{state} = DATA_STATE;
626 wakaba 1.5 $self->{s_kwd} = '';
627 wakaba 1.1
628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
629     $self->{line_prev} = $self->{line};
630     $self->{column_prev} = $self->{column};
631     $self->{column}++;
632     $self->{nc}
633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
634     } else {
635     $self->{set_nc}->($self);
636     }
637    
638    
639     return ({type => CHARACTER_TOKEN, data => '<>',
640     line => $self->{line_prev},
641     column => $self->{column_prev},
642     });
643    
644     redo A;
645     } elsif ($self->{nc} == 0x003F) { # ?
646 wakaba 1.8 if ($self->{is_xml}) {
647    
648     $self->{state} = PI_STATE;
649    
650     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
651     $self->{line_prev} = $self->{line};
652     $self->{column_prev} = $self->{column};
653     $self->{column}++;
654     $self->{nc}
655     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
656     } else {
657     $self->{set_nc}->($self);
658     }
659    
660     redo A;
661     } else {
662    
663     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
664     line => $self->{line_prev},
665     column => $self->{column_prev});
666     $self->{state} = BOGUS_COMMENT_STATE;
667     $self->{ct} = {type => COMMENT_TOKEN, data => '',
668     line => $self->{line_prev},
669     column => $self->{column_prev},
670     };
671     ## $self->{nc} is intentionally left as is
672     redo A;
673     }
674 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
675 wakaba 1.1
676     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
677     line => $self->{line_prev},
678     column => $self->{column_prev});
679     $self->{state} = DATA_STATE;
680 wakaba 1.5 $self->{s_kwd} = '';
681 wakaba 1.1 ## reconsume
682    
683     return ({type => CHARACTER_TOKEN, data => '<',
684     line => $self->{line_prev},
685     column => $self->{column_prev},
686     });
687    
688     redo A;
689 wakaba 1.9 } else {
690     ## XML5: "<:" is a parse error.
691    
692     $self->{ct} = {type => START_TAG_TOKEN,
693     tag_name => chr ($self->{nc}),
694     line => $self->{line_prev},
695     column => $self->{column_prev}};
696     $self->{state} = TAG_NAME_STATE;
697    
698     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
699     $self->{line_prev} = $self->{line};
700     $self->{column_prev} = $self->{column};
701     $self->{column}++;
702     $self->{nc}
703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
704     } else {
705     $self->{set_nc}->($self);
706     }
707    
708     redo A;
709 wakaba 1.1 }
710     } else {
711     die "$0: $self->{content_model} in tag open";
712     }
713     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
714     ## NOTE: The "close tag open state" in the spec is implemented as
715     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
716    
717 wakaba 1.10 ## XML5: "end tag state".
718    
719 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
720     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
721     if (defined $self->{last_stag_name}) {
722     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
723     $self->{s_kwd} = '';
724     ## Reconsume.
725     redo A;
726     } else {
727     ## No start tag token has ever been emitted
728     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
729    
730     $self->{state} = DATA_STATE;
731 wakaba 1.5 $self->{s_kwd} = '';
732 wakaba 1.1 ## Reconsume.
733     return ({type => CHARACTER_TOKEN, data => '</',
734     line => $l, column => $c,
735     });
736     redo A;
737     }
738     }
739    
740     if (0x0041 <= $self->{nc} and
741     $self->{nc} <= 0x005A) { # A..Z
742    
743     $self->{ct}
744     = {type => END_TAG_TOKEN,
745 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
746 wakaba 1.1 line => $l, column => $c};
747     $self->{state} = TAG_NAME_STATE;
748    
749     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
750     $self->{line_prev} = $self->{line};
751     $self->{column_prev} = $self->{column};
752     $self->{column}++;
753     $self->{nc}
754     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
755     } else {
756     $self->{set_nc}->($self);
757     }
758    
759     redo A;
760     } elsif (0x0061 <= $self->{nc} and
761     $self->{nc} <= 0x007A) { # a..z
762    
763     $self->{ct} = {type => END_TAG_TOKEN,
764     tag_name => chr ($self->{nc}),
765     line => $l, column => $c};
766     $self->{state} = TAG_NAME_STATE;
767    
768     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
769     $self->{line_prev} = $self->{line};
770     $self->{column_prev} = $self->{column};
771     $self->{column}++;
772     $self->{nc}
773     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
774     } else {
775     $self->{set_nc}->($self);
776     }
777    
778     redo A;
779     } elsif ($self->{nc} == 0x003E) { # >
780     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
781     line => $self->{line_prev}, ## "<" in "</>"
782     column => $self->{column_prev} - 1);
783     $self->{state} = DATA_STATE;
784 wakaba 1.5 $self->{s_kwd} = '';
785 wakaba 1.10 if ($self->{is_xml}) {
786    
787     ## XML5: No parse error.
788    
789     ## NOTE: This parser raises a parse error, since it supports
790     ## XML1, not XML5.
791    
792     ## NOTE: A short end tag token.
793     my $ct = {type => END_TAG_TOKEN,
794     tag_name => '',
795     line => $self->{line_prev},
796     column => $self->{column_prev} - 1,
797     };
798    
799     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
800     $self->{line_prev} = $self->{line};
801     $self->{column_prev} = $self->{column};
802     $self->{column}++;
803     $self->{nc}
804     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
805     } else {
806     $self->{set_nc}->($self);
807     }
808    
809     return ($ct);
810     } else {
811    
812    
813 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
814     $self->{line_prev} = $self->{line};
815     $self->{column_prev} = $self->{column};
816     $self->{column}++;
817     $self->{nc}
818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
819     } else {
820     $self->{set_nc}->($self);
821     }
822    
823 wakaba 1.10 }
824 wakaba 1.1 redo A;
825     } elsif ($self->{nc} == -1) {
826    
827     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
828 wakaba 1.5 $self->{s_kwd} = '';
829 wakaba 1.1 $self->{state} = DATA_STATE;
830     # reconsume
831    
832     return ({type => CHARACTER_TOKEN, data => '</',
833     line => $l, column => $c,
834     });
835    
836     redo A;
837 wakaba 1.10 } elsif (not $self->{is_xml} or
838     $is_space->{$self->{nc}}) {
839 wakaba 1.1
840 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
841     line => $self->{line_prev}, # "<" of "</"
842     column => $self->{column_prev} - 1);
843 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
844     $self->{ct} = {type => COMMENT_TOKEN, data => '',
845     line => $self->{line_prev}, # "<" of "</"
846     column => $self->{column_prev} - 1,
847     };
848     ## NOTE: $self->{nc} is intentionally left as is.
849     ## Although the "anything else" case of the spec not explicitly
850     ## states that the next input character is to be reconsumed,
851     ## it will be included to the |data| of the comment token
852     ## generated from the bogus end tag, as defined in the
853     ## "bogus comment state" entry.
854     redo A;
855 wakaba 1.10 } else {
856     ## XML5: "</:" is a parse error.
857    
858     $self->{ct} = {type => END_TAG_TOKEN,
859     tag_name => chr ($self->{nc}),
860     line => $l, column => $c};
861     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
862    
863     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
864     $self->{line_prev} = $self->{line};
865     $self->{column_prev} = $self->{column};
866     $self->{column}++;
867     $self->{nc}
868     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
869     } else {
870     $self->{set_nc}->($self);
871     }
872    
873     redo A;
874 wakaba 1.1 }
875     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
876     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
877     if (length $ch) {
878     my $CH = $ch;
879     $ch =~ tr/a-z/A-Z/;
880     my $nch = chr $self->{nc};
881     if ($nch eq $ch or $nch eq $CH) {
882    
883     ## Stay in the state.
884     $self->{s_kwd} .= $nch;
885    
886     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
887     $self->{line_prev} = $self->{line};
888     $self->{column_prev} = $self->{column};
889     $self->{column}++;
890     $self->{nc}
891     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
892     } else {
893     $self->{set_nc}->($self);
894     }
895    
896     redo A;
897     } else {
898    
899     $self->{state} = DATA_STATE;
900 wakaba 1.5 $self->{s_kwd} = '';
901 wakaba 1.1 ## Reconsume.
902     return ({type => CHARACTER_TOKEN,
903     data => '</' . $self->{s_kwd},
904     line => $self->{line_prev},
905     column => $self->{column_prev} - 1 - length $self->{s_kwd},
906     });
907     redo A;
908     }
909     } else { # after "<{tag-name}"
910     unless ($is_space->{$self->{nc}} or
911     {
912     0x003E => 1, # >
913     0x002F => 1, # /
914     -1 => 1, # EOF
915     }->{$self->{nc}}) {
916    
917     ## Reconsume.
918     $self->{state} = DATA_STATE;
919 wakaba 1.5 $self->{s_kwd} = '';
920 wakaba 1.1 return ({type => CHARACTER_TOKEN,
921     data => '</' . $self->{s_kwd},
922     line => $self->{line_prev},
923     column => $self->{column_prev} - 1 - length $self->{s_kwd},
924     });
925     redo A;
926     } else {
927    
928     $self->{ct}
929     = {type => END_TAG_TOKEN,
930     tag_name => $self->{last_stag_name},
931     line => $self->{line_prev},
932     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
933     $self->{state} = TAG_NAME_STATE;
934     ## Reconsume.
935     redo A;
936     }
937     }
938     } elsif ($self->{state} == TAG_NAME_STATE) {
939     if ($is_space->{$self->{nc}}) {
940    
941     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
942    
943     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
944     $self->{line_prev} = $self->{line};
945     $self->{column_prev} = $self->{column};
946     $self->{column}++;
947     $self->{nc}
948     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
949     } else {
950     $self->{set_nc}->($self);
951     }
952    
953     redo A;
954     } elsif ($self->{nc} == 0x003E) { # >
955     if ($self->{ct}->{type} == START_TAG_TOKEN) {
956    
957     $self->{last_stag_name} = $self->{ct}->{tag_name};
958     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
959     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
960     #if ($self->{ct}->{attributes}) {
961     # ## NOTE: This should never be reached.
962     # !!! cp (36);
963     # !!! parse-error (type => 'end tag attribute');
964     #} else {
965    
966     #}
967     } else {
968     die "$0: $self->{ct}->{type}: Unknown token type";
969     }
970     $self->{state} = DATA_STATE;
971 wakaba 1.5 $self->{s_kwd} = '';
972 wakaba 1.1
973     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
974     $self->{line_prev} = $self->{line};
975     $self->{column_prev} = $self->{column};
976     $self->{column}++;
977     $self->{nc}
978     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
979     } else {
980     $self->{set_nc}->($self);
981     }
982    
983    
984     return ($self->{ct}); # start tag or end tag
985    
986     redo A;
987     } elsif (0x0041 <= $self->{nc} and
988     $self->{nc} <= 0x005A) { # A..Z
989    
990 wakaba 1.4 $self->{ct}->{tag_name}
991     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
992 wakaba 1.1 # start tag or end tag
993     ## Stay in this state
994    
995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996     $self->{line_prev} = $self->{line};
997     $self->{column_prev} = $self->{column};
998     $self->{column}++;
999     $self->{nc}
1000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001     } else {
1002     $self->{set_nc}->($self);
1003     }
1004    
1005     redo A;
1006     } elsif ($self->{nc} == -1) {
1007     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1008     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1009    
1010     $self->{last_stag_name} = $self->{ct}->{tag_name};
1011     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     #if ($self->{ct}->{attributes}) {
1014     # ## NOTE: This state should never be reached.
1015     # !!! cp (40);
1016     # !!! parse-error (type => 'end tag attribute');
1017     #} else {
1018    
1019     #}
1020     } else {
1021     die "$0: $self->{ct}->{type}: Unknown token type";
1022     }
1023     $self->{state} = DATA_STATE;
1024 wakaba 1.5 $self->{s_kwd} = '';
1025 wakaba 1.1 # reconsume
1026    
1027     return ($self->{ct}); # start tag or end tag
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x002F) { # /
1031    
1032     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1033    
1034     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1035     $self->{line_prev} = $self->{line};
1036     $self->{column_prev} = $self->{column};
1037     $self->{column}++;
1038     $self->{nc}
1039     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1040     } else {
1041     $self->{set_nc}->($self);
1042     }
1043    
1044     redo A;
1045     } else {
1046    
1047     $self->{ct}->{tag_name} .= chr $self->{nc};
1048     # start tag or end tag
1049     ## Stay in the state
1050    
1051     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1052     $self->{line_prev} = $self->{line};
1053     $self->{column_prev} = $self->{column};
1054     $self->{column}++;
1055     $self->{nc}
1056     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1057     } else {
1058     $self->{set_nc}->($self);
1059     }
1060    
1061     redo A;
1062     }
1063     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1064 wakaba 1.11 ## XML5: "Tag attribute name before state".
1065    
1066 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1067    
1068     ## Stay in the state
1069    
1070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1071     $self->{line_prev} = $self->{line};
1072     $self->{column_prev} = $self->{column};
1073     $self->{column}++;
1074     $self->{nc}
1075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1076     } else {
1077     $self->{set_nc}->($self);
1078     }
1079    
1080     redo A;
1081     } elsif ($self->{nc} == 0x003E) { # >
1082     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1083    
1084     $self->{last_stag_name} = $self->{ct}->{tag_name};
1085     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1086     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1087     if ($self->{ct}->{attributes}) {
1088    
1089     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1090     } else {
1091    
1092     }
1093     } else {
1094     die "$0: $self->{ct}->{type}: Unknown token type";
1095     }
1096     $self->{state} = DATA_STATE;
1097 wakaba 1.5 $self->{s_kwd} = '';
1098 wakaba 1.1
1099     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1100     $self->{line_prev} = $self->{line};
1101     $self->{column_prev} = $self->{column};
1102     $self->{column}++;
1103     $self->{nc}
1104     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1105     } else {
1106     $self->{set_nc}->($self);
1107     }
1108    
1109    
1110     return ($self->{ct}); # start tag or end tag
1111    
1112     redo A;
1113     } elsif (0x0041 <= $self->{nc} and
1114     $self->{nc} <= 0x005A) { # A..Z
1115    
1116     $self->{ca}
1117 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1118 wakaba 1.1 value => '',
1119     line => $self->{line}, column => $self->{column}};
1120     $self->{state} = ATTRIBUTE_NAME_STATE;
1121    
1122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1123     $self->{line_prev} = $self->{line};
1124     $self->{column_prev} = $self->{column};
1125     $self->{column}++;
1126     $self->{nc}
1127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1128     } else {
1129     $self->{set_nc}->($self);
1130     }
1131    
1132     redo A;
1133     } elsif ($self->{nc} == 0x002F) { # /
1134    
1135     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1136    
1137     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1138     $self->{line_prev} = $self->{line};
1139     $self->{column_prev} = $self->{column};
1140     $self->{column}++;
1141     $self->{nc}
1142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1143     } else {
1144     $self->{set_nc}->($self);
1145     }
1146    
1147     redo A;
1148     } elsif ($self->{nc} == -1) {
1149     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1150     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1151    
1152     $self->{last_stag_name} = $self->{ct}->{tag_name};
1153     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1154     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1155     if ($self->{ct}->{attributes}) {
1156    
1157     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1158     } else {
1159    
1160     }
1161     } else {
1162     die "$0: $self->{ct}->{type}: Unknown token type";
1163     }
1164     $self->{state} = DATA_STATE;
1165 wakaba 1.5 $self->{s_kwd} = '';
1166 wakaba 1.1 # reconsume
1167    
1168     return ($self->{ct}); # start tag or end tag
1169    
1170     redo A;
1171     } else {
1172     if ({
1173     0x0022 => 1, # "
1174     0x0027 => 1, # '
1175     0x003D => 1, # =
1176     }->{$self->{nc}}) {
1177    
1178 wakaba 1.11 ## XML5: Not a parse error.
1179 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1180     } else {
1181    
1182 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1183 wakaba 1.1 }
1184     $self->{ca}
1185     = {name => chr ($self->{nc}),
1186     value => '',
1187     line => $self->{line}, column => $self->{column}};
1188     $self->{state} = ATTRIBUTE_NAME_STATE;
1189    
1190     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1191     $self->{line_prev} = $self->{line};
1192     $self->{column_prev} = $self->{column};
1193     $self->{column}++;
1194     $self->{nc}
1195     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1196     } else {
1197     $self->{set_nc}->($self);
1198     }
1199    
1200     redo A;
1201     }
1202     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1203 wakaba 1.11 ## XML5: "Tag attribute name state".
1204    
1205 wakaba 1.1 my $before_leave = sub {
1206     if (exists $self->{ct}->{attributes} # start tag or end tag
1207     ->{$self->{ca}->{name}}) { # MUST
1208    
1209     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1210     ## Discard $self->{ca} # MUST
1211     } else {
1212    
1213     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1214     = $self->{ca};
1215 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1216 wakaba 1.1 }
1217     }; # $before_leave
1218    
1219     if ($is_space->{$self->{nc}}) {
1220    
1221     $before_leave->();
1222     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1223    
1224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1225     $self->{line_prev} = $self->{line};
1226     $self->{column_prev} = $self->{column};
1227     $self->{column}++;
1228     $self->{nc}
1229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1230     } else {
1231     $self->{set_nc}->($self);
1232     }
1233    
1234     redo A;
1235     } elsif ($self->{nc} == 0x003D) { # =
1236    
1237     $before_leave->();
1238     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1239    
1240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1241     $self->{line_prev} = $self->{line};
1242     $self->{column_prev} = $self->{column};
1243     $self->{column}++;
1244     $self->{nc}
1245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1246     } else {
1247     $self->{set_nc}->($self);
1248     }
1249    
1250     redo A;
1251     } elsif ($self->{nc} == 0x003E) { # >
1252 wakaba 1.11 if ($self->{is_xml}) {
1253    
1254     ## XML5: Not a parse error.
1255     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1256     } else {
1257    
1258     }
1259    
1260 wakaba 1.1 $before_leave->();
1261     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1262    
1263     $self->{last_stag_name} = $self->{ct}->{tag_name};
1264     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1265    
1266     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1267     if ($self->{ct}->{attributes}) {
1268     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1269     }
1270     } else {
1271     die "$0: $self->{ct}->{type}: Unknown token type";
1272     }
1273     $self->{state} = DATA_STATE;
1274 wakaba 1.5 $self->{s_kwd} = '';
1275 wakaba 1.1
1276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1277     $self->{line_prev} = $self->{line};
1278     $self->{column_prev} = $self->{column};
1279     $self->{column}++;
1280     $self->{nc}
1281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1282     } else {
1283     $self->{set_nc}->($self);
1284     }
1285    
1286    
1287     return ($self->{ct}); # start tag or end tag
1288    
1289     redo A;
1290     } elsif (0x0041 <= $self->{nc} and
1291     $self->{nc} <= 0x005A) { # A..Z
1292    
1293 wakaba 1.4 $self->{ca}->{name}
1294     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1295 wakaba 1.1 ## Stay in the state
1296    
1297     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1298     $self->{line_prev} = $self->{line};
1299     $self->{column_prev} = $self->{column};
1300     $self->{column}++;
1301     $self->{nc}
1302     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1303     } else {
1304     $self->{set_nc}->($self);
1305     }
1306    
1307     redo A;
1308     } elsif ($self->{nc} == 0x002F) { # /
1309 wakaba 1.11 if ($self->{is_xml}) {
1310    
1311     ## XML5: Not a parse error.
1312     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1313     } else {
1314    
1315     }
1316 wakaba 1.1
1317     $before_leave->();
1318     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1319    
1320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1321     $self->{line_prev} = $self->{line};
1322     $self->{column_prev} = $self->{column};
1323     $self->{column}++;
1324     $self->{nc}
1325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1326     } else {
1327     $self->{set_nc}->($self);
1328     }
1329    
1330     redo A;
1331     } elsif ($self->{nc} == -1) {
1332     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1333     $before_leave->();
1334     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1335    
1336     $self->{last_stag_name} = $self->{ct}->{tag_name};
1337     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339     if ($self->{ct}->{attributes}) {
1340    
1341     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1342     } else {
1343     ## NOTE: This state should never be reached.
1344    
1345     }
1346     } else {
1347     die "$0: $self->{ct}->{type}: Unknown token type";
1348     }
1349     $self->{state} = DATA_STATE;
1350 wakaba 1.5 $self->{s_kwd} = '';
1351 wakaba 1.1 # reconsume
1352    
1353     return ($self->{ct}); # start tag or end tag
1354    
1355     redo A;
1356     } else {
1357     if ($self->{nc} == 0x0022 or # "
1358     $self->{nc} == 0x0027) { # '
1359    
1360 wakaba 1.11 ## XML5: Not a parse error.
1361 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1362     } else {
1363    
1364     }
1365     $self->{ca}->{name} .= chr ($self->{nc});
1366     ## Stay in the state
1367    
1368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1369     $self->{line_prev} = $self->{line};
1370     $self->{column_prev} = $self->{column};
1371     $self->{column}++;
1372     $self->{nc}
1373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1374     } else {
1375     $self->{set_nc}->($self);
1376     }
1377    
1378     redo A;
1379     }
1380     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1381 wakaba 1.11 ## XML5: "Tag attribute name after state".
1382    
1383 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1384    
1385     ## Stay in the state
1386    
1387     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1388     $self->{line_prev} = $self->{line};
1389     $self->{column_prev} = $self->{column};
1390     $self->{column}++;
1391     $self->{nc}
1392     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1393     } else {
1394     $self->{set_nc}->($self);
1395     }
1396    
1397     redo A;
1398     } elsif ($self->{nc} == 0x003D) { # =
1399    
1400     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1401    
1402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1403     $self->{line_prev} = $self->{line};
1404     $self->{column_prev} = $self->{column};
1405     $self->{column}++;
1406     $self->{nc}
1407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1408     } else {
1409     $self->{set_nc}->($self);
1410     }
1411    
1412     redo A;
1413     } elsif ($self->{nc} == 0x003E) { # >
1414 wakaba 1.11 if ($self->{is_xml}) {
1415    
1416     ## XML5: Not a parse error.
1417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1418     } else {
1419    
1420     }
1421    
1422 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1423    
1424     $self->{last_stag_name} = $self->{ct}->{tag_name};
1425     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1426     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1427     if ($self->{ct}->{attributes}) {
1428    
1429     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1430     } else {
1431     ## NOTE: This state should never be reached.
1432    
1433     }
1434     } else {
1435     die "$0: $self->{ct}->{type}: Unknown token type";
1436     }
1437     $self->{state} = DATA_STATE;
1438 wakaba 1.5 $self->{s_kwd} = '';
1439 wakaba 1.1
1440     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1441     $self->{line_prev} = $self->{line};
1442     $self->{column_prev} = $self->{column};
1443     $self->{column}++;
1444     $self->{nc}
1445     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1446     } else {
1447     $self->{set_nc}->($self);
1448     }
1449    
1450    
1451     return ($self->{ct}); # start tag or end tag
1452    
1453     redo A;
1454     } elsif (0x0041 <= $self->{nc} and
1455     $self->{nc} <= 0x005A) { # A..Z
1456    
1457     $self->{ca}
1458 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1459 wakaba 1.1 value => '',
1460     line => $self->{line}, column => $self->{column}};
1461     $self->{state} = ATTRIBUTE_NAME_STATE;
1462    
1463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464     $self->{line_prev} = $self->{line};
1465     $self->{column_prev} = $self->{column};
1466     $self->{column}++;
1467     $self->{nc}
1468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469     } else {
1470     $self->{set_nc}->($self);
1471     }
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x002F) { # /
1475 wakaba 1.11 if ($self->{is_xml}) {
1476    
1477     ## XML5: Not a parse error.
1478     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1479     } else {
1480    
1481     }
1482 wakaba 1.1
1483     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1484    
1485     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1486     $self->{line_prev} = $self->{line};
1487     $self->{column_prev} = $self->{column};
1488     $self->{column}++;
1489     $self->{nc}
1490     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1491     } else {
1492     $self->{set_nc}->($self);
1493     }
1494    
1495     redo A;
1496     } elsif ($self->{nc} == -1) {
1497     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1498     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499    
1500     $self->{last_stag_name} = $self->{ct}->{tag_name};
1501     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503     if ($self->{ct}->{attributes}) {
1504    
1505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506     } else {
1507     ## NOTE: This state should never be reached.
1508    
1509     }
1510     } else {
1511     die "$0: $self->{ct}->{type}: Unknown token type";
1512     }
1513 wakaba 1.5 $self->{s_kwd} = '';
1514 wakaba 1.1 $self->{state} = DATA_STATE;
1515     # reconsume
1516    
1517     return ($self->{ct}); # start tag or end tag
1518    
1519     redo A;
1520     } else {
1521 wakaba 1.11 if ($self->{is_xml}) {
1522    
1523     ## XML5: Not a parse error.
1524     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1525     } else {
1526    
1527     }
1528    
1529 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1530     $self->{nc} == 0x0027) { # '
1531    
1532 wakaba 1.11 ## XML5: Not a parse error.
1533 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1534     } else {
1535    
1536     }
1537     $self->{ca}
1538     = {name => chr ($self->{nc}),
1539     value => '',
1540     line => $self->{line}, column => $self->{column}};
1541     $self->{state} = ATTRIBUTE_NAME_STATE;
1542    
1543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1544     $self->{line_prev} = $self->{line};
1545     $self->{column_prev} = $self->{column};
1546     $self->{column}++;
1547     $self->{nc}
1548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1549     } else {
1550     $self->{set_nc}->($self);
1551     }
1552    
1553     redo A;
1554     }
1555     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1556 wakaba 1.11 ## XML5: "Tag attribute value before state".
1557    
1558 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1559    
1560     ## Stay in the state
1561    
1562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1563     $self->{line_prev} = $self->{line};
1564     $self->{column_prev} = $self->{column};
1565     $self->{column}++;
1566     $self->{nc}
1567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1568     } else {
1569     $self->{set_nc}->($self);
1570     }
1571    
1572     redo A;
1573     } elsif ($self->{nc} == 0x0022) { # "
1574    
1575     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1576    
1577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1578     $self->{line_prev} = $self->{line};
1579     $self->{column_prev} = $self->{column};
1580     $self->{column}++;
1581     $self->{nc}
1582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1583     } else {
1584     $self->{set_nc}->($self);
1585     }
1586    
1587     redo A;
1588     } elsif ($self->{nc} == 0x0026) { # &
1589    
1590     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1591     ## reconsume
1592     redo A;
1593     } elsif ($self->{nc} == 0x0027) { # '
1594    
1595     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1596    
1597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1598     $self->{line_prev} = $self->{line};
1599     $self->{column_prev} = $self->{column};
1600     $self->{column}++;
1601     $self->{nc}
1602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1603     } else {
1604     $self->{set_nc}->($self);
1605     }
1606    
1607     redo A;
1608     } elsif ($self->{nc} == 0x003E) { # >
1609     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1610     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1611    
1612     $self->{last_stag_name} = $self->{ct}->{tag_name};
1613     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1614     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1615     if ($self->{ct}->{attributes}) {
1616    
1617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1618     } else {
1619     ## NOTE: This state should never be reached.
1620    
1621     }
1622     } else {
1623     die "$0: $self->{ct}->{type}: Unknown token type";
1624     }
1625     $self->{state} = DATA_STATE;
1626 wakaba 1.5 $self->{s_kwd} = '';
1627 wakaba 1.1
1628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1629     $self->{line_prev} = $self->{line};
1630     $self->{column_prev} = $self->{column};
1631     $self->{column}++;
1632     $self->{nc}
1633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1634     } else {
1635     $self->{set_nc}->($self);
1636     }
1637    
1638    
1639     return ($self->{ct}); # start tag or end tag
1640    
1641     redo A;
1642     } elsif ($self->{nc} == -1) {
1643     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1644     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1645    
1646     $self->{last_stag_name} = $self->{ct}->{tag_name};
1647     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1648     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1649     if ($self->{ct}->{attributes}) {
1650    
1651     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1652     } else {
1653     ## NOTE: This state should never be reached.
1654    
1655     }
1656     } else {
1657     die "$0: $self->{ct}->{type}: Unknown token type";
1658     }
1659     $self->{state} = DATA_STATE;
1660 wakaba 1.5 $self->{s_kwd} = '';
1661 wakaba 1.1 ## reconsume
1662    
1663     return ($self->{ct}); # start tag or end tag
1664    
1665     redo A;
1666     } else {
1667     if ($self->{nc} == 0x003D) { # =
1668    
1669 wakaba 1.11 ## XML5: Not a parse error.
1670 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1671 wakaba 1.11 } elsif ($self->{is_xml}) {
1672    
1673     ## XML5: No parse error.
1674     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1675 wakaba 1.1 } else {
1676    
1677     }
1678     $self->{ca}->{value} .= chr ($self->{nc});
1679     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1680    
1681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1682     $self->{line_prev} = $self->{line};
1683     $self->{column_prev} = $self->{column};
1684     $self->{column}++;
1685     $self->{nc}
1686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1687     } else {
1688     $self->{set_nc}->($self);
1689     }
1690    
1691     redo A;
1692     }
1693     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1694 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1695    
1696 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1697    
1698 wakaba 1.11 ## XML5: "Tag attribute name before state".
1699 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1700    
1701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1702     $self->{line_prev} = $self->{line};
1703     $self->{column_prev} = $self->{column};
1704     $self->{column}++;
1705     $self->{nc}
1706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1707     } else {
1708     $self->{set_nc}->($self);
1709     }
1710    
1711     redo A;
1712     } elsif ($self->{nc} == 0x0026) { # &
1713    
1714 wakaba 1.11 ## XML5: Not defined yet.
1715    
1716 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1717     ## "entity in attribute value state". In this implementation, the
1718     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1719     ## implementation of the "consume a character reference" algorithm.
1720     $self->{prev_state} = $self->{state};
1721     $self->{entity_add} = 0x0022; # "
1722     $self->{state} = ENTITY_STATE;
1723    
1724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1725     $self->{line_prev} = $self->{line};
1726     $self->{column_prev} = $self->{column};
1727     $self->{column}++;
1728     $self->{nc}
1729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1730     } else {
1731     $self->{set_nc}->($self);
1732     }
1733    
1734     redo A;
1735     } elsif ($self->{nc} == -1) {
1736     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1737     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1738    
1739     $self->{last_stag_name} = $self->{ct}->{tag_name};
1740     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1741     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1742     if ($self->{ct}->{attributes}) {
1743    
1744     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1745     } else {
1746     ## NOTE: This state should never be reached.
1747    
1748     }
1749     } else {
1750     die "$0: $self->{ct}->{type}: Unknown token type";
1751     }
1752     $self->{state} = DATA_STATE;
1753 wakaba 1.5 $self->{s_kwd} = '';
1754 wakaba 1.1 ## reconsume
1755    
1756     return ($self->{ct}); # start tag or end tag
1757    
1758     redo A;
1759     } else {
1760 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1761    
1762     ## XML5: Not a parse error.
1763     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1764     } else {
1765    
1766     }
1767 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1768     $self->{read_until}->($self->{ca}->{value},
1769 wakaba 1.11 q["&<],
1770 wakaba 1.1 length $self->{ca}->{value});
1771    
1772     ## Stay in the state
1773    
1774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1775     $self->{line_prev} = $self->{line};
1776     $self->{column_prev} = $self->{column};
1777     $self->{column}++;
1778     $self->{nc}
1779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1780     } else {
1781     $self->{set_nc}->($self);
1782     }
1783    
1784     redo A;
1785     }
1786     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1787 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1788    
1789 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1790    
1791 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1792 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1793    
1794     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795     $self->{line_prev} = $self->{line};
1796     $self->{column_prev} = $self->{column};
1797     $self->{column}++;
1798     $self->{nc}
1799     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800     } else {
1801     $self->{set_nc}->($self);
1802     }
1803    
1804     redo A;
1805     } elsif ($self->{nc} == 0x0026) { # &
1806    
1807 wakaba 1.11 ## XML5: Not defined yet.
1808    
1809 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1810     ## "entity in attribute value state". In this implementation, the
1811     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1812     ## implementation of the "consume a character reference" algorithm.
1813     $self->{entity_add} = 0x0027; # '
1814     $self->{prev_state} = $self->{state};
1815     $self->{state} = ENTITY_STATE;
1816    
1817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1818     $self->{line_prev} = $self->{line};
1819     $self->{column_prev} = $self->{column};
1820     $self->{column}++;
1821     $self->{nc}
1822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1823     } else {
1824     $self->{set_nc}->($self);
1825     }
1826    
1827     redo A;
1828     } elsif ($self->{nc} == -1) {
1829     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1830     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1831    
1832     $self->{last_stag_name} = $self->{ct}->{tag_name};
1833     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1834     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1835     if ($self->{ct}->{attributes}) {
1836    
1837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1838     } else {
1839     ## NOTE: This state should never be reached.
1840    
1841     }
1842     } else {
1843     die "$0: $self->{ct}->{type}: Unknown token type";
1844     }
1845     $self->{state} = DATA_STATE;
1846 wakaba 1.5 $self->{s_kwd} = '';
1847 wakaba 1.1 ## reconsume
1848    
1849     return ($self->{ct}); # start tag or end tag
1850    
1851     redo A;
1852     } else {
1853 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1854    
1855     ## XML5: Not a parse error.
1856     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1857     } else {
1858    
1859     }
1860 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1861     $self->{read_until}->($self->{ca}->{value},
1862 wakaba 1.11 q['&<],
1863 wakaba 1.1 length $self->{ca}->{value});
1864    
1865     ## Stay in the state
1866    
1867     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1868     $self->{line_prev} = $self->{line};
1869     $self->{column_prev} = $self->{column};
1870     $self->{column}++;
1871     $self->{nc}
1872     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1873     } else {
1874     $self->{set_nc}->($self);
1875     }
1876    
1877     redo A;
1878     }
1879     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1880 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1881    
1882 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1883    
1884 wakaba 1.11 ## XML5: "Tag attribute name before state".
1885 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1886    
1887     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1888     $self->{line_prev} = $self->{line};
1889     $self->{column_prev} = $self->{column};
1890     $self->{column}++;
1891     $self->{nc}
1892     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1893     } else {
1894     $self->{set_nc}->($self);
1895     }
1896    
1897     redo A;
1898     } elsif ($self->{nc} == 0x0026) { # &
1899    
1900 wakaba 1.11
1901     ## XML5: Not defined yet.
1902    
1903 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1904     ## "entity in attribute value state". In this implementation, the
1905     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1906     ## implementation of the "consume a character reference" algorithm.
1907     $self->{entity_add} = -1;
1908     $self->{prev_state} = $self->{state};
1909     $self->{state} = ENTITY_STATE;
1910    
1911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1912     $self->{line_prev} = $self->{line};
1913     $self->{column_prev} = $self->{column};
1914     $self->{column}++;
1915     $self->{nc}
1916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1917     } else {
1918     $self->{set_nc}->($self);
1919     }
1920    
1921     redo A;
1922     } elsif ($self->{nc} == 0x003E) { # >
1923     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1924    
1925     $self->{last_stag_name} = $self->{ct}->{tag_name};
1926     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1927     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1928     if ($self->{ct}->{attributes}) {
1929    
1930     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1931     } else {
1932     ## NOTE: This state should never be reached.
1933    
1934     }
1935     } else {
1936     die "$0: $self->{ct}->{type}: Unknown token type";
1937     }
1938     $self->{state} = DATA_STATE;
1939 wakaba 1.5 $self->{s_kwd} = '';
1940 wakaba 1.1
1941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1942     $self->{line_prev} = $self->{line};
1943     $self->{column_prev} = $self->{column};
1944     $self->{column}++;
1945     $self->{nc}
1946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1947     } else {
1948     $self->{set_nc}->($self);
1949     }
1950    
1951    
1952     return ($self->{ct}); # start tag or end tag
1953    
1954     redo A;
1955     } elsif ($self->{nc} == -1) {
1956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1957     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1958    
1959     $self->{last_stag_name} = $self->{ct}->{tag_name};
1960     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1961     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1962     if ($self->{ct}->{attributes}) {
1963    
1964     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1965     } else {
1966     ## NOTE: This state should never be reached.
1967    
1968     }
1969     } else {
1970     die "$0: $self->{ct}->{type}: Unknown token type";
1971     }
1972     $self->{state} = DATA_STATE;
1973 wakaba 1.5 $self->{s_kwd} = '';
1974 wakaba 1.1 ## reconsume
1975    
1976     return ($self->{ct}); # start tag or end tag
1977    
1978     redo A;
1979     } else {
1980     if ({
1981     0x0022 => 1, # "
1982     0x0027 => 1, # '
1983     0x003D => 1, # =
1984     }->{$self->{nc}}) {
1985    
1986 wakaba 1.11 ## XML5: Not a parse error.
1987 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1988     } else {
1989    
1990     }
1991     $self->{ca}->{value} .= chr ($self->{nc});
1992     $self->{read_until}->($self->{ca}->{value},
1993     q["'=& >],
1994     length $self->{ca}->{value});
1995    
1996     ## Stay in the state
1997    
1998     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1999     $self->{line_prev} = $self->{line};
2000     $self->{column_prev} = $self->{column};
2001     $self->{column}++;
2002     $self->{nc}
2003     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2004     } else {
2005     $self->{set_nc}->($self);
2006     }
2007    
2008     redo A;
2009     }
2010     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2011     if ($is_space->{$self->{nc}}) {
2012    
2013     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2014    
2015     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2016     $self->{line_prev} = $self->{line};
2017     $self->{column_prev} = $self->{column};
2018     $self->{column}++;
2019     $self->{nc}
2020     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2021     } else {
2022     $self->{set_nc}->($self);
2023     }
2024    
2025     redo A;
2026     } elsif ($self->{nc} == 0x003E) { # >
2027     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2028    
2029     $self->{last_stag_name} = $self->{ct}->{tag_name};
2030     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2031     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2032     if ($self->{ct}->{attributes}) {
2033    
2034     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2035     } else {
2036     ## NOTE: This state should never be reached.
2037    
2038     }
2039     } else {
2040     die "$0: $self->{ct}->{type}: Unknown token type";
2041     }
2042     $self->{state} = DATA_STATE;
2043 wakaba 1.5 $self->{s_kwd} = '';
2044 wakaba 1.1
2045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046     $self->{line_prev} = $self->{line};
2047     $self->{column_prev} = $self->{column};
2048     $self->{column}++;
2049     $self->{nc}
2050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051     } else {
2052     $self->{set_nc}->($self);
2053     }
2054    
2055    
2056     return ($self->{ct}); # start tag or end tag
2057    
2058     redo A;
2059     } elsif ($self->{nc} == 0x002F) { # /
2060    
2061     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2062    
2063     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2064     $self->{line_prev} = $self->{line};
2065     $self->{column_prev} = $self->{column};
2066     $self->{column}++;
2067     $self->{nc}
2068     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2069     } else {
2070     $self->{set_nc}->($self);
2071     }
2072    
2073     redo A;
2074     } elsif ($self->{nc} == -1) {
2075     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2076     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2077    
2078     $self->{last_stag_name} = $self->{ct}->{tag_name};
2079     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2080     if ($self->{ct}->{attributes}) {
2081    
2082     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2083     } else {
2084     ## NOTE: This state should never be reached.
2085    
2086     }
2087     } else {
2088     die "$0: $self->{ct}->{type}: Unknown token type";
2089     }
2090     $self->{state} = DATA_STATE;
2091 wakaba 1.5 $self->{s_kwd} = '';
2092 wakaba 1.1 ## Reconsume.
2093     return ($self->{ct}); # start tag or end tag
2094     redo A;
2095     } else {
2096    
2097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2098     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2099     ## reconsume
2100     redo A;
2101     }
2102     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2103 wakaba 1.11 ## XML5: "Empty tag state".
2104    
2105 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2106     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2107    
2108     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2109     ## TODO: Different type than slash in start tag
2110     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2111     if ($self->{ct}->{attributes}) {
2112    
2113     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2114     } else {
2115    
2116     }
2117     ## TODO: Test |<title></title/>|
2118     } else {
2119    
2120     $self->{self_closing} = 1;
2121     }
2122    
2123     $self->{state} = DATA_STATE;
2124 wakaba 1.5 $self->{s_kwd} = '';
2125 wakaba 1.1
2126     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2127     $self->{line_prev} = $self->{line};
2128     $self->{column_prev} = $self->{column};
2129     $self->{column}++;
2130     $self->{nc}
2131     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2132     } else {
2133     $self->{set_nc}->($self);
2134     }
2135    
2136    
2137     return ($self->{ct}); # start tag or end tag
2138    
2139     redo A;
2140     } elsif ($self->{nc} == -1) {
2141     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2142     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2143    
2144     $self->{last_stag_name} = $self->{ct}->{tag_name};
2145     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2146     if ($self->{ct}->{attributes}) {
2147    
2148     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2149     } else {
2150     ## NOTE: This state should never be reached.
2151    
2152     }
2153     } else {
2154     die "$0: $self->{ct}->{type}: Unknown token type";
2155     }
2156 wakaba 1.11 ## XML5: "Tag attribute name before state".
2157 wakaba 1.1 $self->{state} = DATA_STATE;
2158 wakaba 1.5 $self->{s_kwd} = '';
2159 wakaba 1.1 ## Reconsume.
2160     return ($self->{ct}); # start tag or end tag
2161     redo A;
2162     } else {
2163    
2164     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2165     ## TODO: This error type is wrong.
2166     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2167     ## Reconsume.
2168     redo A;
2169     }
2170     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2171     ## (only happen if PCDATA state)
2172    
2173     ## NOTE: Unlike spec's "bogus comment state", this implementation
2174     ## consumes characters one-by-one basis.
2175    
2176     if ($self->{nc} == 0x003E) { # >
2177    
2178     $self->{state} = DATA_STATE;
2179 wakaba 1.5 $self->{s_kwd} = '';
2180 wakaba 1.1
2181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2182     $self->{line_prev} = $self->{line};
2183     $self->{column_prev} = $self->{column};
2184     $self->{column}++;
2185     $self->{nc}
2186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2187     } else {
2188     $self->{set_nc}->($self);
2189     }
2190    
2191    
2192     return ($self->{ct}); # comment
2193     redo A;
2194     } elsif ($self->{nc} == -1) {
2195    
2196     $self->{state} = DATA_STATE;
2197 wakaba 1.5 $self->{s_kwd} = '';
2198 wakaba 1.1 ## reconsume
2199    
2200     return ($self->{ct}); # comment
2201     redo A;
2202     } else {
2203    
2204     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2205     $self->{read_until}->($self->{ct}->{data},
2206     q[>],
2207     length $self->{ct}->{data});
2208    
2209     ## Stay in the state.
2210    
2211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2212     $self->{line_prev} = $self->{line};
2213     $self->{column_prev} = $self->{column};
2214     $self->{column}++;
2215     $self->{nc}
2216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2217     } else {
2218     $self->{set_nc}->($self);
2219     }
2220    
2221     redo A;
2222     }
2223     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2224     ## (only happen if PCDATA state)
2225    
2226     if ($self->{nc} == 0x002D) { # -
2227    
2228     $self->{state} = MD_HYPHEN_STATE;
2229    
2230     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2231     $self->{line_prev} = $self->{line};
2232     $self->{column_prev} = $self->{column};
2233     $self->{column}++;
2234     $self->{nc}
2235     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2236     } else {
2237     $self->{set_nc}->($self);
2238     }
2239    
2240     redo A;
2241     } elsif ($self->{nc} == 0x0044 or # D
2242     $self->{nc} == 0x0064) { # d
2243     ## ASCII case-insensitive.
2244    
2245     $self->{state} = MD_DOCTYPE_STATE;
2246     $self->{s_kwd} = chr $self->{nc};
2247    
2248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249     $self->{line_prev} = $self->{line};
2250     $self->{column_prev} = $self->{column};
2251     $self->{column}++;
2252     $self->{nc}
2253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254     } else {
2255     $self->{set_nc}->($self);
2256     }
2257    
2258     redo A;
2259 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2260     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2261     $self->{is_xml}) and
2262 wakaba 1.1 $self->{nc} == 0x005B) { # [
2263    
2264     $self->{state} = MD_CDATA_STATE;
2265     $self->{s_kwd} = '[';
2266    
2267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2268     $self->{line_prev} = $self->{line};
2269     $self->{column_prev} = $self->{column};
2270     $self->{column}++;
2271     $self->{nc}
2272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2273     } else {
2274     $self->{set_nc}->($self);
2275     }
2276    
2277     redo A;
2278     } else {
2279    
2280     }
2281    
2282     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2283     line => $self->{line_prev},
2284     column => $self->{column_prev} - 1);
2285     ## Reconsume.
2286     $self->{state} = BOGUS_COMMENT_STATE;
2287     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2288     line => $self->{line_prev},
2289     column => $self->{column_prev} - 1,
2290     };
2291     redo A;
2292     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2293     if ($self->{nc} == 0x002D) { # -
2294    
2295     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2296     line => $self->{line_prev},
2297     column => $self->{column_prev} - 2,
2298     };
2299 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2300 wakaba 1.1
2301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2302     $self->{line_prev} = $self->{line};
2303     $self->{column_prev} = $self->{column};
2304     $self->{column}++;
2305     $self->{nc}
2306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2307     } else {
2308     $self->{set_nc}->($self);
2309     }
2310    
2311     redo A;
2312     } else {
2313    
2314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2315     line => $self->{line_prev},
2316     column => $self->{column_prev} - 2);
2317     $self->{state} = BOGUS_COMMENT_STATE;
2318     ## Reconsume.
2319     $self->{ct} = {type => COMMENT_TOKEN,
2320     data => '-',
2321     line => $self->{line_prev},
2322     column => $self->{column_prev} - 2,
2323     };
2324     redo A;
2325     }
2326     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2327     ## ASCII case-insensitive.
2328     if ($self->{nc} == [
2329     undef,
2330     0x004F, # O
2331     0x0043, # C
2332     0x0054, # T
2333     0x0059, # Y
2334     0x0050, # P
2335     ]->[length $self->{s_kwd}] or
2336     $self->{nc} == [
2337     undef,
2338     0x006F, # o
2339     0x0063, # c
2340     0x0074, # t
2341     0x0079, # y
2342     0x0070, # p
2343     ]->[length $self->{s_kwd}]) {
2344    
2345     ## Stay in the state.
2346     $self->{s_kwd} .= chr $self->{nc};
2347    
2348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2349     $self->{line_prev} = $self->{line};
2350     $self->{column_prev} = $self->{column};
2351     $self->{column}++;
2352     $self->{nc}
2353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2354     } else {
2355     $self->{set_nc}->($self);
2356     }
2357    
2358     redo A;
2359     } elsif ((length $self->{s_kwd}) == 6 and
2360     ($self->{nc} == 0x0045 or # E
2361     $self->{nc} == 0x0065)) { # e
2362 wakaba 1.10 if ($self->{s_kwd} ne 'DOCTYP') {
2363    
2364     ## XML5: case-sensitive.
2365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2366     text => 'DOCTYPE',
2367     line => $self->{line_prev},
2368     column => $self->{column_prev} - 5);
2369     } else {
2370    
2371     }
2372 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2373     $self->{ct} = {type => DOCTYPE_TOKEN,
2374     quirks => 1,
2375     line => $self->{line_prev},
2376     column => $self->{column_prev} - 7,
2377     };
2378    
2379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2380     $self->{line_prev} = $self->{line};
2381     $self->{column_prev} = $self->{column};
2382     $self->{column}++;
2383     $self->{nc}
2384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2385     } else {
2386     $self->{set_nc}->($self);
2387     }
2388    
2389     redo A;
2390     } else {
2391    
2392     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2393     line => $self->{line_prev},
2394     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2395     $self->{state} = BOGUS_COMMENT_STATE;
2396     ## Reconsume.
2397     $self->{ct} = {type => COMMENT_TOKEN,
2398     data => $self->{s_kwd},
2399     line => $self->{line_prev},
2400     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2401     };
2402     redo A;
2403     }
2404     } elsif ($self->{state} == MD_CDATA_STATE) {
2405     if ($self->{nc} == {
2406     '[' => 0x0043, # C
2407     '[C' => 0x0044, # D
2408     '[CD' => 0x0041, # A
2409     '[CDA' => 0x0054, # T
2410     '[CDAT' => 0x0041, # A
2411     }->{$self->{s_kwd}}) {
2412    
2413     ## Stay in the state.
2414     $self->{s_kwd} .= chr $self->{nc};
2415    
2416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2417     $self->{line_prev} = $self->{line};
2418     $self->{column_prev} = $self->{column};
2419     $self->{column}++;
2420     $self->{nc}
2421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2422     } else {
2423     $self->{set_nc}->($self);
2424     }
2425    
2426     redo A;
2427     } elsif ($self->{s_kwd} eq '[CDATA' and
2428     $self->{nc} == 0x005B) { # [
2429 wakaba 1.6 if ($self->{is_xml} and
2430     not $self->{tainted} and
2431     @{$self->{open_elements} or []} == 0) {
2432 wakaba 1.8
2433 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2434     line => $self->{line_prev},
2435     column => $self->{column_prev} - 7);
2436     $self->{tainted} = 1;
2437 wakaba 1.8 } else {
2438    
2439 wakaba 1.6 }
2440    
2441 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2442     data => '',
2443     line => $self->{line_prev},
2444     column => $self->{column_prev} - 7};
2445     $self->{state} = CDATA_SECTION_STATE;
2446    
2447     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2448     $self->{line_prev} = $self->{line};
2449     $self->{column_prev} = $self->{column};
2450     $self->{column}++;
2451     $self->{nc}
2452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2453     } else {
2454     $self->{set_nc}->($self);
2455     }
2456    
2457     redo A;
2458     } else {
2459    
2460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461     line => $self->{line_prev},
2462     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2463     $self->{state} = BOGUS_COMMENT_STATE;
2464     ## Reconsume.
2465     $self->{ct} = {type => COMMENT_TOKEN,
2466     data => $self->{s_kwd},
2467     line => $self->{line_prev},
2468     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2469     };
2470     redo A;
2471     }
2472     } elsif ($self->{state} == COMMENT_START_STATE) {
2473     if ($self->{nc} == 0x002D) { # -
2474    
2475     $self->{state} = COMMENT_START_DASH_STATE;
2476    
2477     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2478     $self->{line_prev} = $self->{line};
2479     $self->{column_prev} = $self->{column};
2480     $self->{column}++;
2481     $self->{nc}
2482     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2483     } else {
2484     $self->{set_nc}->($self);
2485     }
2486    
2487     redo A;
2488     } elsif ($self->{nc} == 0x003E) { # >
2489    
2490     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2491     $self->{state} = DATA_STATE;
2492 wakaba 1.5 $self->{s_kwd} = '';
2493 wakaba 1.1
2494     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2495     $self->{line_prev} = $self->{line};
2496     $self->{column_prev} = $self->{column};
2497     $self->{column}++;
2498     $self->{nc}
2499     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2500     } else {
2501     $self->{set_nc}->($self);
2502     }
2503    
2504    
2505     return ($self->{ct}); # comment
2506    
2507     redo A;
2508     } elsif ($self->{nc} == -1) {
2509    
2510     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2511     $self->{state} = DATA_STATE;
2512 wakaba 1.5 $self->{s_kwd} = '';
2513 wakaba 1.1 ## reconsume
2514    
2515     return ($self->{ct}); # comment
2516    
2517     redo A;
2518     } else {
2519    
2520     $self->{ct}->{data} # comment
2521     .= chr ($self->{nc});
2522     $self->{state} = COMMENT_STATE;
2523    
2524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525     $self->{line_prev} = $self->{line};
2526     $self->{column_prev} = $self->{column};
2527     $self->{column}++;
2528     $self->{nc}
2529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530     } else {
2531     $self->{set_nc}->($self);
2532     }
2533    
2534     redo A;
2535     }
2536     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2537     if ($self->{nc} == 0x002D) { # -
2538    
2539     $self->{state} = COMMENT_END_STATE;
2540    
2541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2542     $self->{line_prev} = $self->{line};
2543     $self->{column_prev} = $self->{column};
2544     $self->{column}++;
2545     $self->{nc}
2546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2547     } else {
2548     $self->{set_nc}->($self);
2549     }
2550    
2551     redo A;
2552     } elsif ($self->{nc} == 0x003E) { # >
2553    
2554     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2555     $self->{state} = DATA_STATE;
2556 wakaba 1.5 $self->{s_kwd} = '';
2557 wakaba 1.1
2558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559     $self->{line_prev} = $self->{line};
2560     $self->{column_prev} = $self->{column};
2561     $self->{column}++;
2562     $self->{nc}
2563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564     } else {
2565     $self->{set_nc}->($self);
2566     }
2567    
2568    
2569     return ($self->{ct}); # comment
2570    
2571     redo A;
2572     } elsif ($self->{nc} == -1) {
2573    
2574     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2575     $self->{state} = DATA_STATE;
2576 wakaba 1.5 $self->{s_kwd} = '';
2577 wakaba 1.1 ## reconsume
2578    
2579     return ($self->{ct}); # comment
2580    
2581     redo A;
2582     } else {
2583    
2584     $self->{ct}->{data} # comment
2585     .= '-' . chr ($self->{nc});
2586     $self->{state} = COMMENT_STATE;
2587    
2588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2589     $self->{line_prev} = $self->{line};
2590     $self->{column_prev} = $self->{column};
2591     $self->{column}++;
2592     $self->{nc}
2593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2594     } else {
2595     $self->{set_nc}->($self);
2596     }
2597    
2598     redo A;
2599     }
2600     } elsif ($self->{state} == COMMENT_STATE) {
2601     if ($self->{nc} == 0x002D) { # -
2602    
2603     $self->{state} = COMMENT_END_DASH_STATE;
2604    
2605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2606     $self->{line_prev} = $self->{line};
2607     $self->{column_prev} = $self->{column};
2608     $self->{column}++;
2609     $self->{nc}
2610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2611     } else {
2612     $self->{set_nc}->($self);
2613     }
2614    
2615     redo A;
2616     } elsif ($self->{nc} == -1) {
2617    
2618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619     $self->{state} = DATA_STATE;
2620 wakaba 1.5 $self->{s_kwd} = '';
2621 wakaba 1.1 ## reconsume
2622    
2623     return ($self->{ct}); # comment
2624    
2625     redo A;
2626     } else {
2627    
2628     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2629     $self->{read_until}->($self->{ct}->{data},
2630     q[-],
2631     length $self->{ct}->{data});
2632    
2633     ## Stay in the state
2634    
2635     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2636     $self->{line_prev} = $self->{line};
2637     $self->{column_prev} = $self->{column};
2638     $self->{column}++;
2639     $self->{nc}
2640     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2641     } else {
2642     $self->{set_nc}->($self);
2643     }
2644    
2645     redo A;
2646     }
2647     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2648 wakaba 1.10 ## XML5: "comment dash state".
2649    
2650 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2651    
2652     $self->{state} = COMMENT_END_STATE;
2653    
2654     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2655     $self->{line_prev} = $self->{line};
2656     $self->{column_prev} = $self->{column};
2657     $self->{column}++;
2658     $self->{nc}
2659     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2660     } else {
2661     $self->{set_nc}->($self);
2662     }
2663    
2664     redo A;
2665     } elsif ($self->{nc} == -1) {
2666    
2667     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668 wakaba 1.5 $self->{s_kwd} = '';
2669 wakaba 1.1 $self->{state} = DATA_STATE;
2670 wakaba 1.5 $self->{s_kwd} = '';
2671 wakaba 1.1 ## reconsume
2672    
2673     return ($self->{ct}); # comment
2674    
2675     redo A;
2676     } else {
2677    
2678     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2679     $self->{state} = COMMENT_STATE;
2680    
2681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2682     $self->{line_prev} = $self->{line};
2683     $self->{column_prev} = $self->{column};
2684     $self->{column}++;
2685     $self->{nc}
2686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2687     } else {
2688     $self->{set_nc}->($self);
2689     }
2690    
2691     redo A;
2692     }
2693     } elsif ($self->{state} == COMMENT_END_STATE) {
2694     if ($self->{nc} == 0x003E) { # >
2695    
2696     $self->{state} = DATA_STATE;
2697 wakaba 1.5 $self->{s_kwd} = '';
2698 wakaba 1.1
2699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2700     $self->{line_prev} = $self->{line};
2701     $self->{column_prev} = $self->{column};
2702     $self->{column}++;
2703     $self->{nc}
2704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2705     } else {
2706     $self->{set_nc}->($self);
2707     }
2708    
2709    
2710     return ($self->{ct}); # comment
2711    
2712     redo A;
2713     } elsif ($self->{nc} == 0x002D) { # -
2714    
2715 wakaba 1.10 ## XML5: Not a parse error.
2716 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2717     line => $self->{line_prev},
2718     column => $self->{column_prev});
2719     $self->{ct}->{data} .= '-'; # comment
2720     ## Stay in the state
2721    
2722     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2723     $self->{line_prev} = $self->{line};
2724     $self->{column_prev} = $self->{column};
2725     $self->{column}++;
2726     $self->{nc}
2727     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2728     } else {
2729     $self->{set_nc}->($self);
2730     }
2731    
2732     redo A;
2733     } elsif ($self->{nc} == -1) {
2734    
2735     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2736     $self->{state} = DATA_STATE;
2737 wakaba 1.5 $self->{s_kwd} = '';
2738 wakaba 1.1 ## reconsume
2739    
2740     return ($self->{ct}); # comment
2741    
2742     redo A;
2743     } else {
2744    
2745 wakaba 1.10 ## XML5: Not a parse error.
2746 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2747     line => $self->{line_prev},
2748     column => $self->{column_prev});
2749     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2750     $self->{state} = COMMENT_STATE;
2751    
2752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753     $self->{line_prev} = $self->{line};
2754     $self->{column_prev} = $self->{column};
2755     $self->{column}++;
2756     $self->{nc}
2757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758     } else {
2759     $self->{set_nc}->($self);
2760     }
2761    
2762     redo A;
2763     }
2764     } elsif ($self->{state} == DOCTYPE_STATE) {
2765     if ($is_space->{$self->{nc}}) {
2766    
2767     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2768    
2769     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2770     $self->{line_prev} = $self->{line};
2771     $self->{column_prev} = $self->{column};
2772     $self->{column}++;
2773     $self->{nc}
2774     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2775     } else {
2776     $self->{set_nc}->($self);
2777     }
2778    
2779     redo A;
2780     } else {
2781    
2782     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2783     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2784     ## reconsume
2785     redo A;
2786     }
2787     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2788     if ($is_space->{$self->{nc}}) {
2789    
2790     ## Stay in the state
2791    
2792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2793     $self->{line_prev} = $self->{line};
2794     $self->{column_prev} = $self->{column};
2795     $self->{column}++;
2796     $self->{nc}
2797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2798     } else {
2799     $self->{set_nc}->($self);
2800     }
2801    
2802     redo A;
2803     } elsif ($self->{nc} == 0x003E) { # >
2804    
2805     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2806     $self->{state} = DATA_STATE;
2807 wakaba 1.5 $self->{s_kwd} = '';
2808 wakaba 1.1
2809     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2810     $self->{line_prev} = $self->{line};
2811     $self->{column_prev} = $self->{column};
2812     $self->{column}++;
2813     $self->{nc}
2814     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2815     } else {
2816     $self->{set_nc}->($self);
2817     }
2818    
2819    
2820     return ($self->{ct}); # DOCTYPE (quirks)
2821    
2822     redo A;
2823     } elsif ($self->{nc} == -1) {
2824    
2825     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2826     $self->{state} = DATA_STATE;
2827 wakaba 1.5 $self->{s_kwd} = '';
2828 wakaba 1.1 ## reconsume
2829    
2830     return ($self->{ct}); # DOCTYPE (quirks)
2831    
2832     redo A;
2833     } else {
2834    
2835     $self->{ct}->{name} = chr $self->{nc};
2836     delete $self->{ct}->{quirks};
2837     $self->{state} = DOCTYPE_NAME_STATE;
2838    
2839     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2840     $self->{line_prev} = $self->{line};
2841     $self->{column_prev} = $self->{column};
2842     $self->{column}++;
2843     $self->{nc}
2844     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2845     } else {
2846     $self->{set_nc}->($self);
2847     }
2848    
2849     redo A;
2850     }
2851     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2852     ## ISSUE: Redundant "First," in the spec.
2853     if ($is_space->{$self->{nc}}) {
2854    
2855     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2856    
2857     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2858     $self->{line_prev} = $self->{line};
2859     $self->{column_prev} = $self->{column};
2860     $self->{column}++;
2861     $self->{nc}
2862     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2863     } else {
2864     $self->{set_nc}->($self);
2865     }
2866    
2867     redo A;
2868     } elsif ($self->{nc} == 0x003E) { # >
2869    
2870     $self->{state} = DATA_STATE;
2871 wakaba 1.5 $self->{s_kwd} = '';
2872 wakaba 1.1
2873     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2874     $self->{line_prev} = $self->{line};
2875     $self->{column_prev} = $self->{column};
2876     $self->{column}++;
2877     $self->{nc}
2878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2879     } else {
2880     $self->{set_nc}->($self);
2881     }
2882    
2883    
2884     return ($self->{ct}); # DOCTYPE
2885    
2886     redo A;
2887     } elsif ($self->{nc} == -1) {
2888    
2889     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2890     $self->{state} = DATA_STATE;
2891 wakaba 1.5 $self->{s_kwd} = '';
2892 wakaba 1.1 ## reconsume
2893    
2894     $self->{ct}->{quirks} = 1;
2895     return ($self->{ct}); # DOCTYPE
2896    
2897     redo A;
2898     } else {
2899    
2900     $self->{ct}->{name}
2901     .= chr ($self->{nc}); # DOCTYPE
2902     ## Stay in the state
2903    
2904     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2905     $self->{line_prev} = $self->{line};
2906     $self->{column_prev} = $self->{column};
2907     $self->{column}++;
2908     $self->{nc}
2909     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2910     } else {
2911     $self->{set_nc}->($self);
2912     }
2913    
2914     redo A;
2915     }
2916     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2917     if ($is_space->{$self->{nc}}) {
2918    
2919     ## Stay in the state
2920    
2921     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2922     $self->{line_prev} = $self->{line};
2923     $self->{column_prev} = $self->{column};
2924     $self->{column}++;
2925     $self->{nc}
2926     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2927     } else {
2928     $self->{set_nc}->($self);
2929     }
2930    
2931     redo A;
2932     } elsif ($self->{nc} == 0x003E) { # >
2933    
2934     $self->{state} = DATA_STATE;
2935 wakaba 1.5 $self->{s_kwd} = '';
2936 wakaba 1.1
2937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2938     $self->{line_prev} = $self->{line};
2939     $self->{column_prev} = $self->{column};
2940     $self->{column}++;
2941     $self->{nc}
2942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2943     } else {
2944     $self->{set_nc}->($self);
2945     }
2946    
2947    
2948     return ($self->{ct}); # DOCTYPE
2949    
2950     redo A;
2951     } elsif ($self->{nc} == -1) {
2952    
2953     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2954     $self->{state} = DATA_STATE;
2955 wakaba 1.5 $self->{s_kwd} = '';
2956 wakaba 1.1 ## reconsume
2957    
2958     $self->{ct}->{quirks} = 1;
2959     return ($self->{ct}); # DOCTYPE
2960    
2961     redo A;
2962     } elsif ($self->{nc} == 0x0050 or # P
2963     $self->{nc} == 0x0070) { # p
2964     $self->{state} = PUBLIC_STATE;
2965     $self->{s_kwd} = chr $self->{nc};
2966    
2967     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2968     $self->{line_prev} = $self->{line};
2969     $self->{column_prev} = $self->{column};
2970     $self->{column}++;
2971     $self->{nc}
2972     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2973     } else {
2974     $self->{set_nc}->($self);
2975     }
2976    
2977     redo A;
2978     } elsif ($self->{nc} == 0x0053 or # S
2979     $self->{nc} == 0x0073) { # s
2980     $self->{state} = SYSTEM_STATE;
2981     $self->{s_kwd} = chr $self->{nc};
2982    
2983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2984     $self->{line_prev} = $self->{line};
2985     $self->{column_prev} = $self->{column};
2986     $self->{column}++;
2987     $self->{nc}
2988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2989     } else {
2990     $self->{set_nc}->($self);
2991     }
2992    
2993     redo A;
2994     } else {
2995    
2996     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2997     $self->{ct}->{quirks} = 1;
2998    
2999     $self->{state} = BOGUS_DOCTYPE_STATE;
3000    
3001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002     $self->{line_prev} = $self->{line};
3003     $self->{column_prev} = $self->{column};
3004     $self->{column}++;
3005     $self->{nc}
3006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007     } else {
3008     $self->{set_nc}->($self);
3009     }
3010    
3011     redo A;
3012     }
3013     } elsif ($self->{state} == PUBLIC_STATE) {
3014     ## ASCII case-insensitive
3015     if ($self->{nc} == [
3016     undef,
3017     0x0055, # U
3018     0x0042, # B
3019     0x004C, # L
3020     0x0049, # I
3021     ]->[length $self->{s_kwd}] or
3022     $self->{nc} == [
3023     undef,
3024     0x0075, # u
3025     0x0062, # b
3026     0x006C, # l
3027     0x0069, # i
3028     ]->[length $self->{s_kwd}]) {
3029    
3030     ## Stay in the state.
3031     $self->{s_kwd} .= chr $self->{nc};
3032    
3033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3034     $self->{line_prev} = $self->{line};
3035     $self->{column_prev} = $self->{column};
3036     $self->{column}++;
3037     $self->{nc}
3038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3039     } else {
3040     $self->{set_nc}->($self);
3041     }
3042    
3043     redo A;
3044     } elsif ((length $self->{s_kwd}) == 5 and
3045     ($self->{nc} == 0x0043 or # C
3046     $self->{nc} == 0x0063)) { # c
3047    
3048     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3049    
3050     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3051     $self->{line_prev} = $self->{line};
3052     $self->{column_prev} = $self->{column};
3053     $self->{column}++;
3054     $self->{nc}
3055     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3056     } else {
3057     $self->{set_nc}->($self);
3058     }
3059    
3060     redo A;
3061     } else {
3062    
3063     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3064     line => $self->{line_prev},
3065     column => $self->{column_prev} + 1 - length $self->{s_kwd});
3066     $self->{ct}->{quirks} = 1;
3067    
3068     $self->{state} = BOGUS_DOCTYPE_STATE;
3069     ## Reconsume.
3070     redo A;
3071     }
3072     } elsif ($self->{state} == SYSTEM_STATE) {
3073     ## ASCII case-insensitive
3074     if ($self->{nc} == [
3075     undef,
3076     0x0059, # Y
3077     0x0053, # S
3078     0x0054, # T
3079     0x0045, # E
3080     ]->[length $self->{s_kwd}] or
3081     $self->{nc} == [
3082     undef,
3083     0x0079, # y
3084     0x0073, # s
3085     0x0074, # t
3086     0x0065, # e
3087     ]->[length $self->{s_kwd}]) {
3088    
3089     ## Stay in the state.
3090     $self->{s_kwd} .= chr $self->{nc};
3091    
3092     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3093     $self->{line_prev} = $self->{line};
3094     $self->{column_prev} = $self->{column};
3095     $self->{column}++;
3096     $self->{nc}
3097     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3098     } else {
3099     $self->{set_nc}->($self);
3100     }
3101    
3102     redo A;
3103     } elsif ((length $self->{s_kwd}) == 5 and
3104     ($self->{nc} == 0x004D or # M
3105     $self->{nc} == 0x006D)) { # m
3106    
3107     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3108    
3109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3110     $self->{line_prev} = $self->{line};
3111     $self->{column_prev} = $self->{column};
3112     $self->{column}++;
3113     $self->{nc}
3114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3115     } else {
3116     $self->{set_nc}->($self);
3117     }
3118    
3119     redo A;
3120     } else {
3121    
3122     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3123     line => $self->{line_prev},
3124     column => $self->{column_prev} + 1 - length $self->{s_kwd});
3125     $self->{ct}->{quirks} = 1;
3126    
3127     $self->{state} = BOGUS_DOCTYPE_STATE;
3128     ## Reconsume.
3129     redo A;
3130     }
3131     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3132     if ($is_space->{$self->{nc}}) {
3133    
3134     ## Stay in the state
3135    
3136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137     $self->{line_prev} = $self->{line};
3138     $self->{column_prev} = $self->{column};
3139     $self->{column}++;
3140     $self->{nc}
3141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142     } else {
3143     $self->{set_nc}->($self);
3144     }
3145    
3146     redo A;
3147     } elsif ($self->{nc} eq 0x0022) { # "
3148    
3149     $self->{ct}->{pubid} = ''; # DOCTYPE
3150     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3151    
3152     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153     $self->{line_prev} = $self->{line};
3154     $self->{column_prev} = $self->{column};
3155     $self->{column}++;
3156     $self->{nc}
3157     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158     } else {
3159     $self->{set_nc}->($self);
3160     }
3161    
3162     redo A;
3163     } elsif ($self->{nc} eq 0x0027) { # '
3164    
3165     $self->{ct}->{pubid} = ''; # DOCTYPE
3166     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3167    
3168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3169     $self->{line_prev} = $self->{line};
3170     $self->{column_prev} = $self->{column};
3171     $self->{column}++;
3172     $self->{nc}
3173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3174     } else {
3175     $self->{set_nc}->($self);
3176     }
3177    
3178     redo A;
3179     } elsif ($self->{nc} eq 0x003E) { # >
3180    
3181     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3182    
3183     $self->{state} = DATA_STATE;
3184 wakaba 1.5 $self->{s_kwd} = '';
3185 wakaba 1.1
3186     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3187     $self->{line_prev} = $self->{line};
3188     $self->{column_prev} = $self->{column};
3189     $self->{column}++;
3190     $self->{nc}
3191     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3192     } else {
3193     $self->{set_nc}->($self);
3194     }
3195    
3196    
3197     $self->{ct}->{quirks} = 1;
3198     return ($self->{ct}); # DOCTYPE
3199    
3200     redo A;
3201     } elsif ($self->{nc} == -1) {
3202    
3203     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3204    
3205     $self->{state} = DATA_STATE;
3206 wakaba 1.5 $self->{s_kwd} = '';
3207 wakaba 1.1 ## reconsume
3208    
3209     $self->{ct}->{quirks} = 1;
3210     return ($self->{ct}); # DOCTYPE
3211    
3212     redo A;
3213     } else {
3214    
3215     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3216     $self->{ct}->{quirks} = 1;
3217    
3218     $self->{state} = BOGUS_DOCTYPE_STATE;
3219    
3220     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3221     $self->{line_prev} = $self->{line};
3222     $self->{column_prev} = $self->{column};
3223     $self->{column}++;
3224     $self->{nc}
3225     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3226     } else {
3227     $self->{set_nc}->($self);
3228     }
3229    
3230     redo A;
3231     }
3232     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3233     if ($self->{nc} == 0x0022) { # "
3234    
3235     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3236    
3237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238     $self->{line_prev} = $self->{line};
3239     $self->{column_prev} = $self->{column};
3240     $self->{column}++;
3241     $self->{nc}
3242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243     } else {
3244     $self->{set_nc}->($self);
3245     }
3246    
3247     redo A;
3248     } elsif ($self->{nc} == 0x003E) { # >
3249    
3250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3251    
3252     $self->{state} = DATA_STATE;
3253 wakaba 1.5 $self->{s_kwd} = '';
3254 wakaba 1.1
3255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3256     $self->{line_prev} = $self->{line};
3257     $self->{column_prev} = $self->{column};
3258     $self->{column}++;
3259     $self->{nc}
3260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3261     } else {
3262     $self->{set_nc}->($self);
3263     }
3264    
3265    
3266     $self->{ct}->{quirks} = 1;
3267     return ($self->{ct}); # DOCTYPE
3268    
3269     redo A;
3270     } elsif ($self->{nc} == -1) {
3271    
3272     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3273    
3274     $self->{state} = DATA_STATE;
3275 wakaba 1.5 $self->{s_kwd} = '';
3276 wakaba 1.1 ## reconsume
3277    
3278     $self->{ct}->{quirks} = 1;
3279     return ($self->{ct}); # DOCTYPE
3280    
3281     redo A;
3282     } else {
3283    
3284     $self->{ct}->{pubid} # DOCTYPE
3285     .= chr $self->{nc};
3286     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3287     length $self->{ct}->{pubid});
3288    
3289     ## Stay in the state
3290    
3291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3292     $self->{line_prev} = $self->{line};
3293     $self->{column_prev} = $self->{column};
3294     $self->{column}++;
3295     $self->{nc}
3296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3297     } else {
3298     $self->{set_nc}->($self);
3299     }
3300    
3301     redo A;
3302     }
3303     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3304     if ($self->{nc} == 0x0027) { # '
3305    
3306     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3307    
3308     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3309     $self->{line_prev} = $self->{line};
3310     $self->{column_prev} = $self->{column};
3311     $self->{column}++;
3312     $self->{nc}
3313     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3314     } else {
3315     $self->{set_nc}->($self);
3316     }
3317    
3318     redo A;
3319     } elsif ($self->{nc} == 0x003E) { # >
3320    
3321     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3322    
3323     $self->{state} = DATA_STATE;
3324 wakaba 1.5 $self->{s_kwd} = '';
3325 wakaba 1.1
3326     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3327     $self->{line_prev} = $self->{line};
3328     $self->{column_prev} = $self->{column};
3329     $self->{column}++;
3330     $self->{nc}
3331     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3332     } else {
3333     $self->{set_nc}->($self);
3334     }
3335    
3336    
3337     $self->{ct}->{quirks} = 1;
3338     return ($self->{ct}); # DOCTYPE
3339    
3340     redo A;
3341     } elsif ($self->{nc} == -1) {
3342    
3343     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3344    
3345     $self->{state} = DATA_STATE;
3346 wakaba 1.5 $self->{s_kwd} = '';
3347 wakaba 1.1 ## reconsume
3348    
3349     $self->{ct}->{quirks} = 1;
3350     return ($self->{ct}); # DOCTYPE
3351    
3352     redo A;
3353     } else {
3354    
3355     $self->{ct}->{pubid} # DOCTYPE
3356     .= chr $self->{nc};
3357     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3358     length $self->{ct}->{pubid});
3359    
3360     ## Stay in the state
3361    
3362     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3363     $self->{line_prev} = $self->{line};
3364     $self->{column_prev} = $self->{column};
3365     $self->{column}++;
3366     $self->{nc}
3367     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3368     } else {
3369     $self->{set_nc}->($self);
3370     }
3371    
3372     redo A;
3373     }
3374     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3375     if ($is_space->{$self->{nc}}) {
3376    
3377     ## Stay in the state
3378    
3379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3380     $self->{line_prev} = $self->{line};
3381     $self->{column_prev} = $self->{column};
3382     $self->{column}++;
3383     $self->{nc}
3384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3385     } else {
3386     $self->{set_nc}->($self);
3387     }
3388    
3389     redo A;
3390     } elsif ($self->{nc} == 0x0022) { # "
3391    
3392     $self->{ct}->{sysid} = ''; # DOCTYPE
3393     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3394    
3395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3396     $self->{line_prev} = $self->{line};
3397     $self->{column_prev} = $self->{column};
3398     $self->{column}++;
3399     $self->{nc}
3400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3401     } else {
3402     $self->{set_nc}->($self);
3403     }
3404    
3405     redo A;
3406     } elsif ($self->{nc} == 0x0027) { # '
3407    
3408     $self->{ct}->{sysid} = ''; # DOCTYPE
3409     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3410    
3411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3412     $self->{line_prev} = $self->{line};
3413     $self->{column_prev} = $self->{column};
3414     $self->{column}++;
3415     $self->{nc}
3416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3417     } else {
3418     $self->{set_nc}->($self);
3419     }
3420    
3421     redo A;
3422     } elsif ($self->{nc} == 0x003E) { # >
3423    
3424     $self->{state} = DATA_STATE;
3425 wakaba 1.5 $self->{s_kwd} = '';
3426 wakaba 1.1
3427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428     $self->{line_prev} = $self->{line};
3429     $self->{column_prev} = $self->{column};
3430     $self->{column}++;
3431     $self->{nc}
3432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3433     } else {
3434     $self->{set_nc}->($self);
3435     }
3436    
3437    
3438     return ($self->{ct}); # DOCTYPE
3439    
3440     redo A;
3441     } elsif ($self->{nc} == -1) {
3442    
3443     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3444    
3445     $self->{state} = DATA_STATE;
3446 wakaba 1.5 $self->{s_kwd} = '';
3447 wakaba 1.1 ## reconsume
3448    
3449     $self->{ct}->{quirks} = 1;
3450     return ($self->{ct}); # DOCTYPE
3451    
3452     redo A;
3453     } else {
3454    
3455     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3456     $self->{ct}->{quirks} = 1;
3457    
3458     $self->{state} = BOGUS_DOCTYPE_STATE;
3459    
3460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3461     $self->{line_prev} = $self->{line};
3462     $self->{column_prev} = $self->{column};
3463     $self->{column}++;
3464     $self->{nc}
3465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3466     } else {
3467     $self->{set_nc}->($self);
3468     }
3469    
3470     redo A;
3471     }
3472     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3473     if ($is_space->{$self->{nc}}) {
3474    
3475     ## Stay in the state
3476    
3477     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3478     $self->{line_prev} = $self->{line};
3479     $self->{column_prev} = $self->{column};
3480     $self->{column}++;
3481     $self->{nc}
3482     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3483     } else {
3484     $self->{set_nc}->($self);
3485     }
3486    
3487     redo A;
3488     } elsif ($self->{nc} == 0x0022) { # "
3489    
3490     $self->{ct}->{sysid} = ''; # DOCTYPE
3491     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3492    
3493     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3494     $self->{line_prev} = $self->{line};
3495     $self->{column_prev} = $self->{column};
3496     $self->{column}++;
3497     $self->{nc}
3498     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3499     } else {
3500     $self->{set_nc}->($self);
3501     }
3502    
3503     redo A;
3504     } elsif ($self->{nc} == 0x0027) { # '
3505    
3506     $self->{ct}->{sysid} = ''; # DOCTYPE
3507     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3508    
3509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3510     $self->{line_prev} = $self->{line};
3511     $self->{column_prev} = $self->{column};
3512     $self->{column}++;
3513     $self->{nc}
3514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3515     } else {
3516     $self->{set_nc}->($self);
3517     }
3518    
3519     redo A;
3520     } elsif ($self->{nc} == 0x003E) { # >
3521    
3522     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3523     $self->{state} = DATA_STATE;
3524 wakaba 1.5 $self->{s_kwd} = '';
3525 wakaba 1.1
3526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3527     $self->{line_prev} = $self->{line};
3528     $self->{column_prev} = $self->{column};
3529     $self->{column}++;
3530     $self->{nc}
3531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3532     } else {
3533     $self->{set_nc}->($self);
3534     }
3535    
3536    
3537     $self->{ct}->{quirks} = 1;
3538     return ($self->{ct}); # DOCTYPE
3539    
3540     redo A;
3541     } elsif ($self->{nc} == -1) {
3542    
3543     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3544    
3545     $self->{state} = DATA_STATE;
3546 wakaba 1.5 $self->{s_kwd} = '';
3547 wakaba 1.1 ## reconsume
3548    
3549     $self->{ct}->{quirks} = 1;
3550     return ($self->{ct}); # DOCTYPE
3551    
3552     redo A;
3553     } else {
3554    
3555     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3556     $self->{ct}->{quirks} = 1;
3557    
3558     $self->{state} = BOGUS_DOCTYPE_STATE;
3559    
3560     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3561     $self->{line_prev} = $self->{line};
3562     $self->{column_prev} = $self->{column};
3563     $self->{column}++;
3564     $self->{nc}
3565     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3566     } else {
3567     $self->{set_nc}->($self);
3568     }
3569    
3570     redo A;
3571     }
3572     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3573     if ($self->{nc} == 0x0022) { # "
3574    
3575     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3576    
3577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3578     $self->{line_prev} = $self->{line};
3579     $self->{column_prev} = $self->{column};
3580     $self->{column}++;
3581     $self->{nc}
3582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3583     } else {
3584     $self->{set_nc}->($self);
3585     }
3586    
3587     redo A;
3588     } elsif ($self->{nc} == 0x003E) { # >
3589    
3590     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3591    
3592     $self->{state} = DATA_STATE;
3593 wakaba 1.5 $self->{s_kwd} = '';
3594 wakaba 1.1
3595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3596     $self->{line_prev} = $self->{line};
3597     $self->{column_prev} = $self->{column};
3598     $self->{column}++;
3599     $self->{nc}
3600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3601     } else {
3602     $self->{set_nc}->($self);
3603     }
3604    
3605    
3606     $self->{ct}->{quirks} = 1;
3607     return ($self->{ct}); # DOCTYPE
3608    
3609     redo A;
3610     } elsif ($self->{nc} == -1) {
3611    
3612     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3613    
3614     $self->{state} = DATA_STATE;
3615 wakaba 1.5 $self->{s_kwd} = '';
3616 wakaba 1.1 ## reconsume
3617    
3618     $self->{ct}->{quirks} = 1;
3619     return ($self->{ct}); # DOCTYPE
3620    
3621     redo A;
3622     } else {
3623    
3624     $self->{ct}->{sysid} # DOCTYPE
3625     .= chr $self->{nc};
3626     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3627     length $self->{ct}->{sysid});
3628    
3629     ## Stay in the state
3630    
3631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3632     $self->{line_prev} = $self->{line};
3633     $self->{column_prev} = $self->{column};
3634     $self->{column}++;
3635     $self->{nc}
3636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3637     } else {
3638     $self->{set_nc}->($self);
3639     }
3640    
3641     redo A;
3642     }
3643     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3644     if ($self->{nc} == 0x0027) { # '
3645    
3646     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3647    
3648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3649     $self->{line_prev} = $self->{line};
3650     $self->{column_prev} = $self->{column};
3651     $self->{column}++;
3652     $self->{nc}
3653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3654     } else {
3655     $self->{set_nc}->($self);
3656     }
3657    
3658     redo A;
3659     } elsif ($self->{nc} == 0x003E) { # >
3660    
3661     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3662    
3663     $self->{state} = DATA_STATE;
3664 wakaba 1.5 $self->{s_kwd} = '';
3665 wakaba 1.1
3666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667     $self->{line_prev} = $self->{line};
3668     $self->{column_prev} = $self->{column};
3669     $self->{column}++;
3670     $self->{nc}
3671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672     } else {
3673     $self->{set_nc}->($self);
3674     }
3675    
3676    
3677     $self->{ct}->{quirks} = 1;
3678     return ($self->{ct}); # DOCTYPE
3679    
3680     redo A;
3681     } elsif ($self->{nc} == -1) {
3682    
3683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3684    
3685     $self->{state} = DATA_STATE;
3686 wakaba 1.5 $self->{s_kwd} = '';
3687 wakaba 1.1 ## reconsume
3688    
3689     $self->{ct}->{quirks} = 1;
3690     return ($self->{ct}); # DOCTYPE
3691    
3692     redo A;
3693     } else {
3694    
3695     $self->{ct}->{sysid} # DOCTYPE
3696     .= chr $self->{nc};
3697     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3698     length $self->{ct}->{sysid});
3699    
3700     ## Stay in the state
3701    
3702     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703     $self->{line_prev} = $self->{line};
3704     $self->{column_prev} = $self->{column};
3705     $self->{column}++;
3706     $self->{nc}
3707     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708     } else {
3709     $self->{set_nc}->($self);
3710     }
3711    
3712     redo A;
3713     }
3714     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3715     if ($is_space->{$self->{nc}}) {
3716    
3717     ## Stay in the state
3718    
3719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720     $self->{line_prev} = $self->{line};
3721     $self->{column_prev} = $self->{column};
3722     $self->{column}++;
3723     $self->{nc}
3724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725     } else {
3726     $self->{set_nc}->($self);
3727     }
3728    
3729     redo A;
3730     } elsif ($self->{nc} == 0x003E) { # >
3731    
3732     $self->{state} = DATA_STATE;
3733 wakaba 1.5 $self->{s_kwd} = '';
3734 wakaba 1.1
3735     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3736     $self->{line_prev} = $self->{line};
3737     $self->{column_prev} = $self->{column};
3738     $self->{column}++;
3739     $self->{nc}
3740     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3741     } else {
3742     $self->{set_nc}->($self);
3743     }
3744    
3745    
3746     return ($self->{ct}); # DOCTYPE
3747    
3748     redo A;
3749     } elsif ($self->{nc} == -1) {
3750    
3751     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3752     $self->{state} = DATA_STATE;
3753 wakaba 1.5 $self->{s_kwd} = '';
3754 wakaba 1.1 ## reconsume
3755    
3756     $self->{ct}->{quirks} = 1;
3757     return ($self->{ct}); # DOCTYPE
3758    
3759     redo A;
3760     } else {
3761    
3762     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3763     #$self->{ct}->{quirks} = 1;
3764    
3765     $self->{state} = BOGUS_DOCTYPE_STATE;
3766    
3767     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3768     $self->{line_prev} = $self->{line};
3769     $self->{column_prev} = $self->{column};
3770     $self->{column}++;
3771     $self->{nc}
3772     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3773     } else {
3774     $self->{set_nc}->($self);
3775     }
3776    
3777     redo A;
3778     }
3779     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3780     if ($self->{nc} == 0x003E) { # >
3781    
3782     $self->{state} = DATA_STATE;
3783 wakaba 1.5 $self->{s_kwd} = '';
3784 wakaba 1.1
3785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786     $self->{line_prev} = $self->{line};
3787     $self->{column_prev} = $self->{column};
3788     $self->{column}++;
3789     $self->{nc}
3790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791     } else {
3792     $self->{set_nc}->($self);
3793     }
3794    
3795    
3796     return ($self->{ct}); # DOCTYPE
3797    
3798     redo A;
3799     } elsif ($self->{nc} == -1) {
3800    
3801     $self->{state} = DATA_STATE;
3802 wakaba 1.5 $self->{s_kwd} = '';
3803 wakaba 1.1 ## reconsume
3804    
3805     return ($self->{ct}); # DOCTYPE
3806    
3807     redo A;
3808     } else {
3809    
3810     my $s = '';
3811     $self->{read_until}->($s, q[>], 0);
3812    
3813     ## Stay in the state
3814    
3815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3816     $self->{line_prev} = $self->{line};
3817     $self->{column_prev} = $self->{column};
3818     $self->{column}++;
3819     $self->{nc}
3820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3821     } else {
3822     $self->{set_nc}->($self);
3823     }
3824    
3825     redo A;
3826     }
3827     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3828     ## NOTE: "CDATA section state" in the state is jointly implemented
3829     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3830     ## and |CDATA_SECTION_MSE2_STATE|.
3831 wakaba 1.10
3832     ## XML5: "CDATA state".
3833 wakaba 1.1
3834     if ($self->{nc} == 0x005D) { # ]
3835    
3836     $self->{state} = CDATA_SECTION_MSE1_STATE;
3837    
3838     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3839     $self->{line_prev} = $self->{line};
3840     $self->{column_prev} = $self->{column};
3841     $self->{column}++;
3842     $self->{nc}
3843     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3844     } else {
3845     $self->{set_nc}->($self);
3846     }
3847    
3848     redo A;
3849     } elsif ($self->{nc} == -1) {
3850 wakaba 1.6 if ($self->{is_xml}) {
3851 wakaba 1.8
3852 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3853 wakaba 1.8 } else {
3854    
3855 wakaba 1.6 }
3856    
3857 wakaba 1.1 $self->{state} = DATA_STATE;
3858 wakaba 1.5 $self->{s_kwd} = '';
3859 wakaba 1.10 ## Reconsume.
3860 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3861    
3862     return ($self->{ct}); # character
3863     } else {
3864    
3865     ## No token to emit. $self->{ct} is discarded.
3866     }
3867     redo A;
3868     } else {
3869    
3870     $self->{ct}->{data} .= chr $self->{nc};
3871     $self->{read_until}->($self->{ct}->{data},
3872     q<]>,
3873     length $self->{ct}->{data});
3874    
3875     ## Stay in the state.
3876    
3877     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3878     $self->{line_prev} = $self->{line};
3879     $self->{column_prev} = $self->{column};
3880     $self->{column}++;
3881     $self->{nc}
3882     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3883     } else {
3884     $self->{set_nc}->($self);
3885     }
3886    
3887     redo A;
3888     }
3889    
3890     ## ISSUE: "text tokens" in spec.
3891     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3892 wakaba 1.10 ## XML5: "CDATA bracket state".
3893    
3894 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3895    
3896     $self->{state} = CDATA_SECTION_MSE2_STATE;
3897    
3898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899     $self->{line_prev} = $self->{line};
3900     $self->{column_prev} = $self->{column};
3901     $self->{column}++;
3902     $self->{nc}
3903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904     } else {
3905     $self->{set_nc}->($self);
3906     }
3907    
3908     redo A;
3909     } else {
3910    
3911 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3912 wakaba 1.1 $self->{ct}->{data} .= ']';
3913 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3914 wakaba 1.1 ## Reconsume.
3915     redo A;
3916     }
3917     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3918 wakaba 1.10 ## XML5: "CDATA end state".
3919    
3920 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3921     $self->{state} = DATA_STATE;
3922 wakaba 1.5 $self->{s_kwd} = '';
3923 wakaba 1.1
3924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3925     $self->{line_prev} = $self->{line};
3926     $self->{column_prev} = $self->{column};
3927     $self->{column}++;
3928     $self->{nc}
3929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3930     } else {
3931     $self->{set_nc}->($self);
3932     }
3933    
3934     if (length $self->{ct}->{data}) { # character
3935    
3936     return ($self->{ct}); # character
3937     } else {
3938    
3939     ## No token to emit. $self->{ct} is discarded.
3940     }
3941     redo A;
3942     } elsif ($self->{nc} == 0x005D) { # ]
3943     # character
3944     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3945     ## Stay in the state.
3946    
3947     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3948     $self->{line_prev} = $self->{line};
3949     $self->{column_prev} = $self->{column};
3950     $self->{column}++;
3951     $self->{nc}
3952     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3953     } else {
3954     $self->{set_nc}->($self);
3955     }
3956    
3957     redo A;
3958     } else {
3959    
3960     $self->{ct}->{data} .= ']]'; # character
3961     $self->{state} = CDATA_SECTION_STATE;
3962 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3963 wakaba 1.1 redo A;
3964     }
3965     } elsif ($self->{state} == ENTITY_STATE) {
3966     if ($is_space->{$self->{nc}} or
3967     {
3968     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3969     $self->{entity_add} => 1,
3970     }->{$self->{nc}}) {
3971    
3972     ## Don't consume
3973     ## No error
3974     ## Return nothing.
3975     #
3976     } elsif ($self->{nc} == 0x0023) { # #
3977    
3978     $self->{state} = ENTITY_HASH_STATE;
3979     $self->{s_kwd} = '#';
3980    
3981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3982     $self->{line_prev} = $self->{line};
3983     $self->{column_prev} = $self->{column};
3984     $self->{column}++;
3985     $self->{nc}
3986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3987     } else {
3988     $self->{set_nc}->($self);
3989     }
3990    
3991     redo A;
3992     } elsif ((0x0041 <= $self->{nc} and
3993     $self->{nc} <= 0x005A) or # A..Z
3994     (0x0061 <= $self->{nc} and
3995     $self->{nc} <= 0x007A)) { # a..z
3996    
3997     require Whatpm::_NamedEntityList;
3998     $self->{state} = ENTITY_NAME_STATE;
3999     $self->{s_kwd} = chr $self->{nc};
4000     $self->{entity__value} = $self->{s_kwd};
4001     $self->{entity__match} = 0;
4002    
4003     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4004     $self->{line_prev} = $self->{line};
4005     $self->{column_prev} = $self->{column};
4006     $self->{column}++;
4007     $self->{nc}
4008     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4009     } else {
4010     $self->{set_nc}->($self);
4011     }
4012    
4013     redo A;
4014     } else {
4015    
4016     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4017     ## Return nothing.
4018     #
4019     }
4020    
4021     ## NOTE: No character is consumed by the "consume a character
4022     ## reference" algorithm. In other word, there is an "&" character
4023     ## that does not introduce a character reference, which would be
4024     ## appended to the parent element or the attribute value in later
4025     ## process of the tokenizer.
4026    
4027     if ($self->{prev_state} == DATA_STATE) {
4028    
4029     $self->{state} = $self->{prev_state};
4030 wakaba 1.5 $self->{s_kwd} = '';
4031 wakaba 1.1 ## Reconsume.
4032     return ({type => CHARACTER_TOKEN, data => '&',
4033     line => $self->{line_prev},
4034     column => $self->{column_prev},
4035     });
4036     redo A;
4037     } else {
4038    
4039     $self->{ca}->{value} .= '&';
4040     $self->{state} = $self->{prev_state};
4041 wakaba 1.5 $self->{s_kwd} = '';
4042 wakaba 1.1 ## Reconsume.
4043     redo A;
4044     }
4045     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4046     if ($self->{nc} == 0x0078 or # x
4047     $self->{nc} == 0x0058) { # X
4048    
4049     $self->{state} = HEXREF_X_STATE;
4050     $self->{s_kwd} .= chr $self->{nc};
4051    
4052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4053     $self->{line_prev} = $self->{line};
4054     $self->{column_prev} = $self->{column};
4055     $self->{column}++;
4056     $self->{nc}
4057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4058     } else {
4059     $self->{set_nc}->($self);
4060     }
4061    
4062     redo A;
4063     } elsif (0x0030 <= $self->{nc} and
4064     $self->{nc} <= 0x0039) { # 0..9
4065    
4066     $self->{state} = NCR_NUM_STATE;
4067     $self->{s_kwd} = $self->{nc} - 0x0030;
4068    
4069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070     $self->{line_prev} = $self->{line};
4071     $self->{column_prev} = $self->{column};
4072     $self->{column}++;
4073     $self->{nc}
4074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4075     } else {
4076     $self->{set_nc}->($self);
4077     }
4078    
4079     redo A;
4080     } else {
4081     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4082     line => $self->{line_prev},
4083     column => $self->{column_prev} - 1);
4084    
4085     ## NOTE: According to the spec algorithm, nothing is returned,
4086     ## and then "&#" is appended to the parent element or the attribute
4087     ## value in the later processing.
4088    
4089     if ($self->{prev_state} == DATA_STATE) {
4090    
4091     $self->{state} = $self->{prev_state};
4092 wakaba 1.5 $self->{s_kwd} = '';
4093 wakaba 1.1 ## Reconsume.
4094     return ({type => CHARACTER_TOKEN,
4095     data => '&#',
4096     line => $self->{line_prev},
4097     column => $self->{column_prev} - 1,
4098     });
4099     redo A;
4100     } else {
4101    
4102     $self->{ca}->{value} .= '&#';
4103     $self->{state} = $self->{prev_state};
4104 wakaba 1.5 $self->{s_kwd} = '';
4105 wakaba 1.1 ## Reconsume.
4106     redo A;
4107     }
4108     }
4109     } elsif ($self->{state} == NCR_NUM_STATE) {
4110     if (0x0030 <= $self->{nc} and
4111     $self->{nc} <= 0x0039) { # 0..9
4112    
4113     $self->{s_kwd} *= 10;
4114     $self->{s_kwd} += $self->{nc} - 0x0030;
4115    
4116     ## Stay in the state.
4117    
4118     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4119     $self->{line_prev} = $self->{line};
4120     $self->{column_prev} = $self->{column};
4121     $self->{column}++;
4122     $self->{nc}
4123     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4124     } else {
4125     $self->{set_nc}->($self);
4126     }
4127    
4128     redo A;
4129     } elsif ($self->{nc} == 0x003B) { # ;
4130    
4131    
4132     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4133     $self->{line_prev} = $self->{line};
4134     $self->{column_prev} = $self->{column};
4135     $self->{column}++;
4136     $self->{nc}
4137     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4138     } else {
4139     $self->{set_nc}->($self);
4140     }
4141    
4142     #
4143     } else {
4144    
4145     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4146     ## Reconsume.
4147     #
4148     }
4149    
4150     my $code = $self->{s_kwd};
4151     my $l = $self->{line_prev};
4152     my $c = $self->{column_prev};
4153     if ($charref_map->{$code}) {
4154    
4155     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4156     text => (sprintf 'U+%04X', $code),
4157     line => $l, column => $c);
4158     $code = $charref_map->{$code};
4159     } elsif ($code > 0x10FFFF) {
4160    
4161     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4162     text => (sprintf 'U-%08X', $code),
4163     line => $l, column => $c);
4164     $code = 0xFFFD;
4165     }
4166    
4167     if ($self->{prev_state} == DATA_STATE) {
4168    
4169     $self->{state} = $self->{prev_state};
4170 wakaba 1.5 $self->{s_kwd} = '';
4171 wakaba 1.1 ## Reconsume.
4172     return ({type => CHARACTER_TOKEN, data => chr $code,
4173 wakaba 1.7 has_reference => 1,
4174 wakaba 1.1 line => $l, column => $c,
4175     });
4176     redo A;
4177     } else {
4178    
4179     $self->{ca}->{value} .= chr $code;
4180     $self->{ca}->{has_reference} = 1;
4181     $self->{state} = $self->{prev_state};
4182 wakaba 1.5 $self->{s_kwd} = '';
4183 wakaba 1.1 ## Reconsume.
4184     redo A;
4185     }
4186     } elsif ($self->{state} == HEXREF_X_STATE) {
4187     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4188     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4189     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4190     # 0..9, A..F, a..f
4191    
4192     $self->{state} = HEXREF_HEX_STATE;
4193     $self->{s_kwd} = 0;
4194     ## Reconsume.
4195     redo A;
4196     } else {
4197     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4198     line => $self->{line_prev},
4199     column => $self->{column_prev} - 2);
4200    
4201     ## NOTE: According to the spec algorithm, nothing is returned,
4202     ## and then "&#" followed by "X" or "x" is appended to the parent
4203     ## element or the attribute value in the later processing.
4204    
4205     if ($self->{prev_state} == DATA_STATE) {
4206    
4207     $self->{state} = $self->{prev_state};
4208 wakaba 1.5 $self->{s_kwd} = '';
4209 wakaba 1.1 ## Reconsume.
4210     return ({type => CHARACTER_TOKEN,
4211     data => '&' . $self->{s_kwd},
4212     line => $self->{line_prev},
4213     column => $self->{column_prev} - length $self->{s_kwd},
4214     });
4215     redo A;
4216     } else {
4217    
4218     $self->{ca}->{value} .= '&' . $self->{s_kwd};
4219     $self->{state} = $self->{prev_state};
4220 wakaba 1.5 $self->{s_kwd} = '';
4221 wakaba 1.1 ## Reconsume.
4222     redo A;
4223     }
4224     }
4225     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4226     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4227     # 0..9
4228    
4229     $self->{s_kwd} *= 0x10;
4230     $self->{s_kwd} += $self->{nc} - 0x0030;
4231     ## Stay in the state.
4232    
4233     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4234     $self->{line_prev} = $self->{line};
4235     $self->{column_prev} = $self->{column};
4236     $self->{column}++;
4237     $self->{nc}
4238     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4239     } else {
4240     $self->{set_nc}->($self);
4241     }
4242    
4243     redo A;
4244     } elsif (0x0061 <= $self->{nc} and
4245     $self->{nc} <= 0x0066) { # a..f
4246    
4247     $self->{s_kwd} *= 0x10;
4248     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4249     ## Stay in the state.
4250    
4251     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4252     $self->{line_prev} = $self->{line};
4253     $self->{column_prev} = $self->{column};
4254     $self->{column}++;
4255     $self->{nc}
4256     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4257     } else {
4258     $self->{set_nc}->($self);
4259     }
4260    
4261     redo A;
4262     } elsif (0x0041 <= $self->{nc} and
4263     $self->{nc} <= 0x0046) { # A..F
4264    
4265     $self->{s_kwd} *= 0x10;
4266     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4267     ## Stay in the state.
4268    
4269     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4270     $self->{line_prev} = $self->{line};
4271     $self->{column_prev} = $self->{column};
4272     $self->{column}++;
4273     $self->{nc}
4274     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4275     } else {
4276     $self->{set_nc}->($self);
4277     }
4278    
4279     redo A;
4280     } elsif ($self->{nc} == 0x003B) { # ;
4281    
4282    
4283     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4284     $self->{line_prev} = $self->{line};
4285     $self->{column_prev} = $self->{column};
4286     $self->{column}++;
4287     $self->{nc}
4288     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4289     } else {
4290     $self->{set_nc}->($self);
4291     }
4292    
4293     #
4294     } else {
4295    
4296     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4297     line => $self->{line},
4298     column => $self->{column});
4299     ## Reconsume.
4300     #
4301     }
4302    
4303     my $code = $self->{s_kwd};
4304     my $l = $self->{line_prev};
4305     my $c = $self->{column_prev};
4306     if ($charref_map->{$code}) {
4307    
4308     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4309     text => (sprintf 'U+%04X', $code),
4310     line => $l, column => $c);
4311     $code = $charref_map->{$code};
4312     } elsif ($code > 0x10FFFF) {
4313    
4314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4315     text => (sprintf 'U-%08X', $code),
4316     line => $l, column => $c);
4317     $code = 0xFFFD;
4318     }
4319    
4320     if ($self->{prev_state} == DATA_STATE) {
4321    
4322     $self->{state} = $self->{prev_state};
4323 wakaba 1.5 $self->{s_kwd} = '';
4324 wakaba 1.1 ## Reconsume.
4325     return ({type => CHARACTER_TOKEN, data => chr $code,
4326 wakaba 1.7 has_reference => 1,
4327 wakaba 1.1 line => $l, column => $c,
4328     });
4329     redo A;
4330     } else {
4331    
4332     $self->{ca}->{value} .= chr $code;
4333     $self->{ca}->{has_reference} = 1;
4334     $self->{state} = $self->{prev_state};
4335 wakaba 1.5 $self->{s_kwd} = '';
4336 wakaba 1.1 ## Reconsume.
4337     redo A;
4338     }
4339     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4340     if (length $self->{s_kwd} < 30 and
4341     ## NOTE: Some number greater than the maximum length of entity name
4342     ((0x0041 <= $self->{nc} and # a
4343     $self->{nc} <= 0x005A) or # x
4344     (0x0061 <= $self->{nc} and # a
4345     $self->{nc} <= 0x007A) or # z
4346     (0x0030 <= $self->{nc} and # 0
4347     $self->{nc} <= 0x0039) or # 9
4348     $self->{nc} == 0x003B)) { # ;
4349     our $EntityChar;
4350     $self->{s_kwd} .= chr $self->{nc};
4351     if (defined $EntityChar->{$self->{s_kwd}}) {
4352     if ($self->{nc} == 0x003B) { # ;
4353    
4354     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4355     $self->{entity__match} = 1;
4356    
4357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4358     $self->{line_prev} = $self->{line};
4359     $self->{column_prev} = $self->{column};
4360     $self->{column}++;
4361     $self->{nc}
4362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4363     } else {
4364     $self->{set_nc}->($self);
4365     }
4366    
4367     #
4368     } else {
4369    
4370     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4371     $self->{entity__match} = -1;
4372     ## Stay in the state.
4373    
4374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4375     $self->{line_prev} = $self->{line};
4376     $self->{column_prev} = $self->{column};
4377     $self->{column}++;
4378     $self->{nc}
4379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4380     } else {
4381     $self->{set_nc}->($self);
4382     }
4383    
4384     redo A;
4385     }
4386     } else {
4387    
4388     $self->{entity__value} .= chr $self->{nc};
4389     $self->{entity__match} *= 2;
4390     ## Stay in the state.
4391    
4392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4393     $self->{line_prev} = $self->{line};
4394     $self->{column_prev} = $self->{column};
4395     $self->{column}++;
4396     $self->{nc}
4397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4398     } else {
4399     $self->{set_nc}->($self);
4400     }
4401    
4402     redo A;
4403     }
4404     }
4405    
4406     my $data;
4407     my $has_ref;
4408     if ($self->{entity__match} > 0) {
4409    
4410     $data = $self->{entity__value};
4411     $has_ref = 1;
4412     #
4413     } elsif ($self->{entity__match} < 0) {
4414     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4415     if ($self->{prev_state} != DATA_STATE and # in attribute
4416     $self->{entity__match} < -1) {
4417    
4418     $data = '&' . $self->{s_kwd};
4419     #
4420     } else {
4421    
4422     $data = $self->{entity__value};
4423     $has_ref = 1;
4424     #
4425     }
4426     } else {
4427    
4428     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4429     line => $self->{line_prev},
4430     column => $self->{column_prev} - length $self->{s_kwd});
4431     $data = '&' . $self->{s_kwd};
4432     #
4433     }
4434    
4435     ## NOTE: In these cases, when a character reference is found,
4436     ## it is consumed and a character token is returned, or, otherwise,
4437     ## nothing is consumed and returned, according to the spec algorithm.
4438     ## In this implementation, anything that has been examined by the
4439     ## tokenizer is appended to the parent element or the attribute value
4440     ## as string, either literal string when no character reference or
4441     ## entity-replaced string otherwise, in this stage, since any characters
4442     ## that would not be consumed are appended in the data state or in an
4443     ## appropriate attribute value state anyway.
4444    
4445     if ($self->{prev_state} == DATA_STATE) {
4446    
4447     $self->{state} = $self->{prev_state};
4448 wakaba 1.5 $self->{s_kwd} = '';
4449 wakaba 1.1 ## Reconsume.
4450     return ({type => CHARACTER_TOKEN,
4451     data => $data,
4452 wakaba 1.7 has_reference => $has_ref,
4453 wakaba 1.1 line => $self->{line_prev},
4454     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4455     });
4456     redo A;
4457     } else {
4458    
4459     $self->{ca}->{value} .= $data;
4460     $self->{ca}->{has_reference} = 1 if $has_ref;
4461     $self->{state} = $self->{prev_state};
4462 wakaba 1.5 $self->{s_kwd} = '';
4463 wakaba 1.1 ## Reconsume.
4464     redo A;
4465     }
4466 wakaba 1.8
4467     ## XML-only states
4468    
4469     } elsif ($self->{state} == PI_STATE) {
4470     if ($is_space->{$self->{nc}} or
4471     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4472     $self->{nc} == -1) {
4473     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4474     line => $self->{line_prev},
4475     column => $self->{column_prev}
4476     - 1 * ($self->{nc} != -1));
4477     $self->{state} = BOGUS_COMMENT_STATE;
4478     ## Reconsume.
4479     $self->{ct} = {type => COMMENT_TOKEN,
4480     data => '?',
4481     line => $self->{line_prev},
4482     column => $self->{column_prev}
4483     - 1 * ($self->{nc} != -1),
4484     };
4485     redo A;
4486     } else {
4487     $self->{ct} = {type => PI_TOKEN,
4488     target => chr $self->{nc},
4489     data => '',
4490     line => $self->{line_prev},
4491     column => $self->{column_prev} - 1,
4492     };
4493     $self->{state} = PI_TARGET_STATE;
4494    
4495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4496     $self->{line_prev} = $self->{line};
4497     $self->{column_prev} = $self->{column};
4498     $self->{column}++;
4499     $self->{nc}
4500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4501     } else {
4502     $self->{set_nc}->($self);
4503     }
4504    
4505     redo A;
4506     }
4507     } elsif ($self->{state} == PI_TARGET_STATE) {
4508     if ($is_space->{$self->{nc}}) {
4509     $self->{state} = PI_TARGET_AFTER_STATE;
4510    
4511     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4512     $self->{line_prev} = $self->{line};
4513     $self->{column_prev} = $self->{column};
4514     $self->{column}++;
4515     $self->{nc}
4516     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4517     } else {
4518     $self->{set_nc}->($self);
4519     }
4520    
4521     redo A;
4522     } elsif ($self->{nc} == -1) {
4523     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4524     $self->{state} = DATA_STATE;
4525     $self->{s_kwd} = '';
4526     ## Reconsume.
4527     return ($self->{ct}); # pi
4528     redo A;
4529     } elsif ($self->{nc} == 0x003F) { # ?
4530     $self->{state} = PI_AFTER_STATE;
4531    
4532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4533     $self->{line_prev} = $self->{line};
4534     $self->{column_prev} = $self->{column};
4535     $self->{column}++;
4536     $self->{nc}
4537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4538     } else {
4539     $self->{set_nc}->($self);
4540     }
4541    
4542     redo A;
4543     } else {
4544     ## XML5: typo ("tag name" -> "target")
4545     $self->{ct}->{target} .= chr $self->{nc}; # pi
4546    
4547     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4548     $self->{line_prev} = $self->{line};
4549     $self->{column_prev} = $self->{column};
4550     $self->{column}++;
4551     $self->{nc}
4552     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4553     } else {
4554     $self->{set_nc}->($self);
4555     }
4556    
4557     redo A;
4558     }
4559     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4560     if ($is_space->{$self->{nc}}) {
4561     ## Stay in the state.
4562    
4563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4564     $self->{line_prev} = $self->{line};
4565     $self->{column_prev} = $self->{column};
4566     $self->{column}++;
4567     $self->{nc}
4568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4569     } else {
4570     $self->{set_nc}->($self);
4571     }
4572    
4573     redo A;
4574     } else {
4575     $self->{state} = PI_DATA_STATE;
4576     ## Reprocess.
4577     redo A;
4578     }
4579     } elsif ($self->{state} == PI_DATA_STATE) {
4580     if ($self->{nc} == 0x003F) { # ?
4581     $self->{state} = PI_DATA_AFTER_STATE;
4582    
4583     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4584     $self->{line_prev} = $self->{line};
4585     $self->{column_prev} = $self->{column};
4586     $self->{column}++;
4587     $self->{nc}
4588     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4589     } else {
4590     $self->{set_nc}->($self);
4591     }
4592    
4593     redo A;
4594     } elsif ($self->{nc} == -1) {
4595     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4596     $self->{state} = DATA_STATE;
4597     $self->{s_kwd} = '';
4598     ## Reprocess.
4599     return ($self->{ct}); # pi
4600     redo A;
4601     } else {
4602     $self->{ct}->{data} .= chr $self->{nc}; # pi
4603     $self->{read_until}->($self->{ct}->{data}, q[?],
4604     length $self->{ct}->{data});
4605     ## Stay in the state.
4606    
4607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608     $self->{line_prev} = $self->{line};
4609     $self->{column_prev} = $self->{column};
4610     $self->{column}++;
4611     $self->{nc}
4612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613     } else {
4614     $self->{set_nc}->($self);
4615     }
4616    
4617     ## Reprocess.
4618     redo A;
4619     }
4620     } elsif ($self->{state} == PI_AFTER_STATE) {
4621     if ($self->{nc} == 0x003E) { # >
4622     $self->{state} = DATA_STATE;
4623     $self->{s_kwd} = '';
4624    
4625     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4626     $self->{line_prev} = $self->{line};
4627     $self->{column_prev} = $self->{column};
4628     $self->{column}++;
4629     $self->{nc}
4630     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4631     } else {
4632     $self->{set_nc}->($self);
4633     }
4634    
4635     return ($self->{ct}); # pi
4636     redo A;
4637     } elsif ($self->{nc} == 0x003F) { # ?
4638     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4639     line => $self->{line_prev},
4640     column => $self->{column_prev}); ## XML5: no error
4641     $self->{ct}->{data} .= '?';
4642     $self->{state} = PI_DATA_AFTER_STATE;
4643    
4644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4645     $self->{line_prev} = $self->{line};
4646     $self->{column_prev} = $self->{column};
4647     $self->{column}++;
4648     $self->{nc}
4649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4650     } else {
4651     $self->{set_nc}->($self);
4652     }
4653    
4654     redo A;
4655     } else {
4656     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4657     line => $self->{line_prev},
4658     column => $self->{column_prev}
4659     + 1 * ($self->{nc} == -1)); ## XML5: no error
4660     $self->{ct}->{data} .= '?'; ## XML5: not appended
4661     $self->{state} = PI_DATA_STATE;
4662     ## Reprocess.
4663     redo A;
4664     }
4665     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4666     ## XML5: Same as "pi after state" in XML5
4667     if ($self->{nc} == 0x003E) { # >
4668     $self->{state} = DATA_STATE;
4669     $self->{s_kwd} = '';
4670    
4671     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4672     $self->{line_prev} = $self->{line};
4673     $self->{column_prev} = $self->{column};
4674     $self->{column}++;
4675     $self->{nc}
4676     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4677     } else {
4678     $self->{set_nc}->($self);
4679     }
4680    
4681     return ($self->{ct}); # pi
4682     redo A;
4683     } elsif ($self->{nc} == 0x003F) { # ?
4684     $self->{ct}->{data} .= '?';
4685     ## Stay in the state.
4686    
4687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4688     $self->{line_prev} = $self->{line};
4689     $self->{column_prev} = $self->{column};
4690     $self->{column}++;
4691     $self->{nc}
4692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4693     } else {
4694     $self->{set_nc}->($self);
4695     }
4696    
4697     redo A;
4698     } else {
4699     $self->{ct}->{data} .= '?'; ## XML5: not appended
4700     $self->{state} = PI_DATA_STATE;
4701     ## Reprocess.
4702     redo A;
4703     }
4704    
4705 wakaba 1.1 } else {
4706     die "$0: $self->{state}: Unknown state";
4707     }
4708     } # A
4709    
4710     die "$0: _get_next_token: unexpected case";
4711     } # _get_next_token
4712    
4713     1;
4714 wakaba 1.11 ## $Date: 2008/10/15 08:51:02 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24