/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (hide annotations) (download) (as text)
Wed Oct 15 10:50:38 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.10: +101 -10 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	15 Oct 2008 10:50:31 -0000
	* attrs-1.dat: Test cases for tokenizing errors are added.

	* elements-1.dat: A test result updated.

	* ns-attrs-1.dat: Test results updated.  New test cases for
	duplicate namespaced attributes are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 10:48:03 -0000
	* Tokenizer.pm.src: Set index attribute to each attribute token,
	for ignoring namespaced duplicate attribute at the XML namespace
	parser layer.  Raise a parse error if the attribute value is
	omitted, in XML mode.  Raise a parse error if the attribute value
	is not quoted, in XML mode.  Raise a parse error if "<" character
	is found in a quoted attribute value, in XML mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 10:49:16 -0000
	* Parser.pm.src: Use source order to determine which attribute is
	duplicate.  Preserve duplicate namespaced attributes as
	non-namespaced attributes.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.11 our $VERSION=do{my @r=(q$Revision: 1.10 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117 wakaba 1.8 ## XML states
118     sub PI_STATE () { 51 }
119     sub PI_TARGET_STATE () { 52 }
120     sub PI_TARGET_AFTER_STATE () { 53 }
121     sub PI_DATA_STATE () { 54 }
122     sub PI_AFTER_STATE () { 55 }
123     sub PI_DATA_AFTER_STATE () { 56 }
124    
125 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
126     ## list and descriptions)
127    
128     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129     sub FOREIGN_EL () { 0b1_00000000000 }
130    
131     ## Character reference mappings
132    
133     my $charref_map = {
134     0x0D => 0x000A,
135     0x80 => 0x20AC,
136     0x81 => 0xFFFD,
137     0x82 => 0x201A,
138     0x83 => 0x0192,
139     0x84 => 0x201E,
140     0x85 => 0x2026,
141     0x86 => 0x2020,
142     0x87 => 0x2021,
143     0x88 => 0x02C6,
144     0x89 => 0x2030,
145     0x8A => 0x0160,
146     0x8B => 0x2039,
147     0x8C => 0x0152,
148     0x8D => 0xFFFD,
149     0x8E => 0x017D,
150     0x8F => 0xFFFD,
151     0x90 => 0xFFFD,
152     0x91 => 0x2018,
153     0x92 => 0x2019,
154     0x93 => 0x201C,
155     0x94 => 0x201D,
156     0x95 => 0x2022,
157     0x96 => 0x2013,
158     0x97 => 0x2014,
159     0x98 => 0x02DC,
160     0x99 => 0x2122,
161     0x9A => 0x0161,
162     0x9B => 0x203A,
163     0x9C => 0x0153,
164     0x9D => 0xFFFD,
165     0x9E => 0x017E,
166     0x9F => 0x0178,
167     }; # $charref_map
168     $charref_map->{$_} = 0xFFFD
169     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176    
177     ## Implementations MUST act as if state machine in the spec
178    
179     sub _initialize_tokenizer ($) {
180     my $self = shift;
181    
182     ## NOTE: Fields set by |new| constructor:
183     #$self->{level}
184     #$self->{set_nc}
185     #$self->{parse_error}
186 wakaba 1.3 #$self->{is_xml} (if XML)
187 wakaba 1.1
188     $self->{state} = DATA_STATE; # MUST
189 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
190 wakaba 1.1 #$self->{entity__value}; # initialized when used
191     #$self->{entity__match}; # initialized when used
192     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193     undef $self->{ct}; # current token
194     undef $self->{ca}; # current attribute
195     undef $self->{last_stag_name}; # last emitted start tag name
196     #$self->{prev_state}; # initialized when used
197     delete $self->{self_closing};
198     $self->{char_buffer} = '';
199     $self->{char_buffer_pos} = 0;
200     $self->{nc} = -1; # next input character
201     #$self->{next_nc}
202     !!!next-input-character;
203     $self->{token} = [];
204     # $self->{escape}
205     } # _initialize_tokenizer
206    
207     ## A token has:
208     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
210 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
211     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212 wakaba 1.11 ## ->{target} (PI_TOKEN)
213 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
214     ## ->{sysid} (DOCTYPE_TOKEN)
215     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
216     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
217     ## ->{name}
218     ## ->{value}
219     ## ->{has_reference} == 1 or 0
220 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
221     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
222 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
224 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
225     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
226     ## while the token is pushed back to the stack.
227    
228     ## Emitted token MUST immediately be handled by the tree construction state.
229    
230     ## Before each step, UA MAY check to see if either one of the scripts in
231     ## "list of scripts that will execute as soon as possible" or the first
232     ## script in the "list of scripts that will execute asynchronously",
233     ## has completed loading. If one has, then it MUST be executed
234     ## and removed from the list.
235    
236     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
237     ## (This requirement was dropped from HTML5 spec, unfortunately.)
238    
239     my $is_space = {
240     0x0009 => 1, # CHARACTER TABULATION (HT)
241     0x000A => 1, # LINE FEED (LF)
242     #0x000B => 0, # LINE TABULATION (VT)
243     0x000C => 1, # FORM FEED (FF)
244     #0x000D => 1, # CARRIAGE RETURN (CR)
245     0x0020 => 1, # SPACE (SP)
246     };
247    
248     sub _get_next_token ($) {
249     my $self = shift;
250    
251     if ($self->{self_closing}) {
252     !!!parse-error (type => 'nestc', token => $self->{ct});
253     ## NOTE: The |self_closing| flag is only set by start tag token.
254     ## In addition, when a start tag token is emitted, it is always set to
255     ## |ct|.
256     delete $self->{self_closing};
257     }
258    
259     if (@{$self->{token}}) {
260     $self->{self_closing} = $self->{token}->[0]->{self_closing};
261     return shift @{$self->{token}};
262     }
263    
264     A: {
265     if ($self->{state} == PCDATA_STATE) {
266     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
267    
268     if ($self->{nc} == 0x0026) { # &
269     !!!cp (0.1);
270     ## NOTE: In the spec, the tokenizer is switched to the
271     ## "entity data state". In this implementation, the tokenizer
272     ## is switched to the |ENTITY_STATE|, which is an implementation
273     ## of the "consume a character reference" algorithm.
274     $self->{entity_add} = -1;
275     $self->{prev_state} = DATA_STATE;
276     $self->{state} = ENTITY_STATE;
277     !!!next-input-character;
278     redo A;
279     } elsif ($self->{nc} == 0x003C) { # <
280     !!!cp (0.2);
281     $self->{state} = TAG_OPEN_STATE;
282     !!!next-input-character;
283     redo A;
284     } elsif ($self->{nc} == -1) {
285     !!!cp (0.3);
286     !!!emit ({type => END_OF_FILE_TOKEN,
287     line => $self->{line}, column => $self->{column}});
288     last A; ## TODO: ok?
289     } else {
290     !!!cp (0.4);
291     #
292     }
293    
294     # Anything else
295     my $token = {type => CHARACTER_TOKEN,
296     data => chr $self->{nc},
297     line => $self->{line}, column => $self->{column},
298     };
299     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
300    
301     ## Stay in the state.
302     !!!next-input-character;
303     !!!emit ($token);
304     redo A;
305     } elsif ($self->{state} == DATA_STATE) {
306     $self->{s_kwd} = '' unless defined $self->{s_kwd};
307     if ($self->{nc} == 0x0026) { # &
308     $self->{s_kwd} = '';
309     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
310     not $self->{escape}) {
311     !!!cp (1);
312     ## NOTE: In the spec, the tokenizer is switched to the
313     ## "entity data state". In this implementation, the tokenizer
314     ## is switched to the |ENTITY_STATE|, which is an implementation
315     ## of the "consume a character reference" algorithm.
316     $self->{entity_add} = -1;
317     $self->{prev_state} = DATA_STATE;
318     $self->{state} = ENTITY_STATE;
319     !!!next-input-character;
320     redo A;
321     } else {
322     !!!cp (2);
323     #
324     }
325     } elsif ($self->{nc} == 0x002D) { # -
326     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
327 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
328 wakaba 1.1 !!!cp (3);
329     $self->{escape} = 1; # unless $self->{escape};
330     $self->{s_kwd} = '--';
331     #
332 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
333 wakaba 1.1 !!!cp (4);
334     $self->{s_kwd} = '--';
335     #
336 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
337     !!!cp (4.1);
338     $self->{s_kwd} .= '-';
339     #
340 wakaba 1.1 } else {
341     !!!cp (5);
342 wakaba 1.5 $self->{s_kwd} = '-';
343 wakaba 1.1 #
344     }
345     }
346    
347     #
348     } elsif ($self->{nc} == 0x0021) { # !
349     if (length $self->{s_kwd}) {
350     !!!cp (5.1);
351     $self->{s_kwd} .= '!';
352     #
353     } else {
354     !!!cp (5.2);
355     #$self->{s_kwd} = '';
356     #
357     }
358     #
359     } elsif ($self->{nc} == 0x003C) { # <
360     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
361     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
362     not $self->{escape})) {
363     !!!cp (6);
364     $self->{state} = TAG_OPEN_STATE;
365     !!!next-input-character;
366     redo A;
367     } else {
368     !!!cp (7);
369     $self->{s_kwd} = '';
370     #
371     }
372     } elsif ($self->{nc} == 0x003E) { # >
373     if ($self->{escape} and
374     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
375     if ($self->{s_kwd} eq '--') {
376     !!!cp (8);
377     delete $self->{escape};
378 wakaba 1.5 #
379 wakaba 1.1 } else {
380     !!!cp (9);
381 wakaba 1.5 #
382 wakaba 1.1 }
383 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
384     !!!cp (9.1);
385     !!!parse-error (type => 'unmatched mse', ## TODO: type
386     line => $self->{line_prev},
387     column => $self->{column_prev} - 1);
388     #
389 wakaba 1.1 } else {
390     !!!cp (10);
391 wakaba 1.5 #
392 wakaba 1.1 }
393    
394     $self->{s_kwd} = '';
395     #
396 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
397     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
398     !!!cp (10.1);
399     $self->{s_kwd} .= ']';
400     } elsif ($self->{s_kwd} eq ']]') {
401     !!!cp (10.2);
402     #
403     } else {
404     !!!cp (10.3);
405     $self->{s_kwd} = '';
406     }
407     #
408 wakaba 1.1 } elsif ($self->{nc} == -1) {
409     !!!cp (11);
410     $self->{s_kwd} = '';
411     !!!emit ({type => END_OF_FILE_TOKEN,
412     line => $self->{line}, column => $self->{column}});
413     last A; ## TODO: ok?
414     } else {
415     !!!cp (12);
416     $self->{s_kwd} = '';
417     #
418     }
419    
420     # Anything else
421     my $token = {type => CHARACTER_TOKEN,
422     data => chr $self->{nc},
423     line => $self->{line}, column => $self->{column},
424     };
425 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
426 wakaba 1.1 length $token->{data})) {
427     $self->{s_kwd} = '';
428     }
429    
430     ## Stay in the data state.
431 wakaba 1.5 if (not $self->{is_xml} and
432     $self->{content_model} == PCDATA_CONTENT_MODEL) {
433 wakaba 1.1 !!!cp (13);
434     $self->{state} = PCDATA_STATE;
435     } else {
436     !!!cp (14);
437     ## Stay in the state.
438     }
439     !!!next-input-character;
440     !!!emit ($token);
441     redo A;
442     } elsif ($self->{state} == TAG_OPEN_STATE) {
443 wakaba 1.10 ## XML5: "tag state".
444    
445 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446     if ($self->{nc} == 0x002F) { # /
447     !!!cp (15);
448     !!!next-input-character;
449     $self->{state} = CLOSE_TAG_OPEN_STATE;
450     redo A;
451     } elsif ($self->{nc} == 0x0021) { # !
452     !!!cp (15.1);
453     $self->{s_kwd} = '<' unless $self->{escape};
454     #
455     } else {
456     !!!cp (16);
457     #
458     }
459    
460     ## reconsume
461     $self->{state} = DATA_STATE;
462 wakaba 1.5 $self->{s_kwd} = '';
463 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN, data => '<',
464     line => $self->{line_prev},
465     column => $self->{column_prev},
466     });
467     redo A;
468     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
469     if ($self->{nc} == 0x0021) { # !
470     !!!cp (17);
471     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
472     !!!next-input-character;
473     redo A;
474     } elsif ($self->{nc} == 0x002F) { # /
475     !!!cp (18);
476     $self->{state} = CLOSE_TAG_OPEN_STATE;
477     !!!next-input-character;
478     redo A;
479     } elsif (0x0041 <= $self->{nc} and
480     $self->{nc} <= 0x005A) { # A..Z
481     !!!cp (19);
482     $self->{ct}
483     = {type => START_TAG_TOKEN,
484 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
485 wakaba 1.1 line => $self->{line_prev},
486     column => $self->{column_prev}};
487     $self->{state} = TAG_NAME_STATE;
488     !!!next-input-character;
489     redo A;
490     } elsif (0x0061 <= $self->{nc} and
491     $self->{nc} <= 0x007A) { # a..z
492     !!!cp (20);
493     $self->{ct} = {type => START_TAG_TOKEN,
494     tag_name => chr ($self->{nc}),
495     line => $self->{line_prev},
496     column => $self->{column_prev}};
497     $self->{state} = TAG_NAME_STATE;
498     !!!next-input-character;
499     redo A;
500     } elsif ($self->{nc} == 0x003E) { # >
501     !!!cp (21);
502     !!!parse-error (type => 'empty start tag',
503     line => $self->{line_prev},
504     column => $self->{column_prev});
505     $self->{state} = DATA_STATE;
506 wakaba 1.5 $self->{s_kwd} = '';
507 wakaba 1.1 !!!next-input-character;
508    
509     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
510     line => $self->{line_prev},
511     column => $self->{column_prev},
512     });
513    
514     redo A;
515     } elsif ($self->{nc} == 0x003F) { # ?
516 wakaba 1.8 if ($self->{is_xml}) {
517     !!!cp (22.1);
518     $self->{state} = PI_STATE;
519     !!!next-input-character;
520     redo A;
521     } else {
522     !!!cp (22);
523     !!!parse-error (type => 'pio',
524     line => $self->{line_prev},
525     column => $self->{column_prev});
526     $self->{state} = BOGUS_COMMENT_STATE;
527     $self->{ct} = {type => COMMENT_TOKEN, data => '',
528     line => $self->{line_prev},
529     column => $self->{column_prev},
530     };
531     ## $self->{nc} is intentionally left as is
532     redo A;
533     }
534 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
535 wakaba 1.1 !!!cp (23);
536     !!!parse-error (type => 'bare stago',
537     line => $self->{line_prev},
538     column => $self->{column_prev});
539     $self->{state} = DATA_STATE;
540 wakaba 1.5 $self->{s_kwd} = '';
541 wakaba 1.1 ## reconsume
542    
543     !!!emit ({type => CHARACTER_TOKEN, data => '<',
544     line => $self->{line_prev},
545     column => $self->{column_prev},
546     });
547    
548     redo A;
549 wakaba 1.9 } else {
550     ## XML5: "<:" is a parse error.
551     !!!cp (23.1);
552     $self->{ct} = {type => START_TAG_TOKEN,
553     tag_name => chr ($self->{nc}),
554     line => $self->{line_prev},
555     column => $self->{column_prev}};
556     $self->{state} = TAG_NAME_STATE;
557     !!!next-input-character;
558     redo A;
559 wakaba 1.1 }
560     } else {
561     die "$0: $self->{content_model} in tag open";
562     }
563     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
564     ## NOTE: The "close tag open state" in the spec is implemented as
565     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
566    
567 wakaba 1.10 ## XML5: "end tag state".
568    
569 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
570     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
571     if (defined $self->{last_stag_name}) {
572     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
573     $self->{s_kwd} = '';
574     ## Reconsume.
575     redo A;
576     } else {
577     ## No start tag token has ever been emitted
578     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
579     !!!cp (28);
580     $self->{state} = DATA_STATE;
581 wakaba 1.5 $self->{s_kwd} = '';
582 wakaba 1.1 ## Reconsume.
583     !!!emit ({type => CHARACTER_TOKEN, data => '</',
584     line => $l, column => $c,
585     });
586     redo A;
587     }
588     }
589    
590     if (0x0041 <= $self->{nc} and
591     $self->{nc} <= 0x005A) { # A..Z
592     !!!cp (29);
593     $self->{ct}
594     = {type => END_TAG_TOKEN,
595 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
596 wakaba 1.1 line => $l, column => $c};
597     $self->{state} = TAG_NAME_STATE;
598     !!!next-input-character;
599     redo A;
600     } elsif (0x0061 <= $self->{nc} and
601     $self->{nc} <= 0x007A) { # a..z
602     !!!cp (30);
603     $self->{ct} = {type => END_TAG_TOKEN,
604     tag_name => chr ($self->{nc}),
605     line => $l, column => $c};
606     $self->{state} = TAG_NAME_STATE;
607     !!!next-input-character;
608     redo A;
609     } elsif ($self->{nc} == 0x003E) { # >
610     !!!parse-error (type => 'empty end tag',
611     line => $self->{line_prev}, ## "<" in "</>"
612     column => $self->{column_prev} - 1);
613     $self->{state} = DATA_STATE;
614 wakaba 1.5 $self->{s_kwd} = '';
615 wakaba 1.10 if ($self->{is_xml}) {
616     !!!cp (31);
617     ## XML5: No parse error.
618    
619     ## NOTE: This parser raises a parse error, since it supports
620     ## XML1, not XML5.
621    
622     ## NOTE: A short end tag token.
623     my $ct = {type => END_TAG_TOKEN,
624     tag_name => '',
625     line => $self->{line_prev},
626     column => $self->{column_prev} - 1,
627     };
628     !!!next-input-character;
629     !!!emit ($ct);
630     } else {
631     !!!cp (31.1);
632     !!!next-input-character;
633     }
634 wakaba 1.1 redo A;
635     } elsif ($self->{nc} == -1) {
636     !!!cp (32);
637     !!!parse-error (type => 'bare etago');
638 wakaba 1.5 $self->{s_kwd} = '';
639 wakaba 1.1 $self->{state} = DATA_STATE;
640     # reconsume
641    
642     !!!emit ({type => CHARACTER_TOKEN, data => '</',
643     line => $l, column => $c,
644     });
645    
646     redo A;
647 wakaba 1.10 } elsif (not $self->{is_xml} or
648     $is_space->{$self->{nc}}) {
649 wakaba 1.1 !!!cp (33);
650 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
651     line => $self->{line_prev}, # "<" of "</"
652     column => $self->{column_prev} - 1);
653 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
654     $self->{ct} = {type => COMMENT_TOKEN, data => '',
655     line => $self->{line_prev}, # "<" of "</"
656     column => $self->{column_prev} - 1,
657     };
658     ## NOTE: $self->{nc} is intentionally left as is.
659     ## Although the "anything else" case of the spec not explicitly
660     ## states that the next input character is to be reconsumed,
661     ## it will be included to the |data| of the comment token
662     ## generated from the bogus end tag, as defined in the
663     ## "bogus comment state" entry.
664     redo A;
665 wakaba 1.10 } else {
666     ## XML5: "</:" is a parse error.
667     !!!cp (30.1);
668     $self->{ct} = {type => END_TAG_TOKEN,
669     tag_name => chr ($self->{nc}),
670     line => $l, column => $c};
671     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
672     !!!next-input-character;
673     redo A;
674 wakaba 1.1 }
675     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
676     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
677     if (length $ch) {
678     my $CH = $ch;
679     $ch =~ tr/a-z/A-Z/;
680     my $nch = chr $self->{nc};
681     if ($nch eq $ch or $nch eq $CH) {
682     !!!cp (24);
683     ## Stay in the state.
684     $self->{s_kwd} .= $nch;
685     !!!next-input-character;
686     redo A;
687     } else {
688     !!!cp (25);
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.1 ## Reconsume.
692     !!!emit ({type => CHARACTER_TOKEN,
693     data => '</' . $self->{s_kwd},
694     line => $self->{line_prev},
695     column => $self->{column_prev} - 1 - length $self->{s_kwd},
696     });
697     redo A;
698     }
699     } else { # after "<{tag-name}"
700     unless ($is_space->{$self->{nc}} or
701     {
702     0x003E => 1, # >
703     0x002F => 1, # /
704     -1 => 1, # EOF
705     }->{$self->{nc}}) {
706     !!!cp (26);
707     ## Reconsume.
708     $self->{state} = DATA_STATE;
709 wakaba 1.5 $self->{s_kwd} = '';
710 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
711     data => '</' . $self->{s_kwd},
712     line => $self->{line_prev},
713     column => $self->{column_prev} - 1 - length $self->{s_kwd},
714     });
715     redo A;
716     } else {
717     !!!cp (27);
718     $self->{ct}
719     = {type => END_TAG_TOKEN,
720     tag_name => $self->{last_stag_name},
721     line => $self->{line_prev},
722     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
723     $self->{state} = TAG_NAME_STATE;
724     ## Reconsume.
725     redo A;
726     }
727     }
728     } elsif ($self->{state} == TAG_NAME_STATE) {
729     if ($is_space->{$self->{nc}}) {
730     !!!cp (34);
731     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
732     !!!next-input-character;
733     redo A;
734     } elsif ($self->{nc} == 0x003E) { # >
735     if ($self->{ct}->{type} == START_TAG_TOKEN) {
736     !!!cp (35);
737     $self->{last_stag_name} = $self->{ct}->{tag_name};
738     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
739     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
740     #if ($self->{ct}->{attributes}) {
741     # ## NOTE: This should never be reached.
742     # !!! cp (36);
743     # !!! parse-error (type => 'end tag attribute');
744     #} else {
745     !!!cp (37);
746     #}
747     } else {
748     die "$0: $self->{ct}->{type}: Unknown token type";
749     }
750     $self->{state} = DATA_STATE;
751 wakaba 1.5 $self->{s_kwd} = '';
752 wakaba 1.1 !!!next-input-character;
753    
754     !!!emit ($self->{ct}); # start tag or end tag
755    
756     redo A;
757     } elsif (0x0041 <= $self->{nc} and
758     $self->{nc} <= 0x005A) { # A..Z
759     !!!cp (38);
760 wakaba 1.4 $self->{ct}->{tag_name}
761     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
762 wakaba 1.1 # start tag or end tag
763     ## Stay in this state
764     !!!next-input-character;
765     redo A;
766     } elsif ($self->{nc} == -1) {
767     !!!parse-error (type => 'unclosed tag');
768     if ($self->{ct}->{type} == START_TAG_TOKEN) {
769     !!!cp (39);
770     $self->{last_stag_name} = $self->{ct}->{tag_name};
771     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
772     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
773     #if ($self->{ct}->{attributes}) {
774     # ## NOTE: This state should never be reached.
775     # !!! cp (40);
776     # !!! parse-error (type => 'end tag attribute');
777     #} else {
778     !!!cp (41);
779     #}
780     } else {
781     die "$0: $self->{ct}->{type}: Unknown token type";
782     }
783     $self->{state} = DATA_STATE;
784 wakaba 1.5 $self->{s_kwd} = '';
785 wakaba 1.1 # reconsume
786    
787     !!!emit ($self->{ct}); # start tag or end tag
788    
789     redo A;
790     } elsif ($self->{nc} == 0x002F) { # /
791     !!!cp (42);
792     $self->{state} = SELF_CLOSING_START_TAG_STATE;
793     !!!next-input-character;
794     redo A;
795     } else {
796     !!!cp (44);
797     $self->{ct}->{tag_name} .= chr $self->{nc};
798     # start tag or end tag
799     ## Stay in the state
800     !!!next-input-character;
801     redo A;
802     }
803     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
804 wakaba 1.11 ## XML5: "Tag attribute name before state".
805    
806 wakaba 1.1 if ($is_space->{$self->{nc}}) {
807     !!!cp (45);
808     ## Stay in the state
809     !!!next-input-character;
810     redo A;
811     } elsif ($self->{nc} == 0x003E) { # >
812     if ($self->{ct}->{type} == START_TAG_TOKEN) {
813     !!!cp (46);
814     $self->{last_stag_name} = $self->{ct}->{tag_name};
815     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
816     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
817     if ($self->{ct}->{attributes}) {
818     !!!cp (47);
819     !!!parse-error (type => 'end tag attribute');
820     } else {
821     !!!cp (48);
822     }
823     } else {
824     die "$0: $self->{ct}->{type}: Unknown token type";
825     }
826     $self->{state} = DATA_STATE;
827 wakaba 1.5 $self->{s_kwd} = '';
828 wakaba 1.1 !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (49);
836     $self->{ca}
837 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
838 wakaba 1.1 value => '',
839     line => $self->{line}, column => $self->{column}};
840     $self->{state} = ATTRIBUTE_NAME_STATE;
841     !!!next-input-character;
842     redo A;
843     } elsif ($self->{nc} == 0x002F) { # /
844     !!!cp (50);
845     $self->{state} = SELF_CLOSING_START_TAG_STATE;
846     !!!next-input-character;
847     redo A;
848     } elsif ($self->{nc} == -1) {
849     !!!parse-error (type => 'unclosed tag');
850     if ($self->{ct}->{type} == START_TAG_TOKEN) {
851     !!!cp (52);
852     $self->{last_stag_name} = $self->{ct}->{tag_name};
853     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
854     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
855     if ($self->{ct}->{attributes}) {
856     !!!cp (53);
857     !!!parse-error (type => 'end tag attribute');
858     } else {
859     !!!cp (54);
860     }
861     } else {
862     die "$0: $self->{ct}->{type}: Unknown token type";
863     }
864     $self->{state} = DATA_STATE;
865 wakaba 1.5 $self->{s_kwd} = '';
866 wakaba 1.1 # reconsume
867    
868     !!!emit ($self->{ct}); # start tag or end tag
869    
870     redo A;
871     } else {
872     if ({
873     0x0022 => 1, # "
874     0x0027 => 1, # '
875     0x003D => 1, # =
876     }->{$self->{nc}}) {
877     !!!cp (55);
878 wakaba 1.11 ## XML5: Not a parse error.
879 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
880     } else {
881     !!!cp (56);
882 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
883 wakaba 1.1 }
884     $self->{ca}
885     = {name => chr ($self->{nc}),
886     value => '',
887     line => $self->{line}, column => $self->{column}};
888     $self->{state} = ATTRIBUTE_NAME_STATE;
889     !!!next-input-character;
890     redo A;
891     }
892     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
893 wakaba 1.11 ## XML5: "Tag attribute name state".
894    
895 wakaba 1.1 my $before_leave = sub {
896     if (exists $self->{ct}->{attributes} # start tag or end tag
897     ->{$self->{ca}->{name}}) { # MUST
898     !!!cp (57);
899     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
900     ## Discard $self->{ca} # MUST
901     } else {
902     !!!cp (58);
903     $self->{ct}->{attributes}->{$self->{ca}->{name}}
904     = $self->{ca};
905 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
906 wakaba 1.1 }
907     }; # $before_leave
908    
909     if ($is_space->{$self->{nc}}) {
910     !!!cp (59);
911     $before_leave->();
912     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
913     !!!next-input-character;
914     redo A;
915     } elsif ($self->{nc} == 0x003D) { # =
916     !!!cp (60);
917     $before_leave->();
918     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
919     !!!next-input-character;
920     redo A;
921     } elsif ($self->{nc} == 0x003E) { # >
922 wakaba 1.11 if ($self->{is_xml}) {
923     !!!cp (60.1);
924     ## XML5: Not a parse error.
925     !!!parse-error (type => 'no attr value'); ## TODO: type
926     } else {
927     !!!cp (60.2);
928     }
929    
930 wakaba 1.1 $before_leave->();
931     if ($self->{ct}->{type} == START_TAG_TOKEN) {
932     !!!cp (61);
933     $self->{last_stag_name} = $self->{ct}->{tag_name};
934     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
935     !!!cp (62);
936     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
937     if ($self->{ct}->{attributes}) {
938     !!!parse-error (type => 'end tag attribute');
939     }
940     } else {
941     die "$0: $self->{ct}->{type}: Unknown token type";
942     }
943     $self->{state} = DATA_STATE;
944 wakaba 1.5 $self->{s_kwd} = '';
945 wakaba 1.1 !!!next-input-character;
946    
947     !!!emit ($self->{ct}); # start tag or end tag
948    
949     redo A;
950     } elsif (0x0041 <= $self->{nc} and
951     $self->{nc} <= 0x005A) { # A..Z
952     !!!cp (63);
953 wakaba 1.4 $self->{ca}->{name}
954     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
955 wakaba 1.1 ## Stay in the state
956     !!!next-input-character;
957     redo A;
958     } elsif ($self->{nc} == 0x002F) { # /
959 wakaba 1.11 if ($self->{is_xml}) {
960     !!!cp (64);
961     ## XML5: Not a parse error.
962     !!!parse-error (type => 'no attr value'); ## TODO: type
963     } else {
964     !!!cp (64.1);
965     }
966    
967 wakaba 1.1 $before_leave->();
968     $self->{state} = SELF_CLOSING_START_TAG_STATE;
969     !!!next-input-character;
970     redo A;
971     } elsif ($self->{nc} == -1) {
972     !!!parse-error (type => 'unclosed tag');
973     $before_leave->();
974     if ($self->{ct}->{type} == START_TAG_TOKEN) {
975     !!!cp (66);
976     $self->{last_stag_name} = $self->{ct}->{tag_name};
977     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
978     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
979     if ($self->{ct}->{attributes}) {
980     !!!cp (67);
981     !!!parse-error (type => 'end tag attribute');
982     } else {
983     ## NOTE: This state should never be reached.
984     !!!cp (68);
985     }
986     } else {
987     die "$0: $self->{ct}->{type}: Unknown token type";
988     }
989     $self->{state} = DATA_STATE;
990 wakaba 1.5 $self->{s_kwd} = '';
991 wakaba 1.1 # reconsume
992    
993     !!!emit ($self->{ct}); # start tag or end tag
994    
995     redo A;
996     } else {
997     if ($self->{nc} == 0x0022 or # "
998     $self->{nc} == 0x0027) { # '
999     !!!cp (69);
1000 wakaba 1.11 ## XML5: Not a parse error.
1001 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1002     } else {
1003     !!!cp (70);
1004     }
1005     $self->{ca}->{name} .= chr ($self->{nc});
1006     ## Stay in the state
1007     !!!next-input-character;
1008     redo A;
1009     }
1010     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1011 wakaba 1.11 ## XML5: "Tag attribute name after state".
1012    
1013 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1014     !!!cp (71);
1015     ## Stay in the state
1016     !!!next-input-character;
1017     redo A;
1018     } elsif ($self->{nc} == 0x003D) { # =
1019     !!!cp (72);
1020     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1021     !!!next-input-character;
1022     redo A;
1023     } elsif ($self->{nc} == 0x003E) { # >
1024 wakaba 1.11 if ($self->{is_xml}) {
1025     !!!cp (72.1);
1026     ## XML5: Not a parse error.
1027     !!!parse-error (type => 'no attr value'); ## TODO: type
1028     } else {
1029     !!!cp (72.2);
1030     }
1031    
1032 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033     !!!cp (73);
1034     $self->{last_stag_name} = $self->{ct}->{tag_name};
1035     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1036     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1037     if ($self->{ct}->{attributes}) {
1038     !!!cp (74);
1039     !!!parse-error (type => 'end tag attribute');
1040     } else {
1041     ## NOTE: This state should never be reached.
1042     !!!cp (75);
1043     }
1044     } else {
1045     die "$0: $self->{ct}->{type}: Unknown token type";
1046     }
1047     $self->{state} = DATA_STATE;
1048 wakaba 1.5 $self->{s_kwd} = '';
1049 wakaba 1.1 !!!next-input-character;
1050    
1051     !!!emit ($self->{ct}); # start tag or end tag
1052    
1053     redo A;
1054     } elsif (0x0041 <= $self->{nc} and
1055     $self->{nc} <= 0x005A) { # A..Z
1056     !!!cp (76);
1057     $self->{ca}
1058 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1059 wakaba 1.1 value => '',
1060     line => $self->{line}, column => $self->{column}};
1061     $self->{state} = ATTRIBUTE_NAME_STATE;
1062     !!!next-input-character;
1063     redo A;
1064     } elsif ($self->{nc} == 0x002F) { # /
1065 wakaba 1.11 if ($self->{is_xml}) {
1066     !!!cp (77);
1067     ## XML5: Not a parse error.
1068     !!!parse-error (type => 'no attr value'); ## TODO: type
1069     } else {
1070     !!!cp (77.1);
1071     }
1072    
1073 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1074     !!!next-input-character;
1075     redo A;
1076     } elsif ($self->{nc} == -1) {
1077     !!!parse-error (type => 'unclosed tag');
1078     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1079     !!!cp (79);
1080     $self->{last_stag_name} = $self->{ct}->{tag_name};
1081     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1082     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1083     if ($self->{ct}->{attributes}) {
1084     !!!cp (80);
1085     !!!parse-error (type => 'end tag attribute');
1086     } else {
1087     ## NOTE: This state should never be reached.
1088     !!!cp (81);
1089     }
1090     } else {
1091     die "$0: $self->{ct}->{type}: Unknown token type";
1092     }
1093 wakaba 1.5 $self->{s_kwd} = '';
1094 wakaba 1.1 $self->{state} = DATA_STATE;
1095     # reconsume
1096    
1097     !!!emit ($self->{ct}); # start tag or end tag
1098    
1099     redo A;
1100     } else {
1101 wakaba 1.11 if ($self->{is_xml}) {
1102     !!!cp (78.1);
1103     ## XML5: Not a parse error.
1104     !!!parse-error (type => 'no attr value'); ## TODO: type
1105     } else {
1106     !!!cp (78.2);
1107     }
1108    
1109 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1110     $self->{nc} == 0x0027) { # '
1111     !!!cp (78);
1112 wakaba 1.11 ## XML5: Not a parse error.
1113 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1114     } else {
1115     !!!cp (82);
1116     }
1117     $self->{ca}
1118     = {name => chr ($self->{nc}),
1119     value => '',
1120     line => $self->{line}, column => $self->{column}};
1121     $self->{state} = ATTRIBUTE_NAME_STATE;
1122     !!!next-input-character;
1123     redo A;
1124     }
1125     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1126 wakaba 1.11 ## XML5: "Tag attribute value before state".
1127    
1128 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1129     !!!cp (83);
1130     ## Stay in the state
1131     !!!next-input-character;
1132     redo A;
1133     } elsif ($self->{nc} == 0x0022) { # "
1134     !!!cp (84);
1135     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1136     !!!next-input-character;
1137     redo A;
1138     } elsif ($self->{nc} == 0x0026) { # &
1139     !!!cp (85);
1140     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1141     ## reconsume
1142     redo A;
1143     } elsif ($self->{nc} == 0x0027) { # '
1144     !!!cp (86);
1145     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1146     !!!next-input-character;
1147     redo A;
1148     } elsif ($self->{nc} == 0x003E) { # >
1149     !!!parse-error (type => 'empty unquoted attribute value');
1150     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1151     !!!cp (87);
1152     $self->{last_stag_name} = $self->{ct}->{tag_name};
1153     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1154     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1155     if ($self->{ct}->{attributes}) {
1156     !!!cp (88);
1157     !!!parse-error (type => 'end tag attribute');
1158     } else {
1159     ## NOTE: This state should never be reached.
1160     !!!cp (89);
1161     }
1162     } else {
1163     die "$0: $self->{ct}->{type}: Unknown token type";
1164     }
1165     $self->{state} = DATA_STATE;
1166 wakaba 1.5 $self->{s_kwd} = '';
1167 wakaba 1.1 !!!next-input-character;
1168    
1169     !!!emit ($self->{ct}); # start tag or end tag
1170    
1171     redo A;
1172     } elsif ($self->{nc} == -1) {
1173     !!!parse-error (type => 'unclosed tag');
1174     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1175     !!!cp (90);
1176     $self->{last_stag_name} = $self->{ct}->{tag_name};
1177     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1178     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1179     if ($self->{ct}->{attributes}) {
1180     !!!cp (91);
1181     !!!parse-error (type => 'end tag attribute');
1182     } else {
1183     ## NOTE: This state should never be reached.
1184     !!!cp (92);
1185     }
1186     } else {
1187     die "$0: $self->{ct}->{type}: Unknown token type";
1188     }
1189     $self->{state} = DATA_STATE;
1190 wakaba 1.5 $self->{s_kwd} = '';
1191 wakaba 1.1 ## reconsume
1192    
1193     !!!emit ($self->{ct}); # start tag or end tag
1194    
1195     redo A;
1196     } else {
1197     if ($self->{nc} == 0x003D) { # =
1198     !!!cp (93);
1199 wakaba 1.11 ## XML5: Not a parse error.
1200 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1201 wakaba 1.11 } elsif ($self->{is_xml}) {
1202     !!!cp (93.1);
1203     ## XML5: No parse error.
1204     !!!parse-error (type => 'unquoted attr value'); ## TODO
1205 wakaba 1.1 } else {
1206     !!!cp (94);
1207     }
1208     $self->{ca}->{value} .= chr ($self->{nc});
1209     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1210     !!!next-input-character;
1211     redo A;
1212     }
1213     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1214 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1215    
1216 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1217     !!!cp (95);
1218 wakaba 1.11 ## XML5: "Tag attribute name before state".
1219 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1220     !!!next-input-character;
1221     redo A;
1222     } elsif ($self->{nc} == 0x0026) { # &
1223     !!!cp (96);
1224 wakaba 1.11 ## XML5: Not defined yet.
1225    
1226 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1227     ## "entity in attribute value state". In this implementation, the
1228     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1229     ## implementation of the "consume a character reference" algorithm.
1230     $self->{prev_state} = $self->{state};
1231     $self->{entity_add} = 0x0022; # "
1232     $self->{state} = ENTITY_STATE;
1233     !!!next-input-character;
1234     redo A;
1235     } elsif ($self->{nc} == -1) {
1236     !!!parse-error (type => 'unclosed attribute value');
1237     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1238     !!!cp (97);
1239     $self->{last_stag_name} = $self->{ct}->{tag_name};
1240     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1241     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1242     if ($self->{ct}->{attributes}) {
1243     !!!cp (98);
1244     !!!parse-error (type => 'end tag attribute');
1245     } else {
1246     ## NOTE: This state should never be reached.
1247     !!!cp (99);
1248     }
1249     } else {
1250     die "$0: $self->{ct}->{type}: Unknown token type";
1251     }
1252     $self->{state} = DATA_STATE;
1253 wakaba 1.5 $self->{s_kwd} = '';
1254 wakaba 1.1 ## reconsume
1255    
1256     !!!emit ($self->{ct}); # start tag or end tag
1257    
1258     redo A;
1259     } else {
1260 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1261     !!!cp (100);
1262     ## XML5: Not a parse error.
1263     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1264     } else {
1265     !!!cp (100.1);
1266     }
1267 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1268     $self->{read_until}->($self->{ca}->{value},
1269 wakaba 1.11 q["&<],
1270 wakaba 1.1 length $self->{ca}->{value});
1271    
1272     ## Stay in the state
1273     !!!next-input-character;
1274     redo A;
1275     }
1276     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1277 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1278    
1279 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1280     !!!cp (101);
1281 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1282 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1283     !!!next-input-character;
1284     redo A;
1285     } elsif ($self->{nc} == 0x0026) { # &
1286     !!!cp (102);
1287 wakaba 1.11 ## XML5: Not defined yet.
1288    
1289 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1290     ## "entity in attribute value state". In this implementation, the
1291     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1292     ## implementation of the "consume a character reference" algorithm.
1293     $self->{entity_add} = 0x0027; # '
1294     $self->{prev_state} = $self->{state};
1295     $self->{state} = ENTITY_STATE;
1296     !!!next-input-character;
1297     redo A;
1298     } elsif ($self->{nc} == -1) {
1299     !!!parse-error (type => 'unclosed attribute value');
1300     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1301     !!!cp (103);
1302     $self->{last_stag_name} = $self->{ct}->{tag_name};
1303     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1304     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1305     if ($self->{ct}->{attributes}) {
1306     !!!cp (104);
1307     !!!parse-error (type => 'end tag attribute');
1308     } else {
1309     ## NOTE: This state should never be reached.
1310     !!!cp (105);
1311     }
1312     } else {
1313     die "$0: $self->{ct}->{type}: Unknown token type";
1314     }
1315     $self->{state} = DATA_STATE;
1316 wakaba 1.5 $self->{s_kwd} = '';
1317 wakaba 1.1 ## reconsume
1318    
1319     !!!emit ($self->{ct}); # start tag or end tag
1320    
1321     redo A;
1322     } else {
1323 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1324     !!!cp (106);
1325     ## XML5: Not a parse error.
1326     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1327     } else {
1328     !!!cp (106.1);
1329     }
1330 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1331     $self->{read_until}->($self->{ca}->{value},
1332 wakaba 1.11 q['&<],
1333 wakaba 1.1 length $self->{ca}->{value});
1334    
1335     ## Stay in the state
1336     !!!next-input-character;
1337     redo A;
1338     }
1339     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1340 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1341    
1342 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1343     !!!cp (107);
1344 wakaba 1.11 ## XML5: "Tag attribute name before state".
1345 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346     !!!next-input-character;
1347     redo A;
1348     } elsif ($self->{nc} == 0x0026) { # &
1349     !!!cp (108);
1350 wakaba 1.11
1351     ## XML5: Not defined yet.
1352    
1353 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1354     ## "entity in attribute value state". In this implementation, the
1355     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1356     ## implementation of the "consume a character reference" algorithm.
1357     $self->{entity_add} = -1;
1358     $self->{prev_state} = $self->{state};
1359     $self->{state} = ENTITY_STATE;
1360     !!!next-input-character;
1361     redo A;
1362     } elsif ($self->{nc} == 0x003E) { # >
1363     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1364     !!!cp (109);
1365     $self->{last_stag_name} = $self->{ct}->{tag_name};
1366     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1367     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1368     if ($self->{ct}->{attributes}) {
1369     !!!cp (110);
1370     !!!parse-error (type => 'end tag attribute');
1371     } else {
1372     ## NOTE: This state should never be reached.
1373     !!!cp (111);
1374     }
1375     } else {
1376     die "$0: $self->{ct}->{type}: Unknown token type";
1377     }
1378     $self->{state} = DATA_STATE;
1379 wakaba 1.5 $self->{s_kwd} = '';
1380 wakaba 1.1 !!!next-input-character;
1381    
1382     !!!emit ($self->{ct}); # start tag or end tag
1383    
1384     redo A;
1385     } elsif ($self->{nc} == -1) {
1386     !!!parse-error (type => 'unclosed tag');
1387     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1388     !!!cp (112);
1389     $self->{last_stag_name} = $self->{ct}->{tag_name};
1390     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1391     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1392     if ($self->{ct}->{attributes}) {
1393     !!!cp (113);
1394     !!!parse-error (type => 'end tag attribute');
1395     } else {
1396     ## NOTE: This state should never be reached.
1397     !!!cp (114);
1398     }
1399     } else {
1400     die "$0: $self->{ct}->{type}: Unknown token type";
1401     }
1402     $self->{state} = DATA_STATE;
1403 wakaba 1.5 $self->{s_kwd} = '';
1404 wakaba 1.1 ## reconsume
1405    
1406     !!!emit ($self->{ct}); # start tag or end tag
1407    
1408     redo A;
1409     } else {
1410     if ({
1411     0x0022 => 1, # "
1412     0x0027 => 1, # '
1413     0x003D => 1, # =
1414     }->{$self->{nc}}) {
1415     !!!cp (115);
1416 wakaba 1.11 ## XML5: Not a parse error.
1417 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1418     } else {
1419     !!!cp (116);
1420     }
1421     $self->{ca}->{value} .= chr ($self->{nc});
1422     $self->{read_until}->($self->{ca}->{value},
1423     q["'=& >],
1424     length $self->{ca}->{value});
1425    
1426     ## Stay in the state
1427     !!!next-input-character;
1428     redo A;
1429     }
1430     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1431     if ($is_space->{$self->{nc}}) {
1432     !!!cp (118);
1433     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1434     !!!next-input-character;
1435     redo A;
1436     } elsif ($self->{nc} == 0x003E) { # >
1437     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1438     !!!cp (119);
1439     $self->{last_stag_name} = $self->{ct}->{tag_name};
1440     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1441     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1442     if ($self->{ct}->{attributes}) {
1443     !!!cp (120);
1444     !!!parse-error (type => 'end tag attribute');
1445     } else {
1446     ## NOTE: This state should never be reached.
1447     !!!cp (121);
1448     }
1449     } else {
1450     die "$0: $self->{ct}->{type}: Unknown token type";
1451     }
1452     $self->{state} = DATA_STATE;
1453 wakaba 1.5 $self->{s_kwd} = '';
1454 wakaba 1.1 !!!next-input-character;
1455    
1456     !!!emit ($self->{ct}); # start tag or end tag
1457    
1458     redo A;
1459     } elsif ($self->{nc} == 0x002F) { # /
1460     !!!cp (122);
1461     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1462     !!!next-input-character;
1463     redo A;
1464     } elsif ($self->{nc} == -1) {
1465     !!!parse-error (type => 'unclosed tag');
1466     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1467     !!!cp (122.3);
1468     $self->{last_stag_name} = $self->{ct}->{tag_name};
1469     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1470     if ($self->{ct}->{attributes}) {
1471     !!!cp (122.1);
1472     !!!parse-error (type => 'end tag attribute');
1473     } else {
1474     ## NOTE: This state should never be reached.
1475     !!!cp (122.2);
1476     }
1477     } else {
1478     die "$0: $self->{ct}->{type}: Unknown token type";
1479     }
1480     $self->{state} = DATA_STATE;
1481 wakaba 1.5 $self->{s_kwd} = '';
1482 wakaba 1.1 ## Reconsume.
1483     !!!emit ($self->{ct}); # start tag or end tag
1484     redo A;
1485     } else {
1486     !!!cp ('124.1');
1487     !!!parse-error (type => 'no space between attributes');
1488     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1489     ## reconsume
1490     redo A;
1491     }
1492     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1493 wakaba 1.11 ## XML5: "Empty tag state".
1494    
1495 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1496     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1497     !!!cp ('124.2');
1498     !!!parse-error (type => 'nestc', token => $self->{ct});
1499     ## TODO: Different type than slash in start tag
1500     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1501     if ($self->{ct}->{attributes}) {
1502     !!!cp ('124.4');
1503     !!!parse-error (type => 'end tag attribute');
1504     } else {
1505     !!!cp ('124.5');
1506     }
1507     ## TODO: Test |<title></title/>|
1508     } else {
1509     !!!cp ('124.3');
1510     $self->{self_closing} = 1;
1511     }
1512    
1513     $self->{state} = DATA_STATE;
1514 wakaba 1.5 $self->{s_kwd} = '';
1515 wakaba 1.1 !!!next-input-character;
1516    
1517     !!!emit ($self->{ct}); # start tag or end tag
1518    
1519     redo A;
1520     } elsif ($self->{nc} == -1) {
1521     !!!parse-error (type => 'unclosed tag');
1522     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1523     !!!cp (124.7);
1524     $self->{last_stag_name} = $self->{ct}->{tag_name};
1525     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1526     if ($self->{ct}->{attributes}) {
1527     !!!cp (124.5);
1528     !!!parse-error (type => 'end tag attribute');
1529     } else {
1530     ## NOTE: This state should never be reached.
1531     !!!cp (124.6);
1532     }
1533     } else {
1534     die "$0: $self->{ct}->{type}: Unknown token type";
1535     }
1536 wakaba 1.11 ## XML5: "Tag attribute name before state".
1537 wakaba 1.1 $self->{state} = DATA_STATE;
1538 wakaba 1.5 $self->{s_kwd} = '';
1539 wakaba 1.1 ## Reconsume.
1540     !!!emit ($self->{ct}); # start tag or end tag
1541     redo A;
1542     } else {
1543     !!!cp ('124.4');
1544     !!!parse-error (type => 'nestc');
1545     ## TODO: This error type is wrong.
1546     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1547     ## Reconsume.
1548     redo A;
1549     }
1550     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1551     ## (only happen if PCDATA state)
1552    
1553     ## NOTE: Unlike spec's "bogus comment state", this implementation
1554     ## consumes characters one-by-one basis.
1555    
1556     if ($self->{nc} == 0x003E) { # >
1557     !!!cp (124);
1558     $self->{state} = DATA_STATE;
1559 wakaba 1.5 $self->{s_kwd} = '';
1560 wakaba 1.1 !!!next-input-character;
1561    
1562     !!!emit ($self->{ct}); # comment
1563     redo A;
1564     } elsif ($self->{nc} == -1) {
1565     !!!cp (125);
1566     $self->{state} = DATA_STATE;
1567 wakaba 1.5 $self->{s_kwd} = '';
1568 wakaba 1.1 ## reconsume
1569    
1570     !!!emit ($self->{ct}); # comment
1571     redo A;
1572     } else {
1573     !!!cp (126);
1574     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1575     $self->{read_until}->($self->{ct}->{data},
1576     q[>],
1577     length $self->{ct}->{data});
1578    
1579     ## Stay in the state.
1580     !!!next-input-character;
1581     redo A;
1582     }
1583     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1584     ## (only happen if PCDATA state)
1585    
1586     if ($self->{nc} == 0x002D) { # -
1587     !!!cp (133);
1588     $self->{state} = MD_HYPHEN_STATE;
1589     !!!next-input-character;
1590     redo A;
1591     } elsif ($self->{nc} == 0x0044 or # D
1592     $self->{nc} == 0x0064) { # d
1593     ## ASCII case-insensitive.
1594     !!!cp (130);
1595     $self->{state} = MD_DOCTYPE_STATE;
1596     $self->{s_kwd} = chr $self->{nc};
1597     !!!next-input-character;
1598     redo A;
1599 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1600     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1601     $self->{is_xml}) and
1602 wakaba 1.1 $self->{nc} == 0x005B) { # [
1603     !!!cp (135.4);
1604     $self->{state} = MD_CDATA_STATE;
1605     $self->{s_kwd} = '[';
1606     !!!next-input-character;
1607     redo A;
1608     } else {
1609     !!!cp (136);
1610     }
1611    
1612     !!!parse-error (type => 'bogus comment',
1613     line => $self->{line_prev},
1614     column => $self->{column_prev} - 1);
1615     ## Reconsume.
1616     $self->{state} = BOGUS_COMMENT_STATE;
1617     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1618     line => $self->{line_prev},
1619     column => $self->{column_prev} - 1,
1620     };
1621     redo A;
1622     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1623     if ($self->{nc} == 0x002D) { # -
1624     !!!cp (127);
1625     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1626     line => $self->{line_prev},
1627     column => $self->{column_prev} - 2,
1628     };
1629 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1630 wakaba 1.1 !!!next-input-character;
1631     redo A;
1632     } else {
1633     !!!cp (128);
1634     !!!parse-error (type => 'bogus comment',
1635     line => $self->{line_prev},
1636     column => $self->{column_prev} - 2);
1637     $self->{state} = BOGUS_COMMENT_STATE;
1638     ## Reconsume.
1639     $self->{ct} = {type => COMMENT_TOKEN,
1640     data => '-',
1641     line => $self->{line_prev},
1642     column => $self->{column_prev} - 2,
1643     };
1644     redo A;
1645     }
1646     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1647     ## ASCII case-insensitive.
1648     if ($self->{nc} == [
1649     undef,
1650     0x004F, # O
1651     0x0043, # C
1652     0x0054, # T
1653     0x0059, # Y
1654     0x0050, # P
1655     ]->[length $self->{s_kwd}] or
1656     $self->{nc} == [
1657     undef,
1658     0x006F, # o
1659     0x0063, # c
1660     0x0074, # t
1661     0x0079, # y
1662     0x0070, # p
1663     ]->[length $self->{s_kwd}]) {
1664     !!!cp (131);
1665     ## Stay in the state.
1666     $self->{s_kwd} .= chr $self->{nc};
1667     !!!next-input-character;
1668     redo A;
1669     } elsif ((length $self->{s_kwd}) == 6 and
1670     ($self->{nc} == 0x0045 or # E
1671     $self->{nc} == 0x0065)) { # e
1672 wakaba 1.10 if ($self->{s_kwd} ne 'DOCTYP') {
1673     !!!cp (129);
1674     ## XML5: case-sensitive.
1675     !!!parse-error (type => 'lowercase keyword', ## TODO
1676     text => 'DOCTYPE',
1677     line => $self->{line_prev},
1678     column => $self->{column_prev} - 5);
1679     } else {
1680     !!!cp (129.1);
1681     }
1682 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1683     $self->{ct} = {type => DOCTYPE_TOKEN,
1684     quirks => 1,
1685     line => $self->{line_prev},
1686     column => $self->{column_prev} - 7,
1687     };
1688     !!!next-input-character;
1689     redo A;
1690     } else {
1691     !!!cp (132);
1692     !!!parse-error (type => 'bogus comment',
1693     line => $self->{line_prev},
1694     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1695     $self->{state} = BOGUS_COMMENT_STATE;
1696     ## Reconsume.
1697     $self->{ct} = {type => COMMENT_TOKEN,
1698     data => $self->{s_kwd},
1699     line => $self->{line_prev},
1700     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1701     };
1702     redo A;
1703     }
1704     } elsif ($self->{state} == MD_CDATA_STATE) {
1705     if ($self->{nc} == {
1706     '[' => 0x0043, # C
1707     '[C' => 0x0044, # D
1708     '[CD' => 0x0041, # A
1709     '[CDA' => 0x0054, # T
1710     '[CDAT' => 0x0041, # A
1711     }->{$self->{s_kwd}}) {
1712     !!!cp (135.1);
1713     ## Stay in the state.
1714     $self->{s_kwd} .= chr $self->{nc};
1715     !!!next-input-character;
1716     redo A;
1717     } elsif ($self->{s_kwd} eq '[CDATA' and
1718     $self->{nc} == 0x005B) { # [
1719 wakaba 1.6 if ($self->{is_xml} and
1720     not $self->{tainted} and
1721     @{$self->{open_elements} or []} == 0) {
1722 wakaba 1.8 !!!cp (135.2);
1723 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1724     line => $self->{line_prev},
1725     column => $self->{column_prev} - 7);
1726     $self->{tainted} = 1;
1727 wakaba 1.8 } else {
1728     !!!cp (135.21);
1729 wakaba 1.6 }
1730    
1731 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1732     data => '',
1733     line => $self->{line_prev},
1734     column => $self->{column_prev} - 7};
1735     $self->{state} = CDATA_SECTION_STATE;
1736     !!!next-input-character;
1737     redo A;
1738     } else {
1739     !!!cp (135.3);
1740     !!!parse-error (type => 'bogus comment',
1741     line => $self->{line_prev},
1742     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1743     $self->{state} = BOGUS_COMMENT_STATE;
1744     ## Reconsume.
1745     $self->{ct} = {type => COMMENT_TOKEN,
1746     data => $self->{s_kwd},
1747     line => $self->{line_prev},
1748     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1749     };
1750     redo A;
1751     }
1752     } elsif ($self->{state} == COMMENT_START_STATE) {
1753     if ($self->{nc} == 0x002D) { # -
1754     !!!cp (137);
1755     $self->{state} = COMMENT_START_DASH_STATE;
1756     !!!next-input-character;
1757     redo A;
1758     } elsif ($self->{nc} == 0x003E) { # >
1759     !!!cp (138);
1760     !!!parse-error (type => 'bogus comment');
1761     $self->{state} = DATA_STATE;
1762 wakaba 1.5 $self->{s_kwd} = '';
1763 wakaba 1.1 !!!next-input-character;
1764    
1765     !!!emit ($self->{ct}); # comment
1766    
1767     redo A;
1768     } elsif ($self->{nc} == -1) {
1769     !!!cp (139);
1770     !!!parse-error (type => 'unclosed comment');
1771     $self->{state} = DATA_STATE;
1772 wakaba 1.5 $self->{s_kwd} = '';
1773 wakaba 1.1 ## reconsume
1774    
1775     !!!emit ($self->{ct}); # comment
1776    
1777     redo A;
1778     } else {
1779     !!!cp (140);
1780     $self->{ct}->{data} # comment
1781     .= chr ($self->{nc});
1782     $self->{state} = COMMENT_STATE;
1783     !!!next-input-character;
1784     redo A;
1785     }
1786     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1787     if ($self->{nc} == 0x002D) { # -
1788     !!!cp (141);
1789     $self->{state} = COMMENT_END_STATE;
1790     !!!next-input-character;
1791     redo A;
1792     } elsif ($self->{nc} == 0x003E) { # >
1793     !!!cp (142);
1794     !!!parse-error (type => 'bogus comment');
1795     $self->{state} = DATA_STATE;
1796 wakaba 1.5 $self->{s_kwd} = '';
1797 wakaba 1.1 !!!next-input-character;
1798    
1799     !!!emit ($self->{ct}); # comment
1800    
1801     redo A;
1802     } elsif ($self->{nc} == -1) {
1803     !!!cp (143);
1804     !!!parse-error (type => 'unclosed comment');
1805     $self->{state} = DATA_STATE;
1806 wakaba 1.5 $self->{s_kwd} = '';
1807 wakaba 1.1 ## reconsume
1808    
1809     !!!emit ($self->{ct}); # comment
1810    
1811     redo A;
1812     } else {
1813     !!!cp (144);
1814     $self->{ct}->{data} # comment
1815     .= '-' . chr ($self->{nc});
1816     $self->{state} = COMMENT_STATE;
1817     !!!next-input-character;
1818     redo A;
1819     }
1820     } elsif ($self->{state} == COMMENT_STATE) {
1821     if ($self->{nc} == 0x002D) { # -
1822     !!!cp (145);
1823     $self->{state} = COMMENT_END_DASH_STATE;
1824     !!!next-input-character;
1825     redo A;
1826     } elsif ($self->{nc} == -1) {
1827     !!!cp (146);
1828     !!!parse-error (type => 'unclosed comment');
1829     $self->{state} = DATA_STATE;
1830 wakaba 1.5 $self->{s_kwd} = '';
1831 wakaba 1.1 ## reconsume
1832    
1833     !!!emit ($self->{ct}); # comment
1834    
1835     redo A;
1836     } else {
1837     !!!cp (147);
1838     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1839     $self->{read_until}->($self->{ct}->{data},
1840     q[-],
1841     length $self->{ct}->{data});
1842    
1843     ## Stay in the state
1844     !!!next-input-character;
1845     redo A;
1846     }
1847     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1848 wakaba 1.10 ## XML5: "comment dash state".
1849    
1850 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1851     !!!cp (148);
1852     $self->{state} = COMMENT_END_STATE;
1853     !!!next-input-character;
1854     redo A;
1855     } elsif ($self->{nc} == -1) {
1856     !!!cp (149);
1857     !!!parse-error (type => 'unclosed comment');
1858 wakaba 1.5 $self->{s_kwd} = '';
1859 wakaba 1.1 $self->{state} = DATA_STATE;
1860 wakaba 1.5 $self->{s_kwd} = '';
1861 wakaba 1.1 ## reconsume
1862    
1863     !!!emit ($self->{ct}); # comment
1864    
1865     redo A;
1866     } else {
1867     !!!cp (150);
1868     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1869     $self->{state} = COMMENT_STATE;
1870     !!!next-input-character;
1871     redo A;
1872     }
1873     } elsif ($self->{state} == COMMENT_END_STATE) {
1874     if ($self->{nc} == 0x003E) { # >
1875     !!!cp (151);
1876     $self->{state} = DATA_STATE;
1877 wakaba 1.5 $self->{s_kwd} = '';
1878 wakaba 1.1 !!!next-input-character;
1879    
1880     !!!emit ($self->{ct}); # comment
1881    
1882     redo A;
1883     } elsif ($self->{nc} == 0x002D) { # -
1884     !!!cp (152);
1885 wakaba 1.10 ## XML5: Not a parse error.
1886 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1887     line => $self->{line_prev},
1888     column => $self->{column_prev});
1889     $self->{ct}->{data} .= '-'; # comment
1890     ## Stay in the state
1891     !!!next-input-character;
1892     redo A;
1893     } elsif ($self->{nc} == -1) {
1894     !!!cp (153);
1895     !!!parse-error (type => 'unclosed comment');
1896     $self->{state} = DATA_STATE;
1897 wakaba 1.5 $self->{s_kwd} = '';
1898 wakaba 1.1 ## reconsume
1899    
1900     !!!emit ($self->{ct}); # comment
1901    
1902     redo A;
1903     } else {
1904     !!!cp (154);
1905 wakaba 1.10 ## XML5: Not a parse error.
1906 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1907     line => $self->{line_prev},
1908     column => $self->{column_prev});
1909     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1910     $self->{state} = COMMENT_STATE;
1911     !!!next-input-character;
1912     redo A;
1913     }
1914     } elsif ($self->{state} == DOCTYPE_STATE) {
1915     if ($is_space->{$self->{nc}}) {
1916     !!!cp (155);
1917     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1918     !!!next-input-character;
1919     redo A;
1920     } else {
1921     !!!cp (156);
1922     !!!parse-error (type => 'no space before DOCTYPE name');
1923     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1924     ## reconsume
1925     redo A;
1926     }
1927     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1928     if ($is_space->{$self->{nc}}) {
1929     !!!cp (157);
1930     ## Stay in the state
1931     !!!next-input-character;
1932     redo A;
1933     } elsif ($self->{nc} == 0x003E) { # >
1934     !!!cp (158);
1935     !!!parse-error (type => 'no DOCTYPE name');
1936     $self->{state} = DATA_STATE;
1937 wakaba 1.5 $self->{s_kwd} = '';
1938 wakaba 1.1 !!!next-input-character;
1939    
1940     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1941    
1942     redo A;
1943     } elsif ($self->{nc} == -1) {
1944     !!!cp (159);
1945     !!!parse-error (type => 'no DOCTYPE name');
1946     $self->{state} = DATA_STATE;
1947 wakaba 1.5 $self->{s_kwd} = '';
1948 wakaba 1.1 ## reconsume
1949    
1950     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1951    
1952     redo A;
1953     } else {
1954     !!!cp (160);
1955     $self->{ct}->{name} = chr $self->{nc};
1956     delete $self->{ct}->{quirks};
1957     $self->{state} = DOCTYPE_NAME_STATE;
1958     !!!next-input-character;
1959     redo A;
1960     }
1961     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1962     ## ISSUE: Redundant "First," in the spec.
1963     if ($is_space->{$self->{nc}}) {
1964     !!!cp (161);
1965     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1966     !!!next-input-character;
1967     redo A;
1968     } elsif ($self->{nc} == 0x003E) { # >
1969     !!!cp (162);
1970     $self->{state} = DATA_STATE;
1971 wakaba 1.5 $self->{s_kwd} = '';
1972 wakaba 1.1 !!!next-input-character;
1973    
1974     !!!emit ($self->{ct}); # DOCTYPE
1975    
1976     redo A;
1977     } elsif ($self->{nc} == -1) {
1978     !!!cp (163);
1979     !!!parse-error (type => 'unclosed DOCTYPE');
1980     $self->{state} = DATA_STATE;
1981 wakaba 1.5 $self->{s_kwd} = '';
1982 wakaba 1.1 ## reconsume
1983    
1984     $self->{ct}->{quirks} = 1;
1985     !!!emit ($self->{ct}); # DOCTYPE
1986    
1987     redo A;
1988     } else {
1989     !!!cp (164);
1990     $self->{ct}->{name}
1991     .= chr ($self->{nc}); # DOCTYPE
1992     ## Stay in the state
1993     !!!next-input-character;
1994     redo A;
1995     }
1996     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1997     if ($is_space->{$self->{nc}}) {
1998     !!!cp (165);
1999     ## Stay in the state
2000     !!!next-input-character;
2001     redo A;
2002     } elsif ($self->{nc} == 0x003E) { # >
2003     !!!cp (166);
2004     $self->{state} = DATA_STATE;
2005 wakaba 1.5 $self->{s_kwd} = '';
2006 wakaba 1.1 !!!next-input-character;
2007    
2008     !!!emit ($self->{ct}); # DOCTYPE
2009    
2010     redo A;
2011     } elsif ($self->{nc} == -1) {
2012     !!!cp (167);
2013     !!!parse-error (type => 'unclosed DOCTYPE');
2014     $self->{state} = DATA_STATE;
2015 wakaba 1.5 $self->{s_kwd} = '';
2016 wakaba 1.1 ## reconsume
2017    
2018     $self->{ct}->{quirks} = 1;
2019     !!!emit ($self->{ct}); # DOCTYPE
2020    
2021     redo A;
2022     } elsif ($self->{nc} == 0x0050 or # P
2023     $self->{nc} == 0x0070) { # p
2024     $self->{state} = PUBLIC_STATE;
2025     $self->{s_kwd} = chr $self->{nc};
2026     !!!next-input-character;
2027     redo A;
2028     } elsif ($self->{nc} == 0x0053 or # S
2029     $self->{nc} == 0x0073) { # s
2030     $self->{state} = SYSTEM_STATE;
2031     $self->{s_kwd} = chr $self->{nc};
2032     !!!next-input-character;
2033     redo A;
2034     } else {
2035     !!!cp (180);
2036     !!!parse-error (type => 'string after DOCTYPE name');
2037     $self->{ct}->{quirks} = 1;
2038    
2039     $self->{state} = BOGUS_DOCTYPE_STATE;
2040     !!!next-input-character;
2041     redo A;
2042     }
2043     } elsif ($self->{state} == PUBLIC_STATE) {
2044     ## ASCII case-insensitive
2045     if ($self->{nc} == [
2046     undef,
2047     0x0055, # U
2048     0x0042, # B
2049     0x004C, # L
2050     0x0049, # I
2051     ]->[length $self->{s_kwd}] or
2052     $self->{nc} == [
2053     undef,
2054     0x0075, # u
2055     0x0062, # b
2056     0x006C, # l
2057     0x0069, # i
2058     ]->[length $self->{s_kwd}]) {
2059     !!!cp (175);
2060     ## Stay in the state.
2061     $self->{s_kwd} .= chr $self->{nc};
2062     !!!next-input-character;
2063     redo A;
2064     } elsif ((length $self->{s_kwd}) == 5 and
2065     ($self->{nc} == 0x0043 or # C
2066     $self->{nc} == 0x0063)) { # c
2067     !!!cp (168);
2068     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2069     !!!next-input-character;
2070     redo A;
2071     } else {
2072     !!!cp (169);
2073     !!!parse-error (type => 'string after DOCTYPE name',
2074     line => $self->{line_prev},
2075     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2076     $self->{ct}->{quirks} = 1;
2077    
2078     $self->{state} = BOGUS_DOCTYPE_STATE;
2079     ## Reconsume.
2080     redo A;
2081     }
2082     } elsif ($self->{state} == SYSTEM_STATE) {
2083     ## ASCII case-insensitive
2084     if ($self->{nc} == [
2085     undef,
2086     0x0059, # Y
2087     0x0053, # S
2088     0x0054, # T
2089     0x0045, # E
2090     ]->[length $self->{s_kwd}] or
2091     $self->{nc} == [
2092     undef,
2093     0x0079, # y
2094     0x0073, # s
2095     0x0074, # t
2096     0x0065, # e
2097     ]->[length $self->{s_kwd}]) {
2098     !!!cp (170);
2099     ## Stay in the state.
2100     $self->{s_kwd} .= chr $self->{nc};
2101     !!!next-input-character;
2102     redo A;
2103     } elsif ((length $self->{s_kwd}) == 5 and
2104     ($self->{nc} == 0x004D or # M
2105     $self->{nc} == 0x006D)) { # m
2106     !!!cp (171);
2107     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2108     !!!next-input-character;
2109     redo A;
2110     } else {
2111     !!!cp (172);
2112     !!!parse-error (type => 'string after DOCTYPE name',
2113     line => $self->{line_prev},
2114     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2115     $self->{ct}->{quirks} = 1;
2116    
2117     $self->{state} = BOGUS_DOCTYPE_STATE;
2118     ## Reconsume.
2119     redo A;
2120     }
2121     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2122     if ($is_space->{$self->{nc}}) {
2123     !!!cp (181);
2124     ## Stay in the state
2125     !!!next-input-character;
2126     redo A;
2127     } elsif ($self->{nc} eq 0x0022) { # "
2128     !!!cp (182);
2129     $self->{ct}->{pubid} = ''; # DOCTYPE
2130     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2131     !!!next-input-character;
2132     redo A;
2133     } elsif ($self->{nc} eq 0x0027) { # '
2134     !!!cp (183);
2135     $self->{ct}->{pubid} = ''; # DOCTYPE
2136     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2137     !!!next-input-character;
2138     redo A;
2139     } elsif ($self->{nc} eq 0x003E) { # >
2140     !!!cp (184);
2141     !!!parse-error (type => 'no PUBLIC literal');
2142    
2143     $self->{state} = DATA_STATE;
2144 wakaba 1.5 $self->{s_kwd} = '';
2145 wakaba 1.1 !!!next-input-character;
2146    
2147     $self->{ct}->{quirks} = 1;
2148     !!!emit ($self->{ct}); # DOCTYPE
2149    
2150     redo A;
2151     } elsif ($self->{nc} == -1) {
2152     !!!cp (185);
2153     !!!parse-error (type => 'unclosed DOCTYPE');
2154    
2155     $self->{state} = DATA_STATE;
2156 wakaba 1.5 $self->{s_kwd} = '';
2157 wakaba 1.1 ## reconsume
2158    
2159     $self->{ct}->{quirks} = 1;
2160     !!!emit ($self->{ct}); # DOCTYPE
2161    
2162     redo A;
2163     } else {
2164     !!!cp (186);
2165     !!!parse-error (type => 'string after PUBLIC');
2166     $self->{ct}->{quirks} = 1;
2167    
2168     $self->{state} = BOGUS_DOCTYPE_STATE;
2169     !!!next-input-character;
2170     redo A;
2171     }
2172     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2173     if ($self->{nc} == 0x0022) { # "
2174     !!!cp (187);
2175     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2176     !!!next-input-character;
2177     redo A;
2178     } elsif ($self->{nc} == 0x003E) { # >
2179     !!!cp (188);
2180     !!!parse-error (type => 'unclosed PUBLIC literal');
2181    
2182     $self->{state} = DATA_STATE;
2183 wakaba 1.5 $self->{s_kwd} = '';
2184 wakaba 1.1 !!!next-input-character;
2185    
2186     $self->{ct}->{quirks} = 1;
2187     !!!emit ($self->{ct}); # DOCTYPE
2188    
2189     redo A;
2190     } elsif ($self->{nc} == -1) {
2191     !!!cp (189);
2192     !!!parse-error (type => 'unclosed PUBLIC literal');
2193    
2194     $self->{state} = DATA_STATE;
2195 wakaba 1.5 $self->{s_kwd} = '';
2196 wakaba 1.1 ## reconsume
2197    
2198     $self->{ct}->{quirks} = 1;
2199     !!!emit ($self->{ct}); # DOCTYPE
2200    
2201     redo A;
2202     } else {
2203     !!!cp (190);
2204     $self->{ct}->{pubid} # DOCTYPE
2205     .= chr $self->{nc};
2206     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2207     length $self->{ct}->{pubid});
2208    
2209     ## Stay in the state
2210     !!!next-input-character;
2211     redo A;
2212     }
2213     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2214     if ($self->{nc} == 0x0027) { # '
2215     !!!cp (191);
2216     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2217     !!!next-input-character;
2218     redo A;
2219     } elsif ($self->{nc} == 0x003E) { # >
2220     !!!cp (192);
2221     !!!parse-error (type => 'unclosed PUBLIC literal');
2222    
2223     $self->{state} = DATA_STATE;
2224 wakaba 1.5 $self->{s_kwd} = '';
2225 wakaba 1.1 !!!next-input-character;
2226    
2227     $self->{ct}->{quirks} = 1;
2228     !!!emit ($self->{ct}); # DOCTYPE
2229    
2230     redo A;
2231     } elsif ($self->{nc} == -1) {
2232     !!!cp (193);
2233     !!!parse-error (type => 'unclosed PUBLIC literal');
2234    
2235     $self->{state} = DATA_STATE;
2236 wakaba 1.5 $self->{s_kwd} = '';
2237 wakaba 1.1 ## reconsume
2238    
2239     $self->{ct}->{quirks} = 1;
2240     !!!emit ($self->{ct}); # DOCTYPE
2241    
2242     redo A;
2243     } else {
2244     !!!cp (194);
2245     $self->{ct}->{pubid} # DOCTYPE
2246     .= chr $self->{nc};
2247     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2248     length $self->{ct}->{pubid});
2249    
2250     ## Stay in the state
2251     !!!next-input-character;
2252     redo A;
2253     }
2254     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2255     if ($is_space->{$self->{nc}}) {
2256     !!!cp (195);
2257     ## Stay in the state
2258     !!!next-input-character;
2259     redo A;
2260     } elsif ($self->{nc} == 0x0022) { # "
2261     !!!cp (196);
2262     $self->{ct}->{sysid} = ''; # DOCTYPE
2263     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2264     !!!next-input-character;
2265     redo A;
2266     } elsif ($self->{nc} == 0x0027) { # '
2267     !!!cp (197);
2268     $self->{ct}->{sysid} = ''; # DOCTYPE
2269     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2270     !!!next-input-character;
2271     redo A;
2272     } elsif ($self->{nc} == 0x003E) { # >
2273     !!!cp (198);
2274     $self->{state} = DATA_STATE;
2275 wakaba 1.5 $self->{s_kwd} = '';
2276 wakaba 1.1 !!!next-input-character;
2277    
2278     !!!emit ($self->{ct}); # DOCTYPE
2279    
2280     redo A;
2281     } elsif ($self->{nc} == -1) {
2282     !!!cp (199);
2283     !!!parse-error (type => 'unclosed DOCTYPE');
2284    
2285     $self->{state} = DATA_STATE;
2286 wakaba 1.5 $self->{s_kwd} = '';
2287 wakaba 1.1 ## reconsume
2288    
2289     $self->{ct}->{quirks} = 1;
2290     !!!emit ($self->{ct}); # DOCTYPE
2291    
2292     redo A;
2293     } else {
2294     !!!cp (200);
2295     !!!parse-error (type => 'string after PUBLIC literal');
2296     $self->{ct}->{quirks} = 1;
2297    
2298     $self->{state} = BOGUS_DOCTYPE_STATE;
2299     !!!next-input-character;
2300     redo A;
2301     }
2302     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2303     if ($is_space->{$self->{nc}}) {
2304     !!!cp (201);
2305     ## Stay in the state
2306     !!!next-input-character;
2307     redo A;
2308     } elsif ($self->{nc} == 0x0022) { # "
2309     !!!cp (202);
2310     $self->{ct}->{sysid} = ''; # DOCTYPE
2311     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2312     !!!next-input-character;
2313     redo A;
2314     } elsif ($self->{nc} == 0x0027) { # '
2315     !!!cp (203);
2316     $self->{ct}->{sysid} = ''; # DOCTYPE
2317     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2318     !!!next-input-character;
2319     redo A;
2320     } elsif ($self->{nc} == 0x003E) { # >
2321     !!!cp (204);
2322     !!!parse-error (type => 'no SYSTEM literal');
2323     $self->{state} = DATA_STATE;
2324 wakaba 1.5 $self->{s_kwd} = '';
2325 wakaba 1.1 !!!next-input-character;
2326    
2327     $self->{ct}->{quirks} = 1;
2328     !!!emit ($self->{ct}); # DOCTYPE
2329    
2330     redo A;
2331     } elsif ($self->{nc} == -1) {
2332     !!!cp (205);
2333     !!!parse-error (type => 'unclosed DOCTYPE');
2334    
2335     $self->{state} = DATA_STATE;
2336 wakaba 1.5 $self->{s_kwd} = '';
2337 wakaba 1.1 ## reconsume
2338    
2339     $self->{ct}->{quirks} = 1;
2340     !!!emit ($self->{ct}); # DOCTYPE
2341    
2342     redo A;
2343     } else {
2344     !!!cp (206);
2345     !!!parse-error (type => 'string after SYSTEM');
2346     $self->{ct}->{quirks} = 1;
2347    
2348     $self->{state} = BOGUS_DOCTYPE_STATE;
2349     !!!next-input-character;
2350     redo A;
2351     }
2352     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2353     if ($self->{nc} == 0x0022) { # "
2354     !!!cp (207);
2355     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2356     !!!next-input-character;
2357     redo A;
2358     } elsif ($self->{nc} == 0x003E) { # >
2359     !!!cp (208);
2360     !!!parse-error (type => 'unclosed SYSTEM literal');
2361    
2362     $self->{state} = DATA_STATE;
2363 wakaba 1.5 $self->{s_kwd} = '';
2364 wakaba 1.1 !!!next-input-character;
2365    
2366     $self->{ct}->{quirks} = 1;
2367     !!!emit ($self->{ct}); # DOCTYPE
2368    
2369     redo A;
2370     } elsif ($self->{nc} == -1) {
2371     !!!cp (209);
2372     !!!parse-error (type => 'unclosed SYSTEM literal');
2373    
2374     $self->{state} = DATA_STATE;
2375 wakaba 1.5 $self->{s_kwd} = '';
2376 wakaba 1.1 ## reconsume
2377    
2378     $self->{ct}->{quirks} = 1;
2379     !!!emit ($self->{ct}); # DOCTYPE
2380    
2381     redo A;
2382     } else {
2383     !!!cp (210);
2384     $self->{ct}->{sysid} # DOCTYPE
2385     .= chr $self->{nc};
2386     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2387     length $self->{ct}->{sysid});
2388    
2389     ## Stay in the state
2390     !!!next-input-character;
2391     redo A;
2392     }
2393     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2394     if ($self->{nc} == 0x0027) { # '
2395     !!!cp (211);
2396     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2397     !!!next-input-character;
2398     redo A;
2399     } elsif ($self->{nc} == 0x003E) { # >
2400     !!!cp (212);
2401     !!!parse-error (type => 'unclosed SYSTEM literal');
2402    
2403     $self->{state} = DATA_STATE;
2404 wakaba 1.5 $self->{s_kwd} = '';
2405 wakaba 1.1 !!!next-input-character;
2406    
2407     $self->{ct}->{quirks} = 1;
2408     !!!emit ($self->{ct}); # DOCTYPE
2409    
2410     redo A;
2411     } elsif ($self->{nc} == -1) {
2412     !!!cp (213);
2413     !!!parse-error (type => 'unclosed SYSTEM literal');
2414    
2415     $self->{state} = DATA_STATE;
2416 wakaba 1.5 $self->{s_kwd} = '';
2417 wakaba 1.1 ## reconsume
2418    
2419     $self->{ct}->{quirks} = 1;
2420     !!!emit ($self->{ct}); # DOCTYPE
2421    
2422     redo A;
2423     } else {
2424     !!!cp (214);
2425     $self->{ct}->{sysid} # DOCTYPE
2426     .= chr $self->{nc};
2427     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2428     length $self->{ct}->{sysid});
2429    
2430     ## Stay in the state
2431     !!!next-input-character;
2432     redo A;
2433     }
2434     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2435     if ($is_space->{$self->{nc}}) {
2436     !!!cp (215);
2437     ## Stay in the state
2438     !!!next-input-character;
2439     redo A;
2440     } elsif ($self->{nc} == 0x003E) { # >
2441     !!!cp (216);
2442     $self->{state} = DATA_STATE;
2443 wakaba 1.5 $self->{s_kwd} = '';
2444 wakaba 1.1 !!!next-input-character;
2445    
2446     !!!emit ($self->{ct}); # DOCTYPE
2447    
2448     redo A;
2449     } elsif ($self->{nc} == -1) {
2450     !!!cp (217);
2451     !!!parse-error (type => 'unclosed DOCTYPE');
2452     $self->{state} = DATA_STATE;
2453 wakaba 1.5 $self->{s_kwd} = '';
2454 wakaba 1.1 ## reconsume
2455    
2456     $self->{ct}->{quirks} = 1;
2457     !!!emit ($self->{ct}); # DOCTYPE
2458    
2459     redo A;
2460     } else {
2461     !!!cp (218);
2462     !!!parse-error (type => 'string after SYSTEM literal');
2463     #$self->{ct}->{quirks} = 1;
2464    
2465     $self->{state} = BOGUS_DOCTYPE_STATE;
2466     !!!next-input-character;
2467     redo A;
2468     }
2469     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2470     if ($self->{nc} == 0x003E) { # >
2471     !!!cp (219);
2472     $self->{state} = DATA_STATE;
2473 wakaba 1.5 $self->{s_kwd} = '';
2474 wakaba 1.1 !!!next-input-character;
2475    
2476     !!!emit ($self->{ct}); # DOCTYPE
2477    
2478     redo A;
2479     } elsif ($self->{nc} == -1) {
2480     !!!cp (220);
2481     $self->{state} = DATA_STATE;
2482 wakaba 1.5 $self->{s_kwd} = '';
2483 wakaba 1.1 ## reconsume
2484    
2485     !!!emit ($self->{ct}); # DOCTYPE
2486    
2487     redo A;
2488     } else {
2489     !!!cp (221);
2490     my $s = '';
2491     $self->{read_until}->($s, q[>], 0);
2492    
2493     ## Stay in the state
2494     !!!next-input-character;
2495     redo A;
2496     }
2497     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2498     ## NOTE: "CDATA section state" in the state is jointly implemented
2499     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2500     ## and |CDATA_SECTION_MSE2_STATE|.
2501 wakaba 1.10
2502     ## XML5: "CDATA state".
2503 wakaba 1.1
2504     if ($self->{nc} == 0x005D) { # ]
2505     !!!cp (221.1);
2506     $self->{state} = CDATA_SECTION_MSE1_STATE;
2507     !!!next-input-character;
2508     redo A;
2509     } elsif ($self->{nc} == -1) {
2510 wakaba 1.6 if ($self->{is_xml}) {
2511 wakaba 1.8 !!!cp (221.11);
2512 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2513 wakaba 1.8 } else {
2514     !!!cp (221.12);
2515 wakaba 1.6 }
2516    
2517 wakaba 1.1 $self->{state} = DATA_STATE;
2518 wakaba 1.5 $self->{s_kwd} = '';
2519 wakaba 1.10 ## Reconsume.
2520 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2521     !!!cp (221.2);
2522     !!!emit ($self->{ct}); # character
2523     } else {
2524     !!!cp (221.3);
2525     ## No token to emit. $self->{ct} is discarded.
2526     }
2527     redo A;
2528     } else {
2529     !!!cp (221.4);
2530     $self->{ct}->{data} .= chr $self->{nc};
2531     $self->{read_until}->($self->{ct}->{data},
2532     q<]>,
2533     length $self->{ct}->{data});
2534    
2535     ## Stay in the state.
2536     !!!next-input-character;
2537     redo A;
2538     }
2539    
2540     ## ISSUE: "text tokens" in spec.
2541     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2542 wakaba 1.10 ## XML5: "CDATA bracket state".
2543    
2544 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
2545     !!!cp (221.5);
2546     $self->{state} = CDATA_SECTION_MSE2_STATE;
2547     !!!next-input-character;
2548     redo A;
2549     } else {
2550     !!!cp (221.6);
2551 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
2552 wakaba 1.1 $self->{ct}->{data} .= ']';
2553 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2554 wakaba 1.1 ## Reconsume.
2555     redo A;
2556     }
2557     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2558 wakaba 1.10 ## XML5: "CDATA end state".
2559    
2560 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2561     $self->{state} = DATA_STATE;
2562 wakaba 1.5 $self->{s_kwd} = '';
2563 wakaba 1.1 !!!next-input-character;
2564     if (length $self->{ct}->{data}) { # character
2565     !!!cp (221.7);
2566     !!!emit ($self->{ct}); # character
2567     } else {
2568     !!!cp (221.8);
2569     ## No token to emit. $self->{ct} is discarded.
2570     }
2571     redo A;
2572     } elsif ($self->{nc} == 0x005D) { # ]
2573     !!!cp (221.9); # character
2574     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2575     ## Stay in the state.
2576     !!!next-input-character;
2577     redo A;
2578     } else {
2579     !!!cp (221.11);
2580     $self->{ct}->{data} .= ']]'; # character
2581     $self->{state} = CDATA_SECTION_STATE;
2582 wakaba 1.10 ## Reconsume. ## XML5: Emit.
2583 wakaba 1.1 redo A;
2584     }
2585     } elsif ($self->{state} == ENTITY_STATE) {
2586     if ($is_space->{$self->{nc}} or
2587     {
2588     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2589     $self->{entity_add} => 1,
2590     }->{$self->{nc}}) {
2591     !!!cp (1001);
2592     ## Don't consume
2593     ## No error
2594     ## Return nothing.
2595     #
2596     } elsif ($self->{nc} == 0x0023) { # #
2597     !!!cp (999);
2598     $self->{state} = ENTITY_HASH_STATE;
2599     $self->{s_kwd} = '#';
2600     !!!next-input-character;
2601     redo A;
2602     } elsif ((0x0041 <= $self->{nc} and
2603     $self->{nc} <= 0x005A) or # A..Z
2604     (0x0061 <= $self->{nc} and
2605     $self->{nc} <= 0x007A)) { # a..z
2606     !!!cp (998);
2607     require Whatpm::_NamedEntityList;
2608     $self->{state} = ENTITY_NAME_STATE;
2609     $self->{s_kwd} = chr $self->{nc};
2610     $self->{entity__value} = $self->{s_kwd};
2611     $self->{entity__match} = 0;
2612     !!!next-input-character;
2613     redo A;
2614     } else {
2615     !!!cp (1027);
2616     !!!parse-error (type => 'bare ero');
2617     ## Return nothing.
2618     #
2619     }
2620    
2621     ## NOTE: No character is consumed by the "consume a character
2622     ## reference" algorithm. In other word, there is an "&" character
2623     ## that does not introduce a character reference, which would be
2624     ## appended to the parent element or the attribute value in later
2625     ## process of the tokenizer.
2626    
2627     if ($self->{prev_state} == DATA_STATE) {
2628     !!!cp (997);
2629     $self->{state} = $self->{prev_state};
2630 wakaba 1.5 $self->{s_kwd} = '';
2631 wakaba 1.1 ## Reconsume.
2632     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2633     line => $self->{line_prev},
2634     column => $self->{column_prev},
2635     });
2636     redo A;
2637     } else {
2638     !!!cp (996);
2639     $self->{ca}->{value} .= '&';
2640     $self->{state} = $self->{prev_state};
2641 wakaba 1.5 $self->{s_kwd} = '';
2642 wakaba 1.1 ## Reconsume.
2643     redo A;
2644     }
2645     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2646     if ($self->{nc} == 0x0078 or # x
2647     $self->{nc} == 0x0058) { # X
2648     !!!cp (995);
2649     $self->{state} = HEXREF_X_STATE;
2650     $self->{s_kwd} .= chr $self->{nc};
2651     !!!next-input-character;
2652     redo A;
2653     } elsif (0x0030 <= $self->{nc} and
2654     $self->{nc} <= 0x0039) { # 0..9
2655     !!!cp (994);
2656     $self->{state} = NCR_NUM_STATE;
2657     $self->{s_kwd} = $self->{nc} - 0x0030;
2658     !!!next-input-character;
2659     redo A;
2660     } else {
2661     !!!parse-error (type => 'bare nero',
2662     line => $self->{line_prev},
2663     column => $self->{column_prev} - 1);
2664    
2665     ## NOTE: According to the spec algorithm, nothing is returned,
2666     ## and then "&#" is appended to the parent element or the attribute
2667     ## value in the later processing.
2668    
2669     if ($self->{prev_state} == DATA_STATE) {
2670     !!!cp (1019);
2671     $self->{state} = $self->{prev_state};
2672 wakaba 1.5 $self->{s_kwd} = '';
2673 wakaba 1.1 ## Reconsume.
2674     !!!emit ({type => CHARACTER_TOKEN,
2675     data => '&#',
2676     line => $self->{line_prev},
2677     column => $self->{column_prev} - 1,
2678     });
2679     redo A;
2680     } else {
2681     !!!cp (993);
2682     $self->{ca}->{value} .= '&#';
2683     $self->{state} = $self->{prev_state};
2684 wakaba 1.5 $self->{s_kwd} = '';
2685 wakaba 1.1 ## Reconsume.
2686     redo A;
2687     }
2688     }
2689     } elsif ($self->{state} == NCR_NUM_STATE) {
2690     if (0x0030 <= $self->{nc} and
2691     $self->{nc} <= 0x0039) { # 0..9
2692     !!!cp (1012);
2693     $self->{s_kwd} *= 10;
2694     $self->{s_kwd} += $self->{nc} - 0x0030;
2695    
2696     ## Stay in the state.
2697     !!!next-input-character;
2698     redo A;
2699     } elsif ($self->{nc} == 0x003B) { # ;
2700     !!!cp (1013);
2701     !!!next-input-character;
2702     #
2703     } else {
2704     !!!cp (1014);
2705     !!!parse-error (type => 'no refc');
2706     ## Reconsume.
2707     #
2708     }
2709    
2710     my $code = $self->{s_kwd};
2711     my $l = $self->{line_prev};
2712     my $c = $self->{column_prev};
2713     if ($charref_map->{$code}) {
2714     !!!cp (1015);
2715     !!!parse-error (type => 'invalid character reference',
2716     text => (sprintf 'U+%04X', $code),
2717     line => $l, column => $c);
2718     $code = $charref_map->{$code};
2719     } elsif ($code > 0x10FFFF) {
2720     !!!cp (1016);
2721     !!!parse-error (type => 'invalid character reference',
2722     text => (sprintf 'U-%08X', $code),
2723     line => $l, column => $c);
2724     $code = 0xFFFD;
2725     }
2726    
2727     if ($self->{prev_state} == DATA_STATE) {
2728     !!!cp (992);
2729     $self->{state} = $self->{prev_state};
2730 wakaba 1.5 $self->{s_kwd} = '';
2731 wakaba 1.1 ## Reconsume.
2732     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2733 wakaba 1.7 has_reference => 1,
2734 wakaba 1.1 line => $l, column => $c,
2735     });
2736     redo A;
2737     } else {
2738     !!!cp (991);
2739     $self->{ca}->{value} .= chr $code;
2740     $self->{ca}->{has_reference} = 1;
2741     $self->{state} = $self->{prev_state};
2742 wakaba 1.5 $self->{s_kwd} = '';
2743 wakaba 1.1 ## Reconsume.
2744     redo A;
2745     }
2746     } elsif ($self->{state} == HEXREF_X_STATE) {
2747     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2748     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2749     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2750     # 0..9, A..F, a..f
2751     !!!cp (990);
2752     $self->{state} = HEXREF_HEX_STATE;
2753     $self->{s_kwd} = 0;
2754     ## Reconsume.
2755     redo A;
2756     } else {
2757     !!!parse-error (type => 'bare hcro',
2758     line => $self->{line_prev},
2759     column => $self->{column_prev} - 2);
2760    
2761     ## NOTE: According to the spec algorithm, nothing is returned,
2762     ## and then "&#" followed by "X" or "x" is appended to the parent
2763     ## element or the attribute value in the later processing.
2764    
2765     if ($self->{prev_state} == DATA_STATE) {
2766     !!!cp (1005);
2767     $self->{state} = $self->{prev_state};
2768 wakaba 1.5 $self->{s_kwd} = '';
2769 wakaba 1.1 ## Reconsume.
2770     !!!emit ({type => CHARACTER_TOKEN,
2771     data => '&' . $self->{s_kwd},
2772     line => $self->{line_prev},
2773     column => $self->{column_prev} - length $self->{s_kwd},
2774     });
2775     redo A;
2776     } else {
2777     !!!cp (989);
2778     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2779     $self->{state} = $self->{prev_state};
2780 wakaba 1.5 $self->{s_kwd} = '';
2781 wakaba 1.1 ## Reconsume.
2782     redo A;
2783     }
2784     }
2785     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2786     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2787     # 0..9
2788     !!!cp (1002);
2789     $self->{s_kwd} *= 0x10;
2790     $self->{s_kwd} += $self->{nc} - 0x0030;
2791     ## Stay in the state.
2792     !!!next-input-character;
2793     redo A;
2794     } elsif (0x0061 <= $self->{nc} and
2795     $self->{nc} <= 0x0066) { # a..f
2796     !!!cp (1003);
2797     $self->{s_kwd} *= 0x10;
2798     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2799     ## Stay in the state.
2800     !!!next-input-character;
2801     redo A;
2802     } elsif (0x0041 <= $self->{nc} and
2803     $self->{nc} <= 0x0046) { # A..F
2804     !!!cp (1004);
2805     $self->{s_kwd} *= 0x10;
2806     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2807     ## Stay in the state.
2808     !!!next-input-character;
2809     redo A;
2810     } elsif ($self->{nc} == 0x003B) { # ;
2811     !!!cp (1006);
2812     !!!next-input-character;
2813     #
2814     } else {
2815     !!!cp (1007);
2816     !!!parse-error (type => 'no refc',
2817     line => $self->{line},
2818     column => $self->{column});
2819     ## Reconsume.
2820     #
2821     }
2822    
2823     my $code = $self->{s_kwd};
2824     my $l = $self->{line_prev};
2825     my $c = $self->{column_prev};
2826     if ($charref_map->{$code}) {
2827     !!!cp (1008);
2828     !!!parse-error (type => 'invalid character reference',
2829     text => (sprintf 'U+%04X', $code),
2830     line => $l, column => $c);
2831     $code = $charref_map->{$code};
2832     } elsif ($code > 0x10FFFF) {
2833     !!!cp (1009);
2834     !!!parse-error (type => 'invalid character reference',
2835     text => (sprintf 'U-%08X', $code),
2836     line => $l, column => $c);
2837     $code = 0xFFFD;
2838     }
2839    
2840     if ($self->{prev_state} == DATA_STATE) {
2841     !!!cp (988);
2842     $self->{state} = $self->{prev_state};
2843 wakaba 1.5 $self->{s_kwd} = '';
2844 wakaba 1.1 ## Reconsume.
2845     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2846 wakaba 1.7 has_reference => 1,
2847 wakaba 1.1 line => $l, column => $c,
2848     });
2849     redo A;
2850     } else {
2851     !!!cp (987);
2852     $self->{ca}->{value} .= chr $code;
2853     $self->{ca}->{has_reference} = 1;
2854     $self->{state} = $self->{prev_state};
2855 wakaba 1.5 $self->{s_kwd} = '';
2856 wakaba 1.1 ## Reconsume.
2857     redo A;
2858     }
2859     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2860     if (length $self->{s_kwd} < 30 and
2861     ## NOTE: Some number greater than the maximum length of entity name
2862     ((0x0041 <= $self->{nc} and # a
2863     $self->{nc} <= 0x005A) or # x
2864     (0x0061 <= $self->{nc} and # a
2865     $self->{nc} <= 0x007A) or # z
2866     (0x0030 <= $self->{nc} and # 0
2867     $self->{nc} <= 0x0039) or # 9
2868     $self->{nc} == 0x003B)) { # ;
2869     our $EntityChar;
2870     $self->{s_kwd} .= chr $self->{nc};
2871     if (defined $EntityChar->{$self->{s_kwd}}) {
2872     if ($self->{nc} == 0x003B) { # ;
2873     !!!cp (1020);
2874     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2875     $self->{entity__match} = 1;
2876     !!!next-input-character;
2877     #
2878     } else {
2879     !!!cp (1021);
2880     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2881     $self->{entity__match} = -1;
2882     ## Stay in the state.
2883     !!!next-input-character;
2884     redo A;
2885     }
2886     } else {
2887     !!!cp (1022);
2888     $self->{entity__value} .= chr $self->{nc};
2889     $self->{entity__match} *= 2;
2890     ## Stay in the state.
2891     !!!next-input-character;
2892     redo A;
2893     }
2894     }
2895    
2896     my $data;
2897     my $has_ref;
2898     if ($self->{entity__match} > 0) {
2899     !!!cp (1023);
2900     $data = $self->{entity__value};
2901     $has_ref = 1;
2902     #
2903     } elsif ($self->{entity__match} < 0) {
2904     !!!parse-error (type => 'no refc');
2905     if ($self->{prev_state} != DATA_STATE and # in attribute
2906     $self->{entity__match} < -1) {
2907     !!!cp (1024);
2908     $data = '&' . $self->{s_kwd};
2909     #
2910     } else {
2911     !!!cp (1025);
2912     $data = $self->{entity__value};
2913     $has_ref = 1;
2914     #
2915     }
2916     } else {
2917     !!!cp (1026);
2918     !!!parse-error (type => 'bare ero',
2919     line => $self->{line_prev},
2920     column => $self->{column_prev} - length $self->{s_kwd});
2921     $data = '&' . $self->{s_kwd};
2922     #
2923     }
2924    
2925     ## NOTE: In these cases, when a character reference is found,
2926     ## it is consumed and a character token is returned, or, otherwise,
2927     ## nothing is consumed and returned, according to the spec algorithm.
2928     ## In this implementation, anything that has been examined by the
2929     ## tokenizer is appended to the parent element or the attribute value
2930     ## as string, either literal string when no character reference or
2931     ## entity-replaced string otherwise, in this stage, since any characters
2932     ## that would not be consumed are appended in the data state or in an
2933     ## appropriate attribute value state anyway.
2934    
2935     if ($self->{prev_state} == DATA_STATE) {
2936     !!!cp (986);
2937     $self->{state} = $self->{prev_state};
2938 wakaba 1.5 $self->{s_kwd} = '';
2939 wakaba 1.1 ## Reconsume.
2940     !!!emit ({type => CHARACTER_TOKEN,
2941     data => $data,
2942 wakaba 1.7 has_reference => $has_ref,
2943 wakaba 1.1 line => $self->{line_prev},
2944     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2945     });
2946     redo A;
2947     } else {
2948     !!!cp (985);
2949     $self->{ca}->{value} .= $data;
2950     $self->{ca}->{has_reference} = 1 if $has_ref;
2951     $self->{state} = $self->{prev_state};
2952 wakaba 1.5 $self->{s_kwd} = '';
2953 wakaba 1.1 ## Reconsume.
2954     redo A;
2955     }
2956 wakaba 1.8
2957     ## XML-only states
2958    
2959     } elsif ($self->{state} == PI_STATE) {
2960     if ($is_space->{$self->{nc}} or
2961     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2962     $self->{nc} == -1) {
2963     !!!parse-error (type => 'bare pio', ## TODO: type
2964     line => $self->{line_prev},
2965     column => $self->{column_prev}
2966     - 1 * ($self->{nc} != -1));
2967     $self->{state} = BOGUS_COMMENT_STATE;
2968     ## Reconsume.
2969     $self->{ct} = {type => COMMENT_TOKEN,
2970     data => '?',
2971     line => $self->{line_prev},
2972     column => $self->{column_prev}
2973     - 1 * ($self->{nc} != -1),
2974     };
2975     redo A;
2976     } else {
2977     $self->{ct} = {type => PI_TOKEN,
2978     target => chr $self->{nc},
2979     data => '',
2980     line => $self->{line_prev},
2981     column => $self->{column_prev} - 1,
2982     };
2983     $self->{state} = PI_TARGET_STATE;
2984     !!!next-input-character;
2985     redo A;
2986     }
2987     } elsif ($self->{state} == PI_TARGET_STATE) {
2988     if ($is_space->{$self->{nc}}) {
2989     $self->{state} = PI_TARGET_AFTER_STATE;
2990     !!!next-input-character;
2991     redo A;
2992     } elsif ($self->{nc} == -1) {
2993     !!!parse-error (type => 'no pic'); ## TODO: type
2994     $self->{state} = DATA_STATE;
2995     $self->{s_kwd} = '';
2996     ## Reconsume.
2997     !!!emit ($self->{ct}); # pi
2998     redo A;
2999     } elsif ($self->{nc} == 0x003F) { # ?
3000     $self->{state} = PI_AFTER_STATE;
3001     !!!next-input-character;
3002     redo A;
3003     } else {
3004     ## XML5: typo ("tag name" -> "target")
3005     $self->{ct}->{target} .= chr $self->{nc}; # pi
3006     !!!next-input-character;
3007     redo A;
3008     }
3009     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3010     if ($is_space->{$self->{nc}}) {
3011     ## Stay in the state.
3012     !!!next-input-character;
3013     redo A;
3014     } else {
3015     $self->{state} = PI_DATA_STATE;
3016     ## Reprocess.
3017     redo A;
3018     }
3019     } elsif ($self->{state} == PI_DATA_STATE) {
3020     if ($self->{nc} == 0x003F) { # ?
3021     $self->{state} = PI_DATA_AFTER_STATE;
3022     !!!next-input-character;
3023     redo A;
3024     } elsif ($self->{nc} == -1) {
3025     !!!parse-error (type => 'no pic'); ## TODO: type
3026     $self->{state} = DATA_STATE;
3027     $self->{s_kwd} = '';
3028     ## Reprocess.
3029     !!!emit ($self->{ct}); # pi
3030     redo A;
3031     } else {
3032     $self->{ct}->{data} .= chr $self->{nc}; # pi
3033     $self->{read_until}->($self->{ct}->{data}, q[?],
3034     length $self->{ct}->{data});
3035     ## Stay in the state.
3036     !!!next-input-character;
3037     ## Reprocess.
3038     redo A;
3039     }
3040     } elsif ($self->{state} == PI_AFTER_STATE) {
3041     if ($self->{nc} == 0x003E) { # >
3042     $self->{state} = DATA_STATE;
3043     $self->{s_kwd} = '';
3044     !!!next-input-character;
3045     !!!emit ($self->{ct}); # pi
3046     redo A;
3047     } elsif ($self->{nc} == 0x003F) { # ?
3048     !!!parse-error (type => 'no s after target', ## TODO: type
3049     line => $self->{line_prev},
3050     column => $self->{column_prev}); ## XML5: no error
3051     $self->{ct}->{data} .= '?';
3052     $self->{state} = PI_DATA_AFTER_STATE;
3053     !!!next-input-character;
3054     redo A;
3055     } else {
3056     !!!parse-error (type => 'no s after target', ## TODO: type
3057     line => $self->{line_prev},
3058     column => $self->{column_prev}
3059     + 1 * ($self->{nc} == -1)); ## XML5: no error
3060     $self->{ct}->{data} .= '?'; ## XML5: not appended
3061     $self->{state} = PI_DATA_STATE;
3062     ## Reprocess.
3063     redo A;
3064     }
3065     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3066     ## XML5: Same as "pi after state" in XML5
3067     if ($self->{nc} == 0x003E) { # >
3068     $self->{state} = DATA_STATE;
3069     $self->{s_kwd} = '';
3070     !!!next-input-character;
3071     !!!emit ($self->{ct}); # pi
3072     redo A;
3073     } elsif ($self->{nc} == 0x003F) { # ?
3074     $self->{ct}->{data} .= '?';
3075     ## Stay in the state.
3076     !!!next-input-character;
3077     redo A;
3078     } else {
3079     $self->{ct}->{data} .= '?'; ## XML5: not appended
3080     $self->{state} = PI_DATA_STATE;
3081     ## Reprocess.
3082     redo A;
3083     }
3084    
3085 wakaba 1.1 } else {
3086     die "$0: $self->{state}: Unknown state";
3087     }
3088     } # A
3089    
3090     die "$0: _get_next_token: unexpected case";
3091     } # _get_next_token
3092    
3093     1;
3094 wakaba 1.11 ## $Date: 2008/10/15 08:51:02 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24