/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download) (as text)
Tue Oct 14 14:57:52 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.5: +16 -2 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:56:52 -0000
	* cdata-1.dat: Tests on CDATA section outside of the root element
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:57:28 -0000
	* Tokenizer.pm.src: Parse error if CDATA section is not closed or
	is placed outside of the root element.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.6 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
182 wakaba 1.1 #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194     !!!next-input-character;
195     $self->{token} = [];
196     # $self->{escape}
197     } # _initialize_tokenizer
198    
199     ## A token has:
200     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202     ## ->{name} (DOCTYPE_TOKEN)
203     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204     ## ->{pubid} (DOCTYPE_TOKEN)
205     ## ->{sysid} (DOCTYPE_TOKEN)
206     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208     ## ->{name}
209     ## ->{value}
210     ## ->{has_reference} == 1 or 0
211     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
213     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
214     ## while the token is pushed back to the stack.
215    
216     ## Emitted token MUST immediately be handled by the tree construction state.
217    
218     ## Before each step, UA MAY check to see if either one of the scripts in
219     ## "list of scripts that will execute as soon as possible" or the first
220     ## script in the "list of scripts that will execute asynchronously",
221     ## has completed loading. If one has, then it MUST be executed
222     ## and removed from the list.
223    
224     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
225     ## (This requirement was dropped from HTML5 spec, unfortunately.)
226    
227     my $is_space = {
228     0x0009 => 1, # CHARACTER TABULATION (HT)
229     0x000A => 1, # LINE FEED (LF)
230     #0x000B => 0, # LINE TABULATION (VT)
231     0x000C => 1, # FORM FEED (FF)
232     #0x000D => 1, # CARRIAGE RETURN (CR)
233     0x0020 => 1, # SPACE (SP)
234     };
235    
236     sub _get_next_token ($) {
237     my $self = shift;
238    
239     if ($self->{self_closing}) {
240     !!!parse-error (type => 'nestc', token => $self->{ct});
241     ## NOTE: The |self_closing| flag is only set by start tag token.
242     ## In addition, when a start tag token is emitted, it is always set to
243     ## |ct|.
244     delete $self->{self_closing};
245     }
246    
247     if (@{$self->{token}}) {
248     $self->{self_closing} = $self->{token}->[0]->{self_closing};
249     return shift @{$self->{token}};
250     }
251    
252     A: {
253     if ($self->{state} == PCDATA_STATE) {
254     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
255    
256     if ($self->{nc} == 0x0026) { # &
257     !!!cp (0.1);
258     ## NOTE: In the spec, the tokenizer is switched to the
259     ## "entity data state". In this implementation, the tokenizer
260     ## is switched to the |ENTITY_STATE|, which is an implementation
261     ## of the "consume a character reference" algorithm.
262     $self->{entity_add} = -1;
263     $self->{prev_state} = DATA_STATE;
264     $self->{state} = ENTITY_STATE;
265     !!!next-input-character;
266     redo A;
267     } elsif ($self->{nc} == 0x003C) { # <
268     !!!cp (0.2);
269     $self->{state} = TAG_OPEN_STATE;
270     !!!next-input-character;
271     redo A;
272     } elsif ($self->{nc} == -1) {
273     !!!cp (0.3);
274     !!!emit ({type => END_OF_FILE_TOKEN,
275     line => $self->{line}, column => $self->{column}});
276     last A; ## TODO: ok?
277     } else {
278     !!!cp (0.4);
279     #
280     }
281    
282     # Anything else
283     my $token = {type => CHARACTER_TOKEN,
284     data => chr $self->{nc},
285     line => $self->{line}, column => $self->{column},
286     };
287     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
288    
289     ## Stay in the state.
290     !!!next-input-character;
291     !!!emit ($token);
292     redo A;
293     } elsif ($self->{state} == DATA_STATE) {
294     $self->{s_kwd} = '' unless defined $self->{s_kwd};
295     if ($self->{nc} == 0x0026) { # &
296     $self->{s_kwd} = '';
297     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
298     not $self->{escape}) {
299     !!!cp (1);
300     ## NOTE: In the spec, the tokenizer is switched to the
301     ## "entity data state". In this implementation, the tokenizer
302     ## is switched to the |ENTITY_STATE|, which is an implementation
303     ## of the "consume a character reference" algorithm.
304     $self->{entity_add} = -1;
305     $self->{prev_state} = DATA_STATE;
306     $self->{state} = ENTITY_STATE;
307     !!!next-input-character;
308     redo A;
309     } else {
310     !!!cp (2);
311     #
312     }
313     } elsif ($self->{nc} == 0x002D) { # -
314     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
315 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
316 wakaba 1.1 !!!cp (3);
317     $self->{escape} = 1; # unless $self->{escape};
318     $self->{s_kwd} = '--';
319     #
320 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
321 wakaba 1.1 !!!cp (4);
322     $self->{s_kwd} = '--';
323     #
324 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
325     !!!cp (4.1);
326     $self->{s_kwd} .= '-';
327     #
328 wakaba 1.1 } else {
329     !!!cp (5);
330 wakaba 1.5 $self->{s_kwd} = '-';
331 wakaba 1.1 #
332     }
333     }
334    
335     #
336     } elsif ($self->{nc} == 0x0021) { # !
337     if (length $self->{s_kwd}) {
338     !!!cp (5.1);
339     $self->{s_kwd} .= '!';
340     #
341     } else {
342     !!!cp (5.2);
343     #$self->{s_kwd} = '';
344     #
345     }
346     #
347     } elsif ($self->{nc} == 0x003C) { # <
348     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
349     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
350     not $self->{escape})) {
351     !!!cp (6);
352     $self->{state} = TAG_OPEN_STATE;
353     !!!next-input-character;
354     redo A;
355     } else {
356     !!!cp (7);
357     $self->{s_kwd} = '';
358     #
359     }
360     } elsif ($self->{nc} == 0x003E) { # >
361     if ($self->{escape} and
362     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
363     if ($self->{s_kwd} eq '--') {
364     !!!cp (8);
365     delete $self->{escape};
366 wakaba 1.5 #
367 wakaba 1.1 } else {
368     !!!cp (9);
369 wakaba 1.5 #
370 wakaba 1.1 }
371 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
372     !!!cp (9.1);
373     !!!parse-error (type => 'unmatched mse', ## TODO: type
374     line => $self->{line_prev},
375     column => $self->{column_prev} - 1);
376     #
377 wakaba 1.1 } else {
378     !!!cp (10);
379 wakaba 1.5 #
380 wakaba 1.1 }
381    
382     $self->{s_kwd} = '';
383     #
384 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
385     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
386     !!!cp (10.1);
387     $self->{s_kwd} .= ']';
388     } elsif ($self->{s_kwd} eq ']]') {
389     !!!cp (10.2);
390     #
391     } else {
392     !!!cp (10.3);
393     $self->{s_kwd} = '';
394     }
395     #
396 wakaba 1.1 } elsif ($self->{nc} == -1) {
397     !!!cp (11);
398     $self->{s_kwd} = '';
399     !!!emit ({type => END_OF_FILE_TOKEN,
400     line => $self->{line}, column => $self->{column}});
401     last A; ## TODO: ok?
402     } else {
403     !!!cp (12);
404     $self->{s_kwd} = '';
405     #
406     }
407    
408     # Anything else
409     my $token = {type => CHARACTER_TOKEN,
410     data => chr $self->{nc},
411     line => $self->{line}, column => $self->{column},
412     };
413 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
414 wakaba 1.1 length $token->{data})) {
415     $self->{s_kwd} = '';
416     }
417    
418     ## Stay in the data state.
419 wakaba 1.5 if (not $self->{is_xml} and
420     $self->{content_model} == PCDATA_CONTENT_MODEL) {
421 wakaba 1.1 !!!cp (13);
422     $self->{state} = PCDATA_STATE;
423     } else {
424     !!!cp (14);
425     ## Stay in the state.
426     }
427     !!!next-input-character;
428     !!!emit ($token);
429     redo A;
430     } elsif ($self->{state} == TAG_OPEN_STATE) {
431     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
432     if ($self->{nc} == 0x002F) { # /
433     !!!cp (15);
434     !!!next-input-character;
435     $self->{state} = CLOSE_TAG_OPEN_STATE;
436     redo A;
437     } elsif ($self->{nc} == 0x0021) { # !
438     !!!cp (15.1);
439     $self->{s_kwd} = '<' unless $self->{escape};
440     #
441     } else {
442     !!!cp (16);
443     #
444     }
445    
446     ## reconsume
447     $self->{state} = DATA_STATE;
448 wakaba 1.5 $self->{s_kwd} = '';
449 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN, data => '<',
450     line => $self->{line_prev},
451     column => $self->{column_prev},
452     });
453     redo A;
454     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
455     if ($self->{nc} == 0x0021) { # !
456     !!!cp (17);
457     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
458     !!!next-input-character;
459     redo A;
460     } elsif ($self->{nc} == 0x002F) { # /
461     !!!cp (18);
462     $self->{state} = CLOSE_TAG_OPEN_STATE;
463     !!!next-input-character;
464     redo A;
465     } elsif (0x0041 <= $self->{nc} and
466     $self->{nc} <= 0x005A) { # A..Z
467     !!!cp (19);
468     $self->{ct}
469     = {type => START_TAG_TOKEN,
470 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
471 wakaba 1.1 line => $self->{line_prev},
472     column => $self->{column_prev}};
473     $self->{state} = TAG_NAME_STATE;
474     !!!next-input-character;
475     redo A;
476     } elsif (0x0061 <= $self->{nc} and
477     $self->{nc} <= 0x007A) { # a..z
478     !!!cp (20);
479     $self->{ct} = {type => START_TAG_TOKEN,
480     tag_name => chr ($self->{nc}),
481     line => $self->{line_prev},
482     column => $self->{column_prev}};
483     $self->{state} = TAG_NAME_STATE;
484     !!!next-input-character;
485     redo A;
486     } elsif ($self->{nc} == 0x003E) { # >
487     !!!cp (21);
488     !!!parse-error (type => 'empty start tag',
489     line => $self->{line_prev},
490     column => $self->{column_prev});
491     $self->{state} = DATA_STATE;
492 wakaba 1.5 $self->{s_kwd} = '';
493 wakaba 1.1 !!!next-input-character;
494    
495     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
496     line => $self->{line_prev},
497     column => $self->{column_prev},
498     });
499    
500     redo A;
501     } elsif ($self->{nc} == 0x003F) { # ?
502     !!!cp (22);
503     !!!parse-error (type => 'pio',
504     line => $self->{line_prev},
505     column => $self->{column_prev});
506     $self->{state} = BOGUS_COMMENT_STATE;
507     $self->{ct} = {type => COMMENT_TOKEN, data => '',
508     line => $self->{line_prev},
509     column => $self->{column_prev},
510     };
511     ## $self->{nc} is intentionally left as is
512     redo A;
513     } else {
514     !!!cp (23);
515     !!!parse-error (type => 'bare stago',
516     line => $self->{line_prev},
517     column => $self->{column_prev});
518     $self->{state} = DATA_STATE;
519 wakaba 1.5 $self->{s_kwd} = '';
520 wakaba 1.1 ## reconsume
521    
522     !!!emit ({type => CHARACTER_TOKEN, data => '<',
523     line => $self->{line_prev},
524     column => $self->{column_prev},
525     });
526    
527     redo A;
528     }
529     } else {
530     die "$0: $self->{content_model} in tag open";
531     }
532     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
533     ## NOTE: The "close tag open state" in the spec is implemented as
534     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
535    
536     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
537     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
538     if (defined $self->{last_stag_name}) {
539     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
540     $self->{s_kwd} = '';
541     ## Reconsume.
542     redo A;
543     } else {
544     ## No start tag token has ever been emitted
545     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
546     !!!cp (28);
547     $self->{state} = DATA_STATE;
548 wakaba 1.5 $self->{s_kwd} = '';
549 wakaba 1.1 ## Reconsume.
550     !!!emit ({type => CHARACTER_TOKEN, data => '</',
551     line => $l, column => $c,
552     });
553     redo A;
554     }
555     }
556    
557     if (0x0041 <= $self->{nc} and
558     $self->{nc} <= 0x005A) { # A..Z
559     !!!cp (29);
560     $self->{ct}
561     = {type => END_TAG_TOKEN,
562 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
563 wakaba 1.1 line => $l, column => $c};
564     $self->{state} = TAG_NAME_STATE;
565     !!!next-input-character;
566     redo A;
567     } elsif (0x0061 <= $self->{nc} and
568     $self->{nc} <= 0x007A) { # a..z
569     !!!cp (30);
570     $self->{ct} = {type => END_TAG_TOKEN,
571     tag_name => chr ($self->{nc}),
572     line => $l, column => $c};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (31);
578     !!!parse-error (type => 'empty end tag',
579     line => $self->{line_prev}, ## "<" in "</>"
580     column => $self->{column_prev} - 1);
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584     redo A;
585     } elsif ($self->{nc} == -1) {
586     !!!cp (32);
587     !!!parse-error (type => 'bare etago');
588 wakaba 1.5 $self->{s_kwd} = '';
589 wakaba 1.1 $self->{state} = DATA_STATE;
590     # reconsume
591    
592     !!!emit ({type => CHARACTER_TOKEN, data => '</',
593     line => $l, column => $c,
594     });
595    
596     redo A;
597     } else {
598     !!!cp (33);
599     !!!parse-error (type => 'bogus end tag');
600     $self->{state} = BOGUS_COMMENT_STATE;
601     $self->{ct} = {type => COMMENT_TOKEN, data => '',
602     line => $self->{line_prev}, # "<" of "</"
603     column => $self->{column_prev} - 1,
604     };
605     ## NOTE: $self->{nc} is intentionally left as is.
606     ## Although the "anything else" case of the spec not explicitly
607     ## states that the next input character is to be reconsumed,
608     ## it will be included to the |data| of the comment token
609     ## generated from the bogus end tag, as defined in the
610     ## "bogus comment state" entry.
611     redo A;
612     }
613     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
614     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
615     if (length $ch) {
616     my $CH = $ch;
617     $ch =~ tr/a-z/A-Z/;
618     my $nch = chr $self->{nc};
619     if ($nch eq $ch or $nch eq $CH) {
620     !!!cp (24);
621     ## Stay in the state.
622     $self->{s_kwd} .= $nch;
623     !!!next-input-character;
624     redo A;
625     } else {
626     !!!cp (25);
627     $self->{state} = DATA_STATE;
628 wakaba 1.5 $self->{s_kwd} = '';
629 wakaba 1.1 ## Reconsume.
630     !!!emit ({type => CHARACTER_TOKEN,
631     data => '</' . $self->{s_kwd},
632     line => $self->{line_prev},
633     column => $self->{column_prev} - 1 - length $self->{s_kwd},
634     });
635     redo A;
636     }
637     } else { # after "<{tag-name}"
638     unless ($is_space->{$self->{nc}} or
639     {
640     0x003E => 1, # >
641     0x002F => 1, # /
642     -1 => 1, # EOF
643     }->{$self->{nc}}) {
644     !!!cp (26);
645     ## Reconsume.
646     $self->{state} = DATA_STATE;
647 wakaba 1.5 $self->{s_kwd} = '';
648 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
649     data => '</' . $self->{s_kwd},
650     line => $self->{line_prev},
651     column => $self->{column_prev} - 1 - length $self->{s_kwd},
652     });
653     redo A;
654     } else {
655     !!!cp (27);
656     $self->{ct}
657     = {type => END_TAG_TOKEN,
658     tag_name => $self->{last_stag_name},
659     line => $self->{line_prev},
660     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
661     $self->{state} = TAG_NAME_STATE;
662     ## Reconsume.
663     redo A;
664     }
665     }
666     } elsif ($self->{state} == TAG_NAME_STATE) {
667     if ($is_space->{$self->{nc}}) {
668     !!!cp (34);
669     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
670     !!!next-input-character;
671     redo A;
672     } elsif ($self->{nc} == 0x003E) { # >
673     if ($self->{ct}->{type} == START_TAG_TOKEN) {
674     !!!cp (35);
675     $self->{last_stag_name} = $self->{ct}->{tag_name};
676     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
677     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
678     #if ($self->{ct}->{attributes}) {
679     # ## NOTE: This should never be reached.
680     # !!! cp (36);
681     # !!! parse-error (type => 'end tag attribute');
682     #} else {
683     !!!cp (37);
684     #}
685     } else {
686     die "$0: $self->{ct}->{type}: Unknown token type";
687     }
688     $self->{state} = DATA_STATE;
689 wakaba 1.5 $self->{s_kwd} = '';
690 wakaba 1.1 !!!next-input-character;
691    
692     !!!emit ($self->{ct}); # start tag or end tag
693    
694     redo A;
695     } elsif (0x0041 <= $self->{nc} and
696     $self->{nc} <= 0x005A) { # A..Z
697     !!!cp (38);
698 wakaba 1.4 $self->{ct}->{tag_name}
699     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
700 wakaba 1.1 # start tag or end tag
701     ## Stay in this state
702     !!!next-input-character;
703     redo A;
704     } elsif ($self->{nc} == -1) {
705     !!!parse-error (type => 'unclosed tag');
706     if ($self->{ct}->{type} == START_TAG_TOKEN) {
707     !!!cp (39);
708     $self->{last_stag_name} = $self->{ct}->{tag_name};
709     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
710     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
711     #if ($self->{ct}->{attributes}) {
712     # ## NOTE: This state should never be reached.
713     # !!! cp (40);
714     # !!! parse-error (type => 'end tag attribute');
715     #} else {
716     !!!cp (41);
717     #}
718     } else {
719     die "$0: $self->{ct}->{type}: Unknown token type";
720     }
721     $self->{state} = DATA_STATE;
722 wakaba 1.5 $self->{s_kwd} = '';
723 wakaba 1.1 # reconsume
724    
725     !!!emit ($self->{ct}); # start tag or end tag
726    
727     redo A;
728     } elsif ($self->{nc} == 0x002F) { # /
729     !!!cp (42);
730     $self->{state} = SELF_CLOSING_START_TAG_STATE;
731     !!!next-input-character;
732     redo A;
733     } else {
734     !!!cp (44);
735     $self->{ct}->{tag_name} .= chr $self->{nc};
736     # start tag or end tag
737     ## Stay in the state
738     !!!next-input-character;
739     redo A;
740     }
741     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
742     if ($is_space->{$self->{nc}}) {
743     !!!cp (45);
744     ## Stay in the state
745     !!!next-input-character;
746     redo A;
747     } elsif ($self->{nc} == 0x003E) { # >
748     if ($self->{ct}->{type} == START_TAG_TOKEN) {
749     !!!cp (46);
750     $self->{last_stag_name} = $self->{ct}->{tag_name};
751     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
752     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
753     if ($self->{ct}->{attributes}) {
754     !!!cp (47);
755     !!!parse-error (type => 'end tag attribute');
756     } else {
757     !!!cp (48);
758     }
759     } else {
760     die "$0: $self->{ct}->{type}: Unknown token type";
761     }
762     $self->{state} = DATA_STATE;
763 wakaba 1.5 $self->{s_kwd} = '';
764 wakaba 1.1 !!!next-input-character;
765    
766     !!!emit ($self->{ct}); # start tag or end tag
767    
768     redo A;
769     } elsif (0x0041 <= $self->{nc} and
770     $self->{nc} <= 0x005A) { # A..Z
771     !!!cp (49);
772     $self->{ca}
773 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
774 wakaba 1.1 value => '',
775     line => $self->{line}, column => $self->{column}};
776     $self->{state} = ATTRIBUTE_NAME_STATE;
777     !!!next-input-character;
778     redo A;
779     } elsif ($self->{nc} == 0x002F) { # /
780     !!!cp (50);
781     $self->{state} = SELF_CLOSING_START_TAG_STATE;
782     !!!next-input-character;
783     redo A;
784     } elsif ($self->{nc} == -1) {
785     !!!parse-error (type => 'unclosed tag');
786     if ($self->{ct}->{type} == START_TAG_TOKEN) {
787     !!!cp (52);
788     $self->{last_stag_name} = $self->{ct}->{tag_name};
789     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
790     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791     if ($self->{ct}->{attributes}) {
792     !!!cp (53);
793     !!!parse-error (type => 'end tag attribute');
794     } else {
795     !!!cp (54);
796     }
797     } else {
798     die "$0: $self->{ct}->{type}: Unknown token type";
799     }
800     $self->{state} = DATA_STATE;
801 wakaba 1.5 $self->{s_kwd} = '';
802 wakaba 1.1 # reconsume
803    
804     !!!emit ($self->{ct}); # start tag or end tag
805    
806     redo A;
807     } else {
808     if ({
809     0x0022 => 1, # "
810     0x0027 => 1, # '
811     0x003D => 1, # =
812     }->{$self->{nc}}) {
813     !!!cp (55);
814     !!!parse-error (type => 'bad attribute name');
815     } else {
816     !!!cp (56);
817     }
818     $self->{ca}
819     = {name => chr ($self->{nc}),
820     value => '',
821     line => $self->{line}, column => $self->{column}};
822     $self->{state} = ATTRIBUTE_NAME_STATE;
823     !!!next-input-character;
824     redo A;
825     }
826     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
827     my $before_leave = sub {
828     if (exists $self->{ct}->{attributes} # start tag or end tag
829     ->{$self->{ca}->{name}}) { # MUST
830     !!!cp (57);
831     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
832     ## Discard $self->{ca} # MUST
833     } else {
834     !!!cp (58);
835     $self->{ct}->{attributes}->{$self->{ca}->{name}}
836     = $self->{ca};
837     }
838     }; # $before_leave
839    
840     if ($is_space->{$self->{nc}}) {
841     !!!cp (59);
842     $before_leave->();
843     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
844     !!!next-input-character;
845     redo A;
846     } elsif ($self->{nc} == 0x003D) { # =
847     !!!cp (60);
848     $before_leave->();
849     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
850     !!!next-input-character;
851     redo A;
852     } elsif ($self->{nc} == 0x003E) { # >
853     $before_leave->();
854     if ($self->{ct}->{type} == START_TAG_TOKEN) {
855     !!!cp (61);
856     $self->{last_stag_name} = $self->{ct}->{tag_name};
857     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
858     !!!cp (62);
859     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
860     if ($self->{ct}->{attributes}) {
861     !!!parse-error (type => 'end tag attribute');
862     }
863     } else {
864     die "$0: $self->{ct}->{type}: Unknown token type";
865     }
866     $self->{state} = DATA_STATE;
867 wakaba 1.5 $self->{s_kwd} = '';
868 wakaba 1.1 !!!next-input-character;
869    
870     !!!emit ($self->{ct}); # start tag or end tag
871    
872     redo A;
873     } elsif (0x0041 <= $self->{nc} and
874     $self->{nc} <= 0x005A) { # A..Z
875     !!!cp (63);
876 wakaba 1.4 $self->{ca}->{name}
877     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
878 wakaba 1.1 ## Stay in the state
879     !!!next-input-character;
880     redo A;
881     } elsif ($self->{nc} == 0x002F) { # /
882     !!!cp (64);
883     $before_leave->();
884     $self->{state} = SELF_CLOSING_START_TAG_STATE;
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == -1) {
888     !!!parse-error (type => 'unclosed tag');
889     $before_leave->();
890     if ($self->{ct}->{type} == START_TAG_TOKEN) {
891     !!!cp (66);
892     $self->{last_stag_name} = $self->{ct}->{tag_name};
893     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
894     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
895     if ($self->{ct}->{attributes}) {
896     !!!cp (67);
897     !!!parse-error (type => 'end tag attribute');
898     } else {
899     ## NOTE: This state should never be reached.
900     !!!cp (68);
901     }
902     } else {
903     die "$0: $self->{ct}->{type}: Unknown token type";
904     }
905     $self->{state} = DATA_STATE;
906 wakaba 1.5 $self->{s_kwd} = '';
907 wakaba 1.1 # reconsume
908    
909     !!!emit ($self->{ct}); # start tag or end tag
910    
911     redo A;
912     } else {
913     if ($self->{nc} == 0x0022 or # "
914     $self->{nc} == 0x0027) { # '
915     !!!cp (69);
916     !!!parse-error (type => 'bad attribute name');
917     } else {
918     !!!cp (70);
919     }
920     $self->{ca}->{name} .= chr ($self->{nc});
921     ## Stay in the state
922     !!!next-input-character;
923     redo A;
924     }
925     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
926     if ($is_space->{$self->{nc}}) {
927     !!!cp (71);
928     ## Stay in the state
929     !!!next-input-character;
930     redo A;
931     } elsif ($self->{nc} == 0x003D) { # =
932     !!!cp (72);
933     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
934     !!!next-input-character;
935     redo A;
936     } elsif ($self->{nc} == 0x003E) { # >
937     if ($self->{ct}->{type} == START_TAG_TOKEN) {
938     !!!cp (73);
939     $self->{last_stag_name} = $self->{ct}->{tag_name};
940     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
941     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
942     if ($self->{ct}->{attributes}) {
943     !!!cp (74);
944     !!!parse-error (type => 'end tag attribute');
945     } else {
946     ## NOTE: This state should never be reached.
947     !!!cp (75);
948     }
949     } else {
950     die "$0: $self->{ct}->{type}: Unknown token type";
951     }
952     $self->{state} = DATA_STATE;
953 wakaba 1.5 $self->{s_kwd} = '';
954 wakaba 1.1 !!!next-input-character;
955    
956     !!!emit ($self->{ct}); # start tag or end tag
957    
958     redo A;
959     } elsif (0x0041 <= $self->{nc} and
960     $self->{nc} <= 0x005A) { # A..Z
961     !!!cp (76);
962     $self->{ca}
963 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
964 wakaba 1.1 value => '',
965     line => $self->{line}, column => $self->{column}};
966     $self->{state} = ATTRIBUTE_NAME_STATE;
967     !!!next-input-character;
968     redo A;
969     } elsif ($self->{nc} == 0x002F) { # /
970     !!!cp (77);
971     $self->{state} = SELF_CLOSING_START_TAG_STATE;
972     !!!next-input-character;
973     redo A;
974     } elsif ($self->{nc} == -1) {
975     !!!parse-error (type => 'unclosed tag');
976     if ($self->{ct}->{type} == START_TAG_TOKEN) {
977     !!!cp (79);
978     $self->{last_stag_name} = $self->{ct}->{tag_name};
979     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
980     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
981     if ($self->{ct}->{attributes}) {
982     !!!cp (80);
983     !!!parse-error (type => 'end tag attribute');
984     } else {
985     ## NOTE: This state should never be reached.
986     !!!cp (81);
987     }
988     } else {
989     die "$0: $self->{ct}->{type}: Unknown token type";
990     }
991 wakaba 1.5 $self->{s_kwd} = '';
992 wakaba 1.1 $self->{state} = DATA_STATE;
993     # reconsume
994    
995     !!!emit ($self->{ct}); # start tag or end tag
996    
997     redo A;
998     } else {
999     if ($self->{nc} == 0x0022 or # "
1000     $self->{nc} == 0x0027) { # '
1001     !!!cp (78);
1002     !!!parse-error (type => 'bad attribute name');
1003     } else {
1004     !!!cp (82);
1005     }
1006     $self->{ca}
1007     = {name => chr ($self->{nc}),
1008     value => '',
1009     line => $self->{line}, column => $self->{column}};
1010     $self->{state} = ATTRIBUTE_NAME_STATE;
1011     !!!next-input-character;
1012     redo A;
1013     }
1014     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016     !!!cp (83);
1017     ## Stay in the state
1018     !!!next-input-character;
1019     redo A;
1020     } elsif ($self->{nc} == 0x0022) { # "
1021     !!!cp (84);
1022     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1023     !!!next-input-character;
1024     redo A;
1025     } elsif ($self->{nc} == 0x0026) { # &
1026     !!!cp (85);
1027     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1028     ## reconsume
1029     redo A;
1030     } elsif ($self->{nc} == 0x0027) { # '
1031     !!!cp (86);
1032     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1033     !!!next-input-character;
1034     redo A;
1035     } elsif ($self->{nc} == 0x003E) { # >
1036     !!!parse-error (type => 'empty unquoted attribute value');
1037     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1038     !!!cp (87);
1039     $self->{last_stag_name} = $self->{ct}->{tag_name};
1040     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1041     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1042     if ($self->{ct}->{attributes}) {
1043     !!!cp (88);
1044     !!!parse-error (type => 'end tag attribute');
1045     } else {
1046     ## NOTE: This state should never be reached.
1047     !!!cp (89);
1048     }
1049     } else {
1050     die "$0: $self->{ct}->{type}: Unknown token type";
1051     }
1052     $self->{state} = DATA_STATE;
1053 wakaba 1.5 $self->{s_kwd} = '';
1054 wakaba 1.1 !!!next-input-character;
1055    
1056     !!!emit ($self->{ct}); # start tag or end tag
1057    
1058     redo A;
1059     } elsif ($self->{nc} == -1) {
1060     !!!parse-error (type => 'unclosed tag');
1061     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1062     !!!cp (90);
1063     $self->{last_stag_name} = $self->{ct}->{tag_name};
1064     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1065     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1066     if ($self->{ct}->{attributes}) {
1067     !!!cp (91);
1068     !!!parse-error (type => 'end tag attribute');
1069     } else {
1070     ## NOTE: This state should never be reached.
1071     !!!cp (92);
1072     }
1073     } else {
1074     die "$0: $self->{ct}->{type}: Unknown token type";
1075     }
1076     $self->{state} = DATA_STATE;
1077 wakaba 1.5 $self->{s_kwd} = '';
1078 wakaba 1.1 ## reconsume
1079    
1080     !!!emit ($self->{ct}); # start tag or end tag
1081    
1082     redo A;
1083     } else {
1084     if ($self->{nc} == 0x003D) { # =
1085     !!!cp (93);
1086     !!!parse-error (type => 'bad attribute value');
1087     } else {
1088     !!!cp (94);
1089     }
1090     $self->{ca}->{value} .= chr ($self->{nc});
1091     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1092     !!!next-input-character;
1093     redo A;
1094     }
1095     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1096     if ($self->{nc} == 0x0022) { # "
1097     !!!cp (95);
1098     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1099     !!!next-input-character;
1100     redo A;
1101     } elsif ($self->{nc} == 0x0026) { # &
1102     !!!cp (96);
1103     ## NOTE: In the spec, the tokenizer is switched to the
1104     ## "entity in attribute value state". In this implementation, the
1105     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1106     ## implementation of the "consume a character reference" algorithm.
1107     $self->{prev_state} = $self->{state};
1108     $self->{entity_add} = 0x0022; # "
1109     $self->{state} = ENTITY_STATE;
1110     !!!next-input-character;
1111     redo A;
1112     } elsif ($self->{nc} == -1) {
1113     !!!parse-error (type => 'unclosed attribute value');
1114     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1115     !!!cp (97);
1116     $self->{last_stag_name} = $self->{ct}->{tag_name};
1117     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1118     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1119     if ($self->{ct}->{attributes}) {
1120     !!!cp (98);
1121     !!!parse-error (type => 'end tag attribute');
1122     } else {
1123     ## NOTE: This state should never be reached.
1124     !!!cp (99);
1125     }
1126     } else {
1127     die "$0: $self->{ct}->{type}: Unknown token type";
1128     }
1129     $self->{state} = DATA_STATE;
1130 wakaba 1.5 $self->{s_kwd} = '';
1131 wakaba 1.1 ## reconsume
1132    
1133     !!!emit ($self->{ct}); # start tag or end tag
1134    
1135     redo A;
1136     } else {
1137     !!!cp (100);
1138     $self->{ca}->{value} .= chr ($self->{nc});
1139     $self->{read_until}->($self->{ca}->{value},
1140     q["&],
1141     length $self->{ca}->{value});
1142    
1143     ## Stay in the state
1144     !!!next-input-character;
1145     redo A;
1146     }
1147     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1148     if ($self->{nc} == 0x0027) { # '
1149     !!!cp (101);
1150     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1151     !!!next-input-character;
1152     redo A;
1153     } elsif ($self->{nc} == 0x0026) { # &
1154     !!!cp (102);
1155     ## NOTE: In the spec, the tokenizer is switched to the
1156     ## "entity in attribute value state". In this implementation, the
1157     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1158     ## implementation of the "consume a character reference" algorithm.
1159     $self->{entity_add} = 0x0027; # '
1160     $self->{prev_state} = $self->{state};
1161     $self->{state} = ENTITY_STATE;
1162     !!!next-input-character;
1163     redo A;
1164     } elsif ($self->{nc} == -1) {
1165     !!!parse-error (type => 'unclosed attribute value');
1166     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1167     !!!cp (103);
1168     $self->{last_stag_name} = $self->{ct}->{tag_name};
1169     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1170     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1171     if ($self->{ct}->{attributes}) {
1172     !!!cp (104);
1173     !!!parse-error (type => 'end tag attribute');
1174     } else {
1175     ## NOTE: This state should never be reached.
1176     !!!cp (105);
1177     }
1178     } else {
1179     die "$0: $self->{ct}->{type}: Unknown token type";
1180     }
1181     $self->{state} = DATA_STATE;
1182 wakaba 1.5 $self->{s_kwd} = '';
1183 wakaba 1.1 ## reconsume
1184    
1185     !!!emit ($self->{ct}); # start tag or end tag
1186    
1187     redo A;
1188     } else {
1189     !!!cp (106);
1190     $self->{ca}->{value} .= chr ($self->{nc});
1191     $self->{read_until}->($self->{ca}->{value},
1192     q['&],
1193     length $self->{ca}->{value});
1194    
1195     ## Stay in the state
1196     !!!next-input-character;
1197     redo A;
1198     }
1199     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1200     if ($is_space->{$self->{nc}}) {
1201     !!!cp (107);
1202     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1203     !!!next-input-character;
1204     redo A;
1205     } elsif ($self->{nc} == 0x0026) { # &
1206     !!!cp (108);
1207     ## NOTE: In the spec, the tokenizer is switched to the
1208     ## "entity in attribute value state". In this implementation, the
1209     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1210     ## implementation of the "consume a character reference" algorithm.
1211     $self->{entity_add} = -1;
1212     $self->{prev_state} = $self->{state};
1213     $self->{state} = ENTITY_STATE;
1214     !!!next-input-character;
1215     redo A;
1216     } elsif ($self->{nc} == 0x003E) { # >
1217     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1218     !!!cp (109);
1219     $self->{last_stag_name} = $self->{ct}->{tag_name};
1220     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1221     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1222     if ($self->{ct}->{attributes}) {
1223     !!!cp (110);
1224     !!!parse-error (type => 'end tag attribute');
1225     } else {
1226     ## NOTE: This state should never be reached.
1227     !!!cp (111);
1228     }
1229     } else {
1230     die "$0: $self->{ct}->{type}: Unknown token type";
1231     }
1232     $self->{state} = DATA_STATE;
1233 wakaba 1.5 $self->{s_kwd} = '';
1234 wakaba 1.1 !!!next-input-character;
1235    
1236     !!!emit ($self->{ct}); # start tag or end tag
1237    
1238     redo A;
1239     } elsif ($self->{nc} == -1) {
1240     !!!parse-error (type => 'unclosed tag');
1241     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1242     !!!cp (112);
1243     $self->{last_stag_name} = $self->{ct}->{tag_name};
1244     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1245     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1246     if ($self->{ct}->{attributes}) {
1247     !!!cp (113);
1248     !!!parse-error (type => 'end tag attribute');
1249     } else {
1250     ## NOTE: This state should never be reached.
1251     !!!cp (114);
1252     }
1253     } else {
1254     die "$0: $self->{ct}->{type}: Unknown token type";
1255     }
1256     $self->{state} = DATA_STATE;
1257 wakaba 1.5 $self->{s_kwd} = '';
1258 wakaba 1.1 ## reconsume
1259    
1260     !!!emit ($self->{ct}); # start tag or end tag
1261    
1262     redo A;
1263     } else {
1264     if ({
1265     0x0022 => 1, # "
1266     0x0027 => 1, # '
1267     0x003D => 1, # =
1268     }->{$self->{nc}}) {
1269     !!!cp (115);
1270     !!!parse-error (type => 'bad attribute value');
1271     } else {
1272     !!!cp (116);
1273     }
1274     $self->{ca}->{value} .= chr ($self->{nc});
1275     $self->{read_until}->($self->{ca}->{value},
1276     q["'=& >],
1277     length $self->{ca}->{value});
1278    
1279     ## Stay in the state
1280     !!!next-input-character;
1281     redo A;
1282     }
1283     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1284     if ($is_space->{$self->{nc}}) {
1285     !!!cp (118);
1286     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1287     !!!next-input-character;
1288     redo A;
1289     } elsif ($self->{nc} == 0x003E) { # >
1290     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1291     !!!cp (119);
1292     $self->{last_stag_name} = $self->{ct}->{tag_name};
1293     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1294     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1295     if ($self->{ct}->{attributes}) {
1296     !!!cp (120);
1297     !!!parse-error (type => 'end tag attribute');
1298     } else {
1299     ## NOTE: This state should never be reached.
1300     !!!cp (121);
1301     }
1302     } else {
1303     die "$0: $self->{ct}->{type}: Unknown token type";
1304     }
1305     $self->{state} = DATA_STATE;
1306 wakaba 1.5 $self->{s_kwd} = '';
1307 wakaba 1.1 !!!next-input-character;
1308    
1309     !!!emit ($self->{ct}); # start tag or end tag
1310    
1311     redo A;
1312     } elsif ($self->{nc} == 0x002F) { # /
1313     !!!cp (122);
1314     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1315     !!!next-input-character;
1316     redo A;
1317     } elsif ($self->{nc} == -1) {
1318     !!!parse-error (type => 'unclosed tag');
1319     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1320     !!!cp (122.3);
1321     $self->{last_stag_name} = $self->{ct}->{tag_name};
1322     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1323     if ($self->{ct}->{attributes}) {
1324     !!!cp (122.1);
1325     !!!parse-error (type => 'end tag attribute');
1326     } else {
1327     ## NOTE: This state should never be reached.
1328     !!!cp (122.2);
1329     }
1330     } else {
1331     die "$0: $self->{ct}->{type}: Unknown token type";
1332     }
1333     $self->{state} = DATA_STATE;
1334 wakaba 1.5 $self->{s_kwd} = '';
1335 wakaba 1.1 ## Reconsume.
1336     !!!emit ($self->{ct}); # start tag or end tag
1337     redo A;
1338     } else {
1339     !!!cp ('124.1');
1340     !!!parse-error (type => 'no space between attributes');
1341     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1342     ## reconsume
1343     redo A;
1344     }
1345     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1346     if ($self->{nc} == 0x003E) { # >
1347     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1348     !!!cp ('124.2');
1349     !!!parse-error (type => 'nestc', token => $self->{ct});
1350     ## TODO: Different type than slash in start tag
1351     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1352     if ($self->{ct}->{attributes}) {
1353     !!!cp ('124.4');
1354     !!!parse-error (type => 'end tag attribute');
1355     } else {
1356     !!!cp ('124.5');
1357     }
1358     ## TODO: Test |<title></title/>|
1359     } else {
1360     !!!cp ('124.3');
1361     $self->{self_closing} = 1;
1362     }
1363    
1364     $self->{state} = DATA_STATE;
1365 wakaba 1.5 $self->{s_kwd} = '';
1366 wakaba 1.1 !!!next-input-character;
1367    
1368     !!!emit ($self->{ct}); # start tag or end tag
1369    
1370     redo A;
1371     } elsif ($self->{nc} == -1) {
1372     !!!parse-error (type => 'unclosed tag');
1373     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1374     !!!cp (124.7);
1375     $self->{last_stag_name} = $self->{ct}->{tag_name};
1376     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1377     if ($self->{ct}->{attributes}) {
1378     !!!cp (124.5);
1379     !!!parse-error (type => 'end tag attribute');
1380     } else {
1381     ## NOTE: This state should never be reached.
1382     !!!cp (124.6);
1383     }
1384     } else {
1385     die "$0: $self->{ct}->{type}: Unknown token type";
1386     }
1387     $self->{state} = DATA_STATE;
1388 wakaba 1.5 $self->{s_kwd} = '';
1389 wakaba 1.1 ## Reconsume.
1390     !!!emit ($self->{ct}); # start tag or end tag
1391     redo A;
1392     } else {
1393     !!!cp ('124.4');
1394     !!!parse-error (type => 'nestc');
1395     ## TODO: This error type is wrong.
1396     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1397     ## Reconsume.
1398     redo A;
1399     }
1400     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1401     ## (only happen if PCDATA state)
1402    
1403     ## NOTE: Unlike spec's "bogus comment state", this implementation
1404     ## consumes characters one-by-one basis.
1405    
1406     if ($self->{nc} == 0x003E) { # >
1407     !!!cp (124);
1408     $self->{state} = DATA_STATE;
1409 wakaba 1.5 $self->{s_kwd} = '';
1410 wakaba 1.1 !!!next-input-character;
1411    
1412     !!!emit ($self->{ct}); # comment
1413     redo A;
1414     } elsif ($self->{nc} == -1) {
1415     !!!cp (125);
1416     $self->{state} = DATA_STATE;
1417 wakaba 1.5 $self->{s_kwd} = '';
1418 wakaba 1.1 ## reconsume
1419    
1420     !!!emit ($self->{ct}); # comment
1421     redo A;
1422     } else {
1423     !!!cp (126);
1424     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1425     $self->{read_until}->($self->{ct}->{data},
1426     q[>],
1427     length $self->{ct}->{data});
1428    
1429     ## Stay in the state.
1430     !!!next-input-character;
1431     redo A;
1432     }
1433     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1434     ## (only happen if PCDATA state)
1435    
1436     if ($self->{nc} == 0x002D) { # -
1437     !!!cp (133);
1438     $self->{state} = MD_HYPHEN_STATE;
1439     !!!next-input-character;
1440     redo A;
1441     } elsif ($self->{nc} == 0x0044 or # D
1442     $self->{nc} == 0x0064) { # d
1443     ## ASCII case-insensitive.
1444     !!!cp (130);
1445     $self->{state} = MD_DOCTYPE_STATE;
1446     $self->{s_kwd} = chr $self->{nc};
1447     !!!next-input-character;
1448     redo A;
1449 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1450     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1451     $self->{is_xml}) and
1452 wakaba 1.1 $self->{nc} == 0x005B) { # [
1453     !!!cp (135.4);
1454     $self->{state} = MD_CDATA_STATE;
1455     $self->{s_kwd} = '[';
1456     !!!next-input-character;
1457     redo A;
1458     } else {
1459     !!!cp (136);
1460     }
1461    
1462     !!!parse-error (type => 'bogus comment',
1463     line => $self->{line_prev},
1464     column => $self->{column_prev} - 1);
1465     ## Reconsume.
1466     $self->{state} = BOGUS_COMMENT_STATE;
1467     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1468     line => $self->{line_prev},
1469     column => $self->{column_prev} - 1,
1470     };
1471     redo A;
1472     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1473     if ($self->{nc} == 0x002D) { # -
1474     !!!cp (127);
1475     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1476     line => $self->{line_prev},
1477     column => $self->{column_prev} - 2,
1478     };
1479     $self->{state} = COMMENT_START_STATE;
1480     !!!next-input-character;
1481     redo A;
1482     } else {
1483     !!!cp (128);
1484     !!!parse-error (type => 'bogus comment',
1485     line => $self->{line_prev},
1486     column => $self->{column_prev} - 2);
1487     $self->{state} = BOGUS_COMMENT_STATE;
1488     ## Reconsume.
1489     $self->{ct} = {type => COMMENT_TOKEN,
1490     data => '-',
1491     line => $self->{line_prev},
1492     column => $self->{column_prev} - 2,
1493     };
1494     redo A;
1495     }
1496     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1497     ## ASCII case-insensitive.
1498     if ($self->{nc} == [
1499     undef,
1500     0x004F, # O
1501     0x0043, # C
1502     0x0054, # T
1503     0x0059, # Y
1504     0x0050, # P
1505     ]->[length $self->{s_kwd}] or
1506     $self->{nc} == [
1507     undef,
1508     0x006F, # o
1509     0x0063, # c
1510     0x0074, # t
1511     0x0079, # y
1512     0x0070, # p
1513     ]->[length $self->{s_kwd}]) {
1514     !!!cp (131);
1515     ## Stay in the state.
1516     $self->{s_kwd} .= chr $self->{nc};
1517     !!!next-input-character;
1518     redo A;
1519     } elsif ((length $self->{s_kwd}) == 6 and
1520     ($self->{nc} == 0x0045 or # E
1521     $self->{nc} == 0x0065)) { # e
1522     !!!cp (129);
1523     $self->{state} = DOCTYPE_STATE;
1524     $self->{ct} = {type => DOCTYPE_TOKEN,
1525     quirks => 1,
1526     line => $self->{line_prev},
1527     column => $self->{column_prev} - 7,
1528     };
1529     !!!next-input-character;
1530     redo A;
1531     } else {
1532     !!!cp (132);
1533     !!!parse-error (type => 'bogus comment',
1534     line => $self->{line_prev},
1535     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1536     $self->{state} = BOGUS_COMMENT_STATE;
1537     ## Reconsume.
1538     $self->{ct} = {type => COMMENT_TOKEN,
1539     data => $self->{s_kwd},
1540     line => $self->{line_prev},
1541     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1542     };
1543     redo A;
1544     }
1545     } elsif ($self->{state} == MD_CDATA_STATE) {
1546     if ($self->{nc} == {
1547     '[' => 0x0043, # C
1548     '[C' => 0x0044, # D
1549     '[CD' => 0x0041, # A
1550     '[CDA' => 0x0054, # T
1551     '[CDAT' => 0x0041, # A
1552     }->{$self->{s_kwd}}) {
1553     !!!cp (135.1);
1554     ## Stay in the state.
1555     $self->{s_kwd} .= chr $self->{nc};
1556     !!!next-input-character;
1557     redo A;
1558     } elsif ($self->{s_kwd} eq '[CDATA' and
1559     $self->{nc} == 0x005B) { # [
1560     !!!cp (135.2);
1561 wakaba 1.6
1562     if ($self->{is_xml} and
1563     not $self->{tainted} and
1564     @{$self->{open_elements} or []} == 0) {
1565     !!!parse-error (type => 'cdata outside of root element',
1566     line => $self->{line_prev},
1567     column => $self->{column_prev} - 7);
1568     $self->{tainted} = 1;
1569     }
1570    
1571 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1572     data => '',
1573     line => $self->{line_prev},
1574     column => $self->{column_prev} - 7};
1575     $self->{state} = CDATA_SECTION_STATE;
1576     !!!next-input-character;
1577     redo A;
1578     } else {
1579     !!!cp (135.3);
1580     !!!parse-error (type => 'bogus comment',
1581     line => $self->{line_prev},
1582     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1583     $self->{state} = BOGUS_COMMENT_STATE;
1584     ## Reconsume.
1585     $self->{ct} = {type => COMMENT_TOKEN,
1586     data => $self->{s_kwd},
1587     line => $self->{line_prev},
1588     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1589     };
1590     redo A;
1591     }
1592     } elsif ($self->{state} == COMMENT_START_STATE) {
1593     if ($self->{nc} == 0x002D) { # -
1594     !!!cp (137);
1595     $self->{state} = COMMENT_START_DASH_STATE;
1596     !!!next-input-character;
1597     redo A;
1598     } elsif ($self->{nc} == 0x003E) { # >
1599     !!!cp (138);
1600     !!!parse-error (type => 'bogus comment');
1601     $self->{state} = DATA_STATE;
1602 wakaba 1.5 $self->{s_kwd} = '';
1603 wakaba 1.1 !!!next-input-character;
1604    
1605     !!!emit ($self->{ct}); # comment
1606    
1607     redo A;
1608     } elsif ($self->{nc} == -1) {
1609     !!!cp (139);
1610     !!!parse-error (type => 'unclosed comment');
1611     $self->{state} = DATA_STATE;
1612 wakaba 1.5 $self->{s_kwd} = '';
1613 wakaba 1.1 ## reconsume
1614    
1615     !!!emit ($self->{ct}); # comment
1616    
1617     redo A;
1618     } else {
1619     !!!cp (140);
1620     $self->{ct}->{data} # comment
1621     .= chr ($self->{nc});
1622     $self->{state} = COMMENT_STATE;
1623     !!!next-input-character;
1624     redo A;
1625     }
1626     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1627     if ($self->{nc} == 0x002D) { # -
1628     !!!cp (141);
1629     $self->{state} = COMMENT_END_STATE;
1630     !!!next-input-character;
1631     redo A;
1632     } elsif ($self->{nc} == 0x003E) { # >
1633     !!!cp (142);
1634     !!!parse-error (type => 'bogus comment');
1635     $self->{state} = DATA_STATE;
1636 wakaba 1.5 $self->{s_kwd} = '';
1637 wakaba 1.1 !!!next-input-character;
1638    
1639     !!!emit ($self->{ct}); # comment
1640    
1641     redo A;
1642     } elsif ($self->{nc} == -1) {
1643     !!!cp (143);
1644     !!!parse-error (type => 'unclosed comment');
1645     $self->{state} = DATA_STATE;
1646 wakaba 1.5 $self->{s_kwd} = '';
1647 wakaba 1.1 ## reconsume
1648    
1649     !!!emit ($self->{ct}); # comment
1650    
1651     redo A;
1652     } else {
1653     !!!cp (144);
1654     $self->{ct}->{data} # comment
1655     .= '-' . chr ($self->{nc});
1656     $self->{state} = COMMENT_STATE;
1657     !!!next-input-character;
1658     redo A;
1659     }
1660     } elsif ($self->{state} == COMMENT_STATE) {
1661     if ($self->{nc} == 0x002D) { # -
1662     !!!cp (145);
1663     $self->{state} = COMMENT_END_DASH_STATE;
1664     !!!next-input-character;
1665     redo A;
1666     } elsif ($self->{nc} == -1) {
1667     !!!cp (146);
1668     !!!parse-error (type => 'unclosed comment');
1669     $self->{state} = DATA_STATE;
1670 wakaba 1.5 $self->{s_kwd} = '';
1671 wakaba 1.1 ## reconsume
1672    
1673     !!!emit ($self->{ct}); # comment
1674    
1675     redo A;
1676     } else {
1677     !!!cp (147);
1678     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1679     $self->{read_until}->($self->{ct}->{data},
1680     q[-],
1681     length $self->{ct}->{data});
1682    
1683     ## Stay in the state
1684     !!!next-input-character;
1685     redo A;
1686     }
1687     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1688     if ($self->{nc} == 0x002D) { # -
1689     !!!cp (148);
1690     $self->{state} = COMMENT_END_STATE;
1691     !!!next-input-character;
1692     redo A;
1693     } elsif ($self->{nc} == -1) {
1694     !!!cp (149);
1695     !!!parse-error (type => 'unclosed comment');
1696 wakaba 1.5 $self->{s_kwd} = '';
1697 wakaba 1.1 $self->{state} = DATA_STATE;
1698 wakaba 1.5 $self->{s_kwd} = '';
1699 wakaba 1.1 ## reconsume
1700    
1701     !!!emit ($self->{ct}); # comment
1702    
1703     redo A;
1704     } else {
1705     !!!cp (150);
1706     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1707     $self->{state} = COMMENT_STATE;
1708     !!!next-input-character;
1709     redo A;
1710     }
1711     } elsif ($self->{state} == COMMENT_END_STATE) {
1712     if ($self->{nc} == 0x003E) { # >
1713     !!!cp (151);
1714     $self->{state} = DATA_STATE;
1715 wakaba 1.5 $self->{s_kwd} = '';
1716 wakaba 1.1 !!!next-input-character;
1717    
1718     !!!emit ($self->{ct}); # comment
1719    
1720     redo A;
1721     } elsif ($self->{nc} == 0x002D) { # -
1722     !!!cp (152);
1723     !!!parse-error (type => 'dash in comment',
1724     line => $self->{line_prev},
1725     column => $self->{column_prev});
1726     $self->{ct}->{data} .= '-'; # comment
1727     ## Stay in the state
1728     !!!next-input-character;
1729     redo A;
1730     } elsif ($self->{nc} == -1) {
1731     !!!cp (153);
1732     !!!parse-error (type => 'unclosed comment');
1733     $self->{state} = DATA_STATE;
1734 wakaba 1.5 $self->{s_kwd} = '';
1735 wakaba 1.1 ## reconsume
1736    
1737     !!!emit ($self->{ct}); # comment
1738    
1739     redo A;
1740     } else {
1741     !!!cp (154);
1742     !!!parse-error (type => 'dash in comment',
1743     line => $self->{line_prev},
1744     column => $self->{column_prev});
1745     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1746     $self->{state} = COMMENT_STATE;
1747     !!!next-input-character;
1748     redo A;
1749     }
1750     } elsif ($self->{state} == DOCTYPE_STATE) {
1751     if ($is_space->{$self->{nc}}) {
1752     !!!cp (155);
1753     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1754     !!!next-input-character;
1755     redo A;
1756     } else {
1757     !!!cp (156);
1758     !!!parse-error (type => 'no space before DOCTYPE name');
1759     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1760     ## reconsume
1761     redo A;
1762     }
1763     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1764     if ($is_space->{$self->{nc}}) {
1765     !!!cp (157);
1766     ## Stay in the state
1767     !!!next-input-character;
1768     redo A;
1769     } elsif ($self->{nc} == 0x003E) { # >
1770     !!!cp (158);
1771     !!!parse-error (type => 'no DOCTYPE name');
1772     $self->{state} = DATA_STATE;
1773 wakaba 1.5 $self->{s_kwd} = '';
1774 wakaba 1.1 !!!next-input-character;
1775    
1776     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1777    
1778     redo A;
1779     } elsif ($self->{nc} == -1) {
1780     !!!cp (159);
1781     !!!parse-error (type => 'no DOCTYPE name');
1782     $self->{state} = DATA_STATE;
1783 wakaba 1.5 $self->{s_kwd} = '';
1784 wakaba 1.1 ## reconsume
1785    
1786     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1787    
1788     redo A;
1789     } else {
1790     !!!cp (160);
1791     $self->{ct}->{name} = chr $self->{nc};
1792     delete $self->{ct}->{quirks};
1793     $self->{state} = DOCTYPE_NAME_STATE;
1794     !!!next-input-character;
1795     redo A;
1796     }
1797     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1798     ## ISSUE: Redundant "First," in the spec.
1799     if ($is_space->{$self->{nc}}) {
1800     !!!cp (161);
1801     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1802     !!!next-input-character;
1803     redo A;
1804     } elsif ($self->{nc} == 0x003E) { # >
1805     !!!cp (162);
1806     $self->{state} = DATA_STATE;
1807 wakaba 1.5 $self->{s_kwd} = '';
1808 wakaba 1.1 !!!next-input-character;
1809    
1810     !!!emit ($self->{ct}); # DOCTYPE
1811    
1812     redo A;
1813     } elsif ($self->{nc} == -1) {
1814     !!!cp (163);
1815     !!!parse-error (type => 'unclosed DOCTYPE');
1816     $self->{state} = DATA_STATE;
1817 wakaba 1.5 $self->{s_kwd} = '';
1818 wakaba 1.1 ## reconsume
1819    
1820     $self->{ct}->{quirks} = 1;
1821     !!!emit ($self->{ct}); # DOCTYPE
1822    
1823     redo A;
1824     } else {
1825     !!!cp (164);
1826     $self->{ct}->{name}
1827     .= chr ($self->{nc}); # DOCTYPE
1828     ## Stay in the state
1829     !!!next-input-character;
1830     redo A;
1831     }
1832     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1833     if ($is_space->{$self->{nc}}) {
1834     !!!cp (165);
1835     ## Stay in the state
1836     !!!next-input-character;
1837     redo A;
1838     } elsif ($self->{nc} == 0x003E) { # >
1839     !!!cp (166);
1840     $self->{state} = DATA_STATE;
1841 wakaba 1.5 $self->{s_kwd} = '';
1842 wakaba 1.1 !!!next-input-character;
1843    
1844     !!!emit ($self->{ct}); # DOCTYPE
1845    
1846     redo A;
1847     } elsif ($self->{nc} == -1) {
1848     !!!cp (167);
1849     !!!parse-error (type => 'unclosed DOCTYPE');
1850     $self->{state} = DATA_STATE;
1851 wakaba 1.5 $self->{s_kwd} = '';
1852 wakaba 1.1 ## reconsume
1853    
1854     $self->{ct}->{quirks} = 1;
1855     !!!emit ($self->{ct}); # DOCTYPE
1856    
1857     redo A;
1858     } elsif ($self->{nc} == 0x0050 or # P
1859     $self->{nc} == 0x0070) { # p
1860     $self->{state} = PUBLIC_STATE;
1861     $self->{s_kwd} = chr $self->{nc};
1862     !!!next-input-character;
1863     redo A;
1864     } elsif ($self->{nc} == 0x0053 or # S
1865     $self->{nc} == 0x0073) { # s
1866     $self->{state} = SYSTEM_STATE;
1867     $self->{s_kwd} = chr $self->{nc};
1868     !!!next-input-character;
1869     redo A;
1870     } else {
1871     !!!cp (180);
1872     !!!parse-error (type => 'string after DOCTYPE name');
1873     $self->{ct}->{quirks} = 1;
1874    
1875     $self->{state} = BOGUS_DOCTYPE_STATE;
1876     !!!next-input-character;
1877     redo A;
1878     }
1879     } elsif ($self->{state} == PUBLIC_STATE) {
1880     ## ASCII case-insensitive
1881     if ($self->{nc} == [
1882     undef,
1883     0x0055, # U
1884     0x0042, # B
1885     0x004C, # L
1886     0x0049, # I
1887     ]->[length $self->{s_kwd}] or
1888     $self->{nc} == [
1889     undef,
1890     0x0075, # u
1891     0x0062, # b
1892     0x006C, # l
1893     0x0069, # i
1894     ]->[length $self->{s_kwd}]) {
1895     !!!cp (175);
1896     ## Stay in the state.
1897     $self->{s_kwd} .= chr $self->{nc};
1898     !!!next-input-character;
1899     redo A;
1900     } elsif ((length $self->{s_kwd}) == 5 and
1901     ($self->{nc} == 0x0043 or # C
1902     $self->{nc} == 0x0063)) { # c
1903     !!!cp (168);
1904     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1905     !!!next-input-character;
1906     redo A;
1907     } else {
1908     !!!cp (169);
1909     !!!parse-error (type => 'string after DOCTYPE name',
1910     line => $self->{line_prev},
1911     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1912     $self->{ct}->{quirks} = 1;
1913    
1914     $self->{state} = BOGUS_DOCTYPE_STATE;
1915     ## Reconsume.
1916     redo A;
1917     }
1918     } elsif ($self->{state} == SYSTEM_STATE) {
1919     ## ASCII case-insensitive
1920     if ($self->{nc} == [
1921     undef,
1922     0x0059, # Y
1923     0x0053, # S
1924     0x0054, # T
1925     0x0045, # E
1926     ]->[length $self->{s_kwd}] or
1927     $self->{nc} == [
1928     undef,
1929     0x0079, # y
1930     0x0073, # s
1931     0x0074, # t
1932     0x0065, # e
1933     ]->[length $self->{s_kwd}]) {
1934     !!!cp (170);
1935     ## Stay in the state.
1936     $self->{s_kwd} .= chr $self->{nc};
1937     !!!next-input-character;
1938     redo A;
1939     } elsif ((length $self->{s_kwd}) == 5 and
1940     ($self->{nc} == 0x004D or # M
1941     $self->{nc} == 0x006D)) { # m
1942     !!!cp (171);
1943     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1944     !!!next-input-character;
1945     redo A;
1946     } else {
1947     !!!cp (172);
1948     !!!parse-error (type => 'string after DOCTYPE name',
1949     line => $self->{line_prev},
1950     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1951     $self->{ct}->{quirks} = 1;
1952    
1953     $self->{state} = BOGUS_DOCTYPE_STATE;
1954     ## Reconsume.
1955     redo A;
1956     }
1957     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1958     if ($is_space->{$self->{nc}}) {
1959     !!!cp (181);
1960     ## Stay in the state
1961     !!!next-input-character;
1962     redo A;
1963     } elsif ($self->{nc} eq 0x0022) { # "
1964     !!!cp (182);
1965     $self->{ct}->{pubid} = ''; # DOCTYPE
1966     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1967     !!!next-input-character;
1968     redo A;
1969     } elsif ($self->{nc} eq 0x0027) { # '
1970     !!!cp (183);
1971     $self->{ct}->{pubid} = ''; # DOCTYPE
1972     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1973     !!!next-input-character;
1974     redo A;
1975     } elsif ($self->{nc} eq 0x003E) { # >
1976     !!!cp (184);
1977     !!!parse-error (type => 'no PUBLIC literal');
1978    
1979     $self->{state} = DATA_STATE;
1980 wakaba 1.5 $self->{s_kwd} = '';
1981 wakaba 1.1 !!!next-input-character;
1982    
1983     $self->{ct}->{quirks} = 1;
1984     !!!emit ($self->{ct}); # DOCTYPE
1985    
1986     redo A;
1987     } elsif ($self->{nc} == -1) {
1988     !!!cp (185);
1989     !!!parse-error (type => 'unclosed DOCTYPE');
1990    
1991     $self->{state} = DATA_STATE;
1992 wakaba 1.5 $self->{s_kwd} = '';
1993 wakaba 1.1 ## reconsume
1994    
1995     $self->{ct}->{quirks} = 1;
1996     !!!emit ($self->{ct}); # DOCTYPE
1997    
1998     redo A;
1999     } else {
2000     !!!cp (186);
2001     !!!parse-error (type => 'string after PUBLIC');
2002     $self->{ct}->{quirks} = 1;
2003    
2004     $self->{state} = BOGUS_DOCTYPE_STATE;
2005     !!!next-input-character;
2006     redo A;
2007     }
2008     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2009     if ($self->{nc} == 0x0022) { # "
2010     !!!cp (187);
2011     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2012     !!!next-input-character;
2013     redo A;
2014     } elsif ($self->{nc} == 0x003E) { # >
2015     !!!cp (188);
2016     !!!parse-error (type => 'unclosed PUBLIC literal');
2017    
2018     $self->{state} = DATA_STATE;
2019 wakaba 1.5 $self->{s_kwd} = '';
2020 wakaba 1.1 !!!next-input-character;
2021    
2022     $self->{ct}->{quirks} = 1;
2023     !!!emit ($self->{ct}); # DOCTYPE
2024    
2025     redo A;
2026     } elsif ($self->{nc} == -1) {
2027     !!!cp (189);
2028     !!!parse-error (type => 'unclosed PUBLIC literal');
2029    
2030     $self->{state} = DATA_STATE;
2031 wakaba 1.5 $self->{s_kwd} = '';
2032 wakaba 1.1 ## reconsume
2033    
2034     $self->{ct}->{quirks} = 1;
2035     !!!emit ($self->{ct}); # DOCTYPE
2036    
2037     redo A;
2038     } else {
2039     !!!cp (190);
2040     $self->{ct}->{pubid} # DOCTYPE
2041     .= chr $self->{nc};
2042     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2043     length $self->{ct}->{pubid});
2044    
2045     ## Stay in the state
2046     !!!next-input-character;
2047     redo A;
2048     }
2049     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2050     if ($self->{nc} == 0x0027) { # '
2051     !!!cp (191);
2052     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2053     !!!next-input-character;
2054     redo A;
2055     } elsif ($self->{nc} == 0x003E) { # >
2056     !!!cp (192);
2057     !!!parse-error (type => 'unclosed PUBLIC literal');
2058    
2059     $self->{state} = DATA_STATE;
2060 wakaba 1.5 $self->{s_kwd} = '';
2061 wakaba 1.1 !!!next-input-character;
2062    
2063     $self->{ct}->{quirks} = 1;
2064     !!!emit ($self->{ct}); # DOCTYPE
2065    
2066     redo A;
2067     } elsif ($self->{nc} == -1) {
2068     !!!cp (193);
2069     !!!parse-error (type => 'unclosed PUBLIC literal');
2070    
2071     $self->{state} = DATA_STATE;
2072 wakaba 1.5 $self->{s_kwd} = '';
2073 wakaba 1.1 ## reconsume
2074    
2075     $self->{ct}->{quirks} = 1;
2076     !!!emit ($self->{ct}); # DOCTYPE
2077    
2078     redo A;
2079     } else {
2080     !!!cp (194);
2081     $self->{ct}->{pubid} # DOCTYPE
2082     .= chr $self->{nc};
2083     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2084     length $self->{ct}->{pubid});
2085    
2086     ## Stay in the state
2087     !!!next-input-character;
2088     redo A;
2089     }
2090     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2091     if ($is_space->{$self->{nc}}) {
2092     !!!cp (195);
2093     ## Stay in the state
2094     !!!next-input-character;
2095     redo A;
2096     } elsif ($self->{nc} == 0x0022) { # "
2097     !!!cp (196);
2098     $self->{ct}->{sysid} = ''; # DOCTYPE
2099     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2100     !!!next-input-character;
2101     redo A;
2102     } elsif ($self->{nc} == 0x0027) { # '
2103     !!!cp (197);
2104     $self->{ct}->{sysid} = ''; # DOCTYPE
2105     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2106     !!!next-input-character;
2107     redo A;
2108     } elsif ($self->{nc} == 0x003E) { # >
2109     !!!cp (198);
2110     $self->{state} = DATA_STATE;
2111 wakaba 1.5 $self->{s_kwd} = '';
2112 wakaba 1.1 !!!next-input-character;
2113    
2114     !!!emit ($self->{ct}); # DOCTYPE
2115    
2116     redo A;
2117     } elsif ($self->{nc} == -1) {
2118     !!!cp (199);
2119     !!!parse-error (type => 'unclosed DOCTYPE');
2120    
2121     $self->{state} = DATA_STATE;
2122 wakaba 1.5 $self->{s_kwd} = '';
2123 wakaba 1.1 ## reconsume
2124    
2125     $self->{ct}->{quirks} = 1;
2126     !!!emit ($self->{ct}); # DOCTYPE
2127    
2128     redo A;
2129     } else {
2130     !!!cp (200);
2131     !!!parse-error (type => 'string after PUBLIC literal');
2132     $self->{ct}->{quirks} = 1;
2133    
2134     $self->{state} = BOGUS_DOCTYPE_STATE;
2135     !!!next-input-character;
2136     redo A;
2137     }
2138     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2139     if ($is_space->{$self->{nc}}) {
2140     !!!cp (201);
2141     ## Stay in the state
2142     !!!next-input-character;
2143     redo A;
2144     } elsif ($self->{nc} == 0x0022) { # "
2145     !!!cp (202);
2146     $self->{ct}->{sysid} = ''; # DOCTYPE
2147     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2148     !!!next-input-character;
2149     redo A;
2150     } elsif ($self->{nc} == 0x0027) { # '
2151     !!!cp (203);
2152     $self->{ct}->{sysid} = ''; # DOCTYPE
2153     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2154     !!!next-input-character;
2155     redo A;
2156     } elsif ($self->{nc} == 0x003E) { # >
2157     !!!cp (204);
2158     !!!parse-error (type => 'no SYSTEM literal');
2159     $self->{state} = DATA_STATE;
2160 wakaba 1.5 $self->{s_kwd} = '';
2161 wakaba 1.1 !!!next-input-character;
2162    
2163     $self->{ct}->{quirks} = 1;
2164     !!!emit ($self->{ct}); # DOCTYPE
2165    
2166     redo A;
2167     } elsif ($self->{nc} == -1) {
2168     !!!cp (205);
2169     !!!parse-error (type => 'unclosed DOCTYPE');
2170    
2171     $self->{state} = DATA_STATE;
2172 wakaba 1.5 $self->{s_kwd} = '';
2173 wakaba 1.1 ## reconsume
2174    
2175     $self->{ct}->{quirks} = 1;
2176     !!!emit ($self->{ct}); # DOCTYPE
2177    
2178     redo A;
2179     } else {
2180     !!!cp (206);
2181     !!!parse-error (type => 'string after SYSTEM');
2182     $self->{ct}->{quirks} = 1;
2183    
2184     $self->{state} = BOGUS_DOCTYPE_STATE;
2185     !!!next-input-character;
2186     redo A;
2187     }
2188     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2189     if ($self->{nc} == 0x0022) { # "
2190     !!!cp (207);
2191     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2192     !!!next-input-character;
2193     redo A;
2194     } elsif ($self->{nc} == 0x003E) { # >
2195     !!!cp (208);
2196     !!!parse-error (type => 'unclosed SYSTEM literal');
2197    
2198     $self->{state} = DATA_STATE;
2199 wakaba 1.5 $self->{s_kwd} = '';
2200 wakaba 1.1 !!!next-input-character;
2201    
2202     $self->{ct}->{quirks} = 1;
2203     !!!emit ($self->{ct}); # DOCTYPE
2204    
2205     redo A;
2206     } elsif ($self->{nc} == -1) {
2207     !!!cp (209);
2208     !!!parse-error (type => 'unclosed SYSTEM literal');
2209    
2210     $self->{state} = DATA_STATE;
2211 wakaba 1.5 $self->{s_kwd} = '';
2212 wakaba 1.1 ## reconsume
2213    
2214     $self->{ct}->{quirks} = 1;
2215     !!!emit ($self->{ct}); # DOCTYPE
2216    
2217     redo A;
2218     } else {
2219     !!!cp (210);
2220     $self->{ct}->{sysid} # DOCTYPE
2221     .= chr $self->{nc};
2222     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2223     length $self->{ct}->{sysid});
2224    
2225     ## Stay in the state
2226     !!!next-input-character;
2227     redo A;
2228     }
2229     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2230     if ($self->{nc} == 0x0027) { # '
2231     !!!cp (211);
2232     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2233     !!!next-input-character;
2234     redo A;
2235     } elsif ($self->{nc} == 0x003E) { # >
2236     !!!cp (212);
2237     !!!parse-error (type => 'unclosed SYSTEM literal');
2238    
2239     $self->{state} = DATA_STATE;
2240 wakaba 1.5 $self->{s_kwd} = '';
2241 wakaba 1.1 !!!next-input-character;
2242    
2243     $self->{ct}->{quirks} = 1;
2244     !!!emit ($self->{ct}); # DOCTYPE
2245    
2246     redo A;
2247     } elsif ($self->{nc} == -1) {
2248     !!!cp (213);
2249     !!!parse-error (type => 'unclosed SYSTEM literal');
2250    
2251     $self->{state} = DATA_STATE;
2252 wakaba 1.5 $self->{s_kwd} = '';
2253 wakaba 1.1 ## reconsume
2254    
2255     $self->{ct}->{quirks} = 1;
2256     !!!emit ($self->{ct}); # DOCTYPE
2257    
2258     redo A;
2259     } else {
2260     !!!cp (214);
2261     $self->{ct}->{sysid} # DOCTYPE
2262     .= chr $self->{nc};
2263     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2264     length $self->{ct}->{sysid});
2265    
2266     ## Stay in the state
2267     !!!next-input-character;
2268     redo A;
2269     }
2270     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2271     if ($is_space->{$self->{nc}}) {
2272     !!!cp (215);
2273     ## Stay in the state
2274     !!!next-input-character;
2275     redo A;
2276     } elsif ($self->{nc} == 0x003E) { # >
2277     !!!cp (216);
2278     $self->{state} = DATA_STATE;
2279 wakaba 1.5 $self->{s_kwd} = '';
2280 wakaba 1.1 !!!next-input-character;
2281    
2282     !!!emit ($self->{ct}); # DOCTYPE
2283    
2284     redo A;
2285     } elsif ($self->{nc} == -1) {
2286     !!!cp (217);
2287     !!!parse-error (type => 'unclosed DOCTYPE');
2288     $self->{state} = DATA_STATE;
2289 wakaba 1.5 $self->{s_kwd} = '';
2290 wakaba 1.1 ## reconsume
2291    
2292     $self->{ct}->{quirks} = 1;
2293     !!!emit ($self->{ct}); # DOCTYPE
2294    
2295     redo A;
2296     } else {
2297     !!!cp (218);
2298     !!!parse-error (type => 'string after SYSTEM literal');
2299     #$self->{ct}->{quirks} = 1;
2300    
2301     $self->{state} = BOGUS_DOCTYPE_STATE;
2302     !!!next-input-character;
2303     redo A;
2304     }
2305     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2306     if ($self->{nc} == 0x003E) { # >
2307     !!!cp (219);
2308     $self->{state} = DATA_STATE;
2309 wakaba 1.5 $self->{s_kwd} = '';
2310 wakaba 1.1 !!!next-input-character;
2311    
2312     !!!emit ($self->{ct}); # DOCTYPE
2313    
2314     redo A;
2315     } elsif ($self->{nc} == -1) {
2316     !!!cp (220);
2317     $self->{state} = DATA_STATE;
2318 wakaba 1.5 $self->{s_kwd} = '';
2319 wakaba 1.1 ## reconsume
2320    
2321     !!!emit ($self->{ct}); # DOCTYPE
2322    
2323     redo A;
2324     } else {
2325     !!!cp (221);
2326     my $s = '';
2327     $self->{read_until}->($s, q[>], 0);
2328    
2329     ## Stay in the state
2330     !!!next-input-character;
2331     redo A;
2332     }
2333     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2334     ## NOTE: "CDATA section state" in the state is jointly implemented
2335     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2336     ## and |CDATA_SECTION_MSE2_STATE|.
2337    
2338     if ($self->{nc} == 0x005D) { # ]
2339     !!!cp (221.1);
2340     $self->{state} = CDATA_SECTION_MSE1_STATE;
2341     !!!next-input-character;
2342     redo A;
2343     } elsif ($self->{nc} == -1) {
2344 wakaba 1.6 if ($self->{is_xml}) {
2345     !!!parse-error (type => 'no mse'); ## TODO: type
2346     }
2347    
2348 wakaba 1.1 $self->{state} = DATA_STATE;
2349 wakaba 1.5 $self->{s_kwd} = '';
2350 wakaba 1.1 !!!next-input-character;
2351     if (length $self->{ct}->{data}) { # character
2352     !!!cp (221.2);
2353     !!!emit ($self->{ct}); # character
2354     } else {
2355     !!!cp (221.3);
2356     ## No token to emit. $self->{ct} is discarded.
2357     }
2358     redo A;
2359     } else {
2360     !!!cp (221.4);
2361     $self->{ct}->{data} .= chr $self->{nc};
2362     $self->{read_until}->($self->{ct}->{data},
2363     q<]>,
2364     length $self->{ct}->{data});
2365    
2366     ## Stay in the state.
2367     !!!next-input-character;
2368     redo A;
2369     }
2370    
2371     ## ISSUE: "text tokens" in spec.
2372     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2373     if ($self->{nc} == 0x005D) { # ]
2374     !!!cp (221.5);
2375     $self->{state} = CDATA_SECTION_MSE2_STATE;
2376     !!!next-input-character;
2377     redo A;
2378     } else {
2379     !!!cp (221.6);
2380     $self->{ct}->{data} .= ']';
2381     $self->{state} = CDATA_SECTION_STATE;
2382     ## Reconsume.
2383     redo A;
2384     }
2385     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2386     if ($self->{nc} == 0x003E) { # >
2387     $self->{state} = DATA_STATE;
2388 wakaba 1.5 $self->{s_kwd} = '';
2389 wakaba 1.1 !!!next-input-character;
2390     if (length $self->{ct}->{data}) { # character
2391     !!!cp (221.7);
2392     !!!emit ($self->{ct}); # character
2393     } else {
2394     !!!cp (221.8);
2395     ## No token to emit. $self->{ct} is discarded.
2396     }
2397     redo A;
2398     } elsif ($self->{nc} == 0x005D) { # ]
2399     !!!cp (221.9); # character
2400     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2401     ## Stay in the state.
2402     !!!next-input-character;
2403     redo A;
2404     } else {
2405     !!!cp (221.11);
2406     $self->{ct}->{data} .= ']]'; # character
2407     $self->{state} = CDATA_SECTION_STATE;
2408     ## Reconsume.
2409     redo A;
2410     }
2411     } elsif ($self->{state} == ENTITY_STATE) {
2412     if ($is_space->{$self->{nc}} or
2413     {
2414     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2415     $self->{entity_add} => 1,
2416     }->{$self->{nc}}) {
2417     !!!cp (1001);
2418     ## Don't consume
2419     ## No error
2420     ## Return nothing.
2421     #
2422     } elsif ($self->{nc} == 0x0023) { # #
2423     !!!cp (999);
2424     $self->{state} = ENTITY_HASH_STATE;
2425     $self->{s_kwd} = '#';
2426     !!!next-input-character;
2427     redo A;
2428     } elsif ((0x0041 <= $self->{nc} and
2429     $self->{nc} <= 0x005A) or # A..Z
2430     (0x0061 <= $self->{nc} and
2431     $self->{nc} <= 0x007A)) { # a..z
2432     !!!cp (998);
2433     require Whatpm::_NamedEntityList;
2434     $self->{state} = ENTITY_NAME_STATE;
2435     $self->{s_kwd} = chr $self->{nc};
2436     $self->{entity__value} = $self->{s_kwd};
2437     $self->{entity__match} = 0;
2438     !!!next-input-character;
2439     redo A;
2440     } else {
2441     !!!cp (1027);
2442     !!!parse-error (type => 'bare ero');
2443     ## Return nothing.
2444     #
2445     }
2446    
2447     ## NOTE: No character is consumed by the "consume a character
2448     ## reference" algorithm. In other word, there is an "&" character
2449     ## that does not introduce a character reference, which would be
2450     ## appended to the parent element or the attribute value in later
2451     ## process of the tokenizer.
2452    
2453     if ($self->{prev_state} == DATA_STATE) {
2454     !!!cp (997);
2455     $self->{state} = $self->{prev_state};
2456 wakaba 1.5 $self->{s_kwd} = '';
2457 wakaba 1.1 ## Reconsume.
2458     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2459     line => $self->{line_prev},
2460     column => $self->{column_prev},
2461     });
2462     redo A;
2463     } else {
2464     !!!cp (996);
2465     $self->{ca}->{value} .= '&';
2466     $self->{state} = $self->{prev_state};
2467 wakaba 1.5 $self->{s_kwd} = '';
2468 wakaba 1.1 ## Reconsume.
2469     redo A;
2470     }
2471     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2472     if ($self->{nc} == 0x0078 or # x
2473     $self->{nc} == 0x0058) { # X
2474     !!!cp (995);
2475     $self->{state} = HEXREF_X_STATE;
2476     $self->{s_kwd} .= chr $self->{nc};
2477     !!!next-input-character;
2478     redo A;
2479     } elsif (0x0030 <= $self->{nc} and
2480     $self->{nc} <= 0x0039) { # 0..9
2481     !!!cp (994);
2482     $self->{state} = NCR_NUM_STATE;
2483     $self->{s_kwd} = $self->{nc} - 0x0030;
2484     !!!next-input-character;
2485     redo A;
2486     } else {
2487     !!!parse-error (type => 'bare nero',
2488     line => $self->{line_prev},
2489     column => $self->{column_prev} - 1);
2490    
2491     ## NOTE: According to the spec algorithm, nothing is returned,
2492     ## and then "&#" is appended to the parent element or the attribute
2493     ## value in the later processing.
2494    
2495     if ($self->{prev_state} == DATA_STATE) {
2496     !!!cp (1019);
2497     $self->{state} = $self->{prev_state};
2498 wakaba 1.5 $self->{s_kwd} = '';
2499 wakaba 1.1 ## Reconsume.
2500     !!!emit ({type => CHARACTER_TOKEN,
2501     data => '&#',
2502     line => $self->{line_prev},
2503     column => $self->{column_prev} - 1,
2504     });
2505     redo A;
2506     } else {
2507     !!!cp (993);
2508     $self->{ca}->{value} .= '&#';
2509     $self->{state} = $self->{prev_state};
2510 wakaba 1.5 $self->{s_kwd} = '';
2511 wakaba 1.1 ## Reconsume.
2512     redo A;
2513     }
2514     }
2515     } elsif ($self->{state} == NCR_NUM_STATE) {
2516     if (0x0030 <= $self->{nc} and
2517     $self->{nc} <= 0x0039) { # 0..9
2518     !!!cp (1012);
2519     $self->{s_kwd} *= 10;
2520     $self->{s_kwd} += $self->{nc} - 0x0030;
2521    
2522     ## Stay in the state.
2523     !!!next-input-character;
2524     redo A;
2525     } elsif ($self->{nc} == 0x003B) { # ;
2526     !!!cp (1013);
2527     !!!next-input-character;
2528     #
2529     } else {
2530     !!!cp (1014);
2531     !!!parse-error (type => 'no refc');
2532     ## Reconsume.
2533     #
2534     }
2535    
2536     my $code = $self->{s_kwd};
2537     my $l = $self->{line_prev};
2538     my $c = $self->{column_prev};
2539     if ($charref_map->{$code}) {
2540     !!!cp (1015);
2541     !!!parse-error (type => 'invalid character reference',
2542     text => (sprintf 'U+%04X', $code),
2543     line => $l, column => $c);
2544     $code = $charref_map->{$code};
2545     } elsif ($code > 0x10FFFF) {
2546     !!!cp (1016);
2547     !!!parse-error (type => 'invalid character reference',
2548     text => (sprintf 'U-%08X', $code),
2549     line => $l, column => $c);
2550     $code = 0xFFFD;
2551     }
2552    
2553     if ($self->{prev_state} == DATA_STATE) {
2554     !!!cp (992);
2555     $self->{state} = $self->{prev_state};
2556 wakaba 1.5 $self->{s_kwd} = '';
2557 wakaba 1.1 ## Reconsume.
2558     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2559     line => $l, column => $c,
2560     });
2561     redo A;
2562     } else {
2563     !!!cp (991);
2564     $self->{ca}->{value} .= chr $code;
2565     $self->{ca}->{has_reference} = 1;
2566     $self->{state} = $self->{prev_state};
2567 wakaba 1.5 $self->{s_kwd} = '';
2568 wakaba 1.1 ## Reconsume.
2569     redo A;
2570     }
2571     } elsif ($self->{state} == HEXREF_X_STATE) {
2572     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2573     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2574     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2575     # 0..9, A..F, a..f
2576     !!!cp (990);
2577     $self->{state} = HEXREF_HEX_STATE;
2578     $self->{s_kwd} = 0;
2579     ## Reconsume.
2580     redo A;
2581     } else {
2582     !!!parse-error (type => 'bare hcro',
2583     line => $self->{line_prev},
2584     column => $self->{column_prev} - 2);
2585    
2586     ## NOTE: According to the spec algorithm, nothing is returned,
2587     ## and then "&#" followed by "X" or "x" is appended to the parent
2588     ## element or the attribute value in the later processing.
2589    
2590     if ($self->{prev_state} == DATA_STATE) {
2591     !!!cp (1005);
2592     $self->{state} = $self->{prev_state};
2593 wakaba 1.5 $self->{s_kwd} = '';
2594 wakaba 1.1 ## Reconsume.
2595     !!!emit ({type => CHARACTER_TOKEN,
2596     data => '&' . $self->{s_kwd},
2597     line => $self->{line_prev},
2598     column => $self->{column_prev} - length $self->{s_kwd},
2599     });
2600     redo A;
2601     } else {
2602     !!!cp (989);
2603     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2604     $self->{state} = $self->{prev_state};
2605 wakaba 1.5 $self->{s_kwd} = '';
2606 wakaba 1.1 ## Reconsume.
2607     redo A;
2608     }
2609     }
2610     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2611     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2612     # 0..9
2613     !!!cp (1002);
2614     $self->{s_kwd} *= 0x10;
2615     $self->{s_kwd} += $self->{nc} - 0x0030;
2616     ## Stay in the state.
2617     !!!next-input-character;
2618     redo A;
2619     } elsif (0x0061 <= $self->{nc} and
2620     $self->{nc} <= 0x0066) { # a..f
2621     !!!cp (1003);
2622     $self->{s_kwd} *= 0x10;
2623     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2624     ## Stay in the state.
2625     !!!next-input-character;
2626     redo A;
2627     } elsif (0x0041 <= $self->{nc} and
2628     $self->{nc} <= 0x0046) { # A..F
2629     !!!cp (1004);
2630     $self->{s_kwd} *= 0x10;
2631     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2632     ## Stay in the state.
2633     !!!next-input-character;
2634     redo A;
2635     } elsif ($self->{nc} == 0x003B) { # ;
2636     !!!cp (1006);
2637     !!!next-input-character;
2638     #
2639     } else {
2640     !!!cp (1007);
2641     !!!parse-error (type => 'no refc',
2642     line => $self->{line},
2643     column => $self->{column});
2644     ## Reconsume.
2645     #
2646     }
2647    
2648     my $code = $self->{s_kwd};
2649     my $l = $self->{line_prev};
2650     my $c = $self->{column_prev};
2651     if ($charref_map->{$code}) {
2652     !!!cp (1008);
2653     !!!parse-error (type => 'invalid character reference',
2654     text => (sprintf 'U+%04X', $code),
2655     line => $l, column => $c);
2656     $code = $charref_map->{$code};
2657     } elsif ($code > 0x10FFFF) {
2658     !!!cp (1009);
2659     !!!parse-error (type => 'invalid character reference',
2660     text => (sprintf 'U-%08X', $code),
2661     line => $l, column => $c);
2662     $code = 0xFFFD;
2663     }
2664    
2665     if ($self->{prev_state} == DATA_STATE) {
2666     !!!cp (988);
2667     $self->{state} = $self->{prev_state};
2668 wakaba 1.5 $self->{s_kwd} = '';
2669 wakaba 1.1 ## Reconsume.
2670     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2671     line => $l, column => $c,
2672     });
2673     redo A;
2674     } else {
2675     !!!cp (987);
2676     $self->{ca}->{value} .= chr $code;
2677     $self->{ca}->{has_reference} = 1;
2678     $self->{state} = $self->{prev_state};
2679 wakaba 1.5 $self->{s_kwd} = '';
2680 wakaba 1.1 ## Reconsume.
2681     redo A;
2682     }
2683     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2684     if (length $self->{s_kwd} < 30 and
2685     ## NOTE: Some number greater than the maximum length of entity name
2686     ((0x0041 <= $self->{nc} and # a
2687     $self->{nc} <= 0x005A) or # x
2688     (0x0061 <= $self->{nc} and # a
2689     $self->{nc} <= 0x007A) or # z
2690     (0x0030 <= $self->{nc} and # 0
2691     $self->{nc} <= 0x0039) or # 9
2692     $self->{nc} == 0x003B)) { # ;
2693     our $EntityChar;
2694     $self->{s_kwd} .= chr $self->{nc};
2695     if (defined $EntityChar->{$self->{s_kwd}}) {
2696     if ($self->{nc} == 0x003B) { # ;
2697     !!!cp (1020);
2698     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2699     $self->{entity__match} = 1;
2700     !!!next-input-character;
2701     #
2702     } else {
2703     !!!cp (1021);
2704     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2705     $self->{entity__match} = -1;
2706     ## Stay in the state.
2707     !!!next-input-character;
2708     redo A;
2709     }
2710     } else {
2711     !!!cp (1022);
2712     $self->{entity__value} .= chr $self->{nc};
2713     $self->{entity__match} *= 2;
2714     ## Stay in the state.
2715     !!!next-input-character;
2716     redo A;
2717     }
2718     }
2719    
2720     my $data;
2721     my $has_ref;
2722     if ($self->{entity__match} > 0) {
2723     !!!cp (1023);
2724     $data = $self->{entity__value};
2725     $has_ref = 1;
2726     #
2727     } elsif ($self->{entity__match} < 0) {
2728     !!!parse-error (type => 'no refc');
2729     if ($self->{prev_state} != DATA_STATE and # in attribute
2730     $self->{entity__match} < -1) {
2731     !!!cp (1024);
2732     $data = '&' . $self->{s_kwd};
2733     #
2734     } else {
2735     !!!cp (1025);
2736     $data = $self->{entity__value};
2737     $has_ref = 1;
2738     #
2739     }
2740     } else {
2741     !!!cp (1026);
2742     !!!parse-error (type => 'bare ero',
2743     line => $self->{line_prev},
2744     column => $self->{column_prev} - length $self->{s_kwd});
2745     $data = '&' . $self->{s_kwd};
2746     #
2747     }
2748    
2749     ## NOTE: In these cases, when a character reference is found,
2750     ## it is consumed and a character token is returned, or, otherwise,
2751     ## nothing is consumed and returned, according to the spec algorithm.
2752     ## In this implementation, anything that has been examined by the
2753     ## tokenizer is appended to the parent element or the attribute value
2754     ## as string, either literal string when no character reference or
2755     ## entity-replaced string otherwise, in this stage, since any characters
2756     ## that would not be consumed are appended in the data state or in an
2757     ## appropriate attribute value state anyway.
2758    
2759     if ($self->{prev_state} == DATA_STATE) {
2760     !!!cp (986);
2761     $self->{state} = $self->{prev_state};
2762 wakaba 1.5 $self->{s_kwd} = '';
2763 wakaba 1.1 ## Reconsume.
2764     !!!emit ({type => CHARACTER_TOKEN,
2765     data => $data,
2766     line => $self->{line_prev},
2767     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2768     });
2769     redo A;
2770     } else {
2771     !!!cp (985);
2772     $self->{ca}->{value} .= $data;
2773     $self->{ca}->{has_reference} = 1 if $has_ref;
2774     $self->{state} = $self->{prev_state};
2775 wakaba 1.5 $self->{s_kwd} = '';
2776 wakaba 1.1 ## Reconsume.
2777     redo A;
2778     }
2779     } else {
2780     die "$0: $self->{state}: Unknown state";
2781     }
2782     } # A
2783    
2784     die "$0: _get_next_token: unexpected case";
2785     } # _get_next_token
2786    
2787     1;
2788 wakaba 1.6 ## $Date: 2008/10/14 14:38:59 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24