/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download) (as text)
Tue Oct 14 14:38:59 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.4: +109 -9 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	14 Oct 2008 14:21:51 -0000
	* XML-Parser.t: "xml/texts-1.dat" added.

	* tokenizer-test-2.dat: Test for ]]> are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:38:34 -0000
	* doctypes-1.dat: Wrong results fixed.

	* texts-1.dat: New test data file.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:22:43 -0000
	* Tokenizer.pm.src: Raise a parse error for XML "]]>" other than
	CDATA section end.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.5 our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
182 wakaba 1.1 #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194     !!!next-input-character;
195     $self->{token} = [];
196     # $self->{escape}
197     } # _initialize_tokenizer
198    
199     ## A token has:
200     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202     ## ->{name} (DOCTYPE_TOKEN)
203     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204     ## ->{pubid} (DOCTYPE_TOKEN)
205     ## ->{sysid} (DOCTYPE_TOKEN)
206     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208     ## ->{name}
209     ## ->{value}
210     ## ->{has_reference} == 1 or 0
211     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
213     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
214     ## while the token is pushed back to the stack.
215    
216     ## Emitted token MUST immediately be handled by the tree construction state.
217    
218     ## Before each step, UA MAY check to see if either one of the scripts in
219     ## "list of scripts that will execute as soon as possible" or the first
220     ## script in the "list of scripts that will execute asynchronously",
221     ## has completed loading. If one has, then it MUST be executed
222     ## and removed from the list.
223    
224     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
225     ## (This requirement was dropped from HTML5 spec, unfortunately.)
226    
227     my $is_space = {
228     0x0009 => 1, # CHARACTER TABULATION (HT)
229     0x000A => 1, # LINE FEED (LF)
230     #0x000B => 0, # LINE TABULATION (VT)
231     0x000C => 1, # FORM FEED (FF)
232     #0x000D => 1, # CARRIAGE RETURN (CR)
233     0x0020 => 1, # SPACE (SP)
234     };
235    
236     sub _get_next_token ($) {
237     my $self = shift;
238    
239     if ($self->{self_closing}) {
240     !!!parse-error (type => 'nestc', token => $self->{ct});
241     ## NOTE: The |self_closing| flag is only set by start tag token.
242     ## In addition, when a start tag token is emitted, it is always set to
243     ## |ct|.
244     delete $self->{self_closing};
245     }
246    
247     if (@{$self->{token}}) {
248     $self->{self_closing} = $self->{token}->[0]->{self_closing};
249     return shift @{$self->{token}};
250     }
251    
252     A: {
253     if ($self->{state} == PCDATA_STATE) {
254     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
255    
256     if ($self->{nc} == 0x0026) { # &
257     !!!cp (0.1);
258     ## NOTE: In the spec, the tokenizer is switched to the
259     ## "entity data state". In this implementation, the tokenizer
260     ## is switched to the |ENTITY_STATE|, which is an implementation
261     ## of the "consume a character reference" algorithm.
262     $self->{entity_add} = -1;
263     $self->{prev_state} = DATA_STATE;
264     $self->{state} = ENTITY_STATE;
265     !!!next-input-character;
266     redo A;
267     } elsif ($self->{nc} == 0x003C) { # <
268     !!!cp (0.2);
269     $self->{state} = TAG_OPEN_STATE;
270     !!!next-input-character;
271     redo A;
272     } elsif ($self->{nc} == -1) {
273     !!!cp (0.3);
274     !!!emit ({type => END_OF_FILE_TOKEN,
275     line => $self->{line}, column => $self->{column}});
276     last A; ## TODO: ok?
277     } else {
278     !!!cp (0.4);
279     #
280     }
281    
282     # Anything else
283     my $token = {type => CHARACTER_TOKEN,
284     data => chr $self->{nc},
285     line => $self->{line}, column => $self->{column},
286     };
287     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
288    
289     ## Stay in the state.
290     !!!next-input-character;
291     !!!emit ($token);
292     redo A;
293     } elsif ($self->{state} == DATA_STATE) {
294     $self->{s_kwd} = '' unless defined $self->{s_kwd};
295     if ($self->{nc} == 0x0026) { # &
296     $self->{s_kwd} = '';
297     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
298     not $self->{escape}) {
299     !!!cp (1);
300     ## NOTE: In the spec, the tokenizer is switched to the
301     ## "entity data state". In this implementation, the tokenizer
302     ## is switched to the |ENTITY_STATE|, which is an implementation
303     ## of the "consume a character reference" algorithm.
304     $self->{entity_add} = -1;
305     $self->{prev_state} = DATA_STATE;
306     $self->{state} = ENTITY_STATE;
307     !!!next-input-character;
308     redo A;
309     } else {
310     !!!cp (2);
311     #
312     }
313     } elsif ($self->{nc} == 0x002D) { # -
314     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
315 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
316 wakaba 1.1 !!!cp (3);
317     $self->{escape} = 1; # unless $self->{escape};
318     $self->{s_kwd} = '--';
319     #
320 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
321 wakaba 1.1 !!!cp (4);
322     $self->{s_kwd} = '--';
323     #
324 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
325     !!!cp (4.1);
326     $self->{s_kwd} .= '-';
327     #
328 wakaba 1.1 } else {
329     !!!cp (5);
330 wakaba 1.5 $self->{s_kwd} = '-';
331 wakaba 1.1 #
332     }
333     }
334    
335     #
336     } elsif ($self->{nc} == 0x0021) { # !
337     if (length $self->{s_kwd}) {
338     !!!cp (5.1);
339     $self->{s_kwd} .= '!';
340     #
341     } else {
342     !!!cp (5.2);
343     #$self->{s_kwd} = '';
344     #
345     }
346     #
347     } elsif ($self->{nc} == 0x003C) { # <
348     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
349     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
350     not $self->{escape})) {
351     !!!cp (6);
352     $self->{state} = TAG_OPEN_STATE;
353     !!!next-input-character;
354     redo A;
355     } else {
356     !!!cp (7);
357     $self->{s_kwd} = '';
358     #
359     }
360     } elsif ($self->{nc} == 0x003E) { # >
361     if ($self->{escape} and
362     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
363     if ($self->{s_kwd} eq '--') {
364     !!!cp (8);
365     delete $self->{escape};
366 wakaba 1.5 #
367 wakaba 1.1 } else {
368     !!!cp (9);
369 wakaba 1.5 #
370 wakaba 1.1 }
371 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
372     !!!cp (9.1);
373     !!!parse-error (type => 'unmatched mse', ## TODO: type
374     line => $self->{line_prev},
375     column => $self->{column_prev} - 1);
376     #
377 wakaba 1.1 } else {
378     !!!cp (10);
379 wakaba 1.5 #
380 wakaba 1.1 }
381    
382     $self->{s_kwd} = '';
383     #
384 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
385     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
386     !!!cp (10.1);
387     $self->{s_kwd} .= ']';
388     } elsif ($self->{s_kwd} eq ']]') {
389     !!!cp (10.2);
390     #
391     } else {
392     !!!cp (10.3);
393     $self->{s_kwd} = '';
394     }
395     #
396 wakaba 1.1 } elsif ($self->{nc} == -1) {
397     !!!cp (11);
398     $self->{s_kwd} = '';
399     !!!emit ({type => END_OF_FILE_TOKEN,
400     line => $self->{line}, column => $self->{column}});
401     last A; ## TODO: ok?
402     } else {
403     !!!cp (12);
404     $self->{s_kwd} = '';
405     #
406     }
407    
408     # Anything else
409     my $token = {type => CHARACTER_TOKEN,
410     data => chr $self->{nc},
411     line => $self->{line}, column => $self->{column},
412     };
413 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
414 wakaba 1.1 length $token->{data})) {
415     $self->{s_kwd} = '';
416     }
417    
418     ## Stay in the data state.
419 wakaba 1.5 if (not $self->{is_xml} and
420     $self->{content_model} == PCDATA_CONTENT_MODEL) {
421 wakaba 1.1 !!!cp (13);
422     $self->{state} = PCDATA_STATE;
423     } else {
424     !!!cp (14);
425     ## Stay in the state.
426     }
427     !!!next-input-character;
428     !!!emit ($token);
429     redo A;
430     } elsif ($self->{state} == TAG_OPEN_STATE) {
431     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
432     if ($self->{nc} == 0x002F) { # /
433     !!!cp (15);
434     !!!next-input-character;
435     $self->{state} = CLOSE_TAG_OPEN_STATE;
436     redo A;
437     } elsif ($self->{nc} == 0x0021) { # !
438     !!!cp (15.1);
439     $self->{s_kwd} = '<' unless $self->{escape};
440     #
441     } else {
442     !!!cp (16);
443     #
444     }
445    
446     ## reconsume
447     $self->{state} = DATA_STATE;
448 wakaba 1.5 $self->{s_kwd} = '';
449 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN, data => '<',
450     line => $self->{line_prev},
451     column => $self->{column_prev},
452     });
453     redo A;
454     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
455     if ($self->{nc} == 0x0021) { # !
456     !!!cp (17);
457     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
458     !!!next-input-character;
459     redo A;
460     } elsif ($self->{nc} == 0x002F) { # /
461     !!!cp (18);
462     $self->{state} = CLOSE_TAG_OPEN_STATE;
463     !!!next-input-character;
464     redo A;
465     } elsif (0x0041 <= $self->{nc} and
466     $self->{nc} <= 0x005A) { # A..Z
467     !!!cp (19);
468     $self->{ct}
469     = {type => START_TAG_TOKEN,
470 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
471 wakaba 1.1 line => $self->{line_prev},
472     column => $self->{column_prev}};
473     $self->{state} = TAG_NAME_STATE;
474     !!!next-input-character;
475     redo A;
476     } elsif (0x0061 <= $self->{nc} and
477     $self->{nc} <= 0x007A) { # a..z
478     !!!cp (20);
479     $self->{ct} = {type => START_TAG_TOKEN,
480     tag_name => chr ($self->{nc}),
481     line => $self->{line_prev},
482     column => $self->{column_prev}};
483     $self->{state} = TAG_NAME_STATE;
484     !!!next-input-character;
485     redo A;
486     } elsif ($self->{nc} == 0x003E) { # >
487     !!!cp (21);
488     !!!parse-error (type => 'empty start tag',
489     line => $self->{line_prev},
490     column => $self->{column_prev});
491     $self->{state} = DATA_STATE;
492 wakaba 1.5 $self->{s_kwd} = '';
493 wakaba 1.1 !!!next-input-character;
494    
495     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
496     line => $self->{line_prev},
497     column => $self->{column_prev},
498     });
499    
500     redo A;
501     } elsif ($self->{nc} == 0x003F) { # ?
502     !!!cp (22);
503     !!!parse-error (type => 'pio',
504     line => $self->{line_prev},
505     column => $self->{column_prev});
506     $self->{state} = BOGUS_COMMENT_STATE;
507     $self->{ct} = {type => COMMENT_TOKEN, data => '',
508     line => $self->{line_prev},
509     column => $self->{column_prev},
510     };
511     ## $self->{nc} is intentionally left as is
512     redo A;
513     } else {
514     !!!cp (23);
515     !!!parse-error (type => 'bare stago',
516     line => $self->{line_prev},
517     column => $self->{column_prev});
518     $self->{state} = DATA_STATE;
519 wakaba 1.5 $self->{s_kwd} = '';
520 wakaba 1.1 ## reconsume
521    
522     !!!emit ({type => CHARACTER_TOKEN, data => '<',
523     line => $self->{line_prev},
524     column => $self->{column_prev},
525     });
526    
527     redo A;
528     }
529     } else {
530     die "$0: $self->{content_model} in tag open";
531     }
532     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
533     ## NOTE: The "close tag open state" in the spec is implemented as
534     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
535    
536     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
537     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
538     if (defined $self->{last_stag_name}) {
539     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
540     $self->{s_kwd} = '';
541     ## Reconsume.
542     redo A;
543     } else {
544     ## No start tag token has ever been emitted
545     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
546     !!!cp (28);
547     $self->{state} = DATA_STATE;
548 wakaba 1.5 $self->{s_kwd} = '';
549 wakaba 1.1 ## Reconsume.
550     !!!emit ({type => CHARACTER_TOKEN, data => '</',
551     line => $l, column => $c,
552     });
553     redo A;
554     }
555     }
556    
557     if (0x0041 <= $self->{nc} and
558     $self->{nc} <= 0x005A) { # A..Z
559     !!!cp (29);
560     $self->{ct}
561     = {type => END_TAG_TOKEN,
562 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
563 wakaba 1.1 line => $l, column => $c};
564     $self->{state} = TAG_NAME_STATE;
565     !!!next-input-character;
566     redo A;
567     } elsif (0x0061 <= $self->{nc} and
568     $self->{nc} <= 0x007A) { # a..z
569     !!!cp (30);
570     $self->{ct} = {type => END_TAG_TOKEN,
571     tag_name => chr ($self->{nc}),
572     line => $l, column => $c};
573     $self->{state} = TAG_NAME_STATE;
574     !!!next-input-character;
575     redo A;
576     } elsif ($self->{nc} == 0x003E) { # >
577     !!!cp (31);
578     !!!parse-error (type => 'empty end tag',
579     line => $self->{line_prev}, ## "<" in "</>"
580     column => $self->{column_prev} - 1);
581     $self->{state} = DATA_STATE;
582 wakaba 1.5 $self->{s_kwd} = '';
583 wakaba 1.1 !!!next-input-character;
584     redo A;
585     } elsif ($self->{nc} == -1) {
586     !!!cp (32);
587     !!!parse-error (type => 'bare etago');
588 wakaba 1.5 $self->{s_kwd} = '';
589 wakaba 1.1 $self->{state} = DATA_STATE;
590     # reconsume
591    
592     !!!emit ({type => CHARACTER_TOKEN, data => '</',
593     line => $l, column => $c,
594     });
595    
596     redo A;
597     } else {
598     !!!cp (33);
599     !!!parse-error (type => 'bogus end tag');
600     $self->{state} = BOGUS_COMMENT_STATE;
601     $self->{ct} = {type => COMMENT_TOKEN, data => '',
602     line => $self->{line_prev}, # "<" of "</"
603     column => $self->{column_prev} - 1,
604     };
605     ## NOTE: $self->{nc} is intentionally left as is.
606     ## Although the "anything else" case of the spec not explicitly
607     ## states that the next input character is to be reconsumed,
608     ## it will be included to the |data| of the comment token
609     ## generated from the bogus end tag, as defined in the
610     ## "bogus comment state" entry.
611     redo A;
612     }
613     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
614     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
615     if (length $ch) {
616     my $CH = $ch;
617     $ch =~ tr/a-z/A-Z/;
618     my $nch = chr $self->{nc};
619     if ($nch eq $ch or $nch eq $CH) {
620     !!!cp (24);
621     ## Stay in the state.
622     $self->{s_kwd} .= $nch;
623     !!!next-input-character;
624     redo A;
625     } else {
626     !!!cp (25);
627     $self->{state} = DATA_STATE;
628 wakaba 1.5 $self->{s_kwd} = '';
629 wakaba 1.1 ## Reconsume.
630     !!!emit ({type => CHARACTER_TOKEN,
631     data => '</' . $self->{s_kwd},
632     line => $self->{line_prev},
633     column => $self->{column_prev} - 1 - length $self->{s_kwd},
634     });
635     redo A;
636     }
637     } else { # after "<{tag-name}"
638     unless ($is_space->{$self->{nc}} or
639     {
640     0x003E => 1, # >
641     0x002F => 1, # /
642     -1 => 1, # EOF
643     }->{$self->{nc}}) {
644     !!!cp (26);
645     ## Reconsume.
646     $self->{state} = DATA_STATE;
647 wakaba 1.5 $self->{s_kwd} = '';
648 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
649     data => '</' . $self->{s_kwd},
650     line => $self->{line_prev},
651     column => $self->{column_prev} - 1 - length $self->{s_kwd},
652     });
653     redo A;
654     } else {
655     !!!cp (27);
656     $self->{ct}
657     = {type => END_TAG_TOKEN,
658     tag_name => $self->{last_stag_name},
659     line => $self->{line_prev},
660     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
661     $self->{state} = TAG_NAME_STATE;
662     ## Reconsume.
663     redo A;
664     }
665     }
666     } elsif ($self->{state} == TAG_NAME_STATE) {
667     if ($is_space->{$self->{nc}}) {
668     !!!cp (34);
669     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
670     !!!next-input-character;
671     redo A;
672     } elsif ($self->{nc} == 0x003E) { # >
673     if ($self->{ct}->{type} == START_TAG_TOKEN) {
674     !!!cp (35);
675     $self->{last_stag_name} = $self->{ct}->{tag_name};
676     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
677     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
678     #if ($self->{ct}->{attributes}) {
679     # ## NOTE: This should never be reached.
680     # !!! cp (36);
681     # !!! parse-error (type => 'end tag attribute');
682     #} else {
683     !!!cp (37);
684     #}
685     } else {
686     die "$0: $self->{ct}->{type}: Unknown token type";
687     }
688     $self->{state} = DATA_STATE;
689 wakaba 1.5 $self->{s_kwd} = '';
690 wakaba 1.1 !!!next-input-character;
691    
692     !!!emit ($self->{ct}); # start tag or end tag
693    
694     redo A;
695     } elsif (0x0041 <= $self->{nc} and
696     $self->{nc} <= 0x005A) { # A..Z
697     !!!cp (38);
698 wakaba 1.4 $self->{ct}->{tag_name}
699     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
700 wakaba 1.1 # start tag or end tag
701     ## Stay in this state
702     !!!next-input-character;
703     redo A;
704     } elsif ($self->{nc} == -1) {
705     !!!parse-error (type => 'unclosed tag');
706     if ($self->{ct}->{type} == START_TAG_TOKEN) {
707     !!!cp (39);
708     $self->{last_stag_name} = $self->{ct}->{tag_name};
709     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
710     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
711     #if ($self->{ct}->{attributes}) {
712     # ## NOTE: This state should never be reached.
713     # !!! cp (40);
714     # !!! parse-error (type => 'end tag attribute');
715     #} else {
716     !!!cp (41);
717     #}
718     } else {
719     die "$0: $self->{ct}->{type}: Unknown token type";
720     }
721     $self->{state} = DATA_STATE;
722 wakaba 1.5 $self->{s_kwd} = '';
723 wakaba 1.1 # reconsume
724    
725     !!!emit ($self->{ct}); # start tag or end tag
726    
727     redo A;
728     } elsif ($self->{nc} == 0x002F) { # /
729     !!!cp (42);
730     $self->{state} = SELF_CLOSING_START_TAG_STATE;
731     !!!next-input-character;
732     redo A;
733     } else {
734     !!!cp (44);
735     $self->{ct}->{tag_name} .= chr $self->{nc};
736     # start tag or end tag
737     ## Stay in the state
738     !!!next-input-character;
739     redo A;
740     }
741     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
742     if ($is_space->{$self->{nc}}) {
743     !!!cp (45);
744     ## Stay in the state
745     !!!next-input-character;
746     redo A;
747     } elsif ($self->{nc} == 0x003E) { # >
748     if ($self->{ct}->{type} == START_TAG_TOKEN) {
749     !!!cp (46);
750     $self->{last_stag_name} = $self->{ct}->{tag_name};
751     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
752     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
753     if ($self->{ct}->{attributes}) {
754     !!!cp (47);
755     !!!parse-error (type => 'end tag attribute');
756     } else {
757     !!!cp (48);
758     }
759     } else {
760     die "$0: $self->{ct}->{type}: Unknown token type";
761     }
762     $self->{state} = DATA_STATE;
763 wakaba 1.5 $self->{s_kwd} = '';
764 wakaba 1.1 !!!next-input-character;
765    
766     !!!emit ($self->{ct}); # start tag or end tag
767    
768     redo A;
769     } elsif (0x0041 <= $self->{nc} and
770     $self->{nc} <= 0x005A) { # A..Z
771     !!!cp (49);
772     $self->{ca}
773 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
774 wakaba 1.1 value => '',
775     line => $self->{line}, column => $self->{column}};
776     $self->{state} = ATTRIBUTE_NAME_STATE;
777     !!!next-input-character;
778     redo A;
779     } elsif ($self->{nc} == 0x002F) { # /
780     !!!cp (50);
781     $self->{state} = SELF_CLOSING_START_TAG_STATE;
782     !!!next-input-character;
783     redo A;
784     } elsif ($self->{nc} == -1) {
785     !!!parse-error (type => 'unclosed tag');
786     if ($self->{ct}->{type} == START_TAG_TOKEN) {
787     !!!cp (52);
788     $self->{last_stag_name} = $self->{ct}->{tag_name};
789     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
790     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791     if ($self->{ct}->{attributes}) {
792     !!!cp (53);
793     !!!parse-error (type => 'end tag attribute');
794     } else {
795     !!!cp (54);
796     }
797     } else {
798     die "$0: $self->{ct}->{type}: Unknown token type";
799     }
800     $self->{state} = DATA_STATE;
801 wakaba 1.5 $self->{s_kwd} = '';
802 wakaba 1.1 # reconsume
803    
804     !!!emit ($self->{ct}); # start tag or end tag
805    
806     redo A;
807     } else {
808     if ({
809     0x0022 => 1, # "
810     0x0027 => 1, # '
811     0x003D => 1, # =
812     }->{$self->{nc}}) {
813     !!!cp (55);
814     !!!parse-error (type => 'bad attribute name');
815     } else {
816     !!!cp (56);
817     }
818     $self->{ca}
819     = {name => chr ($self->{nc}),
820     value => '',
821     line => $self->{line}, column => $self->{column}};
822     $self->{state} = ATTRIBUTE_NAME_STATE;
823     !!!next-input-character;
824     redo A;
825     }
826     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
827     my $before_leave = sub {
828     if (exists $self->{ct}->{attributes} # start tag or end tag
829     ->{$self->{ca}->{name}}) { # MUST
830     !!!cp (57);
831     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
832     ## Discard $self->{ca} # MUST
833     } else {
834     !!!cp (58);
835     $self->{ct}->{attributes}->{$self->{ca}->{name}}
836     = $self->{ca};
837     }
838     }; # $before_leave
839    
840     if ($is_space->{$self->{nc}}) {
841     !!!cp (59);
842     $before_leave->();
843     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
844     !!!next-input-character;
845     redo A;
846     } elsif ($self->{nc} == 0x003D) { # =
847     !!!cp (60);
848     $before_leave->();
849     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
850     !!!next-input-character;
851     redo A;
852     } elsif ($self->{nc} == 0x003E) { # >
853     $before_leave->();
854     if ($self->{ct}->{type} == START_TAG_TOKEN) {
855     !!!cp (61);
856     $self->{last_stag_name} = $self->{ct}->{tag_name};
857     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
858     !!!cp (62);
859     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
860     if ($self->{ct}->{attributes}) {
861     !!!parse-error (type => 'end tag attribute');
862     }
863     } else {
864     die "$0: $self->{ct}->{type}: Unknown token type";
865     }
866     $self->{state} = DATA_STATE;
867 wakaba 1.5 $self->{s_kwd} = '';
868 wakaba 1.1 !!!next-input-character;
869    
870     !!!emit ($self->{ct}); # start tag or end tag
871    
872     redo A;
873     } elsif (0x0041 <= $self->{nc} and
874     $self->{nc} <= 0x005A) { # A..Z
875     !!!cp (63);
876 wakaba 1.4 $self->{ca}->{name}
877     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
878 wakaba 1.1 ## Stay in the state
879     !!!next-input-character;
880     redo A;
881     } elsif ($self->{nc} == 0x002F) { # /
882     !!!cp (64);
883     $before_leave->();
884     $self->{state} = SELF_CLOSING_START_TAG_STATE;
885     !!!next-input-character;
886     redo A;
887     } elsif ($self->{nc} == -1) {
888     !!!parse-error (type => 'unclosed tag');
889     $before_leave->();
890     if ($self->{ct}->{type} == START_TAG_TOKEN) {
891     !!!cp (66);
892     $self->{last_stag_name} = $self->{ct}->{tag_name};
893     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
894     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
895     if ($self->{ct}->{attributes}) {
896     !!!cp (67);
897     !!!parse-error (type => 'end tag attribute');
898     } else {
899     ## NOTE: This state should never be reached.
900     !!!cp (68);
901     }
902     } else {
903     die "$0: $self->{ct}->{type}: Unknown token type";
904     }
905     $self->{state} = DATA_STATE;
906 wakaba 1.5 $self->{s_kwd} = '';
907 wakaba 1.1 # reconsume
908    
909     !!!emit ($self->{ct}); # start tag or end tag
910    
911     redo A;
912     } else {
913     if ($self->{nc} == 0x0022 or # "
914     $self->{nc} == 0x0027) { # '
915     !!!cp (69);
916     !!!parse-error (type => 'bad attribute name');
917     } else {
918     !!!cp (70);
919     }
920     $self->{ca}->{name} .= chr ($self->{nc});
921     ## Stay in the state
922     !!!next-input-character;
923     redo A;
924     }
925     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
926     if ($is_space->{$self->{nc}}) {
927     !!!cp (71);
928     ## Stay in the state
929     !!!next-input-character;
930     redo A;
931     } elsif ($self->{nc} == 0x003D) { # =
932     !!!cp (72);
933     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
934     !!!next-input-character;
935     redo A;
936     } elsif ($self->{nc} == 0x003E) { # >
937     if ($self->{ct}->{type} == START_TAG_TOKEN) {
938     !!!cp (73);
939     $self->{last_stag_name} = $self->{ct}->{tag_name};
940     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
941     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
942     if ($self->{ct}->{attributes}) {
943     !!!cp (74);
944     !!!parse-error (type => 'end tag attribute');
945     } else {
946     ## NOTE: This state should never be reached.
947     !!!cp (75);
948     }
949     } else {
950     die "$0: $self->{ct}->{type}: Unknown token type";
951     }
952     $self->{state} = DATA_STATE;
953 wakaba 1.5 $self->{s_kwd} = '';
954 wakaba 1.1 !!!next-input-character;
955    
956     !!!emit ($self->{ct}); # start tag or end tag
957    
958     redo A;
959     } elsif (0x0041 <= $self->{nc} and
960     $self->{nc} <= 0x005A) { # A..Z
961     !!!cp (76);
962     $self->{ca}
963 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
964 wakaba 1.1 value => '',
965     line => $self->{line}, column => $self->{column}};
966     $self->{state} = ATTRIBUTE_NAME_STATE;
967     !!!next-input-character;
968     redo A;
969     } elsif ($self->{nc} == 0x002F) { # /
970     !!!cp (77);
971     $self->{state} = SELF_CLOSING_START_TAG_STATE;
972     !!!next-input-character;
973     redo A;
974     } elsif ($self->{nc} == -1) {
975     !!!parse-error (type => 'unclosed tag');
976     if ($self->{ct}->{type} == START_TAG_TOKEN) {
977     !!!cp (79);
978     $self->{last_stag_name} = $self->{ct}->{tag_name};
979     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
980     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
981     if ($self->{ct}->{attributes}) {
982     !!!cp (80);
983     !!!parse-error (type => 'end tag attribute');
984     } else {
985     ## NOTE: This state should never be reached.
986     !!!cp (81);
987     }
988     } else {
989     die "$0: $self->{ct}->{type}: Unknown token type";
990     }
991 wakaba 1.5 $self->{s_kwd} = '';
992 wakaba 1.1 $self->{state} = DATA_STATE;
993     # reconsume
994    
995     !!!emit ($self->{ct}); # start tag or end tag
996    
997     redo A;
998     } else {
999     if ($self->{nc} == 0x0022 or # "
1000     $self->{nc} == 0x0027) { # '
1001     !!!cp (78);
1002     !!!parse-error (type => 'bad attribute name');
1003     } else {
1004     !!!cp (82);
1005     }
1006     $self->{ca}
1007     = {name => chr ($self->{nc}),
1008     value => '',
1009     line => $self->{line}, column => $self->{column}};
1010     $self->{state} = ATTRIBUTE_NAME_STATE;
1011     !!!next-input-character;
1012     redo A;
1013     }
1014     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1015     if ($is_space->{$self->{nc}}) {
1016     !!!cp (83);
1017     ## Stay in the state
1018     !!!next-input-character;
1019     redo A;
1020     } elsif ($self->{nc} == 0x0022) { # "
1021     !!!cp (84);
1022     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1023     !!!next-input-character;
1024     redo A;
1025     } elsif ($self->{nc} == 0x0026) { # &
1026     !!!cp (85);
1027     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1028     ## reconsume
1029     redo A;
1030     } elsif ($self->{nc} == 0x0027) { # '
1031     !!!cp (86);
1032     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1033     !!!next-input-character;
1034     redo A;
1035     } elsif ($self->{nc} == 0x003E) { # >
1036     !!!parse-error (type => 'empty unquoted attribute value');
1037     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1038     !!!cp (87);
1039     $self->{last_stag_name} = $self->{ct}->{tag_name};
1040     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1041     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1042     if ($self->{ct}->{attributes}) {
1043     !!!cp (88);
1044     !!!parse-error (type => 'end tag attribute');
1045     } else {
1046     ## NOTE: This state should never be reached.
1047     !!!cp (89);
1048     }
1049     } else {
1050     die "$0: $self->{ct}->{type}: Unknown token type";
1051     }
1052     $self->{state} = DATA_STATE;
1053 wakaba 1.5 $self->{s_kwd} = '';
1054 wakaba 1.1 !!!next-input-character;
1055    
1056     !!!emit ($self->{ct}); # start tag or end tag
1057    
1058     redo A;
1059     } elsif ($self->{nc} == -1) {
1060     !!!parse-error (type => 'unclosed tag');
1061     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1062     !!!cp (90);
1063     $self->{last_stag_name} = $self->{ct}->{tag_name};
1064     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1065     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1066     if ($self->{ct}->{attributes}) {
1067     !!!cp (91);
1068     !!!parse-error (type => 'end tag attribute');
1069     } else {
1070     ## NOTE: This state should never be reached.
1071     !!!cp (92);
1072     }
1073     } else {
1074     die "$0: $self->{ct}->{type}: Unknown token type";
1075     }
1076     $self->{state} = DATA_STATE;
1077 wakaba 1.5 $self->{s_kwd} = '';
1078 wakaba 1.1 ## reconsume
1079    
1080     !!!emit ($self->{ct}); # start tag or end tag
1081    
1082     redo A;
1083     } else {
1084     if ($self->{nc} == 0x003D) { # =
1085     !!!cp (93);
1086     !!!parse-error (type => 'bad attribute value');
1087     } else {
1088     !!!cp (94);
1089     }
1090     $self->{ca}->{value} .= chr ($self->{nc});
1091     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1092     !!!next-input-character;
1093     redo A;
1094     }
1095     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1096     if ($self->{nc} == 0x0022) { # "
1097     !!!cp (95);
1098     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1099     !!!next-input-character;
1100     redo A;
1101     } elsif ($self->{nc} == 0x0026) { # &
1102     !!!cp (96);
1103     ## NOTE: In the spec, the tokenizer is switched to the
1104     ## "entity in attribute value state". In this implementation, the
1105     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1106     ## implementation of the "consume a character reference" algorithm.
1107     $self->{prev_state} = $self->{state};
1108     $self->{entity_add} = 0x0022; # "
1109     $self->{state} = ENTITY_STATE;
1110     !!!next-input-character;
1111     redo A;
1112     } elsif ($self->{nc} == -1) {
1113     !!!parse-error (type => 'unclosed attribute value');
1114     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1115     !!!cp (97);
1116     $self->{last_stag_name} = $self->{ct}->{tag_name};
1117     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1118     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1119     if ($self->{ct}->{attributes}) {
1120     !!!cp (98);
1121     !!!parse-error (type => 'end tag attribute');
1122     } else {
1123     ## NOTE: This state should never be reached.
1124     !!!cp (99);
1125     }
1126     } else {
1127     die "$0: $self->{ct}->{type}: Unknown token type";
1128     }
1129     $self->{state} = DATA_STATE;
1130 wakaba 1.5 $self->{s_kwd} = '';
1131 wakaba 1.1 ## reconsume
1132    
1133     !!!emit ($self->{ct}); # start tag or end tag
1134    
1135     redo A;
1136     } else {
1137     !!!cp (100);
1138     $self->{ca}->{value} .= chr ($self->{nc});
1139     $self->{read_until}->($self->{ca}->{value},
1140     q["&],
1141     length $self->{ca}->{value});
1142    
1143     ## Stay in the state
1144     !!!next-input-character;
1145     redo A;
1146     }
1147     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1148     if ($self->{nc} == 0x0027) { # '
1149     !!!cp (101);
1150     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1151     !!!next-input-character;
1152     redo A;
1153     } elsif ($self->{nc} == 0x0026) { # &
1154     !!!cp (102);
1155     ## NOTE: In the spec, the tokenizer is switched to the
1156     ## "entity in attribute value state". In this implementation, the
1157     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1158     ## implementation of the "consume a character reference" algorithm.
1159     $self->{entity_add} = 0x0027; # '
1160     $self->{prev_state} = $self->{state};
1161     $self->{state} = ENTITY_STATE;
1162     !!!next-input-character;
1163     redo A;
1164     } elsif ($self->{nc} == -1) {
1165     !!!parse-error (type => 'unclosed attribute value');
1166     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1167     !!!cp (103);
1168     $self->{last_stag_name} = $self->{ct}->{tag_name};
1169     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1170     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1171     if ($self->{ct}->{attributes}) {
1172     !!!cp (104);
1173     !!!parse-error (type => 'end tag attribute');
1174     } else {
1175     ## NOTE: This state should never be reached.
1176     !!!cp (105);
1177     }
1178     } else {
1179     die "$0: $self->{ct}->{type}: Unknown token type";
1180     }
1181     $self->{state} = DATA_STATE;
1182 wakaba 1.5 $self->{s_kwd} = '';
1183 wakaba 1.1 ## reconsume
1184    
1185     !!!emit ($self->{ct}); # start tag or end tag
1186    
1187     redo A;
1188     } else {
1189     !!!cp (106);
1190     $self->{ca}->{value} .= chr ($self->{nc});
1191     $self->{read_until}->($self->{ca}->{value},
1192     q['&],
1193     length $self->{ca}->{value});
1194    
1195     ## Stay in the state
1196     !!!next-input-character;
1197     redo A;
1198     }
1199     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1200     if ($is_space->{$self->{nc}}) {
1201     !!!cp (107);
1202     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1203     !!!next-input-character;
1204     redo A;
1205     } elsif ($self->{nc} == 0x0026) { # &
1206     !!!cp (108);
1207     ## NOTE: In the spec, the tokenizer is switched to the
1208     ## "entity in attribute value state". In this implementation, the
1209     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1210     ## implementation of the "consume a character reference" algorithm.
1211     $self->{entity_add} = -1;
1212     $self->{prev_state} = $self->{state};
1213     $self->{state} = ENTITY_STATE;
1214     !!!next-input-character;
1215     redo A;
1216     } elsif ($self->{nc} == 0x003E) { # >
1217     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1218     !!!cp (109);
1219     $self->{last_stag_name} = $self->{ct}->{tag_name};
1220     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1221     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1222     if ($self->{ct}->{attributes}) {
1223     !!!cp (110);
1224     !!!parse-error (type => 'end tag attribute');
1225     } else {
1226     ## NOTE: This state should never be reached.
1227     !!!cp (111);
1228     }
1229     } else {
1230     die "$0: $self->{ct}->{type}: Unknown token type";
1231     }
1232     $self->{state} = DATA_STATE;
1233 wakaba 1.5 $self->{s_kwd} = '';
1234 wakaba 1.1 !!!next-input-character;
1235    
1236     !!!emit ($self->{ct}); # start tag or end tag
1237    
1238     redo A;
1239     } elsif ($self->{nc} == -1) {
1240     !!!parse-error (type => 'unclosed tag');
1241     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1242     !!!cp (112);
1243     $self->{last_stag_name} = $self->{ct}->{tag_name};
1244     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1245     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1246     if ($self->{ct}->{attributes}) {
1247     !!!cp (113);
1248     !!!parse-error (type => 'end tag attribute');
1249     } else {
1250     ## NOTE: This state should never be reached.
1251     !!!cp (114);
1252     }
1253     } else {
1254     die "$0: $self->{ct}->{type}: Unknown token type";
1255     }
1256     $self->{state} = DATA_STATE;
1257 wakaba 1.5 $self->{s_kwd} = '';
1258 wakaba 1.1 ## reconsume
1259    
1260     !!!emit ($self->{ct}); # start tag or end tag
1261    
1262     redo A;
1263     } else {
1264     if ({
1265     0x0022 => 1, # "
1266     0x0027 => 1, # '
1267     0x003D => 1, # =
1268     }->{$self->{nc}}) {
1269     !!!cp (115);
1270     !!!parse-error (type => 'bad attribute value');
1271     } else {
1272     !!!cp (116);
1273     }
1274     $self->{ca}->{value} .= chr ($self->{nc});
1275     $self->{read_until}->($self->{ca}->{value},
1276     q["'=& >],
1277     length $self->{ca}->{value});
1278    
1279     ## Stay in the state
1280     !!!next-input-character;
1281     redo A;
1282     }
1283     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1284     if ($is_space->{$self->{nc}}) {
1285     !!!cp (118);
1286     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1287     !!!next-input-character;
1288     redo A;
1289     } elsif ($self->{nc} == 0x003E) { # >
1290     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1291     !!!cp (119);
1292     $self->{last_stag_name} = $self->{ct}->{tag_name};
1293     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1294     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1295     if ($self->{ct}->{attributes}) {
1296     !!!cp (120);
1297     !!!parse-error (type => 'end tag attribute');
1298     } else {
1299     ## NOTE: This state should never be reached.
1300     !!!cp (121);
1301     }
1302     } else {
1303     die "$0: $self->{ct}->{type}: Unknown token type";
1304     }
1305     $self->{state} = DATA_STATE;
1306 wakaba 1.5 $self->{s_kwd} = '';
1307 wakaba 1.1 !!!next-input-character;
1308    
1309     !!!emit ($self->{ct}); # start tag or end tag
1310    
1311     redo A;
1312     } elsif ($self->{nc} == 0x002F) { # /
1313     !!!cp (122);
1314     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1315     !!!next-input-character;
1316     redo A;
1317     } elsif ($self->{nc} == -1) {
1318     !!!parse-error (type => 'unclosed tag');
1319     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1320     !!!cp (122.3);
1321     $self->{last_stag_name} = $self->{ct}->{tag_name};
1322     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1323     if ($self->{ct}->{attributes}) {
1324     !!!cp (122.1);
1325     !!!parse-error (type => 'end tag attribute');
1326     } else {
1327     ## NOTE: This state should never be reached.
1328     !!!cp (122.2);
1329     }
1330     } else {
1331     die "$0: $self->{ct}->{type}: Unknown token type";
1332     }
1333     $self->{state} = DATA_STATE;
1334 wakaba 1.5 $self->{s_kwd} = '';
1335 wakaba 1.1 ## Reconsume.
1336     !!!emit ($self->{ct}); # start tag or end tag
1337     redo A;
1338     } else {
1339     !!!cp ('124.1');
1340     !!!parse-error (type => 'no space between attributes');
1341     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1342     ## reconsume
1343     redo A;
1344     }
1345     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1346     if ($self->{nc} == 0x003E) { # >
1347     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1348     !!!cp ('124.2');
1349     !!!parse-error (type => 'nestc', token => $self->{ct});
1350     ## TODO: Different type than slash in start tag
1351     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1352     if ($self->{ct}->{attributes}) {
1353     !!!cp ('124.4');
1354     !!!parse-error (type => 'end tag attribute');
1355     } else {
1356     !!!cp ('124.5');
1357     }
1358     ## TODO: Test |<title></title/>|
1359     } else {
1360     !!!cp ('124.3');
1361     $self->{self_closing} = 1;
1362     }
1363    
1364     $self->{state} = DATA_STATE;
1365 wakaba 1.5 $self->{s_kwd} = '';
1366 wakaba 1.1 !!!next-input-character;
1367    
1368     !!!emit ($self->{ct}); # start tag or end tag
1369    
1370     redo A;
1371     } elsif ($self->{nc} == -1) {
1372     !!!parse-error (type => 'unclosed tag');
1373     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1374     !!!cp (124.7);
1375     $self->{last_stag_name} = $self->{ct}->{tag_name};
1376     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1377     if ($self->{ct}->{attributes}) {
1378     !!!cp (124.5);
1379     !!!parse-error (type => 'end tag attribute');
1380     } else {
1381     ## NOTE: This state should never be reached.
1382     !!!cp (124.6);
1383     }
1384     } else {
1385     die "$0: $self->{ct}->{type}: Unknown token type";
1386     }
1387     $self->{state} = DATA_STATE;
1388 wakaba 1.5 $self->{s_kwd} = '';
1389 wakaba 1.1 ## Reconsume.
1390     !!!emit ($self->{ct}); # start tag or end tag
1391     redo A;
1392     } else {
1393     !!!cp ('124.4');
1394     !!!parse-error (type => 'nestc');
1395     ## TODO: This error type is wrong.
1396     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1397     ## Reconsume.
1398     redo A;
1399     }
1400     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1401     ## (only happen if PCDATA state)
1402    
1403     ## NOTE: Unlike spec's "bogus comment state", this implementation
1404     ## consumes characters one-by-one basis.
1405    
1406     if ($self->{nc} == 0x003E) { # >
1407     !!!cp (124);
1408     $self->{state} = DATA_STATE;
1409 wakaba 1.5 $self->{s_kwd} = '';
1410 wakaba 1.1 !!!next-input-character;
1411    
1412     !!!emit ($self->{ct}); # comment
1413     redo A;
1414     } elsif ($self->{nc} == -1) {
1415     !!!cp (125);
1416     $self->{state} = DATA_STATE;
1417 wakaba 1.5 $self->{s_kwd} = '';
1418 wakaba 1.1 ## reconsume
1419    
1420     !!!emit ($self->{ct}); # comment
1421     redo A;
1422     } else {
1423     !!!cp (126);
1424     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1425     $self->{read_until}->($self->{ct}->{data},
1426     q[>],
1427     length $self->{ct}->{data});
1428    
1429     ## Stay in the state.
1430     !!!next-input-character;
1431     redo A;
1432     }
1433     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1434     ## (only happen if PCDATA state)
1435    
1436     if ($self->{nc} == 0x002D) { # -
1437     !!!cp (133);
1438     $self->{state} = MD_HYPHEN_STATE;
1439     !!!next-input-character;
1440     redo A;
1441     } elsif ($self->{nc} == 0x0044 or # D
1442     $self->{nc} == 0x0064) { # d
1443     ## ASCII case-insensitive.
1444     !!!cp (130);
1445     $self->{state} = MD_DOCTYPE_STATE;
1446     $self->{s_kwd} = chr $self->{nc};
1447     !!!next-input-character;
1448     redo A;
1449 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1450     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1451     $self->{is_xml}) and
1452 wakaba 1.1 $self->{nc} == 0x005B) { # [
1453     !!!cp (135.4);
1454     $self->{state} = MD_CDATA_STATE;
1455     $self->{s_kwd} = '[';
1456     !!!next-input-character;
1457     redo A;
1458     } else {
1459     !!!cp (136);
1460     }
1461    
1462     !!!parse-error (type => 'bogus comment',
1463     line => $self->{line_prev},
1464     column => $self->{column_prev} - 1);
1465     ## Reconsume.
1466     $self->{state} = BOGUS_COMMENT_STATE;
1467     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1468     line => $self->{line_prev},
1469     column => $self->{column_prev} - 1,
1470     };
1471     redo A;
1472     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1473     if ($self->{nc} == 0x002D) { # -
1474     !!!cp (127);
1475     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1476     line => $self->{line_prev},
1477     column => $self->{column_prev} - 2,
1478     };
1479     $self->{state} = COMMENT_START_STATE;
1480     !!!next-input-character;
1481     redo A;
1482     } else {
1483     !!!cp (128);
1484     !!!parse-error (type => 'bogus comment',
1485     line => $self->{line_prev},
1486     column => $self->{column_prev} - 2);
1487     $self->{state} = BOGUS_COMMENT_STATE;
1488     ## Reconsume.
1489     $self->{ct} = {type => COMMENT_TOKEN,
1490     data => '-',
1491     line => $self->{line_prev},
1492     column => $self->{column_prev} - 2,
1493     };
1494     redo A;
1495     }
1496     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1497     ## ASCII case-insensitive.
1498     if ($self->{nc} == [
1499     undef,
1500     0x004F, # O
1501     0x0043, # C
1502     0x0054, # T
1503     0x0059, # Y
1504     0x0050, # P
1505     ]->[length $self->{s_kwd}] or
1506     $self->{nc} == [
1507     undef,
1508     0x006F, # o
1509     0x0063, # c
1510     0x0074, # t
1511     0x0079, # y
1512     0x0070, # p
1513     ]->[length $self->{s_kwd}]) {
1514     !!!cp (131);
1515     ## Stay in the state.
1516     $self->{s_kwd} .= chr $self->{nc};
1517     !!!next-input-character;
1518     redo A;
1519     } elsif ((length $self->{s_kwd}) == 6 and
1520     ($self->{nc} == 0x0045 or # E
1521     $self->{nc} == 0x0065)) { # e
1522     !!!cp (129);
1523     $self->{state} = DOCTYPE_STATE;
1524     $self->{ct} = {type => DOCTYPE_TOKEN,
1525     quirks => 1,
1526     line => $self->{line_prev},
1527     column => $self->{column_prev} - 7,
1528     };
1529     !!!next-input-character;
1530     redo A;
1531     } else {
1532     !!!cp (132);
1533     !!!parse-error (type => 'bogus comment',
1534     line => $self->{line_prev},
1535     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1536     $self->{state} = BOGUS_COMMENT_STATE;
1537     ## Reconsume.
1538     $self->{ct} = {type => COMMENT_TOKEN,
1539     data => $self->{s_kwd},
1540     line => $self->{line_prev},
1541     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1542     };
1543     redo A;
1544     }
1545     } elsif ($self->{state} == MD_CDATA_STATE) {
1546     if ($self->{nc} == {
1547     '[' => 0x0043, # C
1548     '[C' => 0x0044, # D
1549     '[CD' => 0x0041, # A
1550     '[CDA' => 0x0054, # T
1551     '[CDAT' => 0x0041, # A
1552     }->{$self->{s_kwd}}) {
1553     !!!cp (135.1);
1554     ## Stay in the state.
1555     $self->{s_kwd} .= chr $self->{nc};
1556     !!!next-input-character;
1557     redo A;
1558     } elsif ($self->{s_kwd} eq '[CDATA' and
1559     $self->{nc} == 0x005B) { # [
1560     !!!cp (135.2);
1561     $self->{ct} = {type => CHARACTER_TOKEN,
1562     data => '',
1563     line => $self->{line_prev},
1564     column => $self->{column_prev} - 7};
1565     $self->{state} = CDATA_SECTION_STATE;
1566     !!!next-input-character;
1567     redo A;
1568     } else {
1569     !!!cp (135.3);
1570     !!!parse-error (type => 'bogus comment',
1571     line => $self->{line_prev},
1572     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1573     $self->{state} = BOGUS_COMMENT_STATE;
1574     ## Reconsume.
1575     $self->{ct} = {type => COMMENT_TOKEN,
1576     data => $self->{s_kwd},
1577     line => $self->{line_prev},
1578     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1579     };
1580     redo A;
1581     }
1582     } elsif ($self->{state} == COMMENT_START_STATE) {
1583     if ($self->{nc} == 0x002D) { # -
1584     !!!cp (137);
1585     $self->{state} = COMMENT_START_DASH_STATE;
1586     !!!next-input-character;
1587     redo A;
1588     } elsif ($self->{nc} == 0x003E) { # >
1589     !!!cp (138);
1590     !!!parse-error (type => 'bogus comment');
1591     $self->{state} = DATA_STATE;
1592 wakaba 1.5 $self->{s_kwd} = '';
1593 wakaba 1.1 !!!next-input-character;
1594    
1595     !!!emit ($self->{ct}); # comment
1596    
1597     redo A;
1598     } elsif ($self->{nc} == -1) {
1599     !!!cp (139);
1600     !!!parse-error (type => 'unclosed comment');
1601     $self->{state} = DATA_STATE;
1602 wakaba 1.5 $self->{s_kwd} = '';
1603 wakaba 1.1 ## reconsume
1604    
1605     !!!emit ($self->{ct}); # comment
1606    
1607     redo A;
1608     } else {
1609     !!!cp (140);
1610     $self->{ct}->{data} # comment
1611     .= chr ($self->{nc});
1612     $self->{state} = COMMENT_STATE;
1613     !!!next-input-character;
1614     redo A;
1615     }
1616     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1617     if ($self->{nc} == 0x002D) { # -
1618     !!!cp (141);
1619     $self->{state} = COMMENT_END_STATE;
1620     !!!next-input-character;
1621     redo A;
1622     } elsif ($self->{nc} == 0x003E) { # >
1623     !!!cp (142);
1624     !!!parse-error (type => 'bogus comment');
1625     $self->{state} = DATA_STATE;
1626 wakaba 1.5 $self->{s_kwd} = '';
1627 wakaba 1.1 !!!next-input-character;
1628    
1629     !!!emit ($self->{ct}); # comment
1630    
1631     redo A;
1632     } elsif ($self->{nc} == -1) {
1633     !!!cp (143);
1634     !!!parse-error (type => 'unclosed comment');
1635     $self->{state} = DATA_STATE;
1636 wakaba 1.5 $self->{s_kwd} = '';
1637 wakaba 1.1 ## reconsume
1638    
1639     !!!emit ($self->{ct}); # comment
1640    
1641     redo A;
1642     } else {
1643     !!!cp (144);
1644     $self->{ct}->{data} # comment
1645     .= '-' . chr ($self->{nc});
1646     $self->{state} = COMMENT_STATE;
1647     !!!next-input-character;
1648     redo A;
1649     }
1650     } elsif ($self->{state} == COMMENT_STATE) {
1651     if ($self->{nc} == 0x002D) { # -
1652     !!!cp (145);
1653     $self->{state} = COMMENT_END_DASH_STATE;
1654     !!!next-input-character;
1655     redo A;
1656     } elsif ($self->{nc} == -1) {
1657     !!!cp (146);
1658     !!!parse-error (type => 'unclosed comment');
1659     $self->{state} = DATA_STATE;
1660 wakaba 1.5 $self->{s_kwd} = '';
1661 wakaba 1.1 ## reconsume
1662    
1663     !!!emit ($self->{ct}); # comment
1664    
1665     redo A;
1666     } else {
1667     !!!cp (147);
1668     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1669     $self->{read_until}->($self->{ct}->{data},
1670     q[-],
1671     length $self->{ct}->{data});
1672    
1673     ## Stay in the state
1674     !!!next-input-character;
1675     redo A;
1676     }
1677     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1678     if ($self->{nc} == 0x002D) { # -
1679     !!!cp (148);
1680     $self->{state} = COMMENT_END_STATE;
1681     !!!next-input-character;
1682     redo A;
1683     } elsif ($self->{nc} == -1) {
1684     !!!cp (149);
1685     !!!parse-error (type => 'unclosed comment');
1686 wakaba 1.5 $self->{s_kwd} = '';
1687 wakaba 1.1 $self->{state} = DATA_STATE;
1688 wakaba 1.5 $self->{s_kwd} = '';
1689 wakaba 1.1 ## reconsume
1690    
1691     !!!emit ($self->{ct}); # comment
1692    
1693     redo A;
1694     } else {
1695     !!!cp (150);
1696     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1697     $self->{state} = COMMENT_STATE;
1698     !!!next-input-character;
1699     redo A;
1700     }
1701     } elsif ($self->{state} == COMMENT_END_STATE) {
1702     if ($self->{nc} == 0x003E) { # >
1703     !!!cp (151);
1704     $self->{state} = DATA_STATE;
1705 wakaba 1.5 $self->{s_kwd} = '';
1706 wakaba 1.1 !!!next-input-character;
1707    
1708     !!!emit ($self->{ct}); # comment
1709    
1710     redo A;
1711     } elsif ($self->{nc} == 0x002D) { # -
1712     !!!cp (152);
1713     !!!parse-error (type => 'dash in comment',
1714     line => $self->{line_prev},
1715     column => $self->{column_prev});
1716     $self->{ct}->{data} .= '-'; # comment
1717     ## Stay in the state
1718     !!!next-input-character;
1719     redo A;
1720     } elsif ($self->{nc} == -1) {
1721     !!!cp (153);
1722     !!!parse-error (type => 'unclosed comment');
1723     $self->{state} = DATA_STATE;
1724 wakaba 1.5 $self->{s_kwd} = '';
1725 wakaba 1.1 ## reconsume
1726    
1727     !!!emit ($self->{ct}); # comment
1728    
1729     redo A;
1730     } else {
1731     !!!cp (154);
1732     !!!parse-error (type => 'dash in comment',
1733     line => $self->{line_prev},
1734     column => $self->{column_prev});
1735     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1736     $self->{state} = COMMENT_STATE;
1737     !!!next-input-character;
1738     redo A;
1739     }
1740     } elsif ($self->{state} == DOCTYPE_STATE) {
1741     if ($is_space->{$self->{nc}}) {
1742     !!!cp (155);
1743     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1744     !!!next-input-character;
1745     redo A;
1746     } else {
1747     !!!cp (156);
1748     !!!parse-error (type => 'no space before DOCTYPE name');
1749     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1750     ## reconsume
1751     redo A;
1752     }
1753     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1754     if ($is_space->{$self->{nc}}) {
1755     !!!cp (157);
1756     ## Stay in the state
1757     !!!next-input-character;
1758     redo A;
1759     } elsif ($self->{nc} == 0x003E) { # >
1760     !!!cp (158);
1761     !!!parse-error (type => 'no DOCTYPE name');
1762     $self->{state} = DATA_STATE;
1763 wakaba 1.5 $self->{s_kwd} = '';
1764 wakaba 1.1 !!!next-input-character;
1765    
1766     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1767    
1768     redo A;
1769     } elsif ($self->{nc} == -1) {
1770     !!!cp (159);
1771     !!!parse-error (type => 'no DOCTYPE name');
1772     $self->{state} = DATA_STATE;
1773 wakaba 1.5 $self->{s_kwd} = '';
1774 wakaba 1.1 ## reconsume
1775    
1776     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1777    
1778     redo A;
1779     } else {
1780     !!!cp (160);
1781     $self->{ct}->{name} = chr $self->{nc};
1782     delete $self->{ct}->{quirks};
1783     $self->{state} = DOCTYPE_NAME_STATE;
1784     !!!next-input-character;
1785     redo A;
1786     }
1787     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1788     ## ISSUE: Redundant "First," in the spec.
1789     if ($is_space->{$self->{nc}}) {
1790     !!!cp (161);
1791     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1792     !!!next-input-character;
1793     redo A;
1794     } elsif ($self->{nc} == 0x003E) { # >
1795     !!!cp (162);
1796     $self->{state} = DATA_STATE;
1797 wakaba 1.5 $self->{s_kwd} = '';
1798 wakaba 1.1 !!!next-input-character;
1799    
1800     !!!emit ($self->{ct}); # DOCTYPE
1801    
1802     redo A;
1803     } elsif ($self->{nc} == -1) {
1804     !!!cp (163);
1805     !!!parse-error (type => 'unclosed DOCTYPE');
1806     $self->{state} = DATA_STATE;
1807 wakaba 1.5 $self->{s_kwd} = '';
1808 wakaba 1.1 ## reconsume
1809    
1810     $self->{ct}->{quirks} = 1;
1811     !!!emit ($self->{ct}); # DOCTYPE
1812    
1813     redo A;
1814     } else {
1815     !!!cp (164);
1816     $self->{ct}->{name}
1817     .= chr ($self->{nc}); # DOCTYPE
1818     ## Stay in the state
1819     !!!next-input-character;
1820     redo A;
1821     }
1822     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1823     if ($is_space->{$self->{nc}}) {
1824     !!!cp (165);
1825     ## Stay in the state
1826     !!!next-input-character;
1827     redo A;
1828     } elsif ($self->{nc} == 0x003E) { # >
1829     !!!cp (166);
1830     $self->{state} = DATA_STATE;
1831 wakaba 1.5 $self->{s_kwd} = '';
1832 wakaba 1.1 !!!next-input-character;
1833    
1834     !!!emit ($self->{ct}); # DOCTYPE
1835    
1836     redo A;
1837     } elsif ($self->{nc} == -1) {
1838     !!!cp (167);
1839     !!!parse-error (type => 'unclosed DOCTYPE');
1840     $self->{state} = DATA_STATE;
1841 wakaba 1.5 $self->{s_kwd} = '';
1842 wakaba 1.1 ## reconsume
1843    
1844     $self->{ct}->{quirks} = 1;
1845     !!!emit ($self->{ct}); # DOCTYPE
1846    
1847     redo A;
1848     } elsif ($self->{nc} == 0x0050 or # P
1849     $self->{nc} == 0x0070) { # p
1850     $self->{state} = PUBLIC_STATE;
1851     $self->{s_kwd} = chr $self->{nc};
1852     !!!next-input-character;
1853     redo A;
1854     } elsif ($self->{nc} == 0x0053 or # S
1855     $self->{nc} == 0x0073) { # s
1856     $self->{state} = SYSTEM_STATE;
1857     $self->{s_kwd} = chr $self->{nc};
1858     !!!next-input-character;
1859     redo A;
1860     } else {
1861     !!!cp (180);
1862     !!!parse-error (type => 'string after DOCTYPE name');
1863     $self->{ct}->{quirks} = 1;
1864    
1865     $self->{state} = BOGUS_DOCTYPE_STATE;
1866     !!!next-input-character;
1867     redo A;
1868     }
1869     } elsif ($self->{state} == PUBLIC_STATE) {
1870     ## ASCII case-insensitive
1871     if ($self->{nc} == [
1872     undef,
1873     0x0055, # U
1874     0x0042, # B
1875     0x004C, # L
1876     0x0049, # I
1877     ]->[length $self->{s_kwd}] or
1878     $self->{nc} == [
1879     undef,
1880     0x0075, # u
1881     0x0062, # b
1882     0x006C, # l
1883     0x0069, # i
1884     ]->[length $self->{s_kwd}]) {
1885     !!!cp (175);
1886     ## Stay in the state.
1887     $self->{s_kwd} .= chr $self->{nc};
1888     !!!next-input-character;
1889     redo A;
1890     } elsif ((length $self->{s_kwd}) == 5 and
1891     ($self->{nc} == 0x0043 or # C
1892     $self->{nc} == 0x0063)) { # c
1893     !!!cp (168);
1894     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1895     !!!next-input-character;
1896     redo A;
1897     } else {
1898     !!!cp (169);
1899     !!!parse-error (type => 'string after DOCTYPE name',
1900     line => $self->{line_prev},
1901     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1902     $self->{ct}->{quirks} = 1;
1903    
1904     $self->{state} = BOGUS_DOCTYPE_STATE;
1905     ## Reconsume.
1906     redo A;
1907     }
1908     } elsif ($self->{state} == SYSTEM_STATE) {
1909     ## ASCII case-insensitive
1910     if ($self->{nc} == [
1911     undef,
1912     0x0059, # Y
1913     0x0053, # S
1914     0x0054, # T
1915     0x0045, # E
1916     ]->[length $self->{s_kwd}] or
1917     $self->{nc} == [
1918     undef,
1919     0x0079, # y
1920     0x0073, # s
1921     0x0074, # t
1922     0x0065, # e
1923     ]->[length $self->{s_kwd}]) {
1924     !!!cp (170);
1925     ## Stay in the state.
1926     $self->{s_kwd} .= chr $self->{nc};
1927     !!!next-input-character;
1928     redo A;
1929     } elsif ((length $self->{s_kwd}) == 5 and
1930     ($self->{nc} == 0x004D or # M
1931     $self->{nc} == 0x006D)) { # m
1932     !!!cp (171);
1933     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1934     !!!next-input-character;
1935     redo A;
1936     } else {
1937     !!!cp (172);
1938     !!!parse-error (type => 'string after DOCTYPE name',
1939     line => $self->{line_prev},
1940     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1941     $self->{ct}->{quirks} = 1;
1942    
1943     $self->{state} = BOGUS_DOCTYPE_STATE;
1944     ## Reconsume.
1945     redo A;
1946     }
1947     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1948     if ($is_space->{$self->{nc}}) {
1949     !!!cp (181);
1950     ## Stay in the state
1951     !!!next-input-character;
1952     redo A;
1953     } elsif ($self->{nc} eq 0x0022) { # "
1954     !!!cp (182);
1955     $self->{ct}->{pubid} = ''; # DOCTYPE
1956     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1957     !!!next-input-character;
1958     redo A;
1959     } elsif ($self->{nc} eq 0x0027) { # '
1960     !!!cp (183);
1961     $self->{ct}->{pubid} = ''; # DOCTYPE
1962     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1963     !!!next-input-character;
1964     redo A;
1965     } elsif ($self->{nc} eq 0x003E) { # >
1966     !!!cp (184);
1967     !!!parse-error (type => 'no PUBLIC literal');
1968    
1969     $self->{state} = DATA_STATE;
1970 wakaba 1.5 $self->{s_kwd} = '';
1971 wakaba 1.1 !!!next-input-character;
1972    
1973     $self->{ct}->{quirks} = 1;
1974     !!!emit ($self->{ct}); # DOCTYPE
1975    
1976     redo A;
1977     } elsif ($self->{nc} == -1) {
1978     !!!cp (185);
1979     !!!parse-error (type => 'unclosed DOCTYPE');
1980    
1981     $self->{state} = DATA_STATE;
1982 wakaba 1.5 $self->{s_kwd} = '';
1983 wakaba 1.1 ## reconsume
1984    
1985     $self->{ct}->{quirks} = 1;
1986     !!!emit ($self->{ct}); # DOCTYPE
1987    
1988     redo A;
1989     } else {
1990     !!!cp (186);
1991     !!!parse-error (type => 'string after PUBLIC');
1992     $self->{ct}->{quirks} = 1;
1993    
1994     $self->{state} = BOGUS_DOCTYPE_STATE;
1995     !!!next-input-character;
1996     redo A;
1997     }
1998     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1999     if ($self->{nc} == 0x0022) { # "
2000     !!!cp (187);
2001     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2002     !!!next-input-character;
2003     redo A;
2004     } elsif ($self->{nc} == 0x003E) { # >
2005     !!!cp (188);
2006     !!!parse-error (type => 'unclosed PUBLIC literal');
2007    
2008     $self->{state} = DATA_STATE;
2009 wakaba 1.5 $self->{s_kwd} = '';
2010 wakaba 1.1 !!!next-input-character;
2011    
2012     $self->{ct}->{quirks} = 1;
2013     !!!emit ($self->{ct}); # DOCTYPE
2014    
2015     redo A;
2016     } elsif ($self->{nc} == -1) {
2017     !!!cp (189);
2018     !!!parse-error (type => 'unclosed PUBLIC literal');
2019    
2020     $self->{state} = DATA_STATE;
2021 wakaba 1.5 $self->{s_kwd} = '';
2022 wakaba 1.1 ## reconsume
2023    
2024     $self->{ct}->{quirks} = 1;
2025     !!!emit ($self->{ct}); # DOCTYPE
2026    
2027     redo A;
2028     } else {
2029     !!!cp (190);
2030     $self->{ct}->{pubid} # DOCTYPE
2031     .= chr $self->{nc};
2032     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2033     length $self->{ct}->{pubid});
2034    
2035     ## Stay in the state
2036     !!!next-input-character;
2037     redo A;
2038     }
2039     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2040     if ($self->{nc} == 0x0027) { # '
2041     !!!cp (191);
2042     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2043     !!!next-input-character;
2044     redo A;
2045     } elsif ($self->{nc} == 0x003E) { # >
2046     !!!cp (192);
2047     !!!parse-error (type => 'unclosed PUBLIC literal');
2048    
2049     $self->{state} = DATA_STATE;
2050 wakaba 1.5 $self->{s_kwd} = '';
2051 wakaba 1.1 !!!next-input-character;
2052    
2053     $self->{ct}->{quirks} = 1;
2054     !!!emit ($self->{ct}); # DOCTYPE
2055    
2056     redo A;
2057     } elsif ($self->{nc} == -1) {
2058     !!!cp (193);
2059     !!!parse-error (type => 'unclosed PUBLIC literal');
2060    
2061     $self->{state} = DATA_STATE;
2062 wakaba 1.5 $self->{s_kwd} = '';
2063 wakaba 1.1 ## reconsume
2064    
2065     $self->{ct}->{quirks} = 1;
2066     !!!emit ($self->{ct}); # DOCTYPE
2067    
2068     redo A;
2069     } else {
2070     !!!cp (194);
2071     $self->{ct}->{pubid} # DOCTYPE
2072     .= chr $self->{nc};
2073     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2074     length $self->{ct}->{pubid});
2075    
2076     ## Stay in the state
2077     !!!next-input-character;
2078     redo A;
2079     }
2080     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2081     if ($is_space->{$self->{nc}}) {
2082     !!!cp (195);
2083     ## Stay in the state
2084     !!!next-input-character;
2085     redo A;
2086     } elsif ($self->{nc} == 0x0022) { # "
2087     !!!cp (196);
2088     $self->{ct}->{sysid} = ''; # DOCTYPE
2089     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2090     !!!next-input-character;
2091     redo A;
2092     } elsif ($self->{nc} == 0x0027) { # '
2093     !!!cp (197);
2094     $self->{ct}->{sysid} = ''; # DOCTYPE
2095     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2096     !!!next-input-character;
2097     redo A;
2098     } elsif ($self->{nc} == 0x003E) { # >
2099     !!!cp (198);
2100     $self->{state} = DATA_STATE;
2101 wakaba 1.5 $self->{s_kwd} = '';
2102 wakaba 1.1 !!!next-input-character;
2103    
2104     !!!emit ($self->{ct}); # DOCTYPE
2105    
2106     redo A;
2107     } elsif ($self->{nc} == -1) {
2108     !!!cp (199);
2109     !!!parse-error (type => 'unclosed DOCTYPE');
2110    
2111     $self->{state} = DATA_STATE;
2112 wakaba 1.5 $self->{s_kwd} = '';
2113 wakaba 1.1 ## reconsume
2114    
2115     $self->{ct}->{quirks} = 1;
2116     !!!emit ($self->{ct}); # DOCTYPE
2117    
2118     redo A;
2119     } else {
2120     !!!cp (200);
2121     !!!parse-error (type => 'string after PUBLIC literal');
2122     $self->{ct}->{quirks} = 1;
2123    
2124     $self->{state} = BOGUS_DOCTYPE_STATE;
2125     !!!next-input-character;
2126     redo A;
2127     }
2128     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2129     if ($is_space->{$self->{nc}}) {
2130     !!!cp (201);
2131     ## Stay in the state
2132     !!!next-input-character;
2133     redo A;
2134     } elsif ($self->{nc} == 0x0022) { # "
2135     !!!cp (202);
2136     $self->{ct}->{sysid} = ''; # DOCTYPE
2137     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2138     !!!next-input-character;
2139     redo A;
2140     } elsif ($self->{nc} == 0x0027) { # '
2141     !!!cp (203);
2142     $self->{ct}->{sysid} = ''; # DOCTYPE
2143     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2144     !!!next-input-character;
2145     redo A;
2146     } elsif ($self->{nc} == 0x003E) { # >
2147     !!!cp (204);
2148     !!!parse-error (type => 'no SYSTEM literal');
2149     $self->{state} = DATA_STATE;
2150 wakaba 1.5 $self->{s_kwd} = '';
2151 wakaba 1.1 !!!next-input-character;
2152    
2153     $self->{ct}->{quirks} = 1;
2154     !!!emit ($self->{ct}); # DOCTYPE
2155    
2156     redo A;
2157     } elsif ($self->{nc} == -1) {
2158     !!!cp (205);
2159     !!!parse-error (type => 'unclosed DOCTYPE');
2160    
2161     $self->{state} = DATA_STATE;
2162 wakaba 1.5 $self->{s_kwd} = '';
2163 wakaba 1.1 ## reconsume
2164    
2165     $self->{ct}->{quirks} = 1;
2166     !!!emit ($self->{ct}); # DOCTYPE
2167    
2168     redo A;
2169     } else {
2170     !!!cp (206);
2171     !!!parse-error (type => 'string after SYSTEM');
2172     $self->{ct}->{quirks} = 1;
2173    
2174     $self->{state} = BOGUS_DOCTYPE_STATE;
2175     !!!next-input-character;
2176     redo A;
2177     }
2178     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2179     if ($self->{nc} == 0x0022) { # "
2180     !!!cp (207);
2181     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2182     !!!next-input-character;
2183     redo A;
2184     } elsif ($self->{nc} == 0x003E) { # >
2185     !!!cp (208);
2186     !!!parse-error (type => 'unclosed SYSTEM literal');
2187    
2188     $self->{state} = DATA_STATE;
2189 wakaba 1.5 $self->{s_kwd} = '';
2190 wakaba 1.1 !!!next-input-character;
2191    
2192     $self->{ct}->{quirks} = 1;
2193     !!!emit ($self->{ct}); # DOCTYPE
2194    
2195     redo A;
2196     } elsif ($self->{nc} == -1) {
2197     !!!cp (209);
2198     !!!parse-error (type => 'unclosed SYSTEM literal');
2199    
2200     $self->{state} = DATA_STATE;
2201 wakaba 1.5 $self->{s_kwd} = '';
2202 wakaba 1.1 ## reconsume
2203    
2204     $self->{ct}->{quirks} = 1;
2205     !!!emit ($self->{ct}); # DOCTYPE
2206    
2207     redo A;
2208     } else {
2209     !!!cp (210);
2210     $self->{ct}->{sysid} # DOCTYPE
2211     .= chr $self->{nc};
2212     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2213     length $self->{ct}->{sysid});
2214    
2215     ## Stay in the state
2216     !!!next-input-character;
2217     redo A;
2218     }
2219     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2220     if ($self->{nc} == 0x0027) { # '
2221     !!!cp (211);
2222     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2223     !!!next-input-character;
2224     redo A;
2225     } elsif ($self->{nc} == 0x003E) { # >
2226     !!!cp (212);
2227     !!!parse-error (type => 'unclosed SYSTEM literal');
2228    
2229     $self->{state} = DATA_STATE;
2230 wakaba 1.5 $self->{s_kwd} = '';
2231 wakaba 1.1 !!!next-input-character;
2232    
2233     $self->{ct}->{quirks} = 1;
2234     !!!emit ($self->{ct}); # DOCTYPE
2235    
2236     redo A;
2237     } elsif ($self->{nc} == -1) {
2238     !!!cp (213);
2239     !!!parse-error (type => 'unclosed SYSTEM literal');
2240    
2241     $self->{state} = DATA_STATE;
2242 wakaba 1.5 $self->{s_kwd} = '';
2243 wakaba 1.1 ## reconsume
2244    
2245     $self->{ct}->{quirks} = 1;
2246     !!!emit ($self->{ct}); # DOCTYPE
2247    
2248     redo A;
2249     } else {
2250     !!!cp (214);
2251     $self->{ct}->{sysid} # DOCTYPE
2252     .= chr $self->{nc};
2253     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2254     length $self->{ct}->{sysid});
2255    
2256     ## Stay in the state
2257     !!!next-input-character;
2258     redo A;
2259     }
2260     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2261     if ($is_space->{$self->{nc}}) {
2262     !!!cp (215);
2263     ## Stay in the state
2264     !!!next-input-character;
2265     redo A;
2266     } elsif ($self->{nc} == 0x003E) { # >
2267     !!!cp (216);
2268     $self->{state} = DATA_STATE;
2269 wakaba 1.5 $self->{s_kwd} = '';
2270 wakaba 1.1 !!!next-input-character;
2271    
2272     !!!emit ($self->{ct}); # DOCTYPE
2273    
2274     redo A;
2275     } elsif ($self->{nc} == -1) {
2276     !!!cp (217);
2277     !!!parse-error (type => 'unclosed DOCTYPE');
2278     $self->{state} = DATA_STATE;
2279 wakaba 1.5 $self->{s_kwd} = '';
2280 wakaba 1.1 ## reconsume
2281    
2282     $self->{ct}->{quirks} = 1;
2283     !!!emit ($self->{ct}); # DOCTYPE
2284    
2285     redo A;
2286     } else {
2287     !!!cp (218);
2288     !!!parse-error (type => 'string after SYSTEM literal');
2289     #$self->{ct}->{quirks} = 1;
2290    
2291     $self->{state} = BOGUS_DOCTYPE_STATE;
2292     !!!next-input-character;
2293     redo A;
2294     }
2295     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2296     if ($self->{nc} == 0x003E) { # >
2297     !!!cp (219);
2298     $self->{state} = DATA_STATE;
2299 wakaba 1.5 $self->{s_kwd} = '';
2300 wakaba 1.1 !!!next-input-character;
2301    
2302     !!!emit ($self->{ct}); # DOCTYPE
2303    
2304     redo A;
2305     } elsif ($self->{nc} == -1) {
2306     !!!cp (220);
2307     $self->{state} = DATA_STATE;
2308 wakaba 1.5 $self->{s_kwd} = '';
2309 wakaba 1.1 ## reconsume
2310    
2311     !!!emit ($self->{ct}); # DOCTYPE
2312    
2313     redo A;
2314     } else {
2315     !!!cp (221);
2316     my $s = '';
2317     $self->{read_until}->($s, q[>], 0);
2318    
2319     ## Stay in the state
2320     !!!next-input-character;
2321     redo A;
2322     }
2323     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2324     ## NOTE: "CDATA section state" in the state is jointly implemented
2325     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2326     ## and |CDATA_SECTION_MSE2_STATE|.
2327    
2328     if ($self->{nc} == 0x005D) { # ]
2329     !!!cp (221.1);
2330     $self->{state} = CDATA_SECTION_MSE1_STATE;
2331     !!!next-input-character;
2332     redo A;
2333     } elsif ($self->{nc} == -1) {
2334     $self->{state} = DATA_STATE;
2335 wakaba 1.5 $self->{s_kwd} = '';
2336 wakaba 1.1 !!!next-input-character;
2337     if (length $self->{ct}->{data}) { # character
2338     !!!cp (221.2);
2339     !!!emit ($self->{ct}); # character
2340     } else {
2341     !!!cp (221.3);
2342     ## No token to emit. $self->{ct} is discarded.
2343     }
2344     redo A;
2345     } else {
2346     !!!cp (221.4);
2347     $self->{ct}->{data} .= chr $self->{nc};
2348     $self->{read_until}->($self->{ct}->{data},
2349     q<]>,
2350     length $self->{ct}->{data});
2351    
2352     ## Stay in the state.
2353     !!!next-input-character;
2354     redo A;
2355     }
2356    
2357     ## ISSUE: "text tokens" in spec.
2358     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2359     if ($self->{nc} == 0x005D) { # ]
2360     !!!cp (221.5);
2361     $self->{state} = CDATA_SECTION_MSE2_STATE;
2362     !!!next-input-character;
2363     redo A;
2364     } else {
2365     !!!cp (221.6);
2366     $self->{ct}->{data} .= ']';
2367     $self->{state} = CDATA_SECTION_STATE;
2368     ## Reconsume.
2369     redo A;
2370     }
2371     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2372     if ($self->{nc} == 0x003E) { # >
2373     $self->{state} = DATA_STATE;
2374 wakaba 1.5 $self->{s_kwd} = '';
2375 wakaba 1.1 !!!next-input-character;
2376     if (length $self->{ct}->{data}) { # character
2377     !!!cp (221.7);
2378     !!!emit ($self->{ct}); # character
2379     } else {
2380     !!!cp (221.8);
2381     ## No token to emit. $self->{ct} is discarded.
2382     }
2383     redo A;
2384     } elsif ($self->{nc} == 0x005D) { # ]
2385     !!!cp (221.9); # character
2386     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2387     ## Stay in the state.
2388     !!!next-input-character;
2389     redo A;
2390     } else {
2391     !!!cp (221.11);
2392     $self->{ct}->{data} .= ']]'; # character
2393     $self->{state} = CDATA_SECTION_STATE;
2394     ## Reconsume.
2395     redo A;
2396     }
2397     } elsif ($self->{state} == ENTITY_STATE) {
2398     if ($is_space->{$self->{nc}} or
2399     {
2400     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2401     $self->{entity_add} => 1,
2402     }->{$self->{nc}}) {
2403     !!!cp (1001);
2404     ## Don't consume
2405     ## No error
2406     ## Return nothing.
2407     #
2408     } elsif ($self->{nc} == 0x0023) { # #
2409     !!!cp (999);
2410     $self->{state} = ENTITY_HASH_STATE;
2411     $self->{s_kwd} = '#';
2412     !!!next-input-character;
2413     redo A;
2414     } elsif ((0x0041 <= $self->{nc} and
2415     $self->{nc} <= 0x005A) or # A..Z
2416     (0x0061 <= $self->{nc} and
2417     $self->{nc} <= 0x007A)) { # a..z
2418     !!!cp (998);
2419     require Whatpm::_NamedEntityList;
2420     $self->{state} = ENTITY_NAME_STATE;
2421     $self->{s_kwd} = chr $self->{nc};
2422     $self->{entity__value} = $self->{s_kwd};
2423     $self->{entity__match} = 0;
2424     !!!next-input-character;
2425     redo A;
2426     } else {
2427     !!!cp (1027);
2428     !!!parse-error (type => 'bare ero');
2429     ## Return nothing.
2430     #
2431     }
2432    
2433     ## NOTE: No character is consumed by the "consume a character
2434     ## reference" algorithm. In other word, there is an "&" character
2435     ## that does not introduce a character reference, which would be
2436     ## appended to the parent element or the attribute value in later
2437     ## process of the tokenizer.
2438    
2439     if ($self->{prev_state} == DATA_STATE) {
2440     !!!cp (997);
2441     $self->{state} = $self->{prev_state};
2442 wakaba 1.5 $self->{s_kwd} = '';
2443 wakaba 1.1 ## Reconsume.
2444     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2445     line => $self->{line_prev},
2446     column => $self->{column_prev},
2447     });
2448     redo A;
2449     } else {
2450     !!!cp (996);
2451     $self->{ca}->{value} .= '&';
2452     $self->{state} = $self->{prev_state};
2453 wakaba 1.5 $self->{s_kwd} = '';
2454 wakaba 1.1 ## Reconsume.
2455     redo A;
2456     }
2457     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2458     if ($self->{nc} == 0x0078 or # x
2459     $self->{nc} == 0x0058) { # X
2460     !!!cp (995);
2461     $self->{state} = HEXREF_X_STATE;
2462     $self->{s_kwd} .= chr $self->{nc};
2463     !!!next-input-character;
2464     redo A;
2465     } elsif (0x0030 <= $self->{nc} and
2466     $self->{nc} <= 0x0039) { # 0..9
2467     !!!cp (994);
2468     $self->{state} = NCR_NUM_STATE;
2469     $self->{s_kwd} = $self->{nc} - 0x0030;
2470     !!!next-input-character;
2471     redo A;
2472     } else {
2473     !!!parse-error (type => 'bare nero',
2474     line => $self->{line_prev},
2475     column => $self->{column_prev} - 1);
2476    
2477     ## NOTE: According to the spec algorithm, nothing is returned,
2478     ## and then "&#" is appended to the parent element or the attribute
2479     ## value in the later processing.
2480    
2481     if ($self->{prev_state} == DATA_STATE) {
2482     !!!cp (1019);
2483     $self->{state} = $self->{prev_state};
2484 wakaba 1.5 $self->{s_kwd} = '';
2485 wakaba 1.1 ## Reconsume.
2486     !!!emit ({type => CHARACTER_TOKEN,
2487     data => '&#',
2488     line => $self->{line_prev},
2489     column => $self->{column_prev} - 1,
2490     });
2491     redo A;
2492     } else {
2493     !!!cp (993);
2494     $self->{ca}->{value} .= '&#';
2495     $self->{state} = $self->{prev_state};
2496 wakaba 1.5 $self->{s_kwd} = '';
2497 wakaba 1.1 ## Reconsume.
2498     redo A;
2499     }
2500     }
2501     } elsif ($self->{state} == NCR_NUM_STATE) {
2502     if (0x0030 <= $self->{nc} and
2503     $self->{nc} <= 0x0039) { # 0..9
2504     !!!cp (1012);
2505     $self->{s_kwd} *= 10;
2506     $self->{s_kwd} += $self->{nc} - 0x0030;
2507    
2508     ## Stay in the state.
2509     !!!next-input-character;
2510     redo A;
2511     } elsif ($self->{nc} == 0x003B) { # ;
2512     !!!cp (1013);
2513     !!!next-input-character;
2514     #
2515     } else {
2516     !!!cp (1014);
2517     !!!parse-error (type => 'no refc');
2518     ## Reconsume.
2519     #
2520     }
2521    
2522     my $code = $self->{s_kwd};
2523     my $l = $self->{line_prev};
2524     my $c = $self->{column_prev};
2525     if ($charref_map->{$code}) {
2526     !!!cp (1015);
2527     !!!parse-error (type => 'invalid character reference',
2528     text => (sprintf 'U+%04X', $code),
2529     line => $l, column => $c);
2530     $code = $charref_map->{$code};
2531     } elsif ($code > 0x10FFFF) {
2532     !!!cp (1016);
2533     !!!parse-error (type => 'invalid character reference',
2534     text => (sprintf 'U-%08X', $code),
2535     line => $l, column => $c);
2536     $code = 0xFFFD;
2537     }
2538    
2539     if ($self->{prev_state} == DATA_STATE) {
2540     !!!cp (992);
2541     $self->{state} = $self->{prev_state};
2542 wakaba 1.5 $self->{s_kwd} = '';
2543 wakaba 1.1 ## Reconsume.
2544     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2545     line => $l, column => $c,
2546     });
2547     redo A;
2548     } else {
2549     !!!cp (991);
2550     $self->{ca}->{value} .= chr $code;
2551     $self->{ca}->{has_reference} = 1;
2552     $self->{state} = $self->{prev_state};
2553 wakaba 1.5 $self->{s_kwd} = '';
2554 wakaba 1.1 ## Reconsume.
2555     redo A;
2556     }
2557     } elsif ($self->{state} == HEXREF_X_STATE) {
2558     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2559     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2560     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2561     # 0..9, A..F, a..f
2562     !!!cp (990);
2563     $self->{state} = HEXREF_HEX_STATE;
2564     $self->{s_kwd} = 0;
2565     ## Reconsume.
2566     redo A;
2567     } else {
2568     !!!parse-error (type => 'bare hcro',
2569     line => $self->{line_prev},
2570     column => $self->{column_prev} - 2);
2571    
2572     ## NOTE: According to the spec algorithm, nothing is returned,
2573     ## and then "&#" followed by "X" or "x" is appended to the parent
2574     ## element or the attribute value in the later processing.
2575    
2576     if ($self->{prev_state} == DATA_STATE) {
2577     !!!cp (1005);
2578     $self->{state} = $self->{prev_state};
2579 wakaba 1.5 $self->{s_kwd} = '';
2580 wakaba 1.1 ## Reconsume.
2581     !!!emit ({type => CHARACTER_TOKEN,
2582     data => '&' . $self->{s_kwd},
2583     line => $self->{line_prev},
2584     column => $self->{column_prev} - length $self->{s_kwd},
2585     });
2586     redo A;
2587     } else {
2588     !!!cp (989);
2589     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2590     $self->{state} = $self->{prev_state};
2591 wakaba 1.5 $self->{s_kwd} = '';
2592 wakaba 1.1 ## Reconsume.
2593     redo A;
2594     }
2595     }
2596     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2597     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2598     # 0..9
2599     !!!cp (1002);
2600     $self->{s_kwd} *= 0x10;
2601     $self->{s_kwd} += $self->{nc} - 0x0030;
2602     ## Stay in the state.
2603     !!!next-input-character;
2604     redo A;
2605     } elsif (0x0061 <= $self->{nc} and
2606     $self->{nc} <= 0x0066) { # a..f
2607     !!!cp (1003);
2608     $self->{s_kwd} *= 0x10;
2609     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2610     ## Stay in the state.
2611     !!!next-input-character;
2612     redo A;
2613     } elsif (0x0041 <= $self->{nc} and
2614     $self->{nc} <= 0x0046) { # A..F
2615     !!!cp (1004);
2616     $self->{s_kwd} *= 0x10;
2617     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2618     ## Stay in the state.
2619     !!!next-input-character;
2620     redo A;
2621     } elsif ($self->{nc} == 0x003B) { # ;
2622     !!!cp (1006);
2623     !!!next-input-character;
2624     #
2625     } else {
2626     !!!cp (1007);
2627     !!!parse-error (type => 'no refc',
2628     line => $self->{line},
2629     column => $self->{column});
2630     ## Reconsume.
2631     #
2632     }
2633    
2634     my $code = $self->{s_kwd};
2635     my $l = $self->{line_prev};
2636     my $c = $self->{column_prev};
2637     if ($charref_map->{$code}) {
2638     !!!cp (1008);
2639     !!!parse-error (type => 'invalid character reference',
2640     text => (sprintf 'U+%04X', $code),
2641     line => $l, column => $c);
2642     $code = $charref_map->{$code};
2643     } elsif ($code > 0x10FFFF) {
2644     !!!cp (1009);
2645     !!!parse-error (type => 'invalid character reference',
2646     text => (sprintf 'U-%08X', $code),
2647     line => $l, column => $c);
2648     $code = 0xFFFD;
2649     }
2650    
2651     if ($self->{prev_state} == DATA_STATE) {
2652     !!!cp (988);
2653     $self->{state} = $self->{prev_state};
2654 wakaba 1.5 $self->{s_kwd} = '';
2655 wakaba 1.1 ## Reconsume.
2656     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2657     line => $l, column => $c,
2658     });
2659     redo A;
2660     } else {
2661     !!!cp (987);
2662     $self->{ca}->{value} .= chr $code;
2663     $self->{ca}->{has_reference} = 1;
2664     $self->{state} = $self->{prev_state};
2665 wakaba 1.5 $self->{s_kwd} = '';
2666 wakaba 1.1 ## Reconsume.
2667     redo A;
2668     }
2669     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2670     if (length $self->{s_kwd} < 30 and
2671     ## NOTE: Some number greater than the maximum length of entity name
2672     ((0x0041 <= $self->{nc} and # a
2673     $self->{nc} <= 0x005A) or # x
2674     (0x0061 <= $self->{nc} and # a
2675     $self->{nc} <= 0x007A) or # z
2676     (0x0030 <= $self->{nc} and # 0
2677     $self->{nc} <= 0x0039) or # 9
2678     $self->{nc} == 0x003B)) { # ;
2679     our $EntityChar;
2680     $self->{s_kwd} .= chr $self->{nc};
2681     if (defined $EntityChar->{$self->{s_kwd}}) {
2682     if ($self->{nc} == 0x003B) { # ;
2683     !!!cp (1020);
2684     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2685     $self->{entity__match} = 1;
2686     !!!next-input-character;
2687     #
2688     } else {
2689     !!!cp (1021);
2690     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2691     $self->{entity__match} = -1;
2692     ## Stay in the state.
2693     !!!next-input-character;
2694     redo A;
2695     }
2696     } else {
2697     !!!cp (1022);
2698     $self->{entity__value} .= chr $self->{nc};
2699     $self->{entity__match} *= 2;
2700     ## Stay in the state.
2701     !!!next-input-character;
2702     redo A;
2703     }
2704     }
2705    
2706     my $data;
2707     my $has_ref;
2708     if ($self->{entity__match} > 0) {
2709     !!!cp (1023);
2710     $data = $self->{entity__value};
2711     $has_ref = 1;
2712     #
2713     } elsif ($self->{entity__match} < 0) {
2714     !!!parse-error (type => 'no refc');
2715     if ($self->{prev_state} != DATA_STATE and # in attribute
2716     $self->{entity__match} < -1) {
2717     !!!cp (1024);
2718     $data = '&' . $self->{s_kwd};
2719     #
2720     } else {
2721     !!!cp (1025);
2722     $data = $self->{entity__value};
2723     $has_ref = 1;
2724     #
2725     }
2726     } else {
2727     !!!cp (1026);
2728     !!!parse-error (type => 'bare ero',
2729     line => $self->{line_prev},
2730     column => $self->{column_prev} - length $self->{s_kwd});
2731     $data = '&' . $self->{s_kwd};
2732     #
2733     }
2734    
2735     ## NOTE: In these cases, when a character reference is found,
2736     ## it is consumed and a character token is returned, or, otherwise,
2737     ## nothing is consumed and returned, according to the spec algorithm.
2738     ## In this implementation, anything that has been examined by the
2739     ## tokenizer is appended to the parent element or the attribute value
2740     ## as string, either literal string when no character reference or
2741     ## entity-replaced string otherwise, in this stage, since any characters
2742     ## that would not be consumed are appended in the data state or in an
2743     ## appropriate attribute value state anyway.
2744    
2745     if ($self->{prev_state} == DATA_STATE) {
2746     !!!cp (986);
2747     $self->{state} = $self->{prev_state};
2748 wakaba 1.5 $self->{s_kwd} = '';
2749 wakaba 1.1 ## Reconsume.
2750     !!!emit ({type => CHARACTER_TOKEN,
2751     data => $data,
2752     line => $self->{line_prev},
2753     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2754     });
2755     redo A;
2756     } else {
2757     !!!cp (985);
2758     $self->{ca}->{value} .= $data;
2759     $self->{ca}->{has_reference} = 1 if $has_ref;
2760     $self->{state} = $self->{prev_state};
2761 wakaba 1.5 $self->{s_kwd} = '';
2762 wakaba 1.1 ## Reconsume.
2763     redo A;
2764     }
2765     } else {
2766     die "$0: $self->{state}: Unknown state";
2767     }
2768     } # A
2769    
2770     die "$0: _get_next_token: unexpected case";
2771     } # _get_next_token
2772    
2773     1;
2774 wakaba 1.5 ## $Date: 2008/10/14 11:46:57 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24