/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (hide annotations) (download) (as text)
Tue Oct 14 15:25:50 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.6: +6 -2 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	14 Oct 2008 15:23:30 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/charref-1.dat" added.

++ whatpm/t/xml/ChangeLog	14 Oct 2008 15:23:49 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* charref-1.dat: New test data file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 15:24:42 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Mark CHARACTER_TOKEN with character reference
	as such, for the support of XML parse error.

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 15:25:35 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Raise a parse error for white space character
	generated by a character reference outside of the root element.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.7 our $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
182 wakaba 1.1 #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194     !!!next-input-character;
195     $self->{token} = [];
196     # $self->{escape}
197     } # _initialize_tokenizer
198    
199     ## A token has:
200     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202     ## ->{name} (DOCTYPE_TOKEN)
203     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204     ## ->{pubid} (DOCTYPE_TOKEN)
205     ## ->{sysid} (DOCTYPE_TOKEN)
206     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208     ## ->{name}
209     ## ->{value}
210     ## ->{has_reference} == 1 or 0
211     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
213 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
214     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
215     ## while the token is pushed back to the stack.
216    
217     ## Emitted token MUST immediately be handled by the tree construction state.
218    
219     ## Before each step, UA MAY check to see if either one of the scripts in
220     ## "list of scripts that will execute as soon as possible" or the first
221     ## script in the "list of scripts that will execute asynchronously",
222     ## has completed loading. If one has, then it MUST be executed
223     ## and removed from the list.
224    
225     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
226     ## (This requirement was dropped from HTML5 spec, unfortunately.)
227    
228     my $is_space = {
229     0x0009 => 1, # CHARACTER TABULATION (HT)
230     0x000A => 1, # LINE FEED (LF)
231     #0x000B => 0, # LINE TABULATION (VT)
232     0x000C => 1, # FORM FEED (FF)
233     #0x000D => 1, # CARRIAGE RETURN (CR)
234     0x0020 => 1, # SPACE (SP)
235     };
236    
237     sub _get_next_token ($) {
238     my $self = shift;
239    
240     if ($self->{self_closing}) {
241     !!!parse-error (type => 'nestc', token => $self->{ct});
242     ## NOTE: The |self_closing| flag is only set by start tag token.
243     ## In addition, when a start tag token is emitted, it is always set to
244     ## |ct|.
245     delete $self->{self_closing};
246     }
247    
248     if (@{$self->{token}}) {
249     $self->{self_closing} = $self->{token}->[0]->{self_closing};
250     return shift @{$self->{token}};
251     }
252    
253     A: {
254     if ($self->{state} == PCDATA_STATE) {
255     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
256    
257     if ($self->{nc} == 0x0026) { # &
258     !!!cp (0.1);
259     ## NOTE: In the spec, the tokenizer is switched to the
260     ## "entity data state". In this implementation, the tokenizer
261     ## is switched to the |ENTITY_STATE|, which is an implementation
262     ## of the "consume a character reference" algorithm.
263     $self->{entity_add} = -1;
264     $self->{prev_state} = DATA_STATE;
265     $self->{state} = ENTITY_STATE;
266     !!!next-input-character;
267     redo A;
268     } elsif ($self->{nc} == 0x003C) { # <
269     !!!cp (0.2);
270     $self->{state} = TAG_OPEN_STATE;
271     !!!next-input-character;
272     redo A;
273     } elsif ($self->{nc} == -1) {
274     !!!cp (0.3);
275     !!!emit ({type => END_OF_FILE_TOKEN,
276     line => $self->{line}, column => $self->{column}});
277     last A; ## TODO: ok?
278     } else {
279     !!!cp (0.4);
280     #
281     }
282    
283     # Anything else
284     my $token = {type => CHARACTER_TOKEN,
285     data => chr $self->{nc},
286     line => $self->{line}, column => $self->{column},
287     };
288     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
289    
290     ## Stay in the state.
291     !!!next-input-character;
292     !!!emit ($token);
293     redo A;
294     } elsif ($self->{state} == DATA_STATE) {
295     $self->{s_kwd} = '' unless defined $self->{s_kwd};
296     if ($self->{nc} == 0x0026) { # &
297     $self->{s_kwd} = '';
298     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
299     not $self->{escape}) {
300     !!!cp (1);
301     ## NOTE: In the spec, the tokenizer is switched to the
302     ## "entity data state". In this implementation, the tokenizer
303     ## is switched to the |ENTITY_STATE|, which is an implementation
304     ## of the "consume a character reference" algorithm.
305     $self->{entity_add} = -1;
306     $self->{prev_state} = DATA_STATE;
307     $self->{state} = ENTITY_STATE;
308     !!!next-input-character;
309     redo A;
310     } else {
311     !!!cp (2);
312     #
313     }
314     } elsif ($self->{nc} == 0x002D) { # -
315     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
316 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
317 wakaba 1.1 !!!cp (3);
318     $self->{escape} = 1; # unless $self->{escape};
319     $self->{s_kwd} = '--';
320     #
321 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
322 wakaba 1.1 !!!cp (4);
323     $self->{s_kwd} = '--';
324     #
325 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
326     !!!cp (4.1);
327     $self->{s_kwd} .= '-';
328     #
329 wakaba 1.1 } else {
330     !!!cp (5);
331 wakaba 1.5 $self->{s_kwd} = '-';
332 wakaba 1.1 #
333     }
334     }
335    
336     #
337     } elsif ($self->{nc} == 0x0021) { # !
338     if (length $self->{s_kwd}) {
339     !!!cp (5.1);
340     $self->{s_kwd} .= '!';
341     #
342     } else {
343     !!!cp (5.2);
344     #$self->{s_kwd} = '';
345     #
346     }
347     #
348     } elsif ($self->{nc} == 0x003C) { # <
349     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
350     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
351     not $self->{escape})) {
352     !!!cp (6);
353     $self->{state} = TAG_OPEN_STATE;
354     !!!next-input-character;
355     redo A;
356     } else {
357     !!!cp (7);
358     $self->{s_kwd} = '';
359     #
360     }
361     } elsif ($self->{nc} == 0x003E) { # >
362     if ($self->{escape} and
363     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
364     if ($self->{s_kwd} eq '--') {
365     !!!cp (8);
366     delete $self->{escape};
367 wakaba 1.5 #
368 wakaba 1.1 } else {
369     !!!cp (9);
370 wakaba 1.5 #
371 wakaba 1.1 }
372 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
373     !!!cp (9.1);
374     !!!parse-error (type => 'unmatched mse', ## TODO: type
375     line => $self->{line_prev},
376     column => $self->{column_prev} - 1);
377     #
378 wakaba 1.1 } else {
379     !!!cp (10);
380 wakaba 1.5 #
381 wakaba 1.1 }
382    
383     $self->{s_kwd} = '';
384     #
385 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
386     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
387     !!!cp (10.1);
388     $self->{s_kwd} .= ']';
389     } elsif ($self->{s_kwd} eq ']]') {
390     !!!cp (10.2);
391     #
392     } else {
393     !!!cp (10.3);
394     $self->{s_kwd} = '';
395     }
396     #
397 wakaba 1.1 } elsif ($self->{nc} == -1) {
398     !!!cp (11);
399     $self->{s_kwd} = '';
400     !!!emit ({type => END_OF_FILE_TOKEN,
401     line => $self->{line}, column => $self->{column}});
402     last A; ## TODO: ok?
403     } else {
404     !!!cp (12);
405     $self->{s_kwd} = '';
406     #
407     }
408    
409     # Anything else
410     my $token = {type => CHARACTER_TOKEN,
411     data => chr $self->{nc},
412     line => $self->{line}, column => $self->{column},
413     };
414 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
415 wakaba 1.1 length $token->{data})) {
416     $self->{s_kwd} = '';
417     }
418    
419     ## Stay in the data state.
420 wakaba 1.5 if (not $self->{is_xml} and
421     $self->{content_model} == PCDATA_CONTENT_MODEL) {
422 wakaba 1.1 !!!cp (13);
423     $self->{state} = PCDATA_STATE;
424     } else {
425     !!!cp (14);
426     ## Stay in the state.
427     }
428     !!!next-input-character;
429     !!!emit ($token);
430     redo A;
431     } elsif ($self->{state} == TAG_OPEN_STATE) {
432     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
433     if ($self->{nc} == 0x002F) { # /
434     !!!cp (15);
435     !!!next-input-character;
436     $self->{state} = CLOSE_TAG_OPEN_STATE;
437     redo A;
438     } elsif ($self->{nc} == 0x0021) { # !
439     !!!cp (15.1);
440     $self->{s_kwd} = '<' unless $self->{escape};
441     #
442     } else {
443     !!!cp (16);
444     #
445     }
446    
447     ## reconsume
448     $self->{state} = DATA_STATE;
449 wakaba 1.5 $self->{s_kwd} = '';
450 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN, data => '<',
451     line => $self->{line_prev},
452     column => $self->{column_prev},
453     });
454     redo A;
455     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
456     if ($self->{nc} == 0x0021) { # !
457     !!!cp (17);
458     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
459     !!!next-input-character;
460     redo A;
461     } elsif ($self->{nc} == 0x002F) { # /
462     !!!cp (18);
463     $self->{state} = CLOSE_TAG_OPEN_STATE;
464     !!!next-input-character;
465     redo A;
466     } elsif (0x0041 <= $self->{nc} and
467     $self->{nc} <= 0x005A) { # A..Z
468     !!!cp (19);
469     $self->{ct}
470     = {type => START_TAG_TOKEN,
471 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
472 wakaba 1.1 line => $self->{line_prev},
473     column => $self->{column_prev}};
474     $self->{state} = TAG_NAME_STATE;
475     !!!next-input-character;
476     redo A;
477     } elsif (0x0061 <= $self->{nc} and
478     $self->{nc} <= 0x007A) { # a..z
479     !!!cp (20);
480     $self->{ct} = {type => START_TAG_TOKEN,
481     tag_name => chr ($self->{nc}),
482     line => $self->{line_prev},
483     column => $self->{column_prev}};
484     $self->{state} = TAG_NAME_STATE;
485     !!!next-input-character;
486     redo A;
487     } elsif ($self->{nc} == 0x003E) { # >
488     !!!cp (21);
489     !!!parse-error (type => 'empty start tag',
490     line => $self->{line_prev},
491     column => $self->{column_prev});
492     $self->{state} = DATA_STATE;
493 wakaba 1.5 $self->{s_kwd} = '';
494 wakaba 1.1 !!!next-input-character;
495    
496     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
497     line => $self->{line_prev},
498     column => $self->{column_prev},
499     });
500    
501     redo A;
502     } elsif ($self->{nc} == 0x003F) { # ?
503     !!!cp (22);
504     !!!parse-error (type => 'pio',
505     line => $self->{line_prev},
506     column => $self->{column_prev});
507     $self->{state} = BOGUS_COMMENT_STATE;
508     $self->{ct} = {type => COMMENT_TOKEN, data => '',
509     line => $self->{line_prev},
510     column => $self->{column_prev},
511     };
512     ## $self->{nc} is intentionally left as is
513     redo A;
514     } else {
515     !!!cp (23);
516     !!!parse-error (type => 'bare stago',
517     line => $self->{line_prev},
518     column => $self->{column_prev});
519     $self->{state} = DATA_STATE;
520 wakaba 1.5 $self->{s_kwd} = '';
521 wakaba 1.1 ## reconsume
522    
523     !!!emit ({type => CHARACTER_TOKEN, data => '<',
524     line => $self->{line_prev},
525     column => $self->{column_prev},
526     });
527    
528     redo A;
529     }
530     } else {
531     die "$0: $self->{content_model} in tag open";
532     }
533     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
534     ## NOTE: The "close tag open state" in the spec is implemented as
535     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
536    
537     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
538     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
539     if (defined $self->{last_stag_name}) {
540     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
541     $self->{s_kwd} = '';
542     ## Reconsume.
543     redo A;
544     } else {
545     ## No start tag token has ever been emitted
546     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
547     !!!cp (28);
548     $self->{state} = DATA_STATE;
549 wakaba 1.5 $self->{s_kwd} = '';
550 wakaba 1.1 ## Reconsume.
551     !!!emit ({type => CHARACTER_TOKEN, data => '</',
552     line => $l, column => $c,
553     });
554     redo A;
555     }
556     }
557    
558     if (0x0041 <= $self->{nc} and
559     $self->{nc} <= 0x005A) { # A..Z
560     !!!cp (29);
561     $self->{ct}
562     = {type => END_TAG_TOKEN,
563 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
564 wakaba 1.1 line => $l, column => $c};
565     $self->{state} = TAG_NAME_STATE;
566     !!!next-input-character;
567     redo A;
568     } elsif (0x0061 <= $self->{nc} and
569     $self->{nc} <= 0x007A) { # a..z
570     !!!cp (30);
571     $self->{ct} = {type => END_TAG_TOKEN,
572     tag_name => chr ($self->{nc}),
573     line => $l, column => $c};
574     $self->{state} = TAG_NAME_STATE;
575     !!!next-input-character;
576     redo A;
577     } elsif ($self->{nc} == 0x003E) { # >
578     !!!cp (31);
579     !!!parse-error (type => 'empty end tag',
580     line => $self->{line_prev}, ## "<" in "</>"
581     column => $self->{column_prev} - 1);
582     $self->{state} = DATA_STATE;
583 wakaba 1.5 $self->{s_kwd} = '';
584 wakaba 1.1 !!!next-input-character;
585     redo A;
586     } elsif ($self->{nc} == -1) {
587     !!!cp (32);
588     !!!parse-error (type => 'bare etago');
589 wakaba 1.5 $self->{s_kwd} = '';
590 wakaba 1.1 $self->{state} = DATA_STATE;
591     # reconsume
592    
593     !!!emit ({type => CHARACTER_TOKEN, data => '</',
594     line => $l, column => $c,
595     });
596    
597     redo A;
598     } else {
599     !!!cp (33);
600     !!!parse-error (type => 'bogus end tag');
601     $self->{state} = BOGUS_COMMENT_STATE;
602     $self->{ct} = {type => COMMENT_TOKEN, data => '',
603     line => $self->{line_prev}, # "<" of "</"
604     column => $self->{column_prev} - 1,
605     };
606     ## NOTE: $self->{nc} is intentionally left as is.
607     ## Although the "anything else" case of the spec not explicitly
608     ## states that the next input character is to be reconsumed,
609     ## it will be included to the |data| of the comment token
610     ## generated from the bogus end tag, as defined in the
611     ## "bogus comment state" entry.
612     redo A;
613     }
614     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
615     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
616     if (length $ch) {
617     my $CH = $ch;
618     $ch =~ tr/a-z/A-Z/;
619     my $nch = chr $self->{nc};
620     if ($nch eq $ch or $nch eq $CH) {
621     !!!cp (24);
622     ## Stay in the state.
623     $self->{s_kwd} .= $nch;
624     !!!next-input-character;
625     redo A;
626     } else {
627     !!!cp (25);
628     $self->{state} = DATA_STATE;
629 wakaba 1.5 $self->{s_kwd} = '';
630 wakaba 1.1 ## Reconsume.
631     !!!emit ({type => CHARACTER_TOKEN,
632     data => '</' . $self->{s_kwd},
633     line => $self->{line_prev},
634     column => $self->{column_prev} - 1 - length $self->{s_kwd},
635     });
636     redo A;
637     }
638     } else { # after "<{tag-name}"
639     unless ($is_space->{$self->{nc}} or
640     {
641     0x003E => 1, # >
642     0x002F => 1, # /
643     -1 => 1, # EOF
644     }->{$self->{nc}}) {
645     !!!cp (26);
646     ## Reconsume.
647     $self->{state} = DATA_STATE;
648 wakaba 1.5 $self->{s_kwd} = '';
649 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
650     data => '</' . $self->{s_kwd},
651     line => $self->{line_prev},
652     column => $self->{column_prev} - 1 - length $self->{s_kwd},
653     });
654     redo A;
655     } else {
656     !!!cp (27);
657     $self->{ct}
658     = {type => END_TAG_TOKEN,
659     tag_name => $self->{last_stag_name},
660     line => $self->{line_prev},
661     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
662     $self->{state} = TAG_NAME_STATE;
663     ## Reconsume.
664     redo A;
665     }
666     }
667     } elsif ($self->{state} == TAG_NAME_STATE) {
668     if ($is_space->{$self->{nc}}) {
669     !!!cp (34);
670     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
671     !!!next-input-character;
672     redo A;
673     } elsif ($self->{nc} == 0x003E) { # >
674     if ($self->{ct}->{type} == START_TAG_TOKEN) {
675     !!!cp (35);
676     $self->{last_stag_name} = $self->{ct}->{tag_name};
677     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
678     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
679     #if ($self->{ct}->{attributes}) {
680     # ## NOTE: This should never be reached.
681     # !!! cp (36);
682     # !!! parse-error (type => 'end tag attribute');
683     #} else {
684     !!!cp (37);
685     #}
686     } else {
687     die "$0: $self->{ct}->{type}: Unknown token type";
688     }
689     $self->{state} = DATA_STATE;
690 wakaba 1.5 $self->{s_kwd} = '';
691 wakaba 1.1 !!!next-input-character;
692    
693     !!!emit ($self->{ct}); # start tag or end tag
694    
695     redo A;
696     } elsif (0x0041 <= $self->{nc} and
697     $self->{nc} <= 0x005A) { # A..Z
698     !!!cp (38);
699 wakaba 1.4 $self->{ct}->{tag_name}
700     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
701 wakaba 1.1 # start tag or end tag
702     ## Stay in this state
703     !!!next-input-character;
704     redo A;
705     } elsif ($self->{nc} == -1) {
706     !!!parse-error (type => 'unclosed tag');
707     if ($self->{ct}->{type} == START_TAG_TOKEN) {
708     !!!cp (39);
709     $self->{last_stag_name} = $self->{ct}->{tag_name};
710     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
711     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
712     #if ($self->{ct}->{attributes}) {
713     # ## NOTE: This state should never be reached.
714     # !!! cp (40);
715     # !!! parse-error (type => 'end tag attribute');
716     #} else {
717     !!!cp (41);
718     #}
719     } else {
720     die "$0: $self->{ct}->{type}: Unknown token type";
721     }
722     $self->{state} = DATA_STATE;
723 wakaba 1.5 $self->{s_kwd} = '';
724 wakaba 1.1 # reconsume
725    
726     !!!emit ($self->{ct}); # start tag or end tag
727    
728     redo A;
729     } elsif ($self->{nc} == 0x002F) { # /
730     !!!cp (42);
731     $self->{state} = SELF_CLOSING_START_TAG_STATE;
732     !!!next-input-character;
733     redo A;
734     } else {
735     !!!cp (44);
736     $self->{ct}->{tag_name} .= chr $self->{nc};
737     # start tag or end tag
738     ## Stay in the state
739     !!!next-input-character;
740     redo A;
741     }
742     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
743     if ($is_space->{$self->{nc}}) {
744     !!!cp (45);
745     ## Stay in the state
746     !!!next-input-character;
747     redo A;
748     } elsif ($self->{nc} == 0x003E) { # >
749     if ($self->{ct}->{type} == START_TAG_TOKEN) {
750     !!!cp (46);
751     $self->{last_stag_name} = $self->{ct}->{tag_name};
752     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
753     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
754     if ($self->{ct}->{attributes}) {
755     !!!cp (47);
756     !!!parse-error (type => 'end tag attribute');
757     } else {
758     !!!cp (48);
759     }
760     } else {
761     die "$0: $self->{ct}->{type}: Unknown token type";
762     }
763     $self->{state} = DATA_STATE;
764 wakaba 1.5 $self->{s_kwd} = '';
765 wakaba 1.1 !!!next-input-character;
766    
767     !!!emit ($self->{ct}); # start tag or end tag
768    
769     redo A;
770     } elsif (0x0041 <= $self->{nc} and
771     $self->{nc} <= 0x005A) { # A..Z
772     !!!cp (49);
773     $self->{ca}
774 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
775 wakaba 1.1 value => '',
776     line => $self->{line}, column => $self->{column}};
777     $self->{state} = ATTRIBUTE_NAME_STATE;
778     !!!next-input-character;
779     redo A;
780     } elsif ($self->{nc} == 0x002F) { # /
781     !!!cp (50);
782     $self->{state} = SELF_CLOSING_START_TAG_STATE;
783     !!!next-input-character;
784     redo A;
785     } elsif ($self->{nc} == -1) {
786     !!!parse-error (type => 'unclosed tag');
787     if ($self->{ct}->{type} == START_TAG_TOKEN) {
788     !!!cp (52);
789     $self->{last_stag_name} = $self->{ct}->{tag_name};
790     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
791     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
792     if ($self->{ct}->{attributes}) {
793     !!!cp (53);
794     !!!parse-error (type => 'end tag attribute');
795     } else {
796     !!!cp (54);
797     }
798     } else {
799     die "$0: $self->{ct}->{type}: Unknown token type";
800     }
801     $self->{state} = DATA_STATE;
802 wakaba 1.5 $self->{s_kwd} = '';
803 wakaba 1.1 # reconsume
804    
805     !!!emit ($self->{ct}); # start tag or end tag
806    
807     redo A;
808     } else {
809     if ({
810     0x0022 => 1, # "
811     0x0027 => 1, # '
812     0x003D => 1, # =
813     }->{$self->{nc}}) {
814     !!!cp (55);
815     !!!parse-error (type => 'bad attribute name');
816     } else {
817     !!!cp (56);
818     }
819     $self->{ca}
820     = {name => chr ($self->{nc}),
821     value => '',
822     line => $self->{line}, column => $self->{column}};
823     $self->{state} = ATTRIBUTE_NAME_STATE;
824     !!!next-input-character;
825     redo A;
826     }
827     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
828     my $before_leave = sub {
829     if (exists $self->{ct}->{attributes} # start tag or end tag
830     ->{$self->{ca}->{name}}) { # MUST
831     !!!cp (57);
832     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
833     ## Discard $self->{ca} # MUST
834     } else {
835     !!!cp (58);
836     $self->{ct}->{attributes}->{$self->{ca}->{name}}
837     = $self->{ca};
838     }
839     }; # $before_leave
840    
841     if ($is_space->{$self->{nc}}) {
842     !!!cp (59);
843     $before_leave->();
844     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
845     !!!next-input-character;
846     redo A;
847     } elsif ($self->{nc} == 0x003D) { # =
848     !!!cp (60);
849     $before_leave->();
850     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
851     !!!next-input-character;
852     redo A;
853     } elsif ($self->{nc} == 0x003E) { # >
854     $before_leave->();
855     if ($self->{ct}->{type} == START_TAG_TOKEN) {
856     !!!cp (61);
857     $self->{last_stag_name} = $self->{ct}->{tag_name};
858     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
859     !!!cp (62);
860     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
861     if ($self->{ct}->{attributes}) {
862     !!!parse-error (type => 'end tag attribute');
863     }
864     } else {
865     die "$0: $self->{ct}->{type}: Unknown token type";
866     }
867     $self->{state} = DATA_STATE;
868 wakaba 1.5 $self->{s_kwd} = '';
869 wakaba 1.1 !!!next-input-character;
870    
871     !!!emit ($self->{ct}); # start tag or end tag
872    
873     redo A;
874     } elsif (0x0041 <= $self->{nc} and
875     $self->{nc} <= 0x005A) { # A..Z
876     !!!cp (63);
877 wakaba 1.4 $self->{ca}->{name}
878     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
879 wakaba 1.1 ## Stay in the state
880     !!!next-input-character;
881     redo A;
882     } elsif ($self->{nc} == 0x002F) { # /
883     !!!cp (64);
884     $before_leave->();
885     $self->{state} = SELF_CLOSING_START_TAG_STATE;
886     !!!next-input-character;
887     redo A;
888     } elsif ($self->{nc} == -1) {
889     !!!parse-error (type => 'unclosed tag');
890     $before_leave->();
891     if ($self->{ct}->{type} == START_TAG_TOKEN) {
892     !!!cp (66);
893     $self->{last_stag_name} = $self->{ct}->{tag_name};
894     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
895     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
896     if ($self->{ct}->{attributes}) {
897     !!!cp (67);
898     !!!parse-error (type => 'end tag attribute');
899     } else {
900     ## NOTE: This state should never be reached.
901     !!!cp (68);
902     }
903     } else {
904     die "$0: $self->{ct}->{type}: Unknown token type";
905     }
906     $self->{state} = DATA_STATE;
907 wakaba 1.5 $self->{s_kwd} = '';
908 wakaba 1.1 # reconsume
909    
910     !!!emit ($self->{ct}); # start tag or end tag
911    
912     redo A;
913     } else {
914     if ($self->{nc} == 0x0022 or # "
915     $self->{nc} == 0x0027) { # '
916     !!!cp (69);
917     !!!parse-error (type => 'bad attribute name');
918     } else {
919     !!!cp (70);
920     }
921     $self->{ca}->{name} .= chr ($self->{nc});
922     ## Stay in the state
923     !!!next-input-character;
924     redo A;
925     }
926     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
927     if ($is_space->{$self->{nc}}) {
928     !!!cp (71);
929     ## Stay in the state
930     !!!next-input-character;
931     redo A;
932     } elsif ($self->{nc} == 0x003D) { # =
933     !!!cp (72);
934     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
935     !!!next-input-character;
936     redo A;
937     } elsif ($self->{nc} == 0x003E) { # >
938     if ($self->{ct}->{type} == START_TAG_TOKEN) {
939     !!!cp (73);
940     $self->{last_stag_name} = $self->{ct}->{tag_name};
941     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
942     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
943     if ($self->{ct}->{attributes}) {
944     !!!cp (74);
945     !!!parse-error (type => 'end tag attribute');
946     } else {
947     ## NOTE: This state should never be reached.
948     !!!cp (75);
949     }
950     } else {
951     die "$0: $self->{ct}->{type}: Unknown token type";
952     }
953     $self->{state} = DATA_STATE;
954 wakaba 1.5 $self->{s_kwd} = '';
955 wakaba 1.1 !!!next-input-character;
956    
957     !!!emit ($self->{ct}); # start tag or end tag
958    
959     redo A;
960     } elsif (0x0041 <= $self->{nc} and
961     $self->{nc} <= 0x005A) { # A..Z
962     !!!cp (76);
963     $self->{ca}
964 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
965 wakaba 1.1 value => '',
966     line => $self->{line}, column => $self->{column}};
967     $self->{state} = ATTRIBUTE_NAME_STATE;
968     !!!next-input-character;
969     redo A;
970     } elsif ($self->{nc} == 0x002F) { # /
971     !!!cp (77);
972     $self->{state} = SELF_CLOSING_START_TAG_STATE;
973     !!!next-input-character;
974     redo A;
975     } elsif ($self->{nc} == -1) {
976     !!!parse-error (type => 'unclosed tag');
977     if ($self->{ct}->{type} == START_TAG_TOKEN) {
978     !!!cp (79);
979     $self->{last_stag_name} = $self->{ct}->{tag_name};
980     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
981     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
982     if ($self->{ct}->{attributes}) {
983     !!!cp (80);
984     !!!parse-error (type => 'end tag attribute');
985     } else {
986     ## NOTE: This state should never be reached.
987     !!!cp (81);
988     }
989     } else {
990     die "$0: $self->{ct}->{type}: Unknown token type";
991     }
992 wakaba 1.5 $self->{s_kwd} = '';
993 wakaba 1.1 $self->{state} = DATA_STATE;
994     # reconsume
995    
996     !!!emit ($self->{ct}); # start tag or end tag
997    
998     redo A;
999     } else {
1000     if ($self->{nc} == 0x0022 or # "
1001     $self->{nc} == 0x0027) { # '
1002     !!!cp (78);
1003     !!!parse-error (type => 'bad attribute name');
1004     } else {
1005     !!!cp (82);
1006     }
1007     $self->{ca}
1008     = {name => chr ($self->{nc}),
1009     value => '',
1010     line => $self->{line}, column => $self->{column}};
1011     $self->{state} = ATTRIBUTE_NAME_STATE;
1012     !!!next-input-character;
1013     redo A;
1014     }
1015     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1016     if ($is_space->{$self->{nc}}) {
1017     !!!cp (83);
1018     ## Stay in the state
1019     !!!next-input-character;
1020     redo A;
1021     } elsif ($self->{nc} == 0x0022) { # "
1022     !!!cp (84);
1023     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1024     !!!next-input-character;
1025     redo A;
1026     } elsif ($self->{nc} == 0x0026) { # &
1027     !!!cp (85);
1028     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1029     ## reconsume
1030     redo A;
1031     } elsif ($self->{nc} == 0x0027) { # '
1032     !!!cp (86);
1033     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1034     !!!next-input-character;
1035     redo A;
1036     } elsif ($self->{nc} == 0x003E) { # >
1037     !!!parse-error (type => 'empty unquoted attribute value');
1038     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1039     !!!cp (87);
1040     $self->{last_stag_name} = $self->{ct}->{tag_name};
1041     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1042     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1043     if ($self->{ct}->{attributes}) {
1044     !!!cp (88);
1045     !!!parse-error (type => 'end tag attribute');
1046     } else {
1047     ## NOTE: This state should never be reached.
1048     !!!cp (89);
1049     }
1050     } else {
1051     die "$0: $self->{ct}->{type}: Unknown token type";
1052     }
1053     $self->{state} = DATA_STATE;
1054 wakaba 1.5 $self->{s_kwd} = '';
1055 wakaba 1.1 !!!next-input-character;
1056    
1057     !!!emit ($self->{ct}); # start tag or end tag
1058    
1059     redo A;
1060     } elsif ($self->{nc} == -1) {
1061     !!!parse-error (type => 'unclosed tag');
1062     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1063     !!!cp (90);
1064     $self->{last_stag_name} = $self->{ct}->{tag_name};
1065     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1066     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1067     if ($self->{ct}->{attributes}) {
1068     !!!cp (91);
1069     !!!parse-error (type => 'end tag attribute');
1070     } else {
1071     ## NOTE: This state should never be reached.
1072     !!!cp (92);
1073     }
1074     } else {
1075     die "$0: $self->{ct}->{type}: Unknown token type";
1076     }
1077     $self->{state} = DATA_STATE;
1078 wakaba 1.5 $self->{s_kwd} = '';
1079 wakaba 1.1 ## reconsume
1080    
1081     !!!emit ($self->{ct}); # start tag or end tag
1082    
1083     redo A;
1084     } else {
1085     if ($self->{nc} == 0x003D) { # =
1086     !!!cp (93);
1087     !!!parse-error (type => 'bad attribute value');
1088     } else {
1089     !!!cp (94);
1090     }
1091     $self->{ca}->{value} .= chr ($self->{nc});
1092     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1093     !!!next-input-character;
1094     redo A;
1095     }
1096     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1097     if ($self->{nc} == 0x0022) { # "
1098     !!!cp (95);
1099     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1100     !!!next-input-character;
1101     redo A;
1102     } elsif ($self->{nc} == 0x0026) { # &
1103     !!!cp (96);
1104     ## NOTE: In the spec, the tokenizer is switched to the
1105     ## "entity in attribute value state". In this implementation, the
1106     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1107     ## implementation of the "consume a character reference" algorithm.
1108     $self->{prev_state} = $self->{state};
1109     $self->{entity_add} = 0x0022; # "
1110     $self->{state} = ENTITY_STATE;
1111     !!!next-input-character;
1112     redo A;
1113     } elsif ($self->{nc} == -1) {
1114     !!!parse-error (type => 'unclosed attribute value');
1115     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1116     !!!cp (97);
1117     $self->{last_stag_name} = $self->{ct}->{tag_name};
1118     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1119     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1120     if ($self->{ct}->{attributes}) {
1121     !!!cp (98);
1122     !!!parse-error (type => 'end tag attribute');
1123     } else {
1124     ## NOTE: This state should never be reached.
1125     !!!cp (99);
1126     }
1127     } else {
1128     die "$0: $self->{ct}->{type}: Unknown token type";
1129     }
1130     $self->{state} = DATA_STATE;
1131 wakaba 1.5 $self->{s_kwd} = '';
1132 wakaba 1.1 ## reconsume
1133    
1134     !!!emit ($self->{ct}); # start tag or end tag
1135    
1136     redo A;
1137     } else {
1138     !!!cp (100);
1139     $self->{ca}->{value} .= chr ($self->{nc});
1140     $self->{read_until}->($self->{ca}->{value},
1141     q["&],
1142     length $self->{ca}->{value});
1143    
1144     ## Stay in the state
1145     !!!next-input-character;
1146     redo A;
1147     }
1148     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1149     if ($self->{nc} == 0x0027) { # '
1150     !!!cp (101);
1151     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1152     !!!next-input-character;
1153     redo A;
1154     } elsif ($self->{nc} == 0x0026) { # &
1155     !!!cp (102);
1156     ## NOTE: In the spec, the tokenizer is switched to the
1157     ## "entity in attribute value state". In this implementation, the
1158     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1159     ## implementation of the "consume a character reference" algorithm.
1160     $self->{entity_add} = 0x0027; # '
1161     $self->{prev_state} = $self->{state};
1162     $self->{state} = ENTITY_STATE;
1163     !!!next-input-character;
1164     redo A;
1165     } elsif ($self->{nc} == -1) {
1166     !!!parse-error (type => 'unclosed attribute value');
1167     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1168     !!!cp (103);
1169     $self->{last_stag_name} = $self->{ct}->{tag_name};
1170     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1171     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1172     if ($self->{ct}->{attributes}) {
1173     !!!cp (104);
1174     !!!parse-error (type => 'end tag attribute');
1175     } else {
1176     ## NOTE: This state should never be reached.
1177     !!!cp (105);
1178     }
1179     } else {
1180     die "$0: $self->{ct}->{type}: Unknown token type";
1181     }
1182     $self->{state} = DATA_STATE;
1183 wakaba 1.5 $self->{s_kwd} = '';
1184 wakaba 1.1 ## reconsume
1185    
1186     !!!emit ($self->{ct}); # start tag or end tag
1187    
1188     redo A;
1189     } else {
1190     !!!cp (106);
1191     $self->{ca}->{value} .= chr ($self->{nc});
1192     $self->{read_until}->($self->{ca}->{value},
1193     q['&],
1194     length $self->{ca}->{value});
1195    
1196     ## Stay in the state
1197     !!!next-input-character;
1198     redo A;
1199     }
1200     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1201     if ($is_space->{$self->{nc}}) {
1202     !!!cp (107);
1203     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1204     !!!next-input-character;
1205     redo A;
1206     } elsif ($self->{nc} == 0x0026) { # &
1207     !!!cp (108);
1208     ## NOTE: In the spec, the tokenizer is switched to the
1209     ## "entity in attribute value state". In this implementation, the
1210     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1211     ## implementation of the "consume a character reference" algorithm.
1212     $self->{entity_add} = -1;
1213     $self->{prev_state} = $self->{state};
1214     $self->{state} = ENTITY_STATE;
1215     !!!next-input-character;
1216     redo A;
1217     } elsif ($self->{nc} == 0x003E) { # >
1218     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1219     !!!cp (109);
1220     $self->{last_stag_name} = $self->{ct}->{tag_name};
1221     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1222     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1223     if ($self->{ct}->{attributes}) {
1224     !!!cp (110);
1225     !!!parse-error (type => 'end tag attribute');
1226     } else {
1227     ## NOTE: This state should never be reached.
1228     !!!cp (111);
1229     }
1230     } else {
1231     die "$0: $self->{ct}->{type}: Unknown token type";
1232     }
1233     $self->{state} = DATA_STATE;
1234 wakaba 1.5 $self->{s_kwd} = '';
1235 wakaba 1.1 !!!next-input-character;
1236    
1237     !!!emit ($self->{ct}); # start tag or end tag
1238    
1239     redo A;
1240     } elsif ($self->{nc} == -1) {
1241     !!!parse-error (type => 'unclosed tag');
1242     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1243     !!!cp (112);
1244     $self->{last_stag_name} = $self->{ct}->{tag_name};
1245     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1246     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1247     if ($self->{ct}->{attributes}) {
1248     !!!cp (113);
1249     !!!parse-error (type => 'end tag attribute');
1250     } else {
1251     ## NOTE: This state should never be reached.
1252     !!!cp (114);
1253     }
1254     } else {
1255     die "$0: $self->{ct}->{type}: Unknown token type";
1256     }
1257     $self->{state} = DATA_STATE;
1258 wakaba 1.5 $self->{s_kwd} = '';
1259 wakaba 1.1 ## reconsume
1260    
1261     !!!emit ($self->{ct}); # start tag or end tag
1262    
1263     redo A;
1264     } else {
1265     if ({
1266     0x0022 => 1, # "
1267     0x0027 => 1, # '
1268     0x003D => 1, # =
1269     }->{$self->{nc}}) {
1270     !!!cp (115);
1271     !!!parse-error (type => 'bad attribute value');
1272     } else {
1273     !!!cp (116);
1274     }
1275     $self->{ca}->{value} .= chr ($self->{nc});
1276     $self->{read_until}->($self->{ca}->{value},
1277     q["'=& >],
1278     length $self->{ca}->{value});
1279    
1280     ## Stay in the state
1281     !!!next-input-character;
1282     redo A;
1283     }
1284     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1285     if ($is_space->{$self->{nc}}) {
1286     !!!cp (118);
1287     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1288     !!!next-input-character;
1289     redo A;
1290     } elsif ($self->{nc} == 0x003E) { # >
1291     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1292     !!!cp (119);
1293     $self->{last_stag_name} = $self->{ct}->{tag_name};
1294     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1295     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1296     if ($self->{ct}->{attributes}) {
1297     !!!cp (120);
1298     !!!parse-error (type => 'end tag attribute');
1299     } else {
1300     ## NOTE: This state should never be reached.
1301     !!!cp (121);
1302     }
1303     } else {
1304     die "$0: $self->{ct}->{type}: Unknown token type";
1305     }
1306     $self->{state} = DATA_STATE;
1307 wakaba 1.5 $self->{s_kwd} = '';
1308 wakaba 1.1 !!!next-input-character;
1309    
1310     !!!emit ($self->{ct}); # start tag or end tag
1311    
1312     redo A;
1313     } elsif ($self->{nc} == 0x002F) { # /
1314     !!!cp (122);
1315     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1316     !!!next-input-character;
1317     redo A;
1318     } elsif ($self->{nc} == -1) {
1319     !!!parse-error (type => 'unclosed tag');
1320     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1321     !!!cp (122.3);
1322     $self->{last_stag_name} = $self->{ct}->{tag_name};
1323     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324     if ($self->{ct}->{attributes}) {
1325     !!!cp (122.1);
1326     !!!parse-error (type => 'end tag attribute');
1327     } else {
1328     ## NOTE: This state should never be reached.
1329     !!!cp (122.2);
1330     }
1331     } else {
1332     die "$0: $self->{ct}->{type}: Unknown token type";
1333     }
1334     $self->{state} = DATA_STATE;
1335 wakaba 1.5 $self->{s_kwd} = '';
1336 wakaba 1.1 ## Reconsume.
1337     !!!emit ($self->{ct}); # start tag or end tag
1338     redo A;
1339     } else {
1340     !!!cp ('124.1');
1341     !!!parse-error (type => 'no space between attributes');
1342     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1343     ## reconsume
1344     redo A;
1345     }
1346     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1347     if ($self->{nc} == 0x003E) { # >
1348     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1349     !!!cp ('124.2');
1350     !!!parse-error (type => 'nestc', token => $self->{ct});
1351     ## TODO: Different type than slash in start tag
1352     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1353     if ($self->{ct}->{attributes}) {
1354     !!!cp ('124.4');
1355     !!!parse-error (type => 'end tag attribute');
1356     } else {
1357     !!!cp ('124.5');
1358     }
1359     ## TODO: Test |<title></title/>|
1360     } else {
1361     !!!cp ('124.3');
1362     $self->{self_closing} = 1;
1363     }
1364    
1365     $self->{state} = DATA_STATE;
1366 wakaba 1.5 $self->{s_kwd} = '';
1367 wakaba 1.1 !!!next-input-character;
1368    
1369     !!!emit ($self->{ct}); # start tag or end tag
1370    
1371     redo A;
1372     } elsif ($self->{nc} == -1) {
1373     !!!parse-error (type => 'unclosed tag');
1374     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1375     !!!cp (124.7);
1376     $self->{last_stag_name} = $self->{ct}->{tag_name};
1377     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1378     if ($self->{ct}->{attributes}) {
1379     !!!cp (124.5);
1380     !!!parse-error (type => 'end tag attribute');
1381     } else {
1382     ## NOTE: This state should never be reached.
1383     !!!cp (124.6);
1384     }
1385     } else {
1386     die "$0: $self->{ct}->{type}: Unknown token type";
1387     }
1388     $self->{state} = DATA_STATE;
1389 wakaba 1.5 $self->{s_kwd} = '';
1390 wakaba 1.1 ## Reconsume.
1391     !!!emit ($self->{ct}); # start tag or end tag
1392     redo A;
1393     } else {
1394     !!!cp ('124.4');
1395     !!!parse-error (type => 'nestc');
1396     ## TODO: This error type is wrong.
1397     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1398     ## Reconsume.
1399     redo A;
1400     }
1401     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1402     ## (only happen if PCDATA state)
1403    
1404     ## NOTE: Unlike spec's "bogus comment state", this implementation
1405     ## consumes characters one-by-one basis.
1406    
1407     if ($self->{nc} == 0x003E) { # >
1408     !!!cp (124);
1409     $self->{state} = DATA_STATE;
1410 wakaba 1.5 $self->{s_kwd} = '';
1411 wakaba 1.1 !!!next-input-character;
1412    
1413     !!!emit ($self->{ct}); # comment
1414     redo A;
1415     } elsif ($self->{nc} == -1) {
1416     !!!cp (125);
1417     $self->{state} = DATA_STATE;
1418 wakaba 1.5 $self->{s_kwd} = '';
1419 wakaba 1.1 ## reconsume
1420    
1421     !!!emit ($self->{ct}); # comment
1422     redo A;
1423     } else {
1424     !!!cp (126);
1425     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1426     $self->{read_until}->($self->{ct}->{data},
1427     q[>],
1428     length $self->{ct}->{data});
1429    
1430     ## Stay in the state.
1431     !!!next-input-character;
1432     redo A;
1433     }
1434     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1435     ## (only happen if PCDATA state)
1436    
1437     if ($self->{nc} == 0x002D) { # -
1438     !!!cp (133);
1439     $self->{state} = MD_HYPHEN_STATE;
1440     !!!next-input-character;
1441     redo A;
1442     } elsif ($self->{nc} == 0x0044 or # D
1443     $self->{nc} == 0x0064) { # d
1444     ## ASCII case-insensitive.
1445     !!!cp (130);
1446     $self->{state} = MD_DOCTYPE_STATE;
1447     $self->{s_kwd} = chr $self->{nc};
1448     !!!next-input-character;
1449     redo A;
1450 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1451     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1452     $self->{is_xml}) and
1453 wakaba 1.1 $self->{nc} == 0x005B) { # [
1454     !!!cp (135.4);
1455     $self->{state} = MD_CDATA_STATE;
1456     $self->{s_kwd} = '[';
1457     !!!next-input-character;
1458     redo A;
1459     } else {
1460     !!!cp (136);
1461     }
1462    
1463     !!!parse-error (type => 'bogus comment',
1464     line => $self->{line_prev},
1465     column => $self->{column_prev} - 1);
1466     ## Reconsume.
1467     $self->{state} = BOGUS_COMMENT_STATE;
1468     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1469     line => $self->{line_prev},
1470     column => $self->{column_prev} - 1,
1471     };
1472     redo A;
1473     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1474     if ($self->{nc} == 0x002D) { # -
1475     !!!cp (127);
1476     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1477     line => $self->{line_prev},
1478     column => $self->{column_prev} - 2,
1479     };
1480     $self->{state} = COMMENT_START_STATE;
1481     !!!next-input-character;
1482     redo A;
1483     } else {
1484     !!!cp (128);
1485     !!!parse-error (type => 'bogus comment',
1486     line => $self->{line_prev},
1487     column => $self->{column_prev} - 2);
1488     $self->{state} = BOGUS_COMMENT_STATE;
1489     ## Reconsume.
1490     $self->{ct} = {type => COMMENT_TOKEN,
1491     data => '-',
1492     line => $self->{line_prev},
1493     column => $self->{column_prev} - 2,
1494     };
1495     redo A;
1496     }
1497     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1498     ## ASCII case-insensitive.
1499     if ($self->{nc} == [
1500     undef,
1501     0x004F, # O
1502     0x0043, # C
1503     0x0054, # T
1504     0x0059, # Y
1505     0x0050, # P
1506     ]->[length $self->{s_kwd}] or
1507     $self->{nc} == [
1508     undef,
1509     0x006F, # o
1510     0x0063, # c
1511     0x0074, # t
1512     0x0079, # y
1513     0x0070, # p
1514     ]->[length $self->{s_kwd}]) {
1515     !!!cp (131);
1516     ## Stay in the state.
1517     $self->{s_kwd} .= chr $self->{nc};
1518     !!!next-input-character;
1519     redo A;
1520     } elsif ((length $self->{s_kwd}) == 6 and
1521     ($self->{nc} == 0x0045 or # E
1522     $self->{nc} == 0x0065)) { # e
1523     !!!cp (129);
1524     $self->{state} = DOCTYPE_STATE;
1525     $self->{ct} = {type => DOCTYPE_TOKEN,
1526     quirks => 1,
1527     line => $self->{line_prev},
1528     column => $self->{column_prev} - 7,
1529     };
1530     !!!next-input-character;
1531     redo A;
1532     } else {
1533     !!!cp (132);
1534     !!!parse-error (type => 'bogus comment',
1535     line => $self->{line_prev},
1536     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1537     $self->{state} = BOGUS_COMMENT_STATE;
1538     ## Reconsume.
1539     $self->{ct} = {type => COMMENT_TOKEN,
1540     data => $self->{s_kwd},
1541     line => $self->{line_prev},
1542     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1543     };
1544     redo A;
1545     }
1546     } elsif ($self->{state} == MD_CDATA_STATE) {
1547     if ($self->{nc} == {
1548     '[' => 0x0043, # C
1549     '[C' => 0x0044, # D
1550     '[CD' => 0x0041, # A
1551     '[CDA' => 0x0054, # T
1552     '[CDAT' => 0x0041, # A
1553     }->{$self->{s_kwd}}) {
1554     !!!cp (135.1);
1555     ## Stay in the state.
1556     $self->{s_kwd} .= chr $self->{nc};
1557     !!!next-input-character;
1558     redo A;
1559     } elsif ($self->{s_kwd} eq '[CDATA' and
1560     $self->{nc} == 0x005B) { # [
1561     !!!cp (135.2);
1562 wakaba 1.6
1563     if ($self->{is_xml} and
1564     not $self->{tainted} and
1565     @{$self->{open_elements} or []} == 0) {
1566     !!!parse-error (type => 'cdata outside of root element',
1567     line => $self->{line_prev},
1568     column => $self->{column_prev} - 7);
1569     $self->{tainted} = 1;
1570     }
1571    
1572 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1573     data => '',
1574     line => $self->{line_prev},
1575     column => $self->{column_prev} - 7};
1576     $self->{state} = CDATA_SECTION_STATE;
1577     !!!next-input-character;
1578     redo A;
1579     } else {
1580     !!!cp (135.3);
1581     !!!parse-error (type => 'bogus comment',
1582     line => $self->{line_prev},
1583     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1584     $self->{state} = BOGUS_COMMENT_STATE;
1585     ## Reconsume.
1586     $self->{ct} = {type => COMMENT_TOKEN,
1587     data => $self->{s_kwd},
1588     line => $self->{line_prev},
1589     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1590     };
1591     redo A;
1592     }
1593     } elsif ($self->{state} == COMMENT_START_STATE) {
1594     if ($self->{nc} == 0x002D) { # -
1595     !!!cp (137);
1596     $self->{state} = COMMENT_START_DASH_STATE;
1597     !!!next-input-character;
1598     redo A;
1599     } elsif ($self->{nc} == 0x003E) { # >
1600     !!!cp (138);
1601     !!!parse-error (type => 'bogus comment');
1602     $self->{state} = DATA_STATE;
1603 wakaba 1.5 $self->{s_kwd} = '';
1604 wakaba 1.1 !!!next-input-character;
1605    
1606     !!!emit ($self->{ct}); # comment
1607    
1608     redo A;
1609     } elsif ($self->{nc} == -1) {
1610     !!!cp (139);
1611     !!!parse-error (type => 'unclosed comment');
1612     $self->{state} = DATA_STATE;
1613 wakaba 1.5 $self->{s_kwd} = '';
1614 wakaba 1.1 ## reconsume
1615    
1616     !!!emit ($self->{ct}); # comment
1617    
1618     redo A;
1619     } else {
1620     !!!cp (140);
1621     $self->{ct}->{data} # comment
1622     .= chr ($self->{nc});
1623     $self->{state} = COMMENT_STATE;
1624     !!!next-input-character;
1625     redo A;
1626     }
1627     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1628     if ($self->{nc} == 0x002D) { # -
1629     !!!cp (141);
1630     $self->{state} = COMMENT_END_STATE;
1631     !!!next-input-character;
1632     redo A;
1633     } elsif ($self->{nc} == 0x003E) { # >
1634     !!!cp (142);
1635     !!!parse-error (type => 'bogus comment');
1636     $self->{state} = DATA_STATE;
1637 wakaba 1.5 $self->{s_kwd} = '';
1638 wakaba 1.1 !!!next-input-character;
1639    
1640     !!!emit ($self->{ct}); # comment
1641    
1642     redo A;
1643     } elsif ($self->{nc} == -1) {
1644     !!!cp (143);
1645     !!!parse-error (type => 'unclosed comment');
1646     $self->{state} = DATA_STATE;
1647 wakaba 1.5 $self->{s_kwd} = '';
1648 wakaba 1.1 ## reconsume
1649    
1650     !!!emit ($self->{ct}); # comment
1651    
1652     redo A;
1653     } else {
1654     !!!cp (144);
1655     $self->{ct}->{data} # comment
1656     .= '-' . chr ($self->{nc});
1657     $self->{state} = COMMENT_STATE;
1658     !!!next-input-character;
1659     redo A;
1660     }
1661     } elsif ($self->{state} == COMMENT_STATE) {
1662     if ($self->{nc} == 0x002D) { # -
1663     !!!cp (145);
1664     $self->{state} = COMMENT_END_DASH_STATE;
1665     !!!next-input-character;
1666     redo A;
1667     } elsif ($self->{nc} == -1) {
1668     !!!cp (146);
1669     !!!parse-error (type => 'unclosed comment');
1670     $self->{state} = DATA_STATE;
1671 wakaba 1.5 $self->{s_kwd} = '';
1672 wakaba 1.1 ## reconsume
1673    
1674     !!!emit ($self->{ct}); # comment
1675    
1676     redo A;
1677     } else {
1678     !!!cp (147);
1679     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1680     $self->{read_until}->($self->{ct}->{data},
1681     q[-],
1682     length $self->{ct}->{data});
1683    
1684     ## Stay in the state
1685     !!!next-input-character;
1686     redo A;
1687     }
1688     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1689     if ($self->{nc} == 0x002D) { # -
1690     !!!cp (148);
1691     $self->{state} = COMMENT_END_STATE;
1692     !!!next-input-character;
1693     redo A;
1694     } elsif ($self->{nc} == -1) {
1695     !!!cp (149);
1696     !!!parse-error (type => 'unclosed comment');
1697 wakaba 1.5 $self->{s_kwd} = '';
1698 wakaba 1.1 $self->{state} = DATA_STATE;
1699 wakaba 1.5 $self->{s_kwd} = '';
1700 wakaba 1.1 ## reconsume
1701    
1702     !!!emit ($self->{ct}); # comment
1703    
1704     redo A;
1705     } else {
1706     !!!cp (150);
1707     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1708     $self->{state} = COMMENT_STATE;
1709     !!!next-input-character;
1710     redo A;
1711     }
1712     } elsif ($self->{state} == COMMENT_END_STATE) {
1713     if ($self->{nc} == 0x003E) { # >
1714     !!!cp (151);
1715     $self->{state} = DATA_STATE;
1716 wakaba 1.5 $self->{s_kwd} = '';
1717 wakaba 1.1 !!!next-input-character;
1718    
1719     !!!emit ($self->{ct}); # comment
1720    
1721     redo A;
1722     } elsif ($self->{nc} == 0x002D) { # -
1723     !!!cp (152);
1724     !!!parse-error (type => 'dash in comment',
1725     line => $self->{line_prev},
1726     column => $self->{column_prev});
1727     $self->{ct}->{data} .= '-'; # comment
1728     ## Stay in the state
1729     !!!next-input-character;
1730     redo A;
1731     } elsif ($self->{nc} == -1) {
1732     !!!cp (153);
1733     !!!parse-error (type => 'unclosed comment');
1734     $self->{state} = DATA_STATE;
1735 wakaba 1.5 $self->{s_kwd} = '';
1736 wakaba 1.1 ## reconsume
1737    
1738     !!!emit ($self->{ct}); # comment
1739    
1740     redo A;
1741     } else {
1742     !!!cp (154);
1743     !!!parse-error (type => 'dash in comment',
1744     line => $self->{line_prev},
1745     column => $self->{column_prev});
1746     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1747     $self->{state} = COMMENT_STATE;
1748     !!!next-input-character;
1749     redo A;
1750     }
1751     } elsif ($self->{state} == DOCTYPE_STATE) {
1752     if ($is_space->{$self->{nc}}) {
1753     !!!cp (155);
1754     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1755     !!!next-input-character;
1756     redo A;
1757     } else {
1758     !!!cp (156);
1759     !!!parse-error (type => 'no space before DOCTYPE name');
1760     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1761     ## reconsume
1762     redo A;
1763     }
1764     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1765     if ($is_space->{$self->{nc}}) {
1766     !!!cp (157);
1767     ## Stay in the state
1768     !!!next-input-character;
1769     redo A;
1770     } elsif ($self->{nc} == 0x003E) { # >
1771     !!!cp (158);
1772     !!!parse-error (type => 'no DOCTYPE name');
1773     $self->{state} = DATA_STATE;
1774 wakaba 1.5 $self->{s_kwd} = '';
1775 wakaba 1.1 !!!next-input-character;
1776    
1777     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1778    
1779     redo A;
1780     } elsif ($self->{nc} == -1) {
1781     !!!cp (159);
1782     !!!parse-error (type => 'no DOCTYPE name');
1783     $self->{state} = DATA_STATE;
1784 wakaba 1.5 $self->{s_kwd} = '';
1785 wakaba 1.1 ## reconsume
1786    
1787     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1788    
1789     redo A;
1790     } else {
1791     !!!cp (160);
1792     $self->{ct}->{name} = chr $self->{nc};
1793     delete $self->{ct}->{quirks};
1794     $self->{state} = DOCTYPE_NAME_STATE;
1795     !!!next-input-character;
1796     redo A;
1797     }
1798     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1799     ## ISSUE: Redundant "First," in the spec.
1800     if ($is_space->{$self->{nc}}) {
1801     !!!cp (161);
1802     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1803     !!!next-input-character;
1804     redo A;
1805     } elsif ($self->{nc} == 0x003E) { # >
1806     !!!cp (162);
1807     $self->{state} = DATA_STATE;
1808 wakaba 1.5 $self->{s_kwd} = '';
1809 wakaba 1.1 !!!next-input-character;
1810    
1811     !!!emit ($self->{ct}); # DOCTYPE
1812    
1813     redo A;
1814     } elsif ($self->{nc} == -1) {
1815     !!!cp (163);
1816     !!!parse-error (type => 'unclosed DOCTYPE');
1817     $self->{state} = DATA_STATE;
1818 wakaba 1.5 $self->{s_kwd} = '';
1819 wakaba 1.1 ## reconsume
1820    
1821     $self->{ct}->{quirks} = 1;
1822     !!!emit ($self->{ct}); # DOCTYPE
1823    
1824     redo A;
1825     } else {
1826     !!!cp (164);
1827     $self->{ct}->{name}
1828     .= chr ($self->{nc}); # DOCTYPE
1829     ## Stay in the state
1830     !!!next-input-character;
1831     redo A;
1832     }
1833     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1834     if ($is_space->{$self->{nc}}) {
1835     !!!cp (165);
1836     ## Stay in the state
1837     !!!next-input-character;
1838     redo A;
1839     } elsif ($self->{nc} == 0x003E) { # >
1840     !!!cp (166);
1841     $self->{state} = DATA_STATE;
1842 wakaba 1.5 $self->{s_kwd} = '';
1843 wakaba 1.1 !!!next-input-character;
1844    
1845     !!!emit ($self->{ct}); # DOCTYPE
1846    
1847     redo A;
1848     } elsif ($self->{nc} == -1) {
1849     !!!cp (167);
1850     !!!parse-error (type => 'unclosed DOCTYPE');
1851     $self->{state} = DATA_STATE;
1852 wakaba 1.5 $self->{s_kwd} = '';
1853 wakaba 1.1 ## reconsume
1854    
1855     $self->{ct}->{quirks} = 1;
1856     !!!emit ($self->{ct}); # DOCTYPE
1857    
1858     redo A;
1859     } elsif ($self->{nc} == 0x0050 or # P
1860     $self->{nc} == 0x0070) { # p
1861     $self->{state} = PUBLIC_STATE;
1862     $self->{s_kwd} = chr $self->{nc};
1863     !!!next-input-character;
1864     redo A;
1865     } elsif ($self->{nc} == 0x0053 or # S
1866     $self->{nc} == 0x0073) { # s
1867     $self->{state} = SYSTEM_STATE;
1868     $self->{s_kwd} = chr $self->{nc};
1869     !!!next-input-character;
1870     redo A;
1871     } else {
1872     !!!cp (180);
1873     !!!parse-error (type => 'string after DOCTYPE name');
1874     $self->{ct}->{quirks} = 1;
1875    
1876     $self->{state} = BOGUS_DOCTYPE_STATE;
1877     !!!next-input-character;
1878     redo A;
1879     }
1880     } elsif ($self->{state} == PUBLIC_STATE) {
1881     ## ASCII case-insensitive
1882     if ($self->{nc} == [
1883     undef,
1884     0x0055, # U
1885     0x0042, # B
1886     0x004C, # L
1887     0x0049, # I
1888     ]->[length $self->{s_kwd}] or
1889     $self->{nc} == [
1890     undef,
1891     0x0075, # u
1892     0x0062, # b
1893     0x006C, # l
1894     0x0069, # i
1895     ]->[length $self->{s_kwd}]) {
1896     !!!cp (175);
1897     ## Stay in the state.
1898     $self->{s_kwd} .= chr $self->{nc};
1899     !!!next-input-character;
1900     redo A;
1901     } elsif ((length $self->{s_kwd}) == 5 and
1902     ($self->{nc} == 0x0043 or # C
1903     $self->{nc} == 0x0063)) { # c
1904     !!!cp (168);
1905     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1906     !!!next-input-character;
1907     redo A;
1908     } else {
1909     !!!cp (169);
1910     !!!parse-error (type => 'string after DOCTYPE name',
1911     line => $self->{line_prev},
1912     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1913     $self->{ct}->{quirks} = 1;
1914    
1915     $self->{state} = BOGUS_DOCTYPE_STATE;
1916     ## Reconsume.
1917     redo A;
1918     }
1919     } elsif ($self->{state} == SYSTEM_STATE) {
1920     ## ASCII case-insensitive
1921     if ($self->{nc} == [
1922     undef,
1923     0x0059, # Y
1924     0x0053, # S
1925     0x0054, # T
1926     0x0045, # E
1927     ]->[length $self->{s_kwd}] or
1928     $self->{nc} == [
1929     undef,
1930     0x0079, # y
1931     0x0073, # s
1932     0x0074, # t
1933     0x0065, # e
1934     ]->[length $self->{s_kwd}]) {
1935     !!!cp (170);
1936     ## Stay in the state.
1937     $self->{s_kwd} .= chr $self->{nc};
1938     !!!next-input-character;
1939     redo A;
1940     } elsif ((length $self->{s_kwd}) == 5 and
1941     ($self->{nc} == 0x004D or # M
1942     $self->{nc} == 0x006D)) { # m
1943     !!!cp (171);
1944     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1945     !!!next-input-character;
1946     redo A;
1947     } else {
1948     !!!cp (172);
1949     !!!parse-error (type => 'string after DOCTYPE name',
1950     line => $self->{line_prev},
1951     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1952     $self->{ct}->{quirks} = 1;
1953    
1954     $self->{state} = BOGUS_DOCTYPE_STATE;
1955     ## Reconsume.
1956     redo A;
1957     }
1958     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1959     if ($is_space->{$self->{nc}}) {
1960     !!!cp (181);
1961     ## Stay in the state
1962     !!!next-input-character;
1963     redo A;
1964     } elsif ($self->{nc} eq 0x0022) { # "
1965     !!!cp (182);
1966     $self->{ct}->{pubid} = ''; # DOCTYPE
1967     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1968     !!!next-input-character;
1969     redo A;
1970     } elsif ($self->{nc} eq 0x0027) { # '
1971     !!!cp (183);
1972     $self->{ct}->{pubid} = ''; # DOCTYPE
1973     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1974     !!!next-input-character;
1975     redo A;
1976     } elsif ($self->{nc} eq 0x003E) { # >
1977     !!!cp (184);
1978     !!!parse-error (type => 'no PUBLIC literal');
1979    
1980     $self->{state} = DATA_STATE;
1981 wakaba 1.5 $self->{s_kwd} = '';
1982 wakaba 1.1 !!!next-input-character;
1983    
1984     $self->{ct}->{quirks} = 1;
1985     !!!emit ($self->{ct}); # DOCTYPE
1986    
1987     redo A;
1988     } elsif ($self->{nc} == -1) {
1989     !!!cp (185);
1990     !!!parse-error (type => 'unclosed DOCTYPE');
1991    
1992     $self->{state} = DATA_STATE;
1993 wakaba 1.5 $self->{s_kwd} = '';
1994 wakaba 1.1 ## reconsume
1995    
1996     $self->{ct}->{quirks} = 1;
1997     !!!emit ($self->{ct}); # DOCTYPE
1998    
1999     redo A;
2000     } else {
2001     !!!cp (186);
2002     !!!parse-error (type => 'string after PUBLIC');
2003     $self->{ct}->{quirks} = 1;
2004    
2005     $self->{state} = BOGUS_DOCTYPE_STATE;
2006     !!!next-input-character;
2007     redo A;
2008     }
2009     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2010     if ($self->{nc} == 0x0022) { # "
2011     !!!cp (187);
2012     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2013     !!!next-input-character;
2014     redo A;
2015     } elsif ($self->{nc} == 0x003E) { # >
2016     !!!cp (188);
2017     !!!parse-error (type => 'unclosed PUBLIC literal');
2018    
2019     $self->{state} = DATA_STATE;
2020 wakaba 1.5 $self->{s_kwd} = '';
2021 wakaba 1.1 !!!next-input-character;
2022    
2023     $self->{ct}->{quirks} = 1;
2024     !!!emit ($self->{ct}); # DOCTYPE
2025    
2026     redo A;
2027     } elsif ($self->{nc} == -1) {
2028     !!!cp (189);
2029     !!!parse-error (type => 'unclosed PUBLIC literal');
2030    
2031     $self->{state} = DATA_STATE;
2032 wakaba 1.5 $self->{s_kwd} = '';
2033 wakaba 1.1 ## reconsume
2034    
2035     $self->{ct}->{quirks} = 1;
2036     !!!emit ($self->{ct}); # DOCTYPE
2037    
2038     redo A;
2039     } else {
2040     !!!cp (190);
2041     $self->{ct}->{pubid} # DOCTYPE
2042     .= chr $self->{nc};
2043     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2044     length $self->{ct}->{pubid});
2045    
2046     ## Stay in the state
2047     !!!next-input-character;
2048     redo A;
2049     }
2050     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2051     if ($self->{nc} == 0x0027) { # '
2052     !!!cp (191);
2053     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2054     !!!next-input-character;
2055     redo A;
2056     } elsif ($self->{nc} == 0x003E) { # >
2057     !!!cp (192);
2058     !!!parse-error (type => 'unclosed PUBLIC literal');
2059    
2060     $self->{state} = DATA_STATE;
2061 wakaba 1.5 $self->{s_kwd} = '';
2062 wakaba 1.1 !!!next-input-character;
2063    
2064     $self->{ct}->{quirks} = 1;
2065     !!!emit ($self->{ct}); # DOCTYPE
2066    
2067     redo A;
2068     } elsif ($self->{nc} == -1) {
2069     !!!cp (193);
2070     !!!parse-error (type => 'unclosed PUBLIC literal');
2071    
2072     $self->{state} = DATA_STATE;
2073 wakaba 1.5 $self->{s_kwd} = '';
2074 wakaba 1.1 ## reconsume
2075    
2076     $self->{ct}->{quirks} = 1;
2077     !!!emit ($self->{ct}); # DOCTYPE
2078    
2079     redo A;
2080     } else {
2081     !!!cp (194);
2082     $self->{ct}->{pubid} # DOCTYPE
2083     .= chr $self->{nc};
2084     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2085     length $self->{ct}->{pubid});
2086    
2087     ## Stay in the state
2088     !!!next-input-character;
2089     redo A;
2090     }
2091     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2092     if ($is_space->{$self->{nc}}) {
2093     !!!cp (195);
2094     ## Stay in the state
2095     !!!next-input-character;
2096     redo A;
2097     } elsif ($self->{nc} == 0x0022) { # "
2098     !!!cp (196);
2099     $self->{ct}->{sysid} = ''; # DOCTYPE
2100     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2101     !!!next-input-character;
2102     redo A;
2103     } elsif ($self->{nc} == 0x0027) { # '
2104     !!!cp (197);
2105     $self->{ct}->{sysid} = ''; # DOCTYPE
2106     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2107     !!!next-input-character;
2108     redo A;
2109     } elsif ($self->{nc} == 0x003E) { # >
2110     !!!cp (198);
2111     $self->{state} = DATA_STATE;
2112 wakaba 1.5 $self->{s_kwd} = '';
2113 wakaba 1.1 !!!next-input-character;
2114    
2115     !!!emit ($self->{ct}); # DOCTYPE
2116    
2117     redo A;
2118     } elsif ($self->{nc} == -1) {
2119     !!!cp (199);
2120     !!!parse-error (type => 'unclosed DOCTYPE');
2121    
2122     $self->{state} = DATA_STATE;
2123 wakaba 1.5 $self->{s_kwd} = '';
2124 wakaba 1.1 ## reconsume
2125    
2126     $self->{ct}->{quirks} = 1;
2127     !!!emit ($self->{ct}); # DOCTYPE
2128    
2129     redo A;
2130     } else {
2131     !!!cp (200);
2132     !!!parse-error (type => 'string after PUBLIC literal');
2133     $self->{ct}->{quirks} = 1;
2134    
2135     $self->{state} = BOGUS_DOCTYPE_STATE;
2136     !!!next-input-character;
2137     redo A;
2138     }
2139     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2140     if ($is_space->{$self->{nc}}) {
2141     !!!cp (201);
2142     ## Stay in the state
2143     !!!next-input-character;
2144     redo A;
2145     } elsif ($self->{nc} == 0x0022) { # "
2146     !!!cp (202);
2147     $self->{ct}->{sysid} = ''; # DOCTYPE
2148     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2149     !!!next-input-character;
2150     redo A;
2151     } elsif ($self->{nc} == 0x0027) { # '
2152     !!!cp (203);
2153     $self->{ct}->{sysid} = ''; # DOCTYPE
2154     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2155     !!!next-input-character;
2156     redo A;
2157     } elsif ($self->{nc} == 0x003E) { # >
2158     !!!cp (204);
2159     !!!parse-error (type => 'no SYSTEM literal');
2160     $self->{state} = DATA_STATE;
2161 wakaba 1.5 $self->{s_kwd} = '';
2162 wakaba 1.1 !!!next-input-character;
2163    
2164     $self->{ct}->{quirks} = 1;
2165     !!!emit ($self->{ct}); # DOCTYPE
2166    
2167     redo A;
2168     } elsif ($self->{nc} == -1) {
2169     !!!cp (205);
2170     !!!parse-error (type => 'unclosed DOCTYPE');
2171    
2172     $self->{state} = DATA_STATE;
2173 wakaba 1.5 $self->{s_kwd} = '';
2174 wakaba 1.1 ## reconsume
2175    
2176     $self->{ct}->{quirks} = 1;
2177     !!!emit ($self->{ct}); # DOCTYPE
2178    
2179     redo A;
2180     } else {
2181     !!!cp (206);
2182     !!!parse-error (type => 'string after SYSTEM');
2183     $self->{ct}->{quirks} = 1;
2184    
2185     $self->{state} = BOGUS_DOCTYPE_STATE;
2186     !!!next-input-character;
2187     redo A;
2188     }
2189     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2190     if ($self->{nc} == 0x0022) { # "
2191     !!!cp (207);
2192     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2193     !!!next-input-character;
2194     redo A;
2195     } elsif ($self->{nc} == 0x003E) { # >
2196     !!!cp (208);
2197     !!!parse-error (type => 'unclosed SYSTEM literal');
2198    
2199     $self->{state} = DATA_STATE;
2200 wakaba 1.5 $self->{s_kwd} = '';
2201 wakaba 1.1 !!!next-input-character;
2202    
2203     $self->{ct}->{quirks} = 1;
2204     !!!emit ($self->{ct}); # DOCTYPE
2205    
2206     redo A;
2207     } elsif ($self->{nc} == -1) {
2208     !!!cp (209);
2209     !!!parse-error (type => 'unclosed SYSTEM literal');
2210    
2211     $self->{state} = DATA_STATE;
2212 wakaba 1.5 $self->{s_kwd} = '';
2213 wakaba 1.1 ## reconsume
2214    
2215     $self->{ct}->{quirks} = 1;
2216     !!!emit ($self->{ct}); # DOCTYPE
2217    
2218     redo A;
2219     } else {
2220     !!!cp (210);
2221     $self->{ct}->{sysid} # DOCTYPE
2222     .= chr $self->{nc};
2223     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2224     length $self->{ct}->{sysid});
2225    
2226     ## Stay in the state
2227     !!!next-input-character;
2228     redo A;
2229     }
2230     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2231     if ($self->{nc} == 0x0027) { # '
2232     !!!cp (211);
2233     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2234     !!!next-input-character;
2235     redo A;
2236     } elsif ($self->{nc} == 0x003E) { # >
2237     !!!cp (212);
2238     !!!parse-error (type => 'unclosed SYSTEM literal');
2239    
2240     $self->{state} = DATA_STATE;
2241 wakaba 1.5 $self->{s_kwd} = '';
2242 wakaba 1.1 !!!next-input-character;
2243    
2244     $self->{ct}->{quirks} = 1;
2245     !!!emit ($self->{ct}); # DOCTYPE
2246    
2247     redo A;
2248     } elsif ($self->{nc} == -1) {
2249     !!!cp (213);
2250     !!!parse-error (type => 'unclosed SYSTEM literal');
2251    
2252     $self->{state} = DATA_STATE;
2253 wakaba 1.5 $self->{s_kwd} = '';
2254 wakaba 1.1 ## reconsume
2255    
2256     $self->{ct}->{quirks} = 1;
2257     !!!emit ($self->{ct}); # DOCTYPE
2258    
2259     redo A;
2260     } else {
2261     !!!cp (214);
2262     $self->{ct}->{sysid} # DOCTYPE
2263     .= chr $self->{nc};
2264     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2265     length $self->{ct}->{sysid});
2266    
2267     ## Stay in the state
2268     !!!next-input-character;
2269     redo A;
2270     }
2271     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2272     if ($is_space->{$self->{nc}}) {
2273     !!!cp (215);
2274     ## Stay in the state
2275     !!!next-input-character;
2276     redo A;
2277     } elsif ($self->{nc} == 0x003E) { # >
2278     !!!cp (216);
2279     $self->{state} = DATA_STATE;
2280 wakaba 1.5 $self->{s_kwd} = '';
2281 wakaba 1.1 !!!next-input-character;
2282    
2283     !!!emit ($self->{ct}); # DOCTYPE
2284    
2285     redo A;
2286     } elsif ($self->{nc} == -1) {
2287     !!!cp (217);
2288     !!!parse-error (type => 'unclosed DOCTYPE');
2289     $self->{state} = DATA_STATE;
2290 wakaba 1.5 $self->{s_kwd} = '';
2291 wakaba 1.1 ## reconsume
2292    
2293     $self->{ct}->{quirks} = 1;
2294     !!!emit ($self->{ct}); # DOCTYPE
2295    
2296     redo A;
2297     } else {
2298     !!!cp (218);
2299     !!!parse-error (type => 'string after SYSTEM literal');
2300     #$self->{ct}->{quirks} = 1;
2301    
2302     $self->{state} = BOGUS_DOCTYPE_STATE;
2303     !!!next-input-character;
2304     redo A;
2305     }
2306     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2307     if ($self->{nc} == 0x003E) { # >
2308     !!!cp (219);
2309     $self->{state} = DATA_STATE;
2310 wakaba 1.5 $self->{s_kwd} = '';
2311 wakaba 1.1 !!!next-input-character;
2312    
2313     !!!emit ($self->{ct}); # DOCTYPE
2314    
2315     redo A;
2316     } elsif ($self->{nc} == -1) {
2317     !!!cp (220);
2318     $self->{state} = DATA_STATE;
2319 wakaba 1.5 $self->{s_kwd} = '';
2320 wakaba 1.1 ## reconsume
2321    
2322     !!!emit ($self->{ct}); # DOCTYPE
2323    
2324     redo A;
2325     } else {
2326     !!!cp (221);
2327     my $s = '';
2328     $self->{read_until}->($s, q[>], 0);
2329    
2330     ## Stay in the state
2331     !!!next-input-character;
2332     redo A;
2333     }
2334     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2335     ## NOTE: "CDATA section state" in the state is jointly implemented
2336     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2337     ## and |CDATA_SECTION_MSE2_STATE|.
2338    
2339     if ($self->{nc} == 0x005D) { # ]
2340     !!!cp (221.1);
2341     $self->{state} = CDATA_SECTION_MSE1_STATE;
2342     !!!next-input-character;
2343     redo A;
2344     } elsif ($self->{nc} == -1) {
2345 wakaba 1.6 if ($self->{is_xml}) {
2346     !!!parse-error (type => 'no mse'); ## TODO: type
2347     }
2348    
2349 wakaba 1.1 $self->{state} = DATA_STATE;
2350 wakaba 1.5 $self->{s_kwd} = '';
2351 wakaba 1.1 !!!next-input-character;
2352     if (length $self->{ct}->{data}) { # character
2353     !!!cp (221.2);
2354     !!!emit ($self->{ct}); # character
2355     } else {
2356     !!!cp (221.3);
2357     ## No token to emit. $self->{ct} is discarded.
2358     }
2359     redo A;
2360     } else {
2361     !!!cp (221.4);
2362     $self->{ct}->{data} .= chr $self->{nc};
2363     $self->{read_until}->($self->{ct}->{data},
2364     q<]>,
2365     length $self->{ct}->{data});
2366    
2367     ## Stay in the state.
2368     !!!next-input-character;
2369     redo A;
2370     }
2371    
2372     ## ISSUE: "text tokens" in spec.
2373     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2374     if ($self->{nc} == 0x005D) { # ]
2375     !!!cp (221.5);
2376     $self->{state} = CDATA_SECTION_MSE2_STATE;
2377     !!!next-input-character;
2378     redo A;
2379     } else {
2380     !!!cp (221.6);
2381     $self->{ct}->{data} .= ']';
2382     $self->{state} = CDATA_SECTION_STATE;
2383     ## Reconsume.
2384     redo A;
2385     }
2386     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2387     if ($self->{nc} == 0x003E) { # >
2388     $self->{state} = DATA_STATE;
2389 wakaba 1.5 $self->{s_kwd} = '';
2390 wakaba 1.1 !!!next-input-character;
2391     if (length $self->{ct}->{data}) { # character
2392     !!!cp (221.7);
2393     !!!emit ($self->{ct}); # character
2394     } else {
2395     !!!cp (221.8);
2396     ## No token to emit. $self->{ct} is discarded.
2397     }
2398     redo A;
2399     } elsif ($self->{nc} == 0x005D) { # ]
2400     !!!cp (221.9); # character
2401     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2402     ## Stay in the state.
2403     !!!next-input-character;
2404     redo A;
2405     } else {
2406     !!!cp (221.11);
2407     $self->{ct}->{data} .= ']]'; # character
2408     $self->{state} = CDATA_SECTION_STATE;
2409     ## Reconsume.
2410     redo A;
2411     }
2412     } elsif ($self->{state} == ENTITY_STATE) {
2413     if ($is_space->{$self->{nc}} or
2414     {
2415     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2416     $self->{entity_add} => 1,
2417     }->{$self->{nc}}) {
2418     !!!cp (1001);
2419     ## Don't consume
2420     ## No error
2421     ## Return nothing.
2422     #
2423     } elsif ($self->{nc} == 0x0023) { # #
2424     !!!cp (999);
2425     $self->{state} = ENTITY_HASH_STATE;
2426     $self->{s_kwd} = '#';
2427     !!!next-input-character;
2428     redo A;
2429     } elsif ((0x0041 <= $self->{nc} and
2430     $self->{nc} <= 0x005A) or # A..Z
2431     (0x0061 <= $self->{nc} and
2432     $self->{nc} <= 0x007A)) { # a..z
2433     !!!cp (998);
2434     require Whatpm::_NamedEntityList;
2435     $self->{state} = ENTITY_NAME_STATE;
2436     $self->{s_kwd} = chr $self->{nc};
2437     $self->{entity__value} = $self->{s_kwd};
2438     $self->{entity__match} = 0;
2439     !!!next-input-character;
2440     redo A;
2441     } else {
2442     !!!cp (1027);
2443     !!!parse-error (type => 'bare ero');
2444     ## Return nothing.
2445     #
2446     }
2447    
2448     ## NOTE: No character is consumed by the "consume a character
2449     ## reference" algorithm. In other word, there is an "&" character
2450     ## that does not introduce a character reference, which would be
2451     ## appended to the parent element or the attribute value in later
2452     ## process of the tokenizer.
2453    
2454     if ($self->{prev_state} == DATA_STATE) {
2455     !!!cp (997);
2456     $self->{state} = $self->{prev_state};
2457 wakaba 1.5 $self->{s_kwd} = '';
2458 wakaba 1.1 ## Reconsume.
2459     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2460     line => $self->{line_prev},
2461     column => $self->{column_prev},
2462     });
2463     redo A;
2464     } else {
2465     !!!cp (996);
2466     $self->{ca}->{value} .= '&';
2467     $self->{state} = $self->{prev_state};
2468 wakaba 1.5 $self->{s_kwd} = '';
2469 wakaba 1.1 ## Reconsume.
2470     redo A;
2471     }
2472     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2473     if ($self->{nc} == 0x0078 or # x
2474     $self->{nc} == 0x0058) { # X
2475     !!!cp (995);
2476     $self->{state} = HEXREF_X_STATE;
2477     $self->{s_kwd} .= chr $self->{nc};
2478     !!!next-input-character;
2479     redo A;
2480     } elsif (0x0030 <= $self->{nc} and
2481     $self->{nc} <= 0x0039) { # 0..9
2482     !!!cp (994);
2483     $self->{state} = NCR_NUM_STATE;
2484     $self->{s_kwd} = $self->{nc} - 0x0030;
2485     !!!next-input-character;
2486     redo A;
2487     } else {
2488     !!!parse-error (type => 'bare nero',
2489     line => $self->{line_prev},
2490     column => $self->{column_prev} - 1);
2491    
2492     ## NOTE: According to the spec algorithm, nothing is returned,
2493     ## and then "&#" is appended to the parent element or the attribute
2494     ## value in the later processing.
2495    
2496     if ($self->{prev_state} == DATA_STATE) {
2497     !!!cp (1019);
2498     $self->{state} = $self->{prev_state};
2499 wakaba 1.5 $self->{s_kwd} = '';
2500 wakaba 1.1 ## Reconsume.
2501     !!!emit ({type => CHARACTER_TOKEN,
2502     data => '&#',
2503     line => $self->{line_prev},
2504     column => $self->{column_prev} - 1,
2505     });
2506     redo A;
2507     } else {
2508     !!!cp (993);
2509     $self->{ca}->{value} .= '&#';
2510     $self->{state} = $self->{prev_state};
2511 wakaba 1.5 $self->{s_kwd} = '';
2512 wakaba 1.1 ## Reconsume.
2513     redo A;
2514     }
2515     }
2516     } elsif ($self->{state} == NCR_NUM_STATE) {
2517     if (0x0030 <= $self->{nc} and
2518     $self->{nc} <= 0x0039) { # 0..9
2519     !!!cp (1012);
2520     $self->{s_kwd} *= 10;
2521     $self->{s_kwd} += $self->{nc} - 0x0030;
2522    
2523     ## Stay in the state.
2524     !!!next-input-character;
2525     redo A;
2526     } elsif ($self->{nc} == 0x003B) { # ;
2527     !!!cp (1013);
2528     !!!next-input-character;
2529     #
2530     } else {
2531     !!!cp (1014);
2532     !!!parse-error (type => 'no refc');
2533     ## Reconsume.
2534     #
2535     }
2536    
2537     my $code = $self->{s_kwd};
2538     my $l = $self->{line_prev};
2539     my $c = $self->{column_prev};
2540     if ($charref_map->{$code}) {
2541     !!!cp (1015);
2542     !!!parse-error (type => 'invalid character reference',
2543     text => (sprintf 'U+%04X', $code),
2544     line => $l, column => $c);
2545     $code = $charref_map->{$code};
2546     } elsif ($code > 0x10FFFF) {
2547     !!!cp (1016);
2548     !!!parse-error (type => 'invalid character reference',
2549     text => (sprintf 'U-%08X', $code),
2550     line => $l, column => $c);
2551     $code = 0xFFFD;
2552     }
2553    
2554     if ($self->{prev_state} == DATA_STATE) {
2555     !!!cp (992);
2556     $self->{state} = $self->{prev_state};
2557 wakaba 1.5 $self->{s_kwd} = '';
2558 wakaba 1.1 ## Reconsume.
2559     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2560 wakaba 1.7 has_reference => 1,
2561 wakaba 1.1 line => $l, column => $c,
2562     });
2563     redo A;
2564     } else {
2565     !!!cp (991);
2566     $self->{ca}->{value} .= chr $code;
2567     $self->{ca}->{has_reference} = 1;
2568     $self->{state} = $self->{prev_state};
2569 wakaba 1.5 $self->{s_kwd} = '';
2570 wakaba 1.1 ## Reconsume.
2571     redo A;
2572     }
2573     } elsif ($self->{state} == HEXREF_X_STATE) {
2574     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2575     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2576     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2577     # 0..9, A..F, a..f
2578     !!!cp (990);
2579     $self->{state} = HEXREF_HEX_STATE;
2580     $self->{s_kwd} = 0;
2581     ## Reconsume.
2582     redo A;
2583     } else {
2584     !!!parse-error (type => 'bare hcro',
2585     line => $self->{line_prev},
2586     column => $self->{column_prev} - 2);
2587    
2588     ## NOTE: According to the spec algorithm, nothing is returned,
2589     ## and then "&#" followed by "X" or "x" is appended to the parent
2590     ## element or the attribute value in the later processing.
2591    
2592     if ($self->{prev_state} == DATA_STATE) {
2593     !!!cp (1005);
2594     $self->{state} = $self->{prev_state};
2595 wakaba 1.5 $self->{s_kwd} = '';
2596 wakaba 1.1 ## Reconsume.
2597     !!!emit ({type => CHARACTER_TOKEN,
2598     data => '&' . $self->{s_kwd},
2599     line => $self->{line_prev},
2600     column => $self->{column_prev} - length $self->{s_kwd},
2601     });
2602     redo A;
2603     } else {
2604     !!!cp (989);
2605     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2606     $self->{state} = $self->{prev_state};
2607 wakaba 1.5 $self->{s_kwd} = '';
2608 wakaba 1.1 ## Reconsume.
2609     redo A;
2610     }
2611     }
2612     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2613     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2614     # 0..9
2615     !!!cp (1002);
2616     $self->{s_kwd} *= 0x10;
2617     $self->{s_kwd} += $self->{nc} - 0x0030;
2618     ## Stay in the state.
2619     !!!next-input-character;
2620     redo A;
2621     } elsif (0x0061 <= $self->{nc} and
2622     $self->{nc} <= 0x0066) { # a..f
2623     !!!cp (1003);
2624     $self->{s_kwd} *= 0x10;
2625     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2626     ## Stay in the state.
2627     !!!next-input-character;
2628     redo A;
2629     } elsif (0x0041 <= $self->{nc} and
2630     $self->{nc} <= 0x0046) { # A..F
2631     !!!cp (1004);
2632     $self->{s_kwd} *= 0x10;
2633     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2634     ## Stay in the state.
2635     !!!next-input-character;
2636     redo A;
2637     } elsif ($self->{nc} == 0x003B) { # ;
2638     !!!cp (1006);
2639     !!!next-input-character;
2640     #
2641     } else {
2642     !!!cp (1007);
2643     !!!parse-error (type => 'no refc',
2644     line => $self->{line},
2645     column => $self->{column});
2646     ## Reconsume.
2647     #
2648     }
2649    
2650     my $code = $self->{s_kwd};
2651     my $l = $self->{line_prev};
2652     my $c = $self->{column_prev};
2653     if ($charref_map->{$code}) {
2654     !!!cp (1008);
2655     !!!parse-error (type => 'invalid character reference',
2656     text => (sprintf 'U+%04X', $code),
2657     line => $l, column => $c);
2658     $code = $charref_map->{$code};
2659     } elsif ($code > 0x10FFFF) {
2660     !!!cp (1009);
2661     !!!parse-error (type => 'invalid character reference',
2662     text => (sprintf 'U-%08X', $code),
2663     line => $l, column => $c);
2664     $code = 0xFFFD;
2665     }
2666    
2667     if ($self->{prev_state} == DATA_STATE) {
2668     !!!cp (988);
2669     $self->{state} = $self->{prev_state};
2670 wakaba 1.5 $self->{s_kwd} = '';
2671 wakaba 1.1 ## Reconsume.
2672     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2673 wakaba 1.7 has_reference => 1,
2674 wakaba 1.1 line => $l, column => $c,
2675     });
2676     redo A;
2677     } else {
2678     !!!cp (987);
2679     $self->{ca}->{value} .= chr $code;
2680     $self->{ca}->{has_reference} = 1;
2681     $self->{state} = $self->{prev_state};
2682 wakaba 1.5 $self->{s_kwd} = '';
2683 wakaba 1.1 ## Reconsume.
2684     redo A;
2685     }
2686     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2687     if (length $self->{s_kwd} < 30 and
2688     ## NOTE: Some number greater than the maximum length of entity name
2689     ((0x0041 <= $self->{nc} and # a
2690     $self->{nc} <= 0x005A) or # x
2691     (0x0061 <= $self->{nc} and # a
2692     $self->{nc} <= 0x007A) or # z
2693     (0x0030 <= $self->{nc} and # 0
2694     $self->{nc} <= 0x0039) or # 9
2695     $self->{nc} == 0x003B)) { # ;
2696     our $EntityChar;
2697     $self->{s_kwd} .= chr $self->{nc};
2698     if (defined $EntityChar->{$self->{s_kwd}}) {
2699     if ($self->{nc} == 0x003B) { # ;
2700     !!!cp (1020);
2701     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2702     $self->{entity__match} = 1;
2703     !!!next-input-character;
2704     #
2705     } else {
2706     !!!cp (1021);
2707     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2708     $self->{entity__match} = -1;
2709     ## Stay in the state.
2710     !!!next-input-character;
2711     redo A;
2712     }
2713     } else {
2714     !!!cp (1022);
2715     $self->{entity__value} .= chr $self->{nc};
2716     $self->{entity__match} *= 2;
2717     ## Stay in the state.
2718     !!!next-input-character;
2719     redo A;
2720     }
2721     }
2722    
2723     my $data;
2724     my $has_ref;
2725     if ($self->{entity__match} > 0) {
2726     !!!cp (1023);
2727     $data = $self->{entity__value};
2728     $has_ref = 1;
2729     #
2730     } elsif ($self->{entity__match} < 0) {
2731     !!!parse-error (type => 'no refc');
2732     if ($self->{prev_state} != DATA_STATE and # in attribute
2733     $self->{entity__match} < -1) {
2734     !!!cp (1024);
2735     $data = '&' . $self->{s_kwd};
2736     #
2737     } else {
2738     !!!cp (1025);
2739     $data = $self->{entity__value};
2740     $has_ref = 1;
2741     #
2742     }
2743     } else {
2744     !!!cp (1026);
2745     !!!parse-error (type => 'bare ero',
2746     line => $self->{line_prev},
2747     column => $self->{column_prev} - length $self->{s_kwd});
2748     $data = '&' . $self->{s_kwd};
2749     #
2750     }
2751    
2752     ## NOTE: In these cases, when a character reference is found,
2753     ## it is consumed and a character token is returned, or, otherwise,
2754     ## nothing is consumed and returned, according to the spec algorithm.
2755     ## In this implementation, anything that has been examined by the
2756     ## tokenizer is appended to the parent element or the attribute value
2757     ## as string, either literal string when no character reference or
2758     ## entity-replaced string otherwise, in this stage, since any characters
2759     ## that would not be consumed are appended in the data state or in an
2760     ## appropriate attribute value state anyway.
2761    
2762     if ($self->{prev_state} == DATA_STATE) {
2763     !!!cp (986);
2764     $self->{state} = $self->{prev_state};
2765 wakaba 1.5 $self->{s_kwd} = '';
2766 wakaba 1.1 ## Reconsume.
2767     !!!emit ({type => CHARACTER_TOKEN,
2768     data => $data,
2769 wakaba 1.7 has_reference => $has_ref,
2770 wakaba 1.1 line => $self->{line_prev},
2771     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2772     });
2773     redo A;
2774     } else {
2775     !!!cp (985);
2776     $self->{ca}->{value} .= $data;
2777     $self->{ca}->{has_reference} = 1 if $has_ref;
2778     $self->{state} = $self->{prev_state};
2779 wakaba 1.5 $self->{s_kwd} = '';
2780 wakaba 1.1 ## Reconsume.
2781     redo A;
2782     }
2783     } else {
2784     die "$0: $self->{state}: Unknown state";
2785     }
2786     } # A
2787    
2788     die "$0: _get_next_token: unexpected case";
2789     } # _get_next_token
2790    
2791     1;
2792 wakaba 1.7 ## $Date: 2008/10/14 14:57:52 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24