/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (show annotations) (download) (as text)
Tue Oct 14 14:38:59 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.4: +109 -9 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	14 Oct 2008 14:21:51 -0000
	* XML-Parser.t: "xml/texts-1.dat" added.

	* tokenizer-test-2.dat: Test for ]]> are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:38:34 -0000
	* doctypes-1.dat: Wrong results fixed.

	* texts-1.dat: New test data file.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:22:43 -0000
	* Tokenizer.pm.src: Raise a parse error for XML "]]>" other than
	CDATA section end.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 $self->{s_kwd} = ''; # state keyword
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194 !!!next-input-character;
195 $self->{token} = [];
196 # $self->{escape}
197 } # _initialize_tokenizer
198
199 ## A token has:
200 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202 ## ->{name} (DOCTYPE_TOKEN)
203 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204 ## ->{pubid} (DOCTYPE_TOKEN)
205 ## ->{sysid} (DOCTYPE_TOKEN)
206 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208 ## ->{name}
209 ## ->{value}
210 ## ->{has_reference} == 1 or 0
211 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
213 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
214 ## while the token is pushed back to the stack.
215
216 ## Emitted token MUST immediately be handled by the tree construction state.
217
218 ## Before each step, UA MAY check to see if either one of the scripts in
219 ## "list of scripts that will execute as soon as possible" or the first
220 ## script in the "list of scripts that will execute asynchronously",
221 ## has completed loading. If one has, then it MUST be executed
222 ## and removed from the list.
223
224 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
225 ## (This requirement was dropped from HTML5 spec, unfortunately.)
226
227 my $is_space = {
228 0x0009 => 1, # CHARACTER TABULATION (HT)
229 0x000A => 1, # LINE FEED (LF)
230 #0x000B => 0, # LINE TABULATION (VT)
231 0x000C => 1, # FORM FEED (FF)
232 #0x000D => 1, # CARRIAGE RETURN (CR)
233 0x0020 => 1, # SPACE (SP)
234 };
235
236 sub _get_next_token ($) {
237 my $self = shift;
238
239 if ($self->{self_closing}) {
240 !!!parse-error (type => 'nestc', token => $self->{ct});
241 ## NOTE: The |self_closing| flag is only set by start tag token.
242 ## In addition, when a start tag token is emitted, it is always set to
243 ## |ct|.
244 delete $self->{self_closing};
245 }
246
247 if (@{$self->{token}}) {
248 $self->{self_closing} = $self->{token}->[0]->{self_closing};
249 return shift @{$self->{token}};
250 }
251
252 A: {
253 if ($self->{state} == PCDATA_STATE) {
254 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
255
256 if ($self->{nc} == 0x0026) { # &
257 !!!cp (0.1);
258 ## NOTE: In the spec, the tokenizer is switched to the
259 ## "entity data state". In this implementation, the tokenizer
260 ## is switched to the |ENTITY_STATE|, which is an implementation
261 ## of the "consume a character reference" algorithm.
262 $self->{entity_add} = -1;
263 $self->{prev_state} = DATA_STATE;
264 $self->{state} = ENTITY_STATE;
265 !!!next-input-character;
266 redo A;
267 } elsif ($self->{nc} == 0x003C) { # <
268 !!!cp (0.2);
269 $self->{state} = TAG_OPEN_STATE;
270 !!!next-input-character;
271 redo A;
272 } elsif ($self->{nc} == -1) {
273 !!!cp (0.3);
274 !!!emit ({type => END_OF_FILE_TOKEN,
275 line => $self->{line}, column => $self->{column}});
276 last A; ## TODO: ok?
277 } else {
278 !!!cp (0.4);
279 #
280 }
281
282 # Anything else
283 my $token = {type => CHARACTER_TOKEN,
284 data => chr $self->{nc},
285 line => $self->{line}, column => $self->{column},
286 };
287 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
288
289 ## Stay in the state.
290 !!!next-input-character;
291 !!!emit ($token);
292 redo A;
293 } elsif ($self->{state} == DATA_STATE) {
294 $self->{s_kwd} = '' unless defined $self->{s_kwd};
295 if ($self->{nc} == 0x0026) { # &
296 $self->{s_kwd} = '';
297 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
298 not $self->{escape}) {
299 !!!cp (1);
300 ## NOTE: In the spec, the tokenizer is switched to the
301 ## "entity data state". In this implementation, the tokenizer
302 ## is switched to the |ENTITY_STATE|, which is an implementation
303 ## of the "consume a character reference" algorithm.
304 $self->{entity_add} = -1;
305 $self->{prev_state} = DATA_STATE;
306 $self->{state} = ENTITY_STATE;
307 !!!next-input-character;
308 redo A;
309 } else {
310 !!!cp (2);
311 #
312 }
313 } elsif ($self->{nc} == 0x002D) { # -
314 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
315 if ($self->{s_kwd} eq '<!-') {
316 !!!cp (3);
317 $self->{escape} = 1; # unless $self->{escape};
318 $self->{s_kwd} = '--';
319 #
320 } elsif ($self->{s_kwd} eq '-') {
321 !!!cp (4);
322 $self->{s_kwd} = '--';
323 #
324 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
325 !!!cp (4.1);
326 $self->{s_kwd} .= '-';
327 #
328 } else {
329 !!!cp (5);
330 $self->{s_kwd} = '-';
331 #
332 }
333 }
334
335 #
336 } elsif ($self->{nc} == 0x0021) { # !
337 if (length $self->{s_kwd}) {
338 !!!cp (5.1);
339 $self->{s_kwd} .= '!';
340 #
341 } else {
342 !!!cp (5.2);
343 #$self->{s_kwd} = '';
344 #
345 }
346 #
347 } elsif ($self->{nc} == 0x003C) { # <
348 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
349 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
350 not $self->{escape})) {
351 !!!cp (6);
352 $self->{state} = TAG_OPEN_STATE;
353 !!!next-input-character;
354 redo A;
355 } else {
356 !!!cp (7);
357 $self->{s_kwd} = '';
358 #
359 }
360 } elsif ($self->{nc} == 0x003E) { # >
361 if ($self->{escape} and
362 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
363 if ($self->{s_kwd} eq '--') {
364 !!!cp (8);
365 delete $self->{escape};
366 #
367 } else {
368 !!!cp (9);
369 #
370 }
371 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
372 !!!cp (9.1);
373 !!!parse-error (type => 'unmatched mse', ## TODO: type
374 line => $self->{line_prev},
375 column => $self->{column_prev} - 1);
376 #
377 } else {
378 !!!cp (10);
379 #
380 }
381
382 $self->{s_kwd} = '';
383 #
384 } elsif ($self->{nc} == 0x005D) { # ]
385 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
386 !!!cp (10.1);
387 $self->{s_kwd} .= ']';
388 } elsif ($self->{s_kwd} eq ']]') {
389 !!!cp (10.2);
390 #
391 } else {
392 !!!cp (10.3);
393 $self->{s_kwd} = '';
394 }
395 #
396 } elsif ($self->{nc} == -1) {
397 !!!cp (11);
398 $self->{s_kwd} = '';
399 !!!emit ({type => END_OF_FILE_TOKEN,
400 line => $self->{line}, column => $self->{column}});
401 last A; ## TODO: ok?
402 } else {
403 !!!cp (12);
404 $self->{s_kwd} = '';
405 #
406 }
407
408 # Anything else
409 my $token = {type => CHARACTER_TOKEN,
410 data => chr $self->{nc},
411 line => $self->{line}, column => $self->{column},
412 };
413 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
414 length $token->{data})) {
415 $self->{s_kwd} = '';
416 }
417
418 ## Stay in the data state.
419 if (not $self->{is_xml} and
420 $self->{content_model} == PCDATA_CONTENT_MODEL) {
421 !!!cp (13);
422 $self->{state} = PCDATA_STATE;
423 } else {
424 !!!cp (14);
425 ## Stay in the state.
426 }
427 !!!next-input-character;
428 !!!emit ($token);
429 redo A;
430 } elsif ($self->{state} == TAG_OPEN_STATE) {
431 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
432 if ($self->{nc} == 0x002F) { # /
433 !!!cp (15);
434 !!!next-input-character;
435 $self->{state} = CLOSE_TAG_OPEN_STATE;
436 redo A;
437 } elsif ($self->{nc} == 0x0021) { # !
438 !!!cp (15.1);
439 $self->{s_kwd} = '<' unless $self->{escape};
440 #
441 } else {
442 !!!cp (16);
443 #
444 }
445
446 ## reconsume
447 $self->{state} = DATA_STATE;
448 $self->{s_kwd} = '';
449 !!!emit ({type => CHARACTER_TOKEN, data => '<',
450 line => $self->{line_prev},
451 column => $self->{column_prev},
452 });
453 redo A;
454 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
455 if ($self->{nc} == 0x0021) { # !
456 !!!cp (17);
457 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
458 !!!next-input-character;
459 redo A;
460 } elsif ($self->{nc} == 0x002F) { # /
461 !!!cp (18);
462 $self->{state} = CLOSE_TAG_OPEN_STATE;
463 !!!next-input-character;
464 redo A;
465 } elsif (0x0041 <= $self->{nc} and
466 $self->{nc} <= 0x005A) { # A..Z
467 !!!cp (19);
468 $self->{ct}
469 = {type => START_TAG_TOKEN,
470 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
471 line => $self->{line_prev},
472 column => $self->{column_prev}};
473 $self->{state} = TAG_NAME_STATE;
474 !!!next-input-character;
475 redo A;
476 } elsif (0x0061 <= $self->{nc} and
477 $self->{nc} <= 0x007A) { # a..z
478 !!!cp (20);
479 $self->{ct} = {type => START_TAG_TOKEN,
480 tag_name => chr ($self->{nc}),
481 line => $self->{line_prev},
482 column => $self->{column_prev}};
483 $self->{state} = TAG_NAME_STATE;
484 !!!next-input-character;
485 redo A;
486 } elsif ($self->{nc} == 0x003E) { # >
487 !!!cp (21);
488 !!!parse-error (type => 'empty start tag',
489 line => $self->{line_prev},
490 column => $self->{column_prev});
491 $self->{state} = DATA_STATE;
492 $self->{s_kwd} = '';
493 !!!next-input-character;
494
495 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
496 line => $self->{line_prev},
497 column => $self->{column_prev},
498 });
499
500 redo A;
501 } elsif ($self->{nc} == 0x003F) { # ?
502 !!!cp (22);
503 !!!parse-error (type => 'pio',
504 line => $self->{line_prev},
505 column => $self->{column_prev});
506 $self->{state} = BOGUS_COMMENT_STATE;
507 $self->{ct} = {type => COMMENT_TOKEN, data => '',
508 line => $self->{line_prev},
509 column => $self->{column_prev},
510 };
511 ## $self->{nc} is intentionally left as is
512 redo A;
513 } else {
514 !!!cp (23);
515 !!!parse-error (type => 'bare stago',
516 line => $self->{line_prev},
517 column => $self->{column_prev});
518 $self->{state} = DATA_STATE;
519 $self->{s_kwd} = '';
520 ## reconsume
521
522 !!!emit ({type => CHARACTER_TOKEN, data => '<',
523 line => $self->{line_prev},
524 column => $self->{column_prev},
525 });
526
527 redo A;
528 }
529 } else {
530 die "$0: $self->{content_model} in tag open";
531 }
532 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
533 ## NOTE: The "close tag open state" in the spec is implemented as
534 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
535
536 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
537 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
538 if (defined $self->{last_stag_name}) {
539 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
540 $self->{s_kwd} = '';
541 ## Reconsume.
542 redo A;
543 } else {
544 ## No start tag token has ever been emitted
545 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
546 !!!cp (28);
547 $self->{state} = DATA_STATE;
548 $self->{s_kwd} = '';
549 ## Reconsume.
550 !!!emit ({type => CHARACTER_TOKEN, data => '</',
551 line => $l, column => $c,
552 });
553 redo A;
554 }
555 }
556
557 if (0x0041 <= $self->{nc} and
558 $self->{nc} <= 0x005A) { # A..Z
559 !!!cp (29);
560 $self->{ct}
561 = {type => END_TAG_TOKEN,
562 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
563 line => $l, column => $c};
564 $self->{state} = TAG_NAME_STATE;
565 !!!next-input-character;
566 redo A;
567 } elsif (0x0061 <= $self->{nc} and
568 $self->{nc} <= 0x007A) { # a..z
569 !!!cp (30);
570 $self->{ct} = {type => END_TAG_TOKEN,
571 tag_name => chr ($self->{nc}),
572 line => $l, column => $c};
573 $self->{state} = TAG_NAME_STATE;
574 !!!next-input-character;
575 redo A;
576 } elsif ($self->{nc} == 0x003E) { # >
577 !!!cp (31);
578 !!!parse-error (type => 'empty end tag',
579 line => $self->{line_prev}, ## "<" in "</>"
580 column => $self->{column_prev} - 1);
581 $self->{state} = DATA_STATE;
582 $self->{s_kwd} = '';
583 !!!next-input-character;
584 redo A;
585 } elsif ($self->{nc} == -1) {
586 !!!cp (32);
587 !!!parse-error (type => 'bare etago');
588 $self->{s_kwd} = '';
589 $self->{state} = DATA_STATE;
590 # reconsume
591
592 !!!emit ({type => CHARACTER_TOKEN, data => '</',
593 line => $l, column => $c,
594 });
595
596 redo A;
597 } else {
598 !!!cp (33);
599 !!!parse-error (type => 'bogus end tag');
600 $self->{state} = BOGUS_COMMENT_STATE;
601 $self->{ct} = {type => COMMENT_TOKEN, data => '',
602 line => $self->{line_prev}, # "<" of "</"
603 column => $self->{column_prev} - 1,
604 };
605 ## NOTE: $self->{nc} is intentionally left as is.
606 ## Although the "anything else" case of the spec not explicitly
607 ## states that the next input character is to be reconsumed,
608 ## it will be included to the |data| of the comment token
609 ## generated from the bogus end tag, as defined in the
610 ## "bogus comment state" entry.
611 redo A;
612 }
613 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
614 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
615 if (length $ch) {
616 my $CH = $ch;
617 $ch =~ tr/a-z/A-Z/;
618 my $nch = chr $self->{nc};
619 if ($nch eq $ch or $nch eq $CH) {
620 !!!cp (24);
621 ## Stay in the state.
622 $self->{s_kwd} .= $nch;
623 !!!next-input-character;
624 redo A;
625 } else {
626 !!!cp (25);
627 $self->{state} = DATA_STATE;
628 $self->{s_kwd} = '';
629 ## Reconsume.
630 !!!emit ({type => CHARACTER_TOKEN,
631 data => '</' . $self->{s_kwd},
632 line => $self->{line_prev},
633 column => $self->{column_prev} - 1 - length $self->{s_kwd},
634 });
635 redo A;
636 }
637 } else { # after "<{tag-name}"
638 unless ($is_space->{$self->{nc}} or
639 {
640 0x003E => 1, # >
641 0x002F => 1, # /
642 -1 => 1, # EOF
643 }->{$self->{nc}}) {
644 !!!cp (26);
645 ## Reconsume.
646 $self->{state} = DATA_STATE;
647 $self->{s_kwd} = '';
648 !!!emit ({type => CHARACTER_TOKEN,
649 data => '</' . $self->{s_kwd},
650 line => $self->{line_prev},
651 column => $self->{column_prev} - 1 - length $self->{s_kwd},
652 });
653 redo A;
654 } else {
655 !!!cp (27);
656 $self->{ct}
657 = {type => END_TAG_TOKEN,
658 tag_name => $self->{last_stag_name},
659 line => $self->{line_prev},
660 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
661 $self->{state} = TAG_NAME_STATE;
662 ## Reconsume.
663 redo A;
664 }
665 }
666 } elsif ($self->{state} == TAG_NAME_STATE) {
667 if ($is_space->{$self->{nc}}) {
668 !!!cp (34);
669 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
670 !!!next-input-character;
671 redo A;
672 } elsif ($self->{nc} == 0x003E) { # >
673 if ($self->{ct}->{type} == START_TAG_TOKEN) {
674 !!!cp (35);
675 $self->{last_stag_name} = $self->{ct}->{tag_name};
676 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
677 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
678 #if ($self->{ct}->{attributes}) {
679 # ## NOTE: This should never be reached.
680 # !!! cp (36);
681 # !!! parse-error (type => 'end tag attribute');
682 #} else {
683 !!!cp (37);
684 #}
685 } else {
686 die "$0: $self->{ct}->{type}: Unknown token type";
687 }
688 $self->{state} = DATA_STATE;
689 $self->{s_kwd} = '';
690 !!!next-input-character;
691
692 !!!emit ($self->{ct}); # start tag or end tag
693
694 redo A;
695 } elsif (0x0041 <= $self->{nc} and
696 $self->{nc} <= 0x005A) { # A..Z
697 !!!cp (38);
698 $self->{ct}->{tag_name}
699 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
700 # start tag or end tag
701 ## Stay in this state
702 !!!next-input-character;
703 redo A;
704 } elsif ($self->{nc} == -1) {
705 !!!parse-error (type => 'unclosed tag');
706 if ($self->{ct}->{type} == START_TAG_TOKEN) {
707 !!!cp (39);
708 $self->{last_stag_name} = $self->{ct}->{tag_name};
709 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
710 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
711 #if ($self->{ct}->{attributes}) {
712 # ## NOTE: This state should never be reached.
713 # !!! cp (40);
714 # !!! parse-error (type => 'end tag attribute');
715 #} else {
716 !!!cp (41);
717 #}
718 } else {
719 die "$0: $self->{ct}->{type}: Unknown token type";
720 }
721 $self->{state} = DATA_STATE;
722 $self->{s_kwd} = '';
723 # reconsume
724
725 !!!emit ($self->{ct}); # start tag or end tag
726
727 redo A;
728 } elsif ($self->{nc} == 0x002F) { # /
729 !!!cp (42);
730 $self->{state} = SELF_CLOSING_START_TAG_STATE;
731 !!!next-input-character;
732 redo A;
733 } else {
734 !!!cp (44);
735 $self->{ct}->{tag_name} .= chr $self->{nc};
736 # start tag or end tag
737 ## Stay in the state
738 !!!next-input-character;
739 redo A;
740 }
741 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
742 if ($is_space->{$self->{nc}}) {
743 !!!cp (45);
744 ## Stay in the state
745 !!!next-input-character;
746 redo A;
747 } elsif ($self->{nc} == 0x003E) { # >
748 if ($self->{ct}->{type} == START_TAG_TOKEN) {
749 !!!cp (46);
750 $self->{last_stag_name} = $self->{ct}->{tag_name};
751 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
752 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
753 if ($self->{ct}->{attributes}) {
754 !!!cp (47);
755 !!!parse-error (type => 'end tag attribute');
756 } else {
757 !!!cp (48);
758 }
759 } else {
760 die "$0: $self->{ct}->{type}: Unknown token type";
761 }
762 $self->{state} = DATA_STATE;
763 $self->{s_kwd} = '';
764 !!!next-input-character;
765
766 !!!emit ($self->{ct}); # start tag or end tag
767
768 redo A;
769 } elsif (0x0041 <= $self->{nc} and
770 $self->{nc} <= 0x005A) { # A..Z
771 !!!cp (49);
772 $self->{ca}
773 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
774 value => '',
775 line => $self->{line}, column => $self->{column}};
776 $self->{state} = ATTRIBUTE_NAME_STATE;
777 !!!next-input-character;
778 redo A;
779 } elsif ($self->{nc} == 0x002F) { # /
780 !!!cp (50);
781 $self->{state} = SELF_CLOSING_START_TAG_STATE;
782 !!!next-input-character;
783 redo A;
784 } elsif ($self->{nc} == -1) {
785 !!!parse-error (type => 'unclosed tag');
786 if ($self->{ct}->{type} == START_TAG_TOKEN) {
787 !!!cp (52);
788 $self->{last_stag_name} = $self->{ct}->{tag_name};
789 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
790 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791 if ($self->{ct}->{attributes}) {
792 !!!cp (53);
793 !!!parse-error (type => 'end tag attribute');
794 } else {
795 !!!cp (54);
796 }
797 } else {
798 die "$0: $self->{ct}->{type}: Unknown token type";
799 }
800 $self->{state} = DATA_STATE;
801 $self->{s_kwd} = '';
802 # reconsume
803
804 !!!emit ($self->{ct}); # start tag or end tag
805
806 redo A;
807 } else {
808 if ({
809 0x0022 => 1, # "
810 0x0027 => 1, # '
811 0x003D => 1, # =
812 }->{$self->{nc}}) {
813 !!!cp (55);
814 !!!parse-error (type => 'bad attribute name');
815 } else {
816 !!!cp (56);
817 }
818 $self->{ca}
819 = {name => chr ($self->{nc}),
820 value => '',
821 line => $self->{line}, column => $self->{column}};
822 $self->{state} = ATTRIBUTE_NAME_STATE;
823 !!!next-input-character;
824 redo A;
825 }
826 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
827 my $before_leave = sub {
828 if (exists $self->{ct}->{attributes} # start tag or end tag
829 ->{$self->{ca}->{name}}) { # MUST
830 !!!cp (57);
831 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
832 ## Discard $self->{ca} # MUST
833 } else {
834 !!!cp (58);
835 $self->{ct}->{attributes}->{$self->{ca}->{name}}
836 = $self->{ca};
837 }
838 }; # $before_leave
839
840 if ($is_space->{$self->{nc}}) {
841 !!!cp (59);
842 $before_leave->();
843 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
844 !!!next-input-character;
845 redo A;
846 } elsif ($self->{nc} == 0x003D) { # =
847 !!!cp (60);
848 $before_leave->();
849 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
850 !!!next-input-character;
851 redo A;
852 } elsif ($self->{nc} == 0x003E) { # >
853 $before_leave->();
854 if ($self->{ct}->{type} == START_TAG_TOKEN) {
855 !!!cp (61);
856 $self->{last_stag_name} = $self->{ct}->{tag_name};
857 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
858 !!!cp (62);
859 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
860 if ($self->{ct}->{attributes}) {
861 !!!parse-error (type => 'end tag attribute');
862 }
863 } else {
864 die "$0: $self->{ct}->{type}: Unknown token type";
865 }
866 $self->{state} = DATA_STATE;
867 $self->{s_kwd} = '';
868 !!!next-input-character;
869
870 !!!emit ($self->{ct}); # start tag or end tag
871
872 redo A;
873 } elsif (0x0041 <= $self->{nc} and
874 $self->{nc} <= 0x005A) { # A..Z
875 !!!cp (63);
876 $self->{ca}->{name}
877 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
878 ## Stay in the state
879 !!!next-input-character;
880 redo A;
881 } elsif ($self->{nc} == 0x002F) { # /
882 !!!cp (64);
883 $before_leave->();
884 $self->{state} = SELF_CLOSING_START_TAG_STATE;
885 !!!next-input-character;
886 redo A;
887 } elsif ($self->{nc} == -1) {
888 !!!parse-error (type => 'unclosed tag');
889 $before_leave->();
890 if ($self->{ct}->{type} == START_TAG_TOKEN) {
891 !!!cp (66);
892 $self->{last_stag_name} = $self->{ct}->{tag_name};
893 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
894 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
895 if ($self->{ct}->{attributes}) {
896 !!!cp (67);
897 !!!parse-error (type => 'end tag attribute');
898 } else {
899 ## NOTE: This state should never be reached.
900 !!!cp (68);
901 }
902 } else {
903 die "$0: $self->{ct}->{type}: Unknown token type";
904 }
905 $self->{state} = DATA_STATE;
906 $self->{s_kwd} = '';
907 # reconsume
908
909 !!!emit ($self->{ct}); # start tag or end tag
910
911 redo A;
912 } else {
913 if ($self->{nc} == 0x0022 or # "
914 $self->{nc} == 0x0027) { # '
915 !!!cp (69);
916 !!!parse-error (type => 'bad attribute name');
917 } else {
918 !!!cp (70);
919 }
920 $self->{ca}->{name} .= chr ($self->{nc});
921 ## Stay in the state
922 !!!next-input-character;
923 redo A;
924 }
925 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
926 if ($is_space->{$self->{nc}}) {
927 !!!cp (71);
928 ## Stay in the state
929 !!!next-input-character;
930 redo A;
931 } elsif ($self->{nc} == 0x003D) { # =
932 !!!cp (72);
933 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
934 !!!next-input-character;
935 redo A;
936 } elsif ($self->{nc} == 0x003E) { # >
937 if ($self->{ct}->{type} == START_TAG_TOKEN) {
938 !!!cp (73);
939 $self->{last_stag_name} = $self->{ct}->{tag_name};
940 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
941 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
942 if ($self->{ct}->{attributes}) {
943 !!!cp (74);
944 !!!parse-error (type => 'end tag attribute');
945 } else {
946 ## NOTE: This state should never be reached.
947 !!!cp (75);
948 }
949 } else {
950 die "$0: $self->{ct}->{type}: Unknown token type";
951 }
952 $self->{state} = DATA_STATE;
953 $self->{s_kwd} = '';
954 !!!next-input-character;
955
956 !!!emit ($self->{ct}); # start tag or end tag
957
958 redo A;
959 } elsif (0x0041 <= $self->{nc} and
960 $self->{nc} <= 0x005A) { # A..Z
961 !!!cp (76);
962 $self->{ca}
963 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
964 value => '',
965 line => $self->{line}, column => $self->{column}};
966 $self->{state} = ATTRIBUTE_NAME_STATE;
967 !!!next-input-character;
968 redo A;
969 } elsif ($self->{nc} == 0x002F) { # /
970 !!!cp (77);
971 $self->{state} = SELF_CLOSING_START_TAG_STATE;
972 !!!next-input-character;
973 redo A;
974 } elsif ($self->{nc} == -1) {
975 !!!parse-error (type => 'unclosed tag');
976 if ($self->{ct}->{type} == START_TAG_TOKEN) {
977 !!!cp (79);
978 $self->{last_stag_name} = $self->{ct}->{tag_name};
979 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
980 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
981 if ($self->{ct}->{attributes}) {
982 !!!cp (80);
983 !!!parse-error (type => 'end tag attribute');
984 } else {
985 ## NOTE: This state should never be reached.
986 !!!cp (81);
987 }
988 } else {
989 die "$0: $self->{ct}->{type}: Unknown token type";
990 }
991 $self->{s_kwd} = '';
992 $self->{state} = DATA_STATE;
993 # reconsume
994
995 !!!emit ($self->{ct}); # start tag or end tag
996
997 redo A;
998 } else {
999 if ($self->{nc} == 0x0022 or # "
1000 $self->{nc} == 0x0027) { # '
1001 !!!cp (78);
1002 !!!parse-error (type => 'bad attribute name');
1003 } else {
1004 !!!cp (82);
1005 }
1006 $self->{ca}
1007 = {name => chr ($self->{nc}),
1008 value => '',
1009 line => $self->{line}, column => $self->{column}};
1010 $self->{state} = ATTRIBUTE_NAME_STATE;
1011 !!!next-input-character;
1012 redo A;
1013 }
1014 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1015 if ($is_space->{$self->{nc}}) {
1016 !!!cp (83);
1017 ## Stay in the state
1018 !!!next-input-character;
1019 redo A;
1020 } elsif ($self->{nc} == 0x0022) { # "
1021 !!!cp (84);
1022 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1023 !!!next-input-character;
1024 redo A;
1025 } elsif ($self->{nc} == 0x0026) { # &
1026 !!!cp (85);
1027 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1028 ## reconsume
1029 redo A;
1030 } elsif ($self->{nc} == 0x0027) { # '
1031 !!!cp (86);
1032 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1033 !!!next-input-character;
1034 redo A;
1035 } elsif ($self->{nc} == 0x003E) { # >
1036 !!!parse-error (type => 'empty unquoted attribute value');
1037 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1038 !!!cp (87);
1039 $self->{last_stag_name} = $self->{ct}->{tag_name};
1040 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1041 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1042 if ($self->{ct}->{attributes}) {
1043 !!!cp (88);
1044 !!!parse-error (type => 'end tag attribute');
1045 } else {
1046 ## NOTE: This state should never be reached.
1047 !!!cp (89);
1048 }
1049 } else {
1050 die "$0: $self->{ct}->{type}: Unknown token type";
1051 }
1052 $self->{state} = DATA_STATE;
1053 $self->{s_kwd} = '';
1054 !!!next-input-character;
1055
1056 !!!emit ($self->{ct}); # start tag or end tag
1057
1058 redo A;
1059 } elsif ($self->{nc} == -1) {
1060 !!!parse-error (type => 'unclosed tag');
1061 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1062 !!!cp (90);
1063 $self->{last_stag_name} = $self->{ct}->{tag_name};
1064 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1065 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1066 if ($self->{ct}->{attributes}) {
1067 !!!cp (91);
1068 !!!parse-error (type => 'end tag attribute');
1069 } else {
1070 ## NOTE: This state should never be reached.
1071 !!!cp (92);
1072 }
1073 } else {
1074 die "$0: $self->{ct}->{type}: Unknown token type";
1075 }
1076 $self->{state} = DATA_STATE;
1077 $self->{s_kwd} = '';
1078 ## reconsume
1079
1080 !!!emit ($self->{ct}); # start tag or end tag
1081
1082 redo A;
1083 } else {
1084 if ($self->{nc} == 0x003D) { # =
1085 !!!cp (93);
1086 !!!parse-error (type => 'bad attribute value');
1087 } else {
1088 !!!cp (94);
1089 }
1090 $self->{ca}->{value} .= chr ($self->{nc});
1091 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1092 !!!next-input-character;
1093 redo A;
1094 }
1095 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1096 if ($self->{nc} == 0x0022) { # "
1097 !!!cp (95);
1098 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1099 !!!next-input-character;
1100 redo A;
1101 } elsif ($self->{nc} == 0x0026) { # &
1102 !!!cp (96);
1103 ## NOTE: In the spec, the tokenizer is switched to the
1104 ## "entity in attribute value state". In this implementation, the
1105 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1106 ## implementation of the "consume a character reference" algorithm.
1107 $self->{prev_state} = $self->{state};
1108 $self->{entity_add} = 0x0022; # "
1109 $self->{state} = ENTITY_STATE;
1110 !!!next-input-character;
1111 redo A;
1112 } elsif ($self->{nc} == -1) {
1113 !!!parse-error (type => 'unclosed attribute value');
1114 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1115 !!!cp (97);
1116 $self->{last_stag_name} = $self->{ct}->{tag_name};
1117 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1118 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1119 if ($self->{ct}->{attributes}) {
1120 !!!cp (98);
1121 !!!parse-error (type => 'end tag attribute');
1122 } else {
1123 ## NOTE: This state should never be reached.
1124 !!!cp (99);
1125 }
1126 } else {
1127 die "$0: $self->{ct}->{type}: Unknown token type";
1128 }
1129 $self->{state} = DATA_STATE;
1130 $self->{s_kwd} = '';
1131 ## reconsume
1132
1133 !!!emit ($self->{ct}); # start tag or end tag
1134
1135 redo A;
1136 } else {
1137 !!!cp (100);
1138 $self->{ca}->{value} .= chr ($self->{nc});
1139 $self->{read_until}->($self->{ca}->{value},
1140 q["&],
1141 length $self->{ca}->{value});
1142
1143 ## Stay in the state
1144 !!!next-input-character;
1145 redo A;
1146 }
1147 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1148 if ($self->{nc} == 0x0027) { # '
1149 !!!cp (101);
1150 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1151 !!!next-input-character;
1152 redo A;
1153 } elsif ($self->{nc} == 0x0026) { # &
1154 !!!cp (102);
1155 ## NOTE: In the spec, the tokenizer is switched to the
1156 ## "entity in attribute value state". In this implementation, the
1157 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1158 ## implementation of the "consume a character reference" algorithm.
1159 $self->{entity_add} = 0x0027; # '
1160 $self->{prev_state} = $self->{state};
1161 $self->{state} = ENTITY_STATE;
1162 !!!next-input-character;
1163 redo A;
1164 } elsif ($self->{nc} == -1) {
1165 !!!parse-error (type => 'unclosed attribute value');
1166 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1167 !!!cp (103);
1168 $self->{last_stag_name} = $self->{ct}->{tag_name};
1169 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1170 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1171 if ($self->{ct}->{attributes}) {
1172 !!!cp (104);
1173 !!!parse-error (type => 'end tag attribute');
1174 } else {
1175 ## NOTE: This state should never be reached.
1176 !!!cp (105);
1177 }
1178 } else {
1179 die "$0: $self->{ct}->{type}: Unknown token type";
1180 }
1181 $self->{state} = DATA_STATE;
1182 $self->{s_kwd} = '';
1183 ## reconsume
1184
1185 !!!emit ($self->{ct}); # start tag or end tag
1186
1187 redo A;
1188 } else {
1189 !!!cp (106);
1190 $self->{ca}->{value} .= chr ($self->{nc});
1191 $self->{read_until}->($self->{ca}->{value},
1192 q['&],
1193 length $self->{ca}->{value});
1194
1195 ## Stay in the state
1196 !!!next-input-character;
1197 redo A;
1198 }
1199 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1200 if ($is_space->{$self->{nc}}) {
1201 !!!cp (107);
1202 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1203 !!!next-input-character;
1204 redo A;
1205 } elsif ($self->{nc} == 0x0026) { # &
1206 !!!cp (108);
1207 ## NOTE: In the spec, the tokenizer is switched to the
1208 ## "entity in attribute value state". In this implementation, the
1209 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1210 ## implementation of the "consume a character reference" algorithm.
1211 $self->{entity_add} = -1;
1212 $self->{prev_state} = $self->{state};
1213 $self->{state} = ENTITY_STATE;
1214 !!!next-input-character;
1215 redo A;
1216 } elsif ($self->{nc} == 0x003E) { # >
1217 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1218 !!!cp (109);
1219 $self->{last_stag_name} = $self->{ct}->{tag_name};
1220 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1221 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1222 if ($self->{ct}->{attributes}) {
1223 !!!cp (110);
1224 !!!parse-error (type => 'end tag attribute');
1225 } else {
1226 ## NOTE: This state should never be reached.
1227 !!!cp (111);
1228 }
1229 } else {
1230 die "$0: $self->{ct}->{type}: Unknown token type";
1231 }
1232 $self->{state} = DATA_STATE;
1233 $self->{s_kwd} = '';
1234 !!!next-input-character;
1235
1236 !!!emit ($self->{ct}); # start tag or end tag
1237
1238 redo A;
1239 } elsif ($self->{nc} == -1) {
1240 !!!parse-error (type => 'unclosed tag');
1241 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1242 !!!cp (112);
1243 $self->{last_stag_name} = $self->{ct}->{tag_name};
1244 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1245 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1246 if ($self->{ct}->{attributes}) {
1247 !!!cp (113);
1248 !!!parse-error (type => 'end tag attribute');
1249 } else {
1250 ## NOTE: This state should never be reached.
1251 !!!cp (114);
1252 }
1253 } else {
1254 die "$0: $self->{ct}->{type}: Unknown token type";
1255 }
1256 $self->{state} = DATA_STATE;
1257 $self->{s_kwd} = '';
1258 ## reconsume
1259
1260 !!!emit ($self->{ct}); # start tag or end tag
1261
1262 redo A;
1263 } else {
1264 if ({
1265 0x0022 => 1, # "
1266 0x0027 => 1, # '
1267 0x003D => 1, # =
1268 }->{$self->{nc}}) {
1269 !!!cp (115);
1270 !!!parse-error (type => 'bad attribute value');
1271 } else {
1272 !!!cp (116);
1273 }
1274 $self->{ca}->{value} .= chr ($self->{nc});
1275 $self->{read_until}->($self->{ca}->{value},
1276 q["'=& >],
1277 length $self->{ca}->{value});
1278
1279 ## Stay in the state
1280 !!!next-input-character;
1281 redo A;
1282 }
1283 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1284 if ($is_space->{$self->{nc}}) {
1285 !!!cp (118);
1286 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1287 !!!next-input-character;
1288 redo A;
1289 } elsif ($self->{nc} == 0x003E) { # >
1290 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1291 !!!cp (119);
1292 $self->{last_stag_name} = $self->{ct}->{tag_name};
1293 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1294 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1295 if ($self->{ct}->{attributes}) {
1296 !!!cp (120);
1297 !!!parse-error (type => 'end tag attribute');
1298 } else {
1299 ## NOTE: This state should never be reached.
1300 !!!cp (121);
1301 }
1302 } else {
1303 die "$0: $self->{ct}->{type}: Unknown token type";
1304 }
1305 $self->{state} = DATA_STATE;
1306 $self->{s_kwd} = '';
1307 !!!next-input-character;
1308
1309 !!!emit ($self->{ct}); # start tag or end tag
1310
1311 redo A;
1312 } elsif ($self->{nc} == 0x002F) { # /
1313 !!!cp (122);
1314 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1315 !!!next-input-character;
1316 redo A;
1317 } elsif ($self->{nc} == -1) {
1318 !!!parse-error (type => 'unclosed tag');
1319 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1320 !!!cp (122.3);
1321 $self->{last_stag_name} = $self->{ct}->{tag_name};
1322 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1323 if ($self->{ct}->{attributes}) {
1324 !!!cp (122.1);
1325 !!!parse-error (type => 'end tag attribute');
1326 } else {
1327 ## NOTE: This state should never be reached.
1328 !!!cp (122.2);
1329 }
1330 } else {
1331 die "$0: $self->{ct}->{type}: Unknown token type";
1332 }
1333 $self->{state} = DATA_STATE;
1334 $self->{s_kwd} = '';
1335 ## Reconsume.
1336 !!!emit ($self->{ct}); # start tag or end tag
1337 redo A;
1338 } else {
1339 !!!cp ('124.1');
1340 !!!parse-error (type => 'no space between attributes');
1341 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1342 ## reconsume
1343 redo A;
1344 }
1345 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1346 if ($self->{nc} == 0x003E) { # >
1347 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1348 !!!cp ('124.2');
1349 !!!parse-error (type => 'nestc', token => $self->{ct});
1350 ## TODO: Different type than slash in start tag
1351 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1352 if ($self->{ct}->{attributes}) {
1353 !!!cp ('124.4');
1354 !!!parse-error (type => 'end tag attribute');
1355 } else {
1356 !!!cp ('124.5');
1357 }
1358 ## TODO: Test |<title></title/>|
1359 } else {
1360 !!!cp ('124.3');
1361 $self->{self_closing} = 1;
1362 }
1363
1364 $self->{state} = DATA_STATE;
1365 $self->{s_kwd} = '';
1366 !!!next-input-character;
1367
1368 !!!emit ($self->{ct}); # start tag or end tag
1369
1370 redo A;
1371 } elsif ($self->{nc} == -1) {
1372 !!!parse-error (type => 'unclosed tag');
1373 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1374 !!!cp (124.7);
1375 $self->{last_stag_name} = $self->{ct}->{tag_name};
1376 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1377 if ($self->{ct}->{attributes}) {
1378 !!!cp (124.5);
1379 !!!parse-error (type => 'end tag attribute');
1380 } else {
1381 ## NOTE: This state should never be reached.
1382 !!!cp (124.6);
1383 }
1384 } else {
1385 die "$0: $self->{ct}->{type}: Unknown token type";
1386 }
1387 $self->{state} = DATA_STATE;
1388 $self->{s_kwd} = '';
1389 ## Reconsume.
1390 !!!emit ($self->{ct}); # start tag or end tag
1391 redo A;
1392 } else {
1393 !!!cp ('124.4');
1394 !!!parse-error (type => 'nestc');
1395 ## TODO: This error type is wrong.
1396 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1397 ## Reconsume.
1398 redo A;
1399 }
1400 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1401 ## (only happen if PCDATA state)
1402
1403 ## NOTE: Unlike spec's "bogus comment state", this implementation
1404 ## consumes characters one-by-one basis.
1405
1406 if ($self->{nc} == 0x003E) { # >
1407 !!!cp (124);
1408 $self->{state} = DATA_STATE;
1409 $self->{s_kwd} = '';
1410 !!!next-input-character;
1411
1412 !!!emit ($self->{ct}); # comment
1413 redo A;
1414 } elsif ($self->{nc} == -1) {
1415 !!!cp (125);
1416 $self->{state} = DATA_STATE;
1417 $self->{s_kwd} = '';
1418 ## reconsume
1419
1420 !!!emit ($self->{ct}); # comment
1421 redo A;
1422 } else {
1423 !!!cp (126);
1424 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1425 $self->{read_until}->($self->{ct}->{data},
1426 q[>],
1427 length $self->{ct}->{data});
1428
1429 ## Stay in the state.
1430 !!!next-input-character;
1431 redo A;
1432 }
1433 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1434 ## (only happen if PCDATA state)
1435
1436 if ($self->{nc} == 0x002D) { # -
1437 !!!cp (133);
1438 $self->{state} = MD_HYPHEN_STATE;
1439 !!!next-input-character;
1440 redo A;
1441 } elsif ($self->{nc} == 0x0044 or # D
1442 $self->{nc} == 0x0064) { # d
1443 ## ASCII case-insensitive.
1444 !!!cp (130);
1445 $self->{state} = MD_DOCTYPE_STATE;
1446 $self->{s_kwd} = chr $self->{nc};
1447 !!!next-input-character;
1448 redo A;
1449 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1450 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1451 $self->{is_xml}) and
1452 $self->{nc} == 0x005B) { # [
1453 !!!cp (135.4);
1454 $self->{state} = MD_CDATA_STATE;
1455 $self->{s_kwd} = '[';
1456 !!!next-input-character;
1457 redo A;
1458 } else {
1459 !!!cp (136);
1460 }
1461
1462 !!!parse-error (type => 'bogus comment',
1463 line => $self->{line_prev},
1464 column => $self->{column_prev} - 1);
1465 ## Reconsume.
1466 $self->{state} = BOGUS_COMMENT_STATE;
1467 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1468 line => $self->{line_prev},
1469 column => $self->{column_prev} - 1,
1470 };
1471 redo A;
1472 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1473 if ($self->{nc} == 0x002D) { # -
1474 !!!cp (127);
1475 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1476 line => $self->{line_prev},
1477 column => $self->{column_prev} - 2,
1478 };
1479 $self->{state} = COMMENT_START_STATE;
1480 !!!next-input-character;
1481 redo A;
1482 } else {
1483 !!!cp (128);
1484 !!!parse-error (type => 'bogus comment',
1485 line => $self->{line_prev},
1486 column => $self->{column_prev} - 2);
1487 $self->{state} = BOGUS_COMMENT_STATE;
1488 ## Reconsume.
1489 $self->{ct} = {type => COMMENT_TOKEN,
1490 data => '-',
1491 line => $self->{line_prev},
1492 column => $self->{column_prev} - 2,
1493 };
1494 redo A;
1495 }
1496 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1497 ## ASCII case-insensitive.
1498 if ($self->{nc} == [
1499 undef,
1500 0x004F, # O
1501 0x0043, # C
1502 0x0054, # T
1503 0x0059, # Y
1504 0x0050, # P
1505 ]->[length $self->{s_kwd}] or
1506 $self->{nc} == [
1507 undef,
1508 0x006F, # o
1509 0x0063, # c
1510 0x0074, # t
1511 0x0079, # y
1512 0x0070, # p
1513 ]->[length $self->{s_kwd}]) {
1514 !!!cp (131);
1515 ## Stay in the state.
1516 $self->{s_kwd} .= chr $self->{nc};
1517 !!!next-input-character;
1518 redo A;
1519 } elsif ((length $self->{s_kwd}) == 6 and
1520 ($self->{nc} == 0x0045 or # E
1521 $self->{nc} == 0x0065)) { # e
1522 !!!cp (129);
1523 $self->{state} = DOCTYPE_STATE;
1524 $self->{ct} = {type => DOCTYPE_TOKEN,
1525 quirks => 1,
1526 line => $self->{line_prev},
1527 column => $self->{column_prev} - 7,
1528 };
1529 !!!next-input-character;
1530 redo A;
1531 } else {
1532 !!!cp (132);
1533 !!!parse-error (type => 'bogus comment',
1534 line => $self->{line_prev},
1535 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1536 $self->{state} = BOGUS_COMMENT_STATE;
1537 ## Reconsume.
1538 $self->{ct} = {type => COMMENT_TOKEN,
1539 data => $self->{s_kwd},
1540 line => $self->{line_prev},
1541 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1542 };
1543 redo A;
1544 }
1545 } elsif ($self->{state} == MD_CDATA_STATE) {
1546 if ($self->{nc} == {
1547 '[' => 0x0043, # C
1548 '[C' => 0x0044, # D
1549 '[CD' => 0x0041, # A
1550 '[CDA' => 0x0054, # T
1551 '[CDAT' => 0x0041, # A
1552 }->{$self->{s_kwd}}) {
1553 !!!cp (135.1);
1554 ## Stay in the state.
1555 $self->{s_kwd} .= chr $self->{nc};
1556 !!!next-input-character;
1557 redo A;
1558 } elsif ($self->{s_kwd} eq '[CDATA' and
1559 $self->{nc} == 0x005B) { # [
1560 !!!cp (135.2);
1561 $self->{ct} = {type => CHARACTER_TOKEN,
1562 data => '',
1563 line => $self->{line_prev},
1564 column => $self->{column_prev} - 7};
1565 $self->{state} = CDATA_SECTION_STATE;
1566 !!!next-input-character;
1567 redo A;
1568 } else {
1569 !!!cp (135.3);
1570 !!!parse-error (type => 'bogus comment',
1571 line => $self->{line_prev},
1572 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1573 $self->{state} = BOGUS_COMMENT_STATE;
1574 ## Reconsume.
1575 $self->{ct} = {type => COMMENT_TOKEN,
1576 data => $self->{s_kwd},
1577 line => $self->{line_prev},
1578 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1579 };
1580 redo A;
1581 }
1582 } elsif ($self->{state} == COMMENT_START_STATE) {
1583 if ($self->{nc} == 0x002D) { # -
1584 !!!cp (137);
1585 $self->{state} = COMMENT_START_DASH_STATE;
1586 !!!next-input-character;
1587 redo A;
1588 } elsif ($self->{nc} == 0x003E) { # >
1589 !!!cp (138);
1590 !!!parse-error (type => 'bogus comment');
1591 $self->{state} = DATA_STATE;
1592 $self->{s_kwd} = '';
1593 !!!next-input-character;
1594
1595 !!!emit ($self->{ct}); # comment
1596
1597 redo A;
1598 } elsif ($self->{nc} == -1) {
1599 !!!cp (139);
1600 !!!parse-error (type => 'unclosed comment');
1601 $self->{state} = DATA_STATE;
1602 $self->{s_kwd} = '';
1603 ## reconsume
1604
1605 !!!emit ($self->{ct}); # comment
1606
1607 redo A;
1608 } else {
1609 !!!cp (140);
1610 $self->{ct}->{data} # comment
1611 .= chr ($self->{nc});
1612 $self->{state} = COMMENT_STATE;
1613 !!!next-input-character;
1614 redo A;
1615 }
1616 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1617 if ($self->{nc} == 0x002D) { # -
1618 !!!cp (141);
1619 $self->{state} = COMMENT_END_STATE;
1620 !!!next-input-character;
1621 redo A;
1622 } elsif ($self->{nc} == 0x003E) { # >
1623 !!!cp (142);
1624 !!!parse-error (type => 'bogus comment');
1625 $self->{state} = DATA_STATE;
1626 $self->{s_kwd} = '';
1627 !!!next-input-character;
1628
1629 !!!emit ($self->{ct}); # comment
1630
1631 redo A;
1632 } elsif ($self->{nc} == -1) {
1633 !!!cp (143);
1634 !!!parse-error (type => 'unclosed comment');
1635 $self->{state} = DATA_STATE;
1636 $self->{s_kwd} = '';
1637 ## reconsume
1638
1639 !!!emit ($self->{ct}); # comment
1640
1641 redo A;
1642 } else {
1643 !!!cp (144);
1644 $self->{ct}->{data} # comment
1645 .= '-' . chr ($self->{nc});
1646 $self->{state} = COMMENT_STATE;
1647 !!!next-input-character;
1648 redo A;
1649 }
1650 } elsif ($self->{state} == COMMENT_STATE) {
1651 if ($self->{nc} == 0x002D) { # -
1652 !!!cp (145);
1653 $self->{state} = COMMENT_END_DASH_STATE;
1654 !!!next-input-character;
1655 redo A;
1656 } elsif ($self->{nc} == -1) {
1657 !!!cp (146);
1658 !!!parse-error (type => 'unclosed comment');
1659 $self->{state} = DATA_STATE;
1660 $self->{s_kwd} = '';
1661 ## reconsume
1662
1663 !!!emit ($self->{ct}); # comment
1664
1665 redo A;
1666 } else {
1667 !!!cp (147);
1668 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1669 $self->{read_until}->($self->{ct}->{data},
1670 q[-],
1671 length $self->{ct}->{data});
1672
1673 ## Stay in the state
1674 !!!next-input-character;
1675 redo A;
1676 }
1677 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1678 if ($self->{nc} == 0x002D) { # -
1679 !!!cp (148);
1680 $self->{state} = COMMENT_END_STATE;
1681 !!!next-input-character;
1682 redo A;
1683 } elsif ($self->{nc} == -1) {
1684 !!!cp (149);
1685 !!!parse-error (type => 'unclosed comment');
1686 $self->{s_kwd} = '';
1687 $self->{state} = DATA_STATE;
1688 $self->{s_kwd} = '';
1689 ## reconsume
1690
1691 !!!emit ($self->{ct}); # comment
1692
1693 redo A;
1694 } else {
1695 !!!cp (150);
1696 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1697 $self->{state} = COMMENT_STATE;
1698 !!!next-input-character;
1699 redo A;
1700 }
1701 } elsif ($self->{state} == COMMENT_END_STATE) {
1702 if ($self->{nc} == 0x003E) { # >
1703 !!!cp (151);
1704 $self->{state} = DATA_STATE;
1705 $self->{s_kwd} = '';
1706 !!!next-input-character;
1707
1708 !!!emit ($self->{ct}); # comment
1709
1710 redo A;
1711 } elsif ($self->{nc} == 0x002D) { # -
1712 !!!cp (152);
1713 !!!parse-error (type => 'dash in comment',
1714 line => $self->{line_prev},
1715 column => $self->{column_prev});
1716 $self->{ct}->{data} .= '-'; # comment
1717 ## Stay in the state
1718 !!!next-input-character;
1719 redo A;
1720 } elsif ($self->{nc} == -1) {
1721 !!!cp (153);
1722 !!!parse-error (type => 'unclosed comment');
1723 $self->{state} = DATA_STATE;
1724 $self->{s_kwd} = '';
1725 ## reconsume
1726
1727 !!!emit ($self->{ct}); # comment
1728
1729 redo A;
1730 } else {
1731 !!!cp (154);
1732 !!!parse-error (type => 'dash in comment',
1733 line => $self->{line_prev},
1734 column => $self->{column_prev});
1735 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1736 $self->{state} = COMMENT_STATE;
1737 !!!next-input-character;
1738 redo A;
1739 }
1740 } elsif ($self->{state} == DOCTYPE_STATE) {
1741 if ($is_space->{$self->{nc}}) {
1742 !!!cp (155);
1743 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1744 !!!next-input-character;
1745 redo A;
1746 } else {
1747 !!!cp (156);
1748 !!!parse-error (type => 'no space before DOCTYPE name');
1749 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1750 ## reconsume
1751 redo A;
1752 }
1753 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1754 if ($is_space->{$self->{nc}}) {
1755 !!!cp (157);
1756 ## Stay in the state
1757 !!!next-input-character;
1758 redo A;
1759 } elsif ($self->{nc} == 0x003E) { # >
1760 !!!cp (158);
1761 !!!parse-error (type => 'no DOCTYPE name');
1762 $self->{state} = DATA_STATE;
1763 $self->{s_kwd} = '';
1764 !!!next-input-character;
1765
1766 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1767
1768 redo A;
1769 } elsif ($self->{nc} == -1) {
1770 !!!cp (159);
1771 !!!parse-error (type => 'no DOCTYPE name');
1772 $self->{state} = DATA_STATE;
1773 $self->{s_kwd} = '';
1774 ## reconsume
1775
1776 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1777
1778 redo A;
1779 } else {
1780 !!!cp (160);
1781 $self->{ct}->{name} = chr $self->{nc};
1782 delete $self->{ct}->{quirks};
1783 $self->{state} = DOCTYPE_NAME_STATE;
1784 !!!next-input-character;
1785 redo A;
1786 }
1787 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1788 ## ISSUE: Redundant "First," in the spec.
1789 if ($is_space->{$self->{nc}}) {
1790 !!!cp (161);
1791 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1792 !!!next-input-character;
1793 redo A;
1794 } elsif ($self->{nc} == 0x003E) { # >
1795 !!!cp (162);
1796 $self->{state} = DATA_STATE;
1797 $self->{s_kwd} = '';
1798 !!!next-input-character;
1799
1800 !!!emit ($self->{ct}); # DOCTYPE
1801
1802 redo A;
1803 } elsif ($self->{nc} == -1) {
1804 !!!cp (163);
1805 !!!parse-error (type => 'unclosed DOCTYPE');
1806 $self->{state} = DATA_STATE;
1807 $self->{s_kwd} = '';
1808 ## reconsume
1809
1810 $self->{ct}->{quirks} = 1;
1811 !!!emit ($self->{ct}); # DOCTYPE
1812
1813 redo A;
1814 } else {
1815 !!!cp (164);
1816 $self->{ct}->{name}
1817 .= chr ($self->{nc}); # DOCTYPE
1818 ## Stay in the state
1819 !!!next-input-character;
1820 redo A;
1821 }
1822 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1823 if ($is_space->{$self->{nc}}) {
1824 !!!cp (165);
1825 ## Stay in the state
1826 !!!next-input-character;
1827 redo A;
1828 } elsif ($self->{nc} == 0x003E) { # >
1829 !!!cp (166);
1830 $self->{state} = DATA_STATE;
1831 $self->{s_kwd} = '';
1832 !!!next-input-character;
1833
1834 !!!emit ($self->{ct}); # DOCTYPE
1835
1836 redo A;
1837 } elsif ($self->{nc} == -1) {
1838 !!!cp (167);
1839 !!!parse-error (type => 'unclosed DOCTYPE');
1840 $self->{state} = DATA_STATE;
1841 $self->{s_kwd} = '';
1842 ## reconsume
1843
1844 $self->{ct}->{quirks} = 1;
1845 !!!emit ($self->{ct}); # DOCTYPE
1846
1847 redo A;
1848 } elsif ($self->{nc} == 0x0050 or # P
1849 $self->{nc} == 0x0070) { # p
1850 $self->{state} = PUBLIC_STATE;
1851 $self->{s_kwd} = chr $self->{nc};
1852 !!!next-input-character;
1853 redo A;
1854 } elsif ($self->{nc} == 0x0053 or # S
1855 $self->{nc} == 0x0073) { # s
1856 $self->{state} = SYSTEM_STATE;
1857 $self->{s_kwd} = chr $self->{nc};
1858 !!!next-input-character;
1859 redo A;
1860 } else {
1861 !!!cp (180);
1862 !!!parse-error (type => 'string after DOCTYPE name');
1863 $self->{ct}->{quirks} = 1;
1864
1865 $self->{state} = BOGUS_DOCTYPE_STATE;
1866 !!!next-input-character;
1867 redo A;
1868 }
1869 } elsif ($self->{state} == PUBLIC_STATE) {
1870 ## ASCII case-insensitive
1871 if ($self->{nc} == [
1872 undef,
1873 0x0055, # U
1874 0x0042, # B
1875 0x004C, # L
1876 0x0049, # I
1877 ]->[length $self->{s_kwd}] or
1878 $self->{nc} == [
1879 undef,
1880 0x0075, # u
1881 0x0062, # b
1882 0x006C, # l
1883 0x0069, # i
1884 ]->[length $self->{s_kwd}]) {
1885 !!!cp (175);
1886 ## Stay in the state.
1887 $self->{s_kwd} .= chr $self->{nc};
1888 !!!next-input-character;
1889 redo A;
1890 } elsif ((length $self->{s_kwd}) == 5 and
1891 ($self->{nc} == 0x0043 or # C
1892 $self->{nc} == 0x0063)) { # c
1893 !!!cp (168);
1894 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1895 !!!next-input-character;
1896 redo A;
1897 } else {
1898 !!!cp (169);
1899 !!!parse-error (type => 'string after DOCTYPE name',
1900 line => $self->{line_prev},
1901 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1902 $self->{ct}->{quirks} = 1;
1903
1904 $self->{state} = BOGUS_DOCTYPE_STATE;
1905 ## Reconsume.
1906 redo A;
1907 }
1908 } elsif ($self->{state} == SYSTEM_STATE) {
1909 ## ASCII case-insensitive
1910 if ($self->{nc} == [
1911 undef,
1912 0x0059, # Y
1913 0x0053, # S
1914 0x0054, # T
1915 0x0045, # E
1916 ]->[length $self->{s_kwd}] or
1917 $self->{nc} == [
1918 undef,
1919 0x0079, # y
1920 0x0073, # s
1921 0x0074, # t
1922 0x0065, # e
1923 ]->[length $self->{s_kwd}]) {
1924 !!!cp (170);
1925 ## Stay in the state.
1926 $self->{s_kwd} .= chr $self->{nc};
1927 !!!next-input-character;
1928 redo A;
1929 } elsif ((length $self->{s_kwd}) == 5 and
1930 ($self->{nc} == 0x004D or # M
1931 $self->{nc} == 0x006D)) { # m
1932 !!!cp (171);
1933 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1934 !!!next-input-character;
1935 redo A;
1936 } else {
1937 !!!cp (172);
1938 !!!parse-error (type => 'string after DOCTYPE name',
1939 line => $self->{line_prev},
1940 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1941 $self->{ct}->{quirks} = 1;
1942
1943 $self->{state} = BOGUS_DOCTYPE_STATE;
1944 ## Reconsume.
1945 redo A;
1946 }
1947 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1948 if ($is_space->{$self->{nc}}) {
1949 !!!cp (181);
1950 ## Stay in the state
1951 !!!next-input-character;
1952 redo A;
1953 } elsif ($self->{nc} eq 0x0022) { # "
1954 !!!cp (182);
1955 $self->{ct}->{pubid} = ''; # DOCTYPE
1956 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1957 !!!next-input-character;
1958 redo A;
1959 } elsif ($self->{nc} eq 0x0027) { # '
1960 !!!cp (183);
1961 $self->{ct}->{pubid} = ''; # DOCTYPE
1962 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1963 !!!next-input-character;
1964 redo A;
1965 } elsif ($self->{nc} eq 0x003E) { # >
1966 !!!cp (184);
1967 !!!parse-error (type => 'no PUBLIC literal');
1968
1969 $self->{state} = DATA_STATE;
1970 $self->{s_kwd} = '';
1971 !!!next-input-character;
1972
1973 $self->{ct}->{quirks} = 1;
1974 !!!emit ($self->{ct}); # DOCTYPE
1975
1976 redo A;
1977 } elsif ($self->{nc} == -1) {
1978 !!!cp (185);
1979 !!!parse-error (type => 'unclosed DOCTYPE');
1980
1981 $self->{state} = DATA_STATE;
1982 $self->{s_kwd} = '';
1983 ## reconsume
1984
1985 $self->{ct}->{quirks} = 1;
1986 !!!emit ($self->{ct}); # DOCTYPE
1987
1988 redo A;
1989 } else {
1990 !!!cp (186);
1991 !!!parse-error (type => 'string after PUBLIC');
1992 $self->{ct}->{quirks} = 1;
1993
1994 $self->{state} = BOGUS_DOCTYPE_STATE;
1995 !!!next-input-character;
1996 redo A;
1997 }
1998 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1999 if ($self->{nc} == 0x0022) { # "
2000 !!!cp (187);
2001 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2002 !!!next-input-character;
2003 redo A;
2004 } elsif ($self->{nc} == 0x003E) { # >
2005 !!!cp (188);
2006 !!!parse-error (type => 'unclosed PUBLIC literal');
2007
2008 $self->{state} = DATA_STATE;
2009 $self->{s_kwd} = '';
2010 !!!next-input-character;
2011
2012 $self->{ct}->{quirks} = 1;
2013 !!!emit ($self->{ct}); # DOCTYPE
2014
2015 redo A;
2016 } elsif ($self->{nc} == -1) {
2017 !!!cp (189);
2018 !!!parse-error (type => 'unclosed PUBLIC literal');
2019
2020 $self->{state} = DATA_STATE;
2021 $self->{s_kwd} = '';
2022 ## reconsume
2023
2024 $self->{ct}->{quirks} = 1;
2025 !!!emit ($self->{ct}); # DOCTYPE
2026
2027 redo A;
2028 } else {
2029 !!!cp (190);
2030 $self->{ct}->{pubid} # DOCTYPE
2031 .= chr $self->{nc};
2032 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2033 length $self->{ct}->{pubid});
2034
2035 ## Stay in the state
2036 !!!next-input-character;
2037 redo A;
2038 }
2039 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2040 if ($self->{nc} == 0x0027) { # '
2041 !!!cp (191);
2042 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2043 !!!next-input-character;
2044 redo A;
2045 } elsif ($self->{nc} == 0x003E) { # >
2046 !!!cp (192);
2047 !!!parse-error (type => 'unclosed PUBLIC literal');
2048
2049 $self->{state} = DATA_STATE;
2050 $self->{s_kwd} = '';
2051 !!!next-input-character;
2052
2053 $self->{ct}->{quirks} = 1;
2054 !!!emit ($self->{ct}); # DOCTYPE
2055
2056 redo A;
2057 } elsif ($self->{nc} == -1) {
2058 !!!cp (193);
2059 !!!parse-error (type => 'unclosed PUBLIC literal');
2060
2061 $self->{state} = DATA_STATE;
2062 $self->{s_kwd} = '';
2063 ## reconsume
2064
2065 $self->{ct}->{quirks} = 1;
2066 !!!emit ($self->{ct}); # DOCTYPE
2067
2068 redo A;
2069 } else {
2070 !!!cp (194);
2071 $self->{ct}->{pubid} # DOCTYPE
2072 .= chr $self->{nc};
2073 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2074 length $self->{ct}->{pubid});
2075
2076 ## Stay in the state
2077 !!!next-input-character;
2078 redo A;
2079 }
2080 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2081 if ($is_space->{$self->{nc}}) {
2082 !!!cp (195);
2083 ## Stay in the state
2084 !!!next-input-character;
2085 redo A;
2086 } elsif ($self->{nc} == 0x0022) { # "
2087 !!!cp (196);
2088 $self->{ct}->{sysid} = ''; # DOCTYPE
2089 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2090 !!!next-input-character;
2091 redo A;
2092 } elsif ($self->{nc} == 0x0027) { # '
2093 !!!cp (197);
2094 $self->{ct}->{sysid} = ''; # DOCTYPE
2095 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2096 !!!next-input-character;
2097 redo A;
2098 } elsif ($self->{nc} == 0x003E) { # >
2099 !!!cp (198);
2100 $self->{state} = DATA_STATE;
2101 $self->{s_kwd} = '';
2102 !!!next-input-character;
2103
2104 !!!emit ($self->{ct}); # DOCTYPE
2105
2106 redo A;
2107 } elsif ($self->{nc} == -1) {
2108 !!!cp (199);
2109 !!!parse-error (type => 'unclosed DOCTYPE');
2110
2111 $self->{state} = DATA_STATE;
2112 $self->{s_kwd} = '';
2113 ## reconsume
2114
2115 $self->{ct}->{quirks} = 1;
2116 !!!emit ($self->{ct}); # DOCTYPE
2117
2118 redo A;
2119 } else {
2120 !!!cp (200);
2121 !!!parse-error (type => 'string after PUBLIC literal');
2122 $self->{ct}->{quirks} = 1;
2123
2124 $self->{state} = BOGUS_DOCTYPE_STATE;
2125 !!!next-input-character;
2126 redo A;
2127 }
2128 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2129 if ($is_space->{$self->{nc}}) {
2130 !!!cp (201);
2131 ## Stay in the state
2132 !!!next-input-character;
2133 redo A;
2134 } elsif ($self->{nc} == 0x0022) { # "
2135 !!!cp (202);
2136 $self->{ct}->{sysid} = ''; # DOCTYPE
2137 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2138 !!!next-input-character;
2139 redo A;
2140 } elsif ($self->{nc} == 0x0027) { # '
2141 !!!cp (203);
2142 $self->{ct}->{sysid} = ''; # DOCTYPE
2143 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2144 !!!next-input-character;
2145 redo A;
2146 } elsif ($self->{nc} == 0x003E) { # >
2147 !!!cp (204);
2148 !!!parse-error (type => 'no SYSTEM literal');
2149 $self->{state} = DATA_STATE;
2150 $self->{s_kwd} = '';
2151 !!!next-input-character;
2152
2153 $self->{ct}->{quirks} = 1;
2154 !!!emit ($self->{ct}); # DOCTYPE
2155
2156 redo A;
2157 } elsif ($self->{nc} == -1) {
2158 !!!cp (205);
2159 !!!parse-error (type => 'unclosed DOCTYPE');
2160
2161 $self->{state} = DATA_STATE;
2162 $self->{s_kwd} = '';
2163 ## reconsume
2164
2165 $self->{ct}->{quirks} = 1;
2166 !!!emit ($self->{ct}); # DOCTYPE
2167
2168 redo A;
2169 } else {
2170 !!!cp (206);
2171 !!!parse-error (type => 'string after SYSTEM');
2172 $self->{ct}->{quirks} = 1;
2173
2174 $self->{state} = BOGUS_DOCTYPE_STATE;
2175 !!!next-input-character;
2176 redo A;
2177 }
2178 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2179 if ($self->{nc} == 0x0022) { # "
2180 !!!cp (207);
2181 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2182 !!!next-input-character;
2183 redo A;
2184 } elsif ($self->{nc} == 0x003E) { # >
2185 !!!cp (208);
2186 !!!parse-error (type => 'unclosed SYSTEM literal');
2187
2188 $self->{state} = DATA_STATE;
2189 $self->{s_kwd} = '';
2190 !!!next-input-character;
2191
2192 $self->{ct}->{quirks} = 1;
2193 !!!emit ($self->{ct}); # DOCTYPE
2194
2195 redo A;
2196 } elsif ($self->{nc} == -1) {
2197 !!!cp (209);
2198 !!!parse-error (type => 'unclosed SYSTEM literal');
2199
2200 $self->{state} = DATA_STATE;
2201 $self->{s_kwd} = '';
2202 ## reconsume
2203
2204 $self->{ct}->{quirks} = 1;
2205 !!!emit ($self->{ct}); # DOCTYPE
2206
2207 redo A;
2208 } else {
2209 !!!cp (210);
2210 $self->{ct}->{sysid} # DOCTYPE
2211 .= chr $self->{nc};
2212 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2213 length $self->{ct}->{sysid});
2214
2215 ## Stay in the state
2216 !!!next-input-character;
2217 redo A;
2218 }
2219 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2220 if ($self->{nc} == 0x0027) { # '
2221 !!!cp (211);
2222 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2223 !!!next-input-character;
2224 redo A;
2225 } elsif ($self->{nc} == 0x003E) { # >
2226 !!!cp (212);
2227 !!!parse-error (type => 'unclosed SYSTEM literal');
2228
2229 $self->{state} = DATA_STATE;
2230 $self->{s_kwd} = '';
2231 !!!next-input-character;
2232
2233 $self->{ct}->{quirks} = 1;
2234 !!!emit ($self->{ct}); # DOCTYPE
2235
2236 redo A;
2237 } elsif ($self->{nc} == -1) {
2238 !!!cp (213);
2239 !!!parse-error (type => 'unclosed SYSTEM literal');
2240
2241 $self->{state} = DATA_STATE;
2242 $self->{s_kwd} = '';
2243 ## reconsume
2244
2245 $self->{ct}->{quirks} = 1;
2246 !!!emit ($self->{ct}); # DOCTYPE
2247
2248 redo A;
2249 } else {
2250 !!!cp (214);
2251 $self->{ct}->{sysid} # DOCTYPE
2252 .= chr $self->{nc};
2253 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2254 length $self->{ct}->{sysid});
2255
2256 ## Stay in the state
2257 !!!next-input-character;
2258 redo A;
2259 }
2260 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2261 if ($is_space->{$self->{nc}}) {
2262 !!!cp (215);
2263 ## Stay in the state
2264 !!!next-input-character;
2265 redo A;
2266 } elsif ($self->{nc} == 0x003E) { # >
2267 !!!cp (216);
2268 $self->{state} = DATA_STATE;
2269 $self->{s_kwd} = '';
2270 !!!next-input-character;
2271
2272 !!!emit ($self->{ct}); # DOCTYPE
2273
2274 redo A;
2275 } elsif ($self->{nc} == -1) {
2276 !!!cp (217);
2277 !!!parse-error (type => 'unclosed DOCTYPE');
2278 $self->{state} = DATA_STATE;
2279 $self->{s_kwd} = '';
2280 ## reconsume
2281
2282 $self->{ct}->{quirks} = 1;
2283 !!!emit ($self->{ct}); # DOCTYPE
2284
2285 redo A;
2286 } else {
2287 !!!cp (218);
2288 !!!parse-error (type => 'string after SYSTEM literal');
2289 #$self->{ct}->{quirks} = 1;
2290
2291 $self->{state} = BOGUS_DOCTYPE_STATE;
2292 !!!next-input-character;
2293 redo A;
2294 }
2295 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2296 if ($self->{nc} == 0x003E) { # >
2297 !!!cp (219);
2298 $self->{state} = DATA_STATE;
2299 $self->{s_kwd} = '';
2300 !!!next-input-character;
2301
2302 !!!emit ($self->{ct}); # DOCTYPE
2303
2304 redo A;
2305 } elsif ($self->{nc} == -1) {
2306 !!!cp (220);
2307 $self->{state} = DATA_STATE;
2308 $self->{s_kwd} = '';
2309 ## reconsume
2310
2311 !!!emit ($self->{ct}); # DOCTYPE
2312
2313 redo A;
2314 } else {
2315 !!!cp (221);
2316 my $s = '';
2317 $self->{read_until}->($s, q[>], 0);
2318
2319 ## Stay in the state
2320 !!!next-input-character;
2321 redo A;
2322 }
2323 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2324 ## NOTE: "CDATA section state" in the state is jointly implemented
2325 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2326 ## and |CDATA_SECTION_MSE2_STATE|.
2327
2328 if ($self->{nc} == 0x005D) { # ]
2329 !!!cp (221.1);
2330 $self->{state} = CDATA_SECTION_MSE1_STATE;
2331 !!!next-input-character;
2332 redo A;
2333 } elsif ($self->{nc} == -1) {
2334 $self->{state} = DATA_STATE;
2335 $self->{s_kwd} = '';
2336 !!!next-input-character;
2337 if (length $self->{ct}->{data}) { # character
2338 !!!cp (221.2);
2339 !!!emit ($self->{ct}); # character
2340 } else {
2341 !!!cp (221.3);
2342 ## No token to emit. $self->{ct} is discarded.
2343 }
2344 redo A;
2345 } else {
2346 !!!cp (221.4);
2347 $self->{ct}->{data} .= chr $self->{nc};
2348 $self->{read_until}->($self->{ct}->{data},
2349 q<]>,
2350 length $self->{ct}->{data});
2351
2352 ## Stay in the state.
2353 !!!next-input-character;
2354 redo A;
2355 }
2356
2357 ## ISSUE: "text tokens" in spec.
2358 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2359 if ($self->{nc} == 0x005D) { # ]
2360 !!!cp (221.5);
2361 $self->{state} = CDATA_SECTION_MSE2_STATE;
2362 !!!next-input-character;
2363 redo A;
2364 } else {
2365 !!!cp (221.6);
2366 $self->{ct}->{data} .= ']';
2367 $self->{state} = CDATA_SECTION_STATE;
2368 ## Reconsume.
2369 redo A;
2370 }
2371 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2372 if ($self->{nc} == 0x003E) { # >
2373 $self->{state} = DATA_STATE;
2374 $self->{s_kwd} = '';
2375 !!!next-input-character;
2376 if (length $self->{ct}->{data}) { # character
2377 !!!cp (221.7);
2378 !!!emit ($self->{ct}); # character
2379 } else {
2380 !!!cp (221.8);
2381 ## No token to emit. $self->{ct} is discarded.
2382 }
2383 redo A;
2384 } elsif ($self->{nc} == 0x005D) { # ]
2385 !!!cp (221.9); # character
2386 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2387 ## Stay in the state.
2388 !!!next-input-character;
2389 redo A;
2390 } else {
2391 !!!cp (221.11);
2392 $self->{ct}->{data} .= ']]'; # character
2393 $self->{state} = CDATA_SECTION_STATE;
2394 ## Reconsume.
2395 redo A;
2396 }
2397 } elsif ($self->{state} == ENTITY_STATE) {
2398 if ($is_space->{$self->{nc}} or
2399 {
2400 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2401 $self->{entity_add} => 1,
2402 }->{$self->{nc}}) {
2403 !!!cp (1001);
2404 ## Don't consume
2405 ## No error
2406 ## Return nothing.
2407 #
2408 } elsif ($self->{nc} == 0x0023) { # #
2409 !!!cp (999);
2410 $self->{state} = ENTITY_HASH_STATE;
2411 $self->{s_kwd} = '#';
2412 !!!next-input-character;
2413 redo A;
2414 } elsif ((0x0041 <= $self->{nc} and
2415 $self->{nc} <= 0x005A) or # A..Z
2416 (0x0061 <= $self->{nc} and
2417 $self->{nc} <= 0x007A)) { # a..z
2418 !!!cp (998);
2419 require Whatpm::_NamedEntityList;
2420 $self->{state} = ENTITY_NAME_STATE;
2421 $self->{s_kwd} = chr $self->{nc};
2422 $self->{entity__value} = $self->{s_kwd};
2423 $self->{entity__match} = 0;
2424 !!!next-input-character;
2425 redo A;
2426 } else {
2427 !!!cp (1027);
2428 !!!parse-error (type => 'bare ero');
2429 ## Return nothing.
2430 #
2431 }
2432
2433 ## NOTE: No character is consumed by the "consume a character
2434 ## reference" algorithm. In other word, there is an "&" character
2435 ## that does not introduce a character reference, which would be
2436 ## appended to the parent element or the attribute value in later
2437 ## process of the tokenizer.
2438
2439 if ($self->{prev_state} == DATA_STATE) {
2440 !!!cp (997);
2441 $self->{state} = $self->{prev_state};
2442 $self->{s_kwd} = '';
2443 ## Reconsume.
2444 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2445 line => $self->{line_prev},
2446 column => $self->{column_prev},
2447 });
2448 redo A;
2449 } else {
2450 !!!cp (996);
2451 $self->{ca}->{value} .= '&';
2452 $self->{state} = $self->{prev_state};
2453 $self->{s_kwd} = '';
2454 ## Reconsume.
2455 redo A;
2456 }
2457 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2458 if ($self->{nc} == 0x0078 or # x
2459 $self->{nc} == 0x0058) { # X
2460 !!!cp (995);
2461 $self->{state} = HEXREF_X_STATE;
2462 $self->{s_kwd} .= chr $self->{nc};
2463 !!!next-input-character;
2464 redo A;
2465 } elsif (0x0030 <= $self->{nc} and
2466 $self->{nc} <= 0x0039) { # 0..9
2467 !!!cp (994);
2468 $self->{state} = NCR_NUM_STATE;
2469 $self->{s_kwd} = $self->{nc} - 0x0030;
2470 !!!next-input-character;
2471 redo A;
2472 } else {
2473 !!!parse-error (type => 'bare nero',
2474 line => $self->{line_prev},
2475 column => $self->{column_prev} - 1);
2476
2477 ## NOTE: According to the spec algorithm, nothing is returned,
2478 ## and then "&#" is appended to the parent element or the attribute
2479 ## value in the later processing.
2480
2481 if ($self->{prev_state} == DATA_STATE) {
2482 !!!cp (1019);
2483 $self->{state} = $self->{prev_state};
2484 $self->{s_kwd} = '';
2485 ## Reconsume.
2486 !!!emit ({type => CHARACTER_TOKEN,
2487 data => '&#',
2488 line => $self->{line_prev},
2489 column => $self->{column_prev} - 1,
2490 });
2491 redo A;
2492 } else {
2493 !!!cp (993);
2494 $self->{ca}->{value} .= '&#';
2495 $self->{state} = $self->{prev_state};
2496 $self->{s_kwd} = '';
2497 ## Reconsume.
2498 redo A;
2499 }
2500 }
2501 } elsif ($self->{state} == NCR_NUM_STATE) {
2502 if (0x0030 <= $self->{nc} and
2503 $self->{nc} <= 0x0039) { # 0..9
2504 !!!cp (1012);
2505 $self->{s_kwd} *= 10;
2506 $self->{s_kwd} += $self->{nc} - 0x0030;
2507
2508 ## Stay in the state.
2509 !!!next-input-character;
2510 redo A;
2511 } elsif ($self->{nc} == 0x003B) { # ;
2512 !!!cp (1013);
2513 !!!next-input-character;
2514 #
2515 } else {
2516 !!!cp (1014);
2517 !!!parse-error (type => 'no refc');
2518 ## Reconsume.
2519 #
2520 }
2521
2522 my $code = $self->{s_kwd};
2523 my $l = $self->{line_prev};
2524 my $c = $self->{column_prev};
2525 if ($charref_map->{$code}) {
2526 !!!cp (1015);
2527 !!!parse-error (type => 'invalid character reference',
2528 text => (sprintf 'U+%04X', $code),
2529 line => $l, column => $c);
2530 $code = $charref_map->{$code};
2531 } elsif ($code > 0x10FFFF) {
2532 !!!cp (1016);
2533 !!!parse-error (type => 'invalid character reference',
2534 text => (sprintf 'U-%08X', $code),
2535 line => $l, column => $c);
2536 $code = 0xFFFD;
2537 }
2538
2539 if ($self->{prev_state} == DATA_STATE) {
2540 !!!cp (992);
2541 $self->{state} = $self->{prev_state};
2542 $self->{s_kwd} = '';
2543 ## Reconsume.
2544 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2545 line => $l, column => $c,
2546 });
2547 redo A;
2548 } else {
2549 !!!cp (991);
2550 $self->{ca}->{value} .= chr $code;
2551 $self->{ca}->{has_reference} = 1;
2552 $self->{state} = $self->{prev_state};
2553 $self->{s_kwd} = '';
2554 ## Reconsume.
2555 redo A;
2556 }
2557 } elsif ($self->{state} == HEXREF_X_STATE) {
2558 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2559 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2560 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2561 # 0..9, A..F, a..f
2562 !!!cp (990);
2563 $self->{state} = HEXREF_HEX_STATE;
2564 $self->{s_kwd} = 0;
2565 ## Reconsume.
2566 redo A;
2567 } else {
2568 !!!parse-error (type => 'bare hcro',
2569 line => $self->{line_prev},
2570 column => $self->{column_prev} - 2);
2571
2572 ## NOTE: According to the spec algorithm, nothing is returned,
2573 ## and then "&#" followed by "X" or "x" is appended to the parent
2574 ## element or the attribute value in the later processing.
2575
2576 if ($self->{prev_state} == DATA_STATE) {
2577 !!!cp (1005);
2578 $self->{state} = $self->{prev_state};
2579 $self->{s_kwd} = '';
2580 ## Reconsume.
2581 !!!emit ({type => CHARACTER_TOKEN,
2582 data => '&' . $self->{s_kwd},
2583 line => $self->{line_prev},
2584 column => $self->{column_prev} - length $self->{s_kwd},
2585 });
2586 redo A;
2587 } else {
2588 !!!cp (989);
2589 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2590 $self->{state} = $self->{prev_state};
2591 $self->{s_kwd} = '';
2592 ## Reconsume.
2593 redo A;
2594 }
2595 }
2596 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2597 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2598 # 0..9
2599 !!!cp (1002);
2600 $self->{s_kwd} *= 0x10;
2601 $self->{s_kwd} += $self->{nc} - 0x0030;
2602 ## Stay in the state.
2603 !!!next-input-character;
2604 redo A;
2605 } elsif (0x0061 <= $self->{nc} and
2606 $self->{nc} <= 0x0066) { # a..f
2607 !!!cp (1003);
2608 $self->{s_kwd} *= 0x10;
2609 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2610 ## Stay in the state.
2611 !!!next-input-character;
2612 redo A;
2613 } elsif (0x0041 <= $self->{nc} and
2614 $self->{nc} <= 0x0046) { # A..F
2615 !!!cp (1004);
2616 $self->{s_kwd} *= 0x10;
2617 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2618 ## Stay in the state.
2619 !!!next-input-character;
2620 redo A;
2621 } elsif ($self->{nc} == 0x003B) { # ;
2622 !!!cp (1006);
2623 !!!next-input-character;
2624 #
2625 } else {
2626 !!!cp (1007);
2627 !!!parse-error (type => 'no refc',
2628 line => $self->{line},
2629 column => $self->{column});
2630 ## Reconsume.
2631 #
2632 }
2633
2634 my $code = $self->{s_kwd};
2635 my $l = $self->{line_prev};
2636 my $c = $self->{column_prev};
2637 if ($charref_map->{$code}) {
2638 !!!cp (1008);
2639 !!!parse-error (type => 'invalid character reference',
2640 text => (sprintf 'U+%04X', $code),
2641 line => $l, column => $c);
2642 $code = $charref_map->{$code};
2643 } elsif ($code > 0x10FFFF) {
2644 !!!cp (1009);
2645 !!!parse-error (type => 'invalid character reference',
2646 text => (sprintf 'U-%08X', $code),
2647 line => $l, column => $c);
2648 $code = 0xFFFD;
2649 }
2650
2651 if ($self->{prev_state} == DATA_STATE) {
2652 !!!cp (988);
2653 $self->{state} = $self->{prev_state};
2654 $self->{s_kwd} = '';
2655 ## Reconsume.
2656 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2657 line => $l, column => $c,
2658 });
2659 redo A;
2660 } else {
2661 !!!cp (987);
2662 $self->{ca}->{value} .= chr $code;
2663 $self->{ca}->{has_reference} = 1;
2664 $self->{state} = $self->{prev_state};
2665 $self->{s_kwd} = '';
2666 ## Reconsume.
2667 redo A;
2668 }
2669 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2670 if (length $self->{s_kwd} < 30 and
2671 ## NOTE: Some number greater than the maximum length of entity name
2672 ((0x0041 <= $self->{nc} and # a
2673 $self->{nc} <= 0x005A) or # x
2674 (0x0061 <= $self->{nc} and # a
2675 $self->{nc} <= 0x007A) or # z
2676 (0x0030 <= $self->{nc} and # 0
2677 $self->{nc} <= 0x0039) or # 9
2678 $self->{nc} == 0x003B)) { # ;
2679 our $EntityChar;
2680 $self->{s_kwd} .= chr $self->{nc};
2681 if (defined $EntityChar->{$self->{s_kwd}}) {
2682 if ($self->{nc} == 0x003B) { # ;
2683 !!!cp (1020);
2684 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2685 $self->{entity__match} = 1;
2686 !!!next-input-character;
2687 #
2688 } else {
2689 !!!cp (1021);
2690 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2691 $self->{entity__match} = -1;
2692 ## Stay in the state.
2693 !!!next-input-character;
2694 redo A;
2695 }
2696 } else {
2697 !!!cp (1022);
2698 $self->{entity__value} .= chr $self->{nc};
2699 $self->{entity__match} *= 2;
2700 ## Stay in the state.
2701 !!!next-input-character;
2702 redo A;
2703 }
2704 }
2705
2706 my $data;
2707 my $has_ref;
2708 if ($self->{entity__match} > 0) {
2709 !!!cp (1023);
2710 $data = $self->{entity__value};
2711 $has_ref = 1;
2712 #
2713 } elsif ($self->{entity__match} < 0) {
2714 !!!parse-error (type => 'no refc');
2715 if ($self->{prev_state} != DATA_STATE and # in attribute
2716 $self->{entity__match} < -1) {
2717 !!!cp (1024);
2718 $data = '&' . $self->{s_kwd};
2719 #
2720 } else {
2721 !!!cp (1025);
2722 $data = $self->{entity__value};
2723 $has_ref = 1;
2724 #
2725 }
2726 } else {
2727 !!!cp (1026);
2728 !!!parse-error (type => 'bare ero',
2729 line => $self->{line_prev},
2730 column => $self->{column_prev} - length $self->{s_kwd});
2731 $data = '&' . $self->{s_kwd};
2732 #
2733 }
2734
2735 ## NOTE: In these cases, when a character reference is found,
2736 ## it is consumed and a character token is returned, or, otherwise,
2737 ## nothing is consumed and returned, according to the spec algorithm.
2738 ## In this implementation, anything that has been examined by the
2739 ## tokenizer is appended to the parent element or the attribute value
2740 ## as string, either literal string when no character reference or
2741 ## entity-replaced string otherwise, in this stage, since any characters
2742 ## that would not be consumed are appended in the data state or in an
2743 ## appropriate attribute value state anyway.
2744
2745 if ($self->{prev_state} == DATA_STATE) {
2746 !!!cp (986);
2747 $self->{state} = $self->{prev_state};
2748 $self->{s_kwd} = '';
2749 ## Reconsume.
2750 !!!emit ({type => CHARACTER_TOKEN,
2751 data => $data,
2752 line => $self->{line_prev},
2753 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2754 });
2755 redo A;
2756 } else {
2757 !!!cp (985);
2758 $self->{ca}->{value} .= $data;
2759 $self->{ca}->{has_reference} = 1 if $has_ref;
2760 $self->{state} = $self->{prev_state};
2761 $self->{s_kwd} = '';
2762 ## Reconsume.
2763 redo A;
2764 }
2765 } else {
2766 die "$0: $self->{state}: Unknown state";
2767 }
2768 } # A
2769
2770 die "$0: _get_next_token: unexpected case";
2771 } # _get_next_token
2772
2773 1;
2774 ## $Date: 2008/10/14 11:46:57 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24