/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (show annotations) (download) (as text)
Tue Oct 14 14:57:52 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.5: +16 -2 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:56:52 -0000
	* cdata-1.dat: Tests on CDATA section outside of the root element
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:57:28 -0000
	* Tokenizer.pm.src: Parse error if CDATA section is not closed or
	is placed outside of the root element.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 $self->{s_kwd} = ''; # state keyword
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194 !!!next-input-character;
195 $self->{token} = [];
196 # $self->{escape}
197 } # _initialize_tokenizer
198
199 ## A token has:
200 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202 ## ->{name} (DOCTYPE_TOKEN)
203 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204 ## ->{pubid} (DOCTYPE_TOKEN)
205 ## ->{sysid} (DOCTYPE_TOKEN)
206 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208 ## ->{name}
209 ## ->{value}
210 ## ->{has_reference} == 1 or 0
211 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
213 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
214 ## while the token is pushed back to the stack.
215
216 ## Emitted token MUST immediately be handled by the tree construction state.
217
218 ## Before each step, UA MAY check to see if either one of the scripts in
219 ## "list of scripts that will execute as soon as possible" or the first
220 ## script in the "list of scripts that will execute asynchronously",
221 ## has completed loading. If one has, then it MUST be executed
222 ## and removed from the list.
223
224 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
225 ## (This requirement was dropped from HTML5 spec, unfortunately.)
226
227 my $is_space = {
228 0x0009 => 1, # CHARACTER TABULATION (HT)
229 0x000A => 1, # LINE FEED (LF)
230 #0x000B => 0, # LINE TABULATION (VT)
231 0x000C => 1, # FORM FEED (FF)
232 #0x000D => 1, # CARRIAGE RETURN (CR)
233 0x0020 => 1, # SPACE (SP)
234 };
235
236 sub _get_next_token ($) {
237 my $self = shift;
238
239 if ($self->{self_closing}) {
240 !!!parse-error (type => 'nestc', token => $self->{ct});
241 ## NOTE: The |self_closing| flag is only set by start tag token.
242 ## In addition, when a start tag token is emitted, it is always set to
243 ## |ct|.
244 delete $self->{self_closing};
245 }
246
247 if (@{$self->{token}}) {
248 $self->{self_closing} = $self->{token}->[0]->{self_closing};
249 return shift @{$self->{token}};
250 }
251
252 A: {
253 if ($self->{state} == PCDATA_STATE) {
254 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
255
256 if ($self->{nc} == 0x0026) { # &
257 !!!cp (0.1);
258 ## NOTE: In the spec, the tokenizer is switched to the
259 ## "entity data state". In this implementation, the tokenizer
260 ## is switched to the |ENTITY_STATE|, which is an implementation
261 ## of the "consume a character reference" algorithm.
262 $self->{entity_add} = -1;
263 $self->{prev_state} = DATA_STATE;
264 $self->{state} = ENTITY_STATE;
265 !!!next-input-character;
266 redo A;
267 } elsif ($self->{nc} == 0x003C) { # <
268 !!!cp (0.2);
269 $self->{state} = TAG_OPEN_STATE;
270 !!!next-input-character;
271 redo A;
272 } elsif ($self->{nc} == -1) {
273 !!!cp (0.3);
274 !!!emit ({type => END_OF_FILE_TOKEN,
275 line => $self->{line}, column => $self->{column}});
276 last A; ## TODO: ok?
277 } else {
278 !!!cp (0.4);
279 #
280 }
281
282 # Anything else
283 my $token = {type => CHARACTER_TOKEN,
284 data => chr $self->{nc},
285 line => $self->{line}, column => $self->{column},
286 };
287 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
288
289 ## Stay in the state.
290 !!!next-input-character;
291 !!!emit ($token);
292 redo A;
293 } elsif ($self->{state} == DATA_STATE) {
294 $self->{s_kwd} = '' unless defined $self->{s_kwd};
295 if ($self->{nc} == 0x0026) { # &
296 $self->{s_kwd} = '';
297 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
298 not $self->{escape}) {
299 !!!cp (1);
300 ## NOTE: In the spec, the tokenizer is switched to the
301 ## "entity data state". In this implementation, the tokenizer
302 ## is switched to the |ENTITY_STATE|, which is an implementation
303 ## of the "consume a character reference" algorithm.
304 $self->{entity_add} = -1;
305 $self->{prev_state} = DATA_STATE;
306 $self->{state} = ENTITY_STATE;
307 !!!next-input-character;
308 redo A;
309 } else {
310 !!!cp (2);
311 #
312 }
313 } elsif ($self->{nc} == 0x002D) { # -
314 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
315 if ($self->{s_kwd} eq '<!-') {
316 !!!cp (3);
317 $self->{escape} = 1; # unless $self->{escape};
318 $self->{s_kwd} = '--';
319 #
320 } elsif ($self->{s_kwd} eq '-') {
321 !!!cp (4);
322 $self->{s_kwd} = '--';
323 #
324 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
325 !!!cp (4.1);
326 $self->{s_kwd} .= '-';
327 #
328 } else {
329 !!!cp (5);
330 $self->{s_kwd} = '-';
331 #
332 }
333 }
334
335 #
336 } elsif ($self->{nc} == 0x0021) { # !
337 if (length $self->{s_kwd}) {
338 !!!cp (5.1);
339 $self->{s_kwd} .= '!';
340 #
341 } else {
342 !!!cp (5.2);
343 #$self->{s_kwd} = '';
344 #
345 }
346 #
347 } elsif ($self->{nc} == 0x003C) { # <
348 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
349 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
350 not $self->{escape})) {
351 !!!cp (6);
352 $self->{state} = TAG_OPEN_STATE;
353 !!!next-input-character;
354 redo A;
355 } else {
356 !!!cp (7);
357 $self->{s_kwd} = '';
358 #
359 }
360 } elsif ($self->{nc} == 0x003E) { # >
361 if ($self->{escape} and
362 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
363 if ($self->{s_kwd} eq '--') {
364 !!!cp (8);
365 delete $self->{escape};
366 #
367 } else {
368 !!!cp (9);
369 #
370 }
371 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
372 !!!cp (9.1);
373 !!!parse-error (type => 'unmatched mse', ## TODO: type
374 line => $self->{line_prev},
375 column => $self->{column_prev} - 1);
376 #
377 } else {
378 !!!cp (10);
379 #
380 }
381
382 $self->{s_kwd} = '';
383 #
384 } elsif ($self->{nc} == 0x005D) { # ]
385 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
386 !!!cp (10.1);
387 $self->{s_kwd} .= ']';
388 } elsif ($self->{s_kwd} eq ']]') {
389 !!!cp (10.2);
390 #
391 } else {
392 !!!cp (10.3);
393 $self->{s_kwd} = '';
394 }
395 #
396 } elsif ($self->{nc} == -1) {
397 !!!cp (11);
398 $self->{s_kwd} = '';
399 !!!emit ({type => END_OF_FILE_TOKEN,
400 line => $self->{line}, column => $self->{column}});
401 last A; ## TODO: ok?
402 } else {
403 !!!cp (12);
404 $self->{s_kwd} = '';
405 #
406 }
407
408 # Anything else
409 my $token = {type => CHARACTER_TOKEN,
410 data => chr $self->{nc},
411 line => $self->{line}, column => $self->{column},
412 };
413 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
414 length $token->{data})) {
415 $self->{s_kwd} = '';
416 }
417
418 ## Stay in the data state.
419 if (not $self->{is_xml} and
420 $self->{content_model} == PCDATA_CONTENT_MODEL) {
421 !!!cp (13);
422 $self->{state} = PCDATA_STATE;
423 } else {
424 !!!cp (14);
425 ## Stay in the state.
426 }
427 !!!next-input-character;
428 !!!emit ($token);
429 redo A;
430 } elsif ($self->{state} == TAG_OPEN_STATE) {
431 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
432 if ($self->{nc} == 0x002F) { # /
433 !!!cp (15);
434 !!!next-input-character;
435 $self->{state} = CLOSE_TAG_OPEN_STATE;
436 redo A;
437 } elsif ($self->{nc} == 0x0021) { # !
438 !!!cp (15.1);
439 $self->{s_kwd} = '<' unless $self->{escape};
440 #
441 } else {
442 !!!cp (16);
443 #
444 }
445
446 ## reconsume
447 $self->{state} = DATA_STATE;
448 $self->{s_kwd} = '';
449 !!!emit ({type => CHARACTER_TOKEN, data => '<',
450 line => $self->{line_prev},
451 column => $self->{column_prev},
452 });
453 redo A;
454 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
455 if ($self->{nc} == 0x0021) { # !
456 !!!cp (17);
457 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
458 !!!next-input-character;
459 redo A;
460 } elsif ($self->{nc} == 0x002F) { # /
461 !!!cp (18);
462 $self->{state} = CLOSE_TAG_OPEN_STATE;
463 !!!next-input-character;
464 redo A;
465 } elsif (0x0041 <= $self->{nc} and
466 $self->{nc} <= 0x005A) { # A..Z
467 !!!cp (19);
468 $self->{ct}
469 = {type => START_TAG_TOKEN,
470 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
471 line => $self->{line_prev},
472 column => $self->{column_prev}};
473 $self->{state} = TAG_NAME_STATE;
474 !!!next-input-character;
475 redo A;
476 } elsif (0x0061 <= $self->{nc} and
477 $self->{nc} <= 0x007A) { # a..z
478 !!!cp (20);
479 $self->{ct} = {type => START_TAG_TOKEN,
480 tag_name => chr ($self->{nc}),
481 line => $self->{line_prev},
482 column => $self->{column_prev}};
483 $self->{state} = TAG_NAME_STATE;
484 !!!next-input-character;
485 redo A;
486 } elsif ($self->{nc} == 0x003E) { # >
487 !!!cp (21);
488 !!!parse-error (type => 'empty start tag',
489 line => $self->{line_prev},
490 column => $self->{column_prev});
491 $self->{state} = DATA_STATE;
492 $self->{s_kwd} = '';
493 !!!next-input-character;
494
495 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
496 line => $self->{line_prev},
497 column => $self->{column_prev},
498 });
499
500 redo A;
501 } elsif ($self->{nc} == 0x003F) { # ?
502 !!!cp (22);
503 !!!parse-error (type => 'pio',
504 line => $self->{line_prev},
505 column => $self->{column_prev});
506 $self->{state} = BOGUS_COMMENT_STATE;
507 $self->{ct} = {type => COMMENT_TOKEN, data => '',
508 line => $self->{line_prev},
509 column => $self->{column_prev},
510 };
511 ## $self->{nc} is intentionally left as is
512 redo A;
513 } else {
514 !!!cp (23);
515 !!!parse-error (type => 'bare stago',
516 line => $self->{line_prev},
517 column => $self->{column_prev});
518 $self->{state} = DATA_STATE;
519 $self->{s_kwd} = '';
520 ## reconsume
521
522 !!!emit ({type => CHARACTER_TOKEN, data => '<',
523 line => $self->{line_prev},
524 column => $self->{column_prev},
525 });
526
527 redo A;
528 }
529 } else {
530 die "$0: $self->{content_model} in tag open";
531 }
532 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
533 ## NOTE: The "close tag open state" in the spec is implemented as
534 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
535
536 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
537 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
538 if (defined $self->{last_stag_name}) {
539 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
540 $self->{s_kwd} = '';
541 ## Reconsume.
542 redo A;
543 } else {
544 ## No start tag token has ever been emitted
545 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
546 !!!cp (28);
547 $self->{state} = DATA_STATE;
548 $self->{s_kwd} = '';
549 ## Reconsume.
550 !!!emit ({type => CHARACTER_TOKEN, data => '</',
551 line => $l, column => $c,
552 });
553 redo A;
554 }
555 }
556
557 if (0x0041 <= $self->{nc} and
558 $self->{nc} <= 0x005A) { # A..Z
559 !!!cp (29);
560 $self->{ct}
561 = {type => END_TAG_TOKEN,
562 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
563 line => $l, column => $c};
564 $self->{state} = TAG_NAME_STATE;
565 !!!next-input-character;
566 redo A;
567 } elsif (0x0061 <= $self->{nc} and
568 $self->{nc} <= 0x007A) { # a..z
569 !!!cp (30);
570 $self->{ct} = {type => END_TAG_TOKEN,
571 tag_name => chr ($self->{nc}),
572 line => $l, column => $c};
573 $self->{state} = TAG_NAME_STATE;
574 !!!next-input-character;
575 redo A;
576 } elsif ($self->{nc} == 0x003E) { # >
577 !!!cp (31);
578 !!!parse-error (type => 'empty end tag',
579 line => $self->{line_prev}, ## "<" in "</>"
580 column => $self->{column_prev} - 1);
581 $self->{state} = DATA_STATE;
582 $self->{s_kwd} = '';
583 !!!next-input-character;
584 redo A;
585 } elsif ($self->{nc} == -1) {
586 !!!cp (32);
587 !!!parse-error (type => 'bare etago');
588 $self->{s_kwd} = '';
589 $self->{state} = DATA_STATE;
590 # reconsume
591
592 !!!emit ({type => CHARACTER_TOKEN, data => '</',
593 line => $l, column => $c,
594 });
595
596 redo A;
597 } else {
598 !!!cp (33);
599 !!!parse-error (type => 'bogus end tag');
600 $self->{state} = BOGUS_COMMENT_STATE;
601 $self->{ct} = {type => COMMENT_TOKEN, data => '',
602 line => $self->{line_prev}, # "<" of "</"
603 column => $self->{column_prev} - 1,
604 };
605 ## NOTE: $self->{nc} is intentionally left as is.
606 ## Although the "anything else" case of the spec not explicitly
607 ## states that the next input character is to be reconsumed,
608 ## it will be included to the |data| of the comment token
609 ## generated from the bogus end tag, as defined in the
610 ## "bogus comment state" entry.
611 redo A;
612 }
613 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
614 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
615 if (length $ch) {
616 my $CH = $ch;
617 $ch =~ tr/a-z/A-Z/;
618 my $nch = chr $self->{nc};
619 if ($nch eq $ch or $nch eq $CH) {
620 !!!cp (24);
621 ## Stay in the state.
622 $self->{s_kwd} .= $nch;
623 !!!next-input-character;
624 redo A;
625 } else {
626 !!!cp (25);
627 $self->{state} = DATA_STATE;
628 $self->{s_kwd} = '';
629 ## Reconsume.
630 !!!emit ({type => CHARACTER_TOKEN,
631 data => '</' . $self->{s_kwd},
632 line => $self->{line_prev},
633 column => $self->{column_prev} - 1 - length $self->{s_kwd},
634 });
635 redo A;
636 }
637 } else { # after "<{tag-name}"
638 unless ($is_space->{$self->{nc}} or
639 {
640 0x003E => 1, # >
641 0x002F => 1, # /
642 -1 => 1, # EOF
643 }->{$self->{nc}}) {
644 !!!cp (26);
645 ## Reconsume.
646 $self->{state} = DATA_STATE;
647 $self->{s_kwd} = '';
648 !!!emit ({type => CHARACTER_TOKEN,
649 data => '</' . $self->{s_kwd},
650 line => $self->{line_prev},
651 column => $self->{column_prev} - 1 - length $self->{s_kwd},
652 });
653 redo A;
654 } else {
655 !!!cp (27);
656 $self->{ct}
657 = {type => END_TAG_TOKEN,
658 tag_name => $self->{last_stag_name},
659 line => $self->{line_prev},
660 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
661 $self->{state} = TAG_NAME_STATE;
662 ## Reconsume.
663 redo A;
664 }
665 }
666 } elsif ($self->{state} == TAG_NAME_STATE) {
667 if ($is_space->{$self->{nc}}) {
668 !!!cp (34);
669 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
670 !!!next-input-character;
671 redo A;
672 } elsif ($self->{nc} == 0x003E) { # >
673 if ($self->{ct}->{type} == START_TAG_TOKEN) {
674 !!!cp (35);
675 $self->{last_stag_name} = $self->{ct}->{tag_name};
676 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
677 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
678 #if ($self->{ct}->{attributes}) {
679 # ## NOTE: This should never be reached.
680 # !!! cp (36);
681 # !!! parse-error (type => 'end tag attribute');
682 #} else {
683 !!!cp (37);
684 #}
685 } else {
686 die "$0: $self->{ct}->{type}: Unknown token type";
687 }
688 $self->{state} = DATA_STATE;
689 $self->{s_kwd} = '';
690 !!!next-input-character;
691
692 !!!emit ($self->{ct}); # start tag or end tag
693
694 redo A;
695 } elsif (0x0041 <= $self->{nc} and
696 $self->{nc} <= 0x005A) { # A..Z
697 !!!cp (38);
698 $self->{ct}->{tag_name}
699 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
700 # start tag or end tag
701 ## Stay in this state
702 !!!next-input-character;
703 redo A;
704 } elsif ($self->{nc} == -1) {
705 !!!parse-error (type => 'unclosed tag');
706 if ($self->{ct}->{type} == START_TAG_TOKEN) {
707 !!!cp (39);
708 $self->{last_stag_name} = $self->{ct}->{tag_name};
709 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
710 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
711 #if ($self->{ct}->{attributes}) {
712 # ## NOTE: This state should never be reached.
713 # !!! cp (40);
714 # !!! parse-error (type => 'end tag attribute');
715 #} else {
716 !!!cp (41);
717 #}
718 } else {
719 die "$0: $self->{ct}->{type}: Unknown token type";
720 }
721 $self->{state} = DATA_STATE;
722 $self->{s_kwd} = '';
723 # reconsume
724
725 !!!emit ($self->{ct}); # start tag or end tag
726
727 redo A;
728 } elsif ($self->{nc} == 0x002F) { # /
729 !!!cp (42);
730 $self->{state} = SELF_CLOSING_START_TAG_STATE;
731 !!!next-input-character;
732 redo A;
733 } else {
734 !!!cp (44);
735 $self->{ct}->{tag_name} .= chr $self->{nc};
736 # start tag or end tag
737 ## Stay in the state
738 !!!next-input-character;
739 redo A;
740 }
741 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
742 if ($is_space->{$self->{nc}}) {
743 !!!cp (45);
744 ## Stay in the state
745 !!!next-input-character;
746 redo A;
747 } elsif ($self->{nc} == 0x003E) { # >
748 if ($self->{ct}->{type} == START_TAG_TOKEN) {
749 !!!cp (46);
750 $self->{last_stag_name} = $self->{ct}->{tag_name};
751 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
752 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
753 if ($self->{ct}->{attributes}) {
754 !!!cp (47);
755 !!!parse-error (type => 'end tag attribute');
756 } else {
757 !!!cp (48);
758 }
759 } else {
760 die "$0: $self->{ct}->{type}: Unknown token type";
761 }
762 $self->{state} = DATA_STATE;
763 $self->{s_kwd} = '';
764 !!!next-input-character;
765
766 !!!emit ($self->{ct}); # start tag or end tag
767
768 redo A;
769 } elsif (0x0041 <= $self->{nc} and
770 $self->{nc} <= 0x005A) { # A..Z
771 !!!cp (49);
772 $self->{ca}
773 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
774 value => '',
775 line => $self->{line}, column => $self->{column}};
776 $self->{state} = ATTRIBUTE_NAME_STATE;
777 !!!next-input-character;
778 redo A;
779 } elsif ($self->{nc} == 0x002F) { # /
780 !!!cp (50);
781 $self->{state} = SELF_CLOSING_START_TAG_STATE;
782 !!!next-input-character;
783 redo A;
784 } elsif ($self->{nc} == -1) {
785 !!!parse-error (type => 'unclosed tag');
786 if ($self->{ct}->{type} == START_TAG_TOKEN) {
787 !!!cp (52);
788 $self->{last_stag_name} = $self->{ct}->{tag_name};
789 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
790 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791 if ($self->{ct}->{attributes}) {
792 !!!cp (53);
793 !!!parse-error (type => 'end tag attribute');
794 } else {
795 !!!cp (54);
796 }
797 } else {
798 die "$0: $self->{ct}->{type}: Unknown token type";
799 }
800 $self->{state} = DATA_STATE;
801 $self->{s_kwd} = '';
802 # reconsume
803
804 !!!emit ($self->{ct}); # start tag or end tag
805
806 redo A;
807 } else {
808 if ({
809 0x0022 => 1, # "
810 0x0027 => 1, # '
811 0x003D => 1, # =
812 }->{$self->{nc}}) {
813 !!!cp (55);
814 !!!parse-error (type => 'bad attribute name');
815 } else {
816 !!!cp (56);
817 }
818 $self->{ca}
819 = {name => chr ($self->{nc}),
820 value => '',
821 line => $self->{line}, column => $self->{column}};
822 $self->{state} = ATTRIBUTE_NAME_STATE;
823 !!!next-input-character;
824 redo A;
825 }
826 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
827 my $before_leave = sub {
828 if (exists $self->{ct}->{attributes} # start tag or end tag
829 ->{$self->{ca}->{name}}) { # MUST
830 !!!cp (57);
831 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
832 ## Discard $self->{ca} # MUST
833 } else {
834 !!!cp (58);
835 $self->{ct}->{attributes}->{$self->{ca}->{name}}
836 = $self->{ca};
837 }
838 }; # $before_leave
839
840 if ($is_space->{$self->{nc}}) {
841 !!!cp (59);
842 $before_leave->();
843 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
844 !!!next-input-character;
845 redo A;
846 } elsif ($self->{nc} == 0x003D) { # =
847 !!!cp (60);
848 $before_leave->();
849 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
850 !!!next-input-character;
851 redo A;
852 } elsif ($self->{nc} == 0x003E) { # >
853 $before_leave->();
854 if ($self->{ct}->{type} == START_TAG_TOKEN) {
855 !!!cp (61);
856 $self->{last_stag_name} = $self->{ct}->{tag_name};
857 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
858 !!!cp (62);
859 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
860 if ($self->{ct}->{attributes}) {
861 !!!parse-error (type => 'end tag attribute');
862 }
863 } else {
864 die "$0: $self->{ct}->{type}: Unknown token type";
865 }
866 $self->{state} = DATA_STATE;
867 $self->{s_kwd} = '';
868 !!!next-input-character;
869
870 !!!emit ($self->{ct}); # start tag or end tag
871
872 redo A;
873 } elsif (0x0041 <= $self->{nc} and
874 $self->{nc} <= 0x005A) { # A..Z
875 !!!cp (63);
876 $self->{ca}->{name}
877 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
878 ## Stay in the state
879 !!!next-input-character;
880 redo A;
881 } elsif ($self->{nc} == 0x002F) { # /
882 !!!cp (64);
883 $before_leave->();
884 $self->{state} = SELF_CLOSING_START_TAG_STATE;
885 !!!next-input-character;
886 redo A;
887 } elsif ($self->{nc} == -1) {
888 !!!parse-error (type => 'unclosed tag');
889 $before_leave->();
890 if ($self->{ct}->{type} == START_TAG_TOKEN) {
891 !!!cp (66);
892 $self->{last_stag_name} = $self->{ct}->{tag_name};
893 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
894 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
895 if ($self->{ct}->{attributes}) {
896 !!!cp (67);
897 !!!parse-error (type => 'end tag attribute');
898 } else {
899 ## NOTE: This state should never be reached.
900 !!!cp (68);
901 }
902 } else {
903 die "$0: $self->{ct}->{type}: Unknown token type";
904 }
905 $self->{state} = DATA_STATE;
906 $self->{s_kwd} = '';
907 # reconsume
908
909 !!!emit ($self->{ct}); # start tag or end tag
910
911 redo A;
912 } else {
913 if ($self->{nc} == 0x0022 or # "
914 $self->{nc} == 0x0027) { # '
915 !!!cp (69);
916 !!!parse-error (type => 'bad attribute name');
917 } else {
918 !!!cp (70);
919 }
920 $self->{ca}->{name} .= chr ($self->{nc});
921 ## Stay in the state
922 !!!next-input-character;
923 redo A;
924 }
925 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
926 if ($is_space->{$self->{nc}}) {
927 !!!cp (71);
928 ## Stay in the state
929 !!!next-input-character;
930 redo A;
931 } elsif ($self->{nc} == 0x003D) { # =
932 !!!cp (72);
933 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
934 !!!next-input-character;
935 redo A;
936 } elsif ($self->{nc} == 0x003E) { # >
937 if ($self->{ct}->{type} == START_TAG_TOKEN) {
938 !!!cp (73);
939 $self->{last_stag_name} = $self->{ct}->{tag_name};
940 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
941 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
942 if ($self->{ct}->{attributes}) {
943 !!!cp (74);
944 !!!parse-error (type => 'end tag attribute');
945 } else {
946 ## NOTE: This state should never be reached.
947 !!!cp (75);
948 }
949 } else {
950 die "$0: $self->{ct}->{type}: Unknown token type";
951 }
952 $self->{state} = DATA_STATE;
953 $self->{s_kwd} = '';
954 !!!next-input-character;
955
956 !!!emit ($self->{ct}); # start tag or end tag
957
958 redo A;
959 } elsif (0x0041 <= $self->{nc} and
960 $self->{nc} <= 0x005A) { # A..Z
961 !!!cp (76);
962 $self->{ca}
963 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
964 value => '',
965 line => $self->{line}, column => $self->{column}};
966 $self->{state} = ATTRIBUTE_NAME_STATE;
967 !!!next-input-character;
968 redo A;
969 } elsif ($self->{nc} == 0x002F) { # /
970 !!!cp (77);
971 $self->{state} = SELF_CLOSING_START_TAG_STATE;
972 !!!next-input-character;
973 redo A;
974 } elsif ($self->{nc} == -1) {
975 !!!parse-error (type => 'unclosed tag');
976 if ($self->{ct}->{type} == START_TAG_TOKEN) {
977 !!!cp (79);
978 $self->{last_stag_name} = $self->{ct}->{tag_name};
979 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
980 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
981 if ($self->{ct}->{attributes}) {
982 !!!cp (80);
983 !!!parse-error (type => 'end tag attribute');
984 } else {
985 ## NOTE: This state should never be reached.
986 !!!cp (81);
987 }
988 } else {
989 die "$0: $self->{ct}->{type}: Unknown token type";
990 }
991 $self->{s_kwd} = '';
992 $self->{state} = DATA_STATE;
993 # reconsume
994
995 !!!emit ($self->{ct}); # start tag or end tag
996
997 redo A;
998 } else {
999 if ($self->{nc} == 0x0022 or # "
1000 $self->{nc} == 0x0027) { # '
1001 !!!cp (78);
1002 !!!parse-error (type => 'bad attribute name');
1003 } else {
1004 !!!cp (82);
1005 }
1006 $self->{ca}
1007 = {name => chr ($self->{nc}),
1008 value => '',
1009 line => $self->{line}, column => $self->{column}};
1010 $self->{state} = ATTRIBUTE_NAME_STATE;
1011 !!!next-input-character;
1012 redo A;
1013 }
1014 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1015 if ($is_space->{$self->{nc}}) {
1016 !!!cp (83);
1017 ## Stay in the state
1018 !!!next-input-character;
1019 redo A;
1020 } elsif ($self->{nc} == 0x0022) { # "
1021 !!!cp (84);
1022 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1023 !!!next-input-character;
1024 redo A;
1025 } elsif ($self->{nc} == 0x0026) { # &
1026 !!!cp (85);
1027 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1028 ## reconsume
1029 redo A;
1030 } elsif ($self->{nc} == 0x0027) { # '
1031 !!!cp (86);
1032 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1033 !!!next-input-character;
1034 redo A;
1035 } elsif ($self->{nc} == 0x003E) { # >
1036 !!!parse-error (type => 'empty unquoted attribute value');
1037 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1038 !!!cp (87);
1039 $self->{last_stag_name} = $self->{ct}->{tag_name};
1040 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1041 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1042 if ($self->{ct}->{attributes}) {
1043 !!!cp (88);
1044 !!!parse-error (type => 'end tag attribute');
1045 } else {
1046 ## NOTE: This state should never be reached.
1047 !!!cp (89);
1048 }
1049 } else {
1050 die "$0: $self->{ct}->{type}: Unknown token type";
1051 }
1052 $self->{state} = DATA_STATE;
1053 $self->{s_kwd} = '';
1054 !!!next-input-character;
1055
1056 !!!emit ($self->{ct}); # start tag or end tag
1057
1058 redo A;
1059 } elsif ($self->{nc} == -1) {
1060 !!!parse-error (type => 'unclosed tag');
1061 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1062 !!!cp (90);
1063 $self->{last_stag_name} = $self->{ct}->{tag_name};
1064 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1065 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1066 if ($self->{ct}->{attributes}) {
1067 !!!cp (91);
1068 !!!parse-error (type => 'end tag attribute');
1069 } else {
1070 ## NOTE: This state should never be reached.
1071 !!!cp (92);
1072 }
1073 } else {
1074 die "$0: $self->{ct}->{type}: Unknown token type";
1075 }
1076 $self->{state} = DATA_STATE;
1077 $self->{s_kwd} = '';
1078 ## reconsume
1079
1080 !!!emit ($self->{ct}); # start tag or end tag
1081
1082 redo A;
1083 } else {
1084 if ($self->{nc} == 0x003D) { # =
1085 !!!cp (93);
1086 !!!parse-error (type => 'bad attribute value');
1087 } else {
1088 !!!cp (94);
1089 }
1090 $self->{ca}->{value} .= chr ($self->{nc});
1091 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1092 !!!next-input-character;
1093 redo A;
1094 }
1095 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1096 if ($self->{nc} == 0x0022) { # "
1097 !!!cp (95);
1098 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1099 !!!next-input-character;
1100 redo A;
1101 } elsif ($self->{nc} == 0x0026) { # &
1102 !!!cp (96);
1103 ## NOTE: In the spec, the tokenizer is switched to the
1104 ## "entity in attribute value state". In this implementation, the
1105 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1106 ## implementation of the "consume a character reference" algorithm.
1107 $self->{prev_state} = $self->{state};
1108 $self->{entity_add} = 0x0022; # "
1109 $self->{state} = ENTITY_STATE;
1110 !!!next-input-character;
1111 redo A;
1112 } elsif ($self->{nc} == -1) {
1113 !!!parse-error (type => 'unclosed attribute value');
1114 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1115 !!!cp (97);
1116 $self->{last_stag_name} = $self->{ct}->{tag_name};
1117 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1118 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1119 if ($self->{ct}->{attributes}) {
1120 !!!cp (98);
1121 !!!parse-error (type => 'end tag attribute');
1122 } else {
1123 ## NOTE: This state should never be reached.
1124 !!!cp (99);
1125 }
1126 } else {
1127 die "$0: $self->{ct}->{type}: Unknown token type";
1128 }
1129 $self->{state} = DATA_STATE;
1130 $self->{s_kwd} = '';
1131 ## reconsume
1132
1133 !!!emit ($self->{ct}); # start tag or end tag
1134
1135 redo A;
1136 } else {
1137 !!!cp (100);
1138 $self->{ca}->{value} .= chr ($self->{nc});
1139 $self->{read_until}->($self->{ca}->{value},
1140 q["&],
1141 length $self->{ca}->{value});
1142
1143 ## Stay in the state
1144 !!!next-input-character;
1145 redo A;
1146 }
1147 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1148 if ($self->{nc} == 0x0027) { # '
1149 !!!cp (101);
1150 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1151 !!!next-input-character;
1152 redo A;
1153 } elsif ($self->{nc} == 0x0026) { # &
1154 !!!cp (102);
1155 ## NOTE: In the spec, the tokenizer is switched to the
1156 ## "entity in attribute value state". In this implementation, the
1157 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1158 ## implementation of the "consume a character reference" algorithm.
1159 $self->{entity_add} = 0x0027; # '
1160 $self->{prev_state} = $self->{state};
1161 $self->{state} = ENTITY_STATE;
1162 !!!next-input-character;
1163 redo A;
1164 } elsif ($self->{nc} == -1) {
1165 !!!parse-error (type => 'unclosed attribute value');
1166 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1167 !!!cp (103);
1168 $self->{last_stag_name} = $self->{ct}->{tag_name};
1169 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1170 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1171 if ($self->{ct}->{attributes}) {
1172 !!!cp (104);
1173 !!!parse-error (type => 'end tag attribute');
1174 } else {
1175 ## NOTE: This state should never be reached.
1176 !!!cp (105);
1177 }
1178 } else {
1179 die "$0: $self->{ct}->{type}: Unknown token type";
1180 }
1181 $self->{state} = DATA_STATE;
1182 $self->{s_kwd} = '';
1183 ## reconsume
1184
1185 !!!emit ($self->{ct}); # start tag or end tag
1186
1187 redo A;
1188 } else {
1189 !!!cp (106);
1190 $self->{ca}->{value} .= chr ($self->{nc});
1191 $self->{read_until}->($self->{ca}->{value},
1192 q['&],
1193 length $self->{ca}->{value});
1194
1195 ## Stay in the state
1196 !!!next-input-character;
1197 redo A;
1198 }
1199 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1200 if ($is_space->{$self->{nc}}) {
1201 !!!cp (107);
1202 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1203 !!!next-input-character;
1204 redo A;
1205 } elsif ($self->{nc} == 0x0026) { # &
1206 !!!cp (108);
1207 ## NOTE: In the spec, the tokenizer is switched to the
1208 ## "entity in attribute value state". In this implementation, the
1209 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1210 ## implementation of the "consume a character reference" algorithm.
1211 $self->{entity_add} = -1;
1212 $self->{prev_state} = $self->{state};
1213 $self->{state} = ENTITY_STATE;
1214 !!!next-input-character;
1215 redo A;
1216 } elsif ($self->{nc} == 0x003E) { # >
1217 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1218 !!!cp (109);
1219 $self->{last_stag_name} = $self->{ct}->{tag_name};
1220 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1221 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1222 if ($self->{ct}->{attributes}) {
1223 !!!cp (110);
1224 !!!parse-error (type => 'end tag attribute');
1225 } else {
1226 ## NOTE: This state should never be reached.
1227 !!!cp (111);
1228 }
1229 } else {
1230 die "$0: $self->{ct}->{type}: Unknown token type";
1231 }
1232 $self->{state} = DATA_STATE;
1233 $self->{s_kwd} = '';
1234 !!!next-input-character;
1235
1236 !!!emit ($self->{ct}); # start tag or end tag
1237
1238 redo A;
1239 } elsif ($self->{nc} == -1) {
1240 !!!parse-error (type => 'unclosed tag');
1241 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1242 !!!cp (112);
1243 $self->{last_stag_name} = $self->{ct}->{tag_name};
1244 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1245 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1246 if ($self->{ct}->{attributes}) {
1247 !!!cp (113);
1248 !!!parse-error (type => 'end tag attribute');
1249 } else {
1250 ## NOTE: This state should never be reached.
1251 !!!cp (114);
1252 }
1253 } else {
1254 die "$0: $self->{ct}->{type}: Unknown token type";
1255 }
1256 $self->{state} = DATA_STATE;
1257 $self->{s_kwd} = '';
1258 ## reconsume
1259
1260 !!!emit ($self->{ct}); # start tag or end tag
1261
1262 redo A;
1263 } else {
1264 if ({
1265 0x0022 => 1, # "
1266 0x0027 => 1, # '
1267 0x003D => 1, # =
1268 }->{$self->{nc}}) {
1269 !!!cp (115);
1270 !!!parse-error (type => 'bad attribute value');
1271 } else {
1272 !!!cp (116);
1273 }
1274 $self->{ca}->{value} .= chr ($self->{nc});
1275 $self->{read_until}->($self->{ca}->{value},
1276 q["'=& >],
1277 length $self->{ca}->{value});
1278
1279 ## Stay in the state
1280 !!!next-input-character;
1281 redo A;
1282 }
1283 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1284 if ($is_space->{$self->{nc}}) {
1285 !!!cp (118);
1286 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1287 !!!next-input-character;
1288 redo A;
1289 } elsif ($self->{nc} == 0x003E) { # >
1290 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1291 !!!cp (119);
1292 $self->{last_stag_name} = $self->{ct}->{tag_name};
1293 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1294 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1295 if ($self->{ct}->{attributes}) {
1296 !!!cp (120);
1297 !!!parse-error (type => 'end tag attribute');
1298 } else {
1299 ## NOTE: This state should never be reached.
1300 !!!cp (121);
1301 }
1302 } else {
1303 die "$0: $self->{ct}->{type}: Unknown token type";
1304 }
1305 $self->{state} = DATA_STATE;
1306 $self->{s_kwd} = '';
1307 !!!next-input-character;
1308
1309 !!!emit ($self->{ct}); # start tag or end tag
1310
1311 redo A;
1312 } elsif ($self->{nc} == 0x002F) { # /
1313 !!!cp (122);
1314 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1315 !!!next-input-character;
1316 redo A;
1317 } elsif ($self->{nc} == -1) {
1318 !!!parse-error (type => 'unclosed tag');
1319 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1320 !!!cp (122.3);
1321 $self->{last_stag_name} = $self->{ct}->{tag_name};
1322 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1323 if ($self->{ct}->{attributes}) {
1324 !!!cp (122.1);
1325 !!!parse-error (type => 'end tag attribute');
1326 } else {
1327 ## NOTE: This state should never be reached.
1328 !!!cp (122.2);
1329 }
1330 } else {
1331 die "$0: $self->{ct}->{type}: Unknown token type";
1332 }
1333 $self->{state} = DATA_STATE;
1334 $self->{s_kwd} = '';
1335 ## Reconsume.
1336 !!!emit ($self->{ct}); # start tag or end tag
1337 redo A;
1338 } else {
1339 !!!cp ('124.1');
1340 !!!parse-error (type => 'no space between attributes');
1341 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1342 ## reconsume
1343 redo A;
1344 }
1345 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1346 if ($self->{nc} == 0x003E) { # >
1347 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1348 !!!cp ('124.2');
1349 !!!parse-error (type => 'nestc', token => $self->{ct});
1350 ## TODO: Different type than slash in start tag
1351 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1352 if ($self->{ct}->{attributes}) {
1353 !!!cp ('124.4');
1354 !!!parse-error (type => 'end tag attribute');
1355 } else {
1356 !!!cp ('124.5');
1357 }
1358 ## TODO: Test |<title></title/>|
1359 } else {
1360 !!!cp ('124.3');
1361 $self->{self_closing} = 1;
1362 }
1363
1364 $self->{state} = DATA_STATE;
1365 $self->{s_kwd} = '';
1366 !!!next-input-character;
1367
1368 !!!emit ($self->{ct}); # start tag or end tag
1369
1370 redo A;
1371 } elsif ($self->{nc} == -1) {
1372 !!!parse-error (type => 'unclosed tag');
1373 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1374 !!!cp (124.7);
1375 $self->{last_stag_name} = $self->{ct}->{tag_name};
1376 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1377 if ($self->{ct}->{attributes}) {
1378 !!!cp (124.5);
1379 !!!parse-error (type => 'end tag attribute');
1380 } else {
1381 ## NOTE: This state should never be reached.
1382 !!!cp (124.6);
1383 }
1384 } else {
1385 die "$0: $self->{ct}->{type}: Unknown token type";
1386 }
1387 $self->{state} = DATA_STATE;
1388 $self->{s_kwd} = '';
1389 ## Reconsume.
1390 !!!emit ($self->{ct}); # start tag or end tag
1391 redo A;
1392 } else {
1393 !!!cp ('124.4');
1394 !!!parse-error (type => 'nestc');
1395 ## TODO: This error type is wrong.
1396 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1397 ## Reconsume.
1398 redo A;
1399 }
1400 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1401 ## (only happen if PCDATA state)
1402
1403 ## NOTE: Unlike spec's "bogus comment state", this implementation
1404 ## consumes characters one-by-one basis.
1405
1406 if ($self->{nc} == 0x003E) { # >
1407 !!!cp (124);
1408 $self->{state} = DATA_STATE;
1409 $self->{s_kwd} = '';
1410 !!!next-input-character;
1411
1412 !!!emit ($self->{ct}); # comment
1413 redo A;
1414 } elsif ($self->{nc} == -1) {
1415 !!!cp (125);
1416 $self->{state} = DATA_STATE;
1417 $self->{s_kwd} = '';
1418 ## reconsume
1419
1420 !!!emit ($self->{ct}); # comment
1421 redo A;
1422 } else {
1423 !!!cp (126);
1424 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1425 $self->{read_until}->($self->{ct}->{data},
1426 q[>],
1427 length $self->{ct}->{data});
1428
1429 ## Stay in the state.
1430 !!!next-input-character;
1431 redo A;
1432 }
1433 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1434 ## (only happen if PCDATA state)
1435
1436 if ($self->{nc} == 0x002D) { # -
1437 !!!cp (133);
1438 $self->{state} = MD_HYPHEN_STATE;
1439 !!!next-input-character;
1440 redo A;
1441 } elsif ($self->{nc} == 0x0044 or # D
1442 $self->{nc} == 0x0064) { # d
1443 ## ASCII case-insensitive.
1444 !!!cp (130);
1445 $self->{state} = MD_DOCTYPE_STATE;
1446 $self->{s_kwd} = chr $self->{nc};
1447 !!!next-input-character;
1448 redo A;
1449 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1450 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1451 $self->{is_xml}) and
1452 $self->{nc} == 0x005B) { # [
1453 !!!cp (135.4);
1454 $self->{state} = MD_CDATA_STATE;
1455 $self->{s_kwd} = '[';
1456 !!!next-input-character;
1457 redo A;
1458 } else {
1459 !!!cp (136);
1460 }
1461
1462 !!!parse-error (type => 'bogus comment',
1463 line => $self->{line_prev},
1464 column => $self->{column_prev} - 1);
1465 ## Reconsume.
1466 $self->{state} = BOGUS_COMMENT_STATE;
1467 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1468 line => $self->{line_prev},
1469 column => $self->{column_prev} - 1,
1470 };
1471 redo A;
1472 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1473 if ($self->{nc} == 0x002D) { # -
1474 !!!cp (127);
1475 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1476 line => $self->{line_prev},
1477 column => $self->{column_prev} - 2,
1478 };
1479 $self->{state} = COMMENT_START_STATE;
1480 !!!next-input-character;
1481 redo A;
1482 } else {
1483 !!!cp (128);
1484 !!!parse-error (type => 'bogus comment',
1485 line => $self->{line_prev},
1486 column => $self->{column_prev} - 2);
1487 $self->{state} = BOGUS_COMMENT_STATE;
1488 ## Reconsume.
1489 $self->{ct} = {type => COMMENT_TOKEN,
1490 data => '-',
1491 line => $self->{line_prev},
1492 column => $self->{column_prev} - 2,
1493 };
1494 redo A;
1495 }
1496 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1497 ## ASCII case-insensitive.
1498 if ($self->{nc} == [
1499 undef,
1500 0x004F, # O
1501 0x0043, # C
1502 0x0054, # T
1503 0x0059, # Y
1504 0x0050, # P
1505 ]->[length $self->{s_kwd}] or
1506 $self->{nc} == [
1507 undef,
1508 0x006F, # o
1509 0x0063, # c
1510 0x0074, # t
1511 0x0079, # y
1512 0x0070, # p
1513 ]->[length $self->{s_kwd}]) {
1514 !!!cp (131);
1515 ## Stay in the state.
1516 $self->{s_kwd} .= chr $self->{nc};
1517 !!!next-input-character;
1518 redo A;
1519 } elsif ((length $self->{s_kwd}) == 6 and
1520 ($self->{nc} == 0x0045 or # E
1521 $self->{nc} == 0x0065)) { # e
1522 !!!cp (129);
1523 $self->{state} = DOCTYPE_STATE;
1524 $self->{ct} = {type => DOCTYPE_TOKEN,
1525 quirks => 1,
1526 line => $self->{line_prev},
1527 column => $self->{column_prev} - 7,
1528 };
1529 !!!next-input-character;
1530 redo A;
1531 } else {
1532 !!!cp (132);
1533 !!!parse-error (type => 'bogus comment',
1534 line => $self->{line_prev},
1535 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1536 $self->{state} = BOGUS_COMMENT_STATE;
1537 ## Reconsume.
1538 $self->{ct} = {type => COMMENT_TOKEN,
1539 data => $self->{s_kwd},
1540 line => $self->{line_prev},
1541 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1542 };
1543 redo A;
1544 }
1545 } elsif ($self->{state} == MD_CDATA_STATE) {
1546 if ($self->{nc} == {
1547 '[' => 0x0043, # C
1548 '[C' => 0x0044, # D
1549 '[CD' => 0x0041, # A
1550 '[CDA' => 0x0054, # T
1551 '[CDAT' => 0x0041, # A
1552 }->{$self->{s_kwd}}) {
1553 !!!cp (135.1);
1554 ## Stay in the state.
1555 $self->{s_kwd} .= chr $self->{nc};
1556 !!!next-input-character;
1557 redo A;
1558 } elsif ($self->{s_kwd} eq '[CDATA' and
1559 $self->{nc} == 0x005B) { # [
1560 !!!cp (135.2);
1561
1562 if ($self->{is_xml} and
1563 not $self->{tainted} and
1564 @{$self->{open_elements} or []} == 0) {
1565 !!!parse-error (type => 'cdata outside of root element',
1566 line => $self->{line_prev},
1567 column => $self->{column_prev} - 7);
1568 $self->{tainted} = 1;
1569 }
1570
1571 $self->{ct} = {type => CHARACTER_TOKEN,
1572 data => '',
1573 line => $self->{line_prev},
1574 column => $self->{column_prev} - 7};
1575 $self->{state} = CDATA_SECTION_STATE;
1576 !!!next-input-character;
1577 redo A;
1578 } else {
1579 !!!cp (135.3);
1580 !!!parse-error (type => 'bogus comment',
1581 line => $self->{line_prev},
1582 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1583 $self->{state} = BOGUS_COMMENT_STATE;
1584 ## Reconsume.
1585 $self->{ct} = {type => COMMENT_TOKEN,
1586 data => $self->{s_kwd},
1587 line => $self->{line_prev},
1588 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1589 };
1590 redo A;
1591 }
1592 } elsif ($self->{state} == COMMENT_START_STATE) {
1593 if ($self->{nc} == 0x002D) { # -
1594 !!!cp (137);
1595 $self->{state} = COMMENT_START_DASH_STATE;
1596 !!!next-input-character;
1597 redo A;
1598 } elsif ($self->{nc} == 0x003E) { # >
1599 !!!cp (138);
1600 !!!parse-error (type => 'bogus comment');
1601 $self->{state} = DATA_STATE;
1602 $self->{s_kwd} = '';
1603 !!!next-input-character;
1604
1605 !!!emit ($self->{ct}); # comment
1606
1607 redo A;
1608 } elsif ($self->{nc} == -1) {
1609 !!!cp (139);
1610 !!!parse-error (type => 'unclosed comment');
1611 $self->{state} = DATA_STATE;
1612 $self->{s_kwd} = '';
1613 ## reconsume
1614
1615 !!!emit ($self->{ct}); # comment
1616
1617 redo A;
1618 } else {
1619 !!!cp (140);
1620 $self->{ct}->{data} # comment
1621 .= chr ($self->{nc});
1622 $self->{state} = COMMENT_STATE;
1623 !!!next-input-character;
1624 redo A;
1625 }
1626 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1627 if ($self->{nc} == 0x002D) { # -
1628 !!!cp (141);
1629 $self->{state} = COMMENT_END_STATE;
1630 !!!next-input-character;
1631 redo A;
1632 } elsif ($self->{nc} == 0x003E) { # >
1633 !!!cp (142);
1634 !!!parse-error (type => 'bogus comment');
1635 $self->{state} = DATA_STATE;
1636 $self->{s_kwd} = '';
1637 !!!next-input-character;
1638
1639 !!!emit ($self->{ct}); # comment
1640
1641 redo A;
1642 } elsif ($self->{nc} == -1) {
1643 !!!cp (143);
1644 !!!parse-error (type => 'unclosed comment');
1645 $self->{state} = DATA_STATE;
1646 $self->{s_kwd} = '';
1647 ## reconsume
1648
1649 !!!emit ($self->{ct}); # comment
1650
1651 redo A;
1652 } else {
1653 !!!cp (144);
1654 $self->{ct}->{data} # comment
1655 .= '-' . chr ($self->{nc});
1656 $self->{state} = COMMENT_STATE;
1657 !!!next-input-character;
1658 redo A;
1659 }
1660 } elsif ($self->{state} == COMMENT_STATE) {
1661 if ($self->{nc} == 0x002D) { # -
1662 !!!cp (145);
1663 $self->{state} = COMMENT_END_DASH_STATE;
1664 !!!next-input-character;
1665 redo A;
1666 } elsif ($self->{nc} == -1) {
1667 !!!cp (146);
1668 !!!parse-error (type => 'unclosed comment');
1669 $self->{state} = DATA_STATE;
1670 $self->{s_kwd} = '';
1671 ## reconsume
1672
1673 !!!emit ($self->{ct}); # comment
1674
1675 redo A;
1676 } else {
1677 !!!cp (147);
1678 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1679 $self->{read_until}->($self->{ct}->{data},
1680 q[-],
1681 length $self->{ct}->{data});
1682
1683 ## Stay in the state
1684 !!!next-input-character;
1685 redo A;
1686 }
1687 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1688 if ($self->{nc} == 0x002D) { # -
1689 !!!cp (148);
1690 $self->{state} = COMMENT_END_STATE;
1691 !!!next-input-character;
1692 redo A;
1693 } elsif ($self->{nc} == -1) {
1694 !!!cp (149);
1695 !!!parse-error (type => 'unclosed comment');
1696 $self->{s_kwd} = '';
1697 $self->{state} = DATA_STATE;
1698 $self->{s_kwd} = '';
1699 ## reconsume
1700
1701 !!!emit ($self->{ct}); # comment
1702
1703 redo A;
1704 } else {
1705 !!!cp (150);
1706 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1707 $self->{state} = COMMENT_STATE;
1708 !!!next-input-character;
1709 redo A;
1710 }
1711 } elsif ($self->{state} == COMMENT_END_STATE) {
1712 if ($self->{nc} == 0x003E) { # >
1713 !!!cp (151);
1714 $self->{state} = DATA_STATE;
1715 $self->{s_kwd} = '';
1716 !!!next-input-character;
1717
1718 !!!emit ($self->{ct}); # comment
1719
1720 redo A;
1721 } elsif ($self->{nc} == 0x002D) { # -
1722 !!!cp (152);
1723 !!!parse-error (type => 'dash in comment',
1724 line => $self->{line_prev},
1725 column => $self->{column_prev});
1726 $self->{ct}->{data} .= '-'; # comment
1727 ## Stay in the state
1728 !!!next-input-character;
1729 redo A;
1730 } elsif ($self->{nc} == -1) {
1731 !!!cp (153);
1732 !!!parse-error (type => 'unclosed comment');
1733 $self->{state} = DATA_STATE;
1734 $self->{s_kwd} = '';
1735 ## reconsume
1736
1737 !!!emit ($self->{ct}); # comment
1738
1739 redo A;
1740 } else {
1741 !!!cp (154);
1742 !!!parse-error (type => 'dash in comment',
1743 line => $self->{line_prev},
1744 column => $self->{column_prev});
1745 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1746 $self->{state} = COMMENT_STATE;
1747 !!!next-input-character;
1748 redo A;
1749 }
1750 } elsif ($self->{state} == DOCTYPE_STATE) {
1751 if ($is_space->{$self->{nc}}) {
1752 !!!cp (155);
1753 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1754 !!!next-input-character;
1755 redo A;
1756 } else {
1757 !!!cp (156);
1758 !!!parse-error (type => 'no space before DOCTYPE name');
1759 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1760 ## reconsume
1761 redo A;
1762 }
1763 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1764 if ($is_space->{$self->{nc}}) {
1765 !!!cp (157);
1766 ## Stay in the state
1767 !!!next-input-character;
1768 redo A;
1769 } elsif ($self->{nc} == 0x003E) { # >
1770 !!!cp (158);
1771 !!!parse-error (type => 'no DOCTYPE name');
1772 $self->{state} = DATA_STATE;
1773 $self->{s_kwd} = '';
1774 !!!next-input-character;
1775
1776 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1777
1778 redo A;
1779 } elsif ($self->{nc} == -1) {
1780 !!!cp (159);
1781 !!!parse-error (type => 'no DOCTYPE name');
1782 $self->{state} = DATA_STATE;
1783 $self->{s_kwd} = '';
1784 ## reconsume
1785
1786 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1787
1788 redo A;
1789 } else {
1790 !!!cp (160);
1791 $self->{ct}->{name} = chr $self->{nc};
1792 delete $self->{ct}->{quirks};
1793 $self->{state} = DOCTYPE_NAME_STATE;
1794 !!!next-input-character;
1795 redo A;
1796 }
1797 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1798 ## ISSUE: Redundant "First," in the spec.
1799 if ($is_space->{$self->{nc}}) {
1800 !!!cp (161);
1801 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1802 !!!next-input-character;
1803 redo A;
1804 } elsif ($self->{nc} == 0x003E) { # >
1805 !!!cp (162);
1806 $self->{state} = DATA_STATE;
1807 $self->{s_kwd} = '';
1808 !!!next-input-character;
1809
1810 !!!emit ($self->{ct}); # DOCTYPE
1811
1812 redo A;
1813 } elsif ($self->{nc} == -1) {
1814 !!!cp (163);
1815 !!!parse-error (type => 'unclosed DOCTYPE');
1816 $self->{state} = DATA_STATE;
1817 $self->{s_kwd} = '';
1818 ## reconsume
1819
1820 $self->{ct}->{quirks} = 1;
1821 !!!emit ($self->{ct}); # DOCTYPE
1822
1823 redo A;
1824 } else {
1825 !!!cp (164);
1826 $self->{ct}->{name}
1827 .= chr ($self->{nc}); # DOCTYPE
1828 ## Stay in the state
1829 !!!next-input-character;
1830 redo A;
1831 }
1832 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1833 if ($is_space->{$self->{nc}}) {
1834 !!!cp (165);
1835 ## Stay in the state
1836 !!!next-input-character;
1837 redo A;
1838 } elsif ($self->{nc} == 0x003E) { # >
1839 !!!cp (166);
1840 $self->{state} = DATA_STATE;
1841 $self->{s_kwd} = '';
1842 !!!next-input-character;
1843
1844 !!!emit ($self->{ct}); # DOCTYPE
1845
1846 redo A;
1847 } elsif ($self->{nc} == -1) {
1848 !!!cp (167);
1849 !!!parse-error (type => 'unclosed DOCTYPE');
1850 $self->{state} = DATA_STATE;
1851 $self->{s_kwd} = '';
1852 ## reconsume
1853
1854 $self->{ct}->{quirks} = 1;
1855 !!!emit ($self->{ct}); # DOCTYPE
1856
1857 redo A;
1858 } elsif ($self->{nc} == 0x0050 or # P
1859 $self->{nc} == 0x0070) { # p
1860 $self->{state} = PUBLIC_STATE;
1861 $self->{s_kwd} = chr $self->{nc};
1862 !!!next-input-character;
1863 redo A;
1864 } elsif ($self->{nc} == 0x0053 or # S
1865 $self->{nc} == 0x0073) { # s
1866 $self->{state} = SYSTEM_STATE;
1867 $self->{s_kwd} = chr $self->{nc};
1868 !!!next-input-character;
1869 redo A;
1870 } else {
1871 !!!cp (180);
1872 !!!parse-error (type => 'string after DOCTYPE name');
1873 $self->{ct}->{quirks} = 1;
1874
1875 $self->{state} = BOGUS_DOCTYPE_STATE;
1876 !!!next-input-character;
1877 redo A;
1878 }
1879 } elsif ($self->{state} == PUBLIC_STATE) {
1880 ## ASCII case-insensitive
1881 if ($self->{nc} == [
1882 undef,
1883 0x0055, # U
1884 0x0042, # B
1885 0x004C, # L
1886 0x0049, # I
1887 ]->[length $self->{s_kwd}] or
1888 $self->{nc} == [
1889 undef,
1890 0x0075, # u
1891 0x0062, # b
1892 0x006C, # l
1893 0x0069, # i
1894 ]->[length $self->{s_kwd}]) {
1895 !!!cp (175);
1896 ## Stay in the state.
1897 $self->{s_kwd} .= chr $self->{nc};
1898 !!!next-input-character;
1899 redo A;
1900 } elsif ((length $self->{s_kwd}) == 5 and
1901 ($self->{nc} == 0x0043 or # C
1902 $self->{nc} == 0x0063)) { # c
1903 !!!cp (168);
1904 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1905 !!!next-input-character;
1906 redo A;
1907 } else {
1908 !!!cp (169);
1909 !!!parse-error (type => 'string after DOCTYPE name',
1910 line => $self->{line_prev},
1911 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1912 $self->{ct}->{quirks} = 1;
1913
1914 $self->{state} = BOGUS_DOCTYPE_STATE;
1915 ## Reconsume.
1916 redo A;
1917 }
1918 } elsif ($self->{state} == SYSTEM_STATE) {
1919 ## ASCII case-insensitive
1920 if ($self->{nc} == [
1921 undef,
1922 0x0059, # Y
1923 0x0053, # S
1924 0x0054, # T
1925 0x0045, # E
1926 ]->[length $self->{s_kwd}] or
1927 $self->{nc} == [
1928 undef,
1929 0x0079, # y
1930 0x0073, # s
1931 0x0074, # t
1932 0x0065, # e
1933 ]->[length $self->{s_kwd}]) {
1934 !!!cp (170);
1935 ## Stay in the state.
1936 $self->{s_kwd} .= chr $self->{nc};
1937 !!!next-input-character;
1938 redo A;
1939 } elsif ((length $self->{s_kwd}) == 5 and
1940 ($self->{nc} == 0x004D or # M
1941 $self->{nc} == 0x006D)) { # m
1942 !!!cp (171);
1943 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1944 !!!next-input-character;
1945 redo A;
1946 } else {
1947 !!!cp (172);
1948 !!!parse-error (type => 'string after DOCTYPE name',
1949 line => $self->{line_prev},
1950 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1951 $self->{ct}->{quirks} = 1;
1952
1953 $self->{state} = BOGUS_DOCTYPE_STATE;
1954 ## Reconsume.
1955 redo A;
1956 }
1957 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1958 if ($is_space->{$self->{nc}}) {
1959 !!!cp (181);
1960 ## Stay in the state
1961 !!!next-input-character;
1962 redo A;
1963 } elsif ($self->{nc} eq 0x0022) { # "
1964 !!!cp (182);
1965 $self->{ct}->{pubid} = ''; # DOCTYPE
1966 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1967 !!!next-input-character;
1968 redo A;
1969 } elsif ($self->{nc} eq 0x0027) { # '
1970 !!!cp (183);
1971 $self->{ct}->{pubid} = ''; # DOCTYPE
1972 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1973 !!!next-input-character;
1974 redo A;
1975 } elsif ($self->{nc} eq 0x003E) { # >
1976 !!!cp (184);
1977 !!!parse-error (type => 'no PUBLIC literal');
1978
1979 $self->{state} = DATA_STATE;
1980 $self->{s_kwd} = '';
1981 !!!next-input-character;
1982
1983 $self->{ct}->{quirks} = 1;
1984 !!!emit ($self->{ct}); # DOCTYPE
1985
1986 redo A;
1987 } elsif ($self->{nc} == -1) {
1988 !!!cp (185);
1989 !!!parse-error (type => 'unclosed DOCTYPE');
1990
1991 $self->{state} = DATA_STATE;
1992 $self->{s_kwd} = '';
1993 ## reconsume
1994
1995 $self->{ct}->{quirks} = 1;
1996 !!!emit ($self->{ct}); # DOCTYPE
1997
1998 redo A;
1999 } else {
2000 !!!cp (186);
2001 !!!parse-error (type => 'string after PUBLIC');
2002 $self->{ct}->{quirks} = 1;
2003
2004 $self->{state} = BOGUS_DOCTYPE_STATE;
2005 !!!next-input-character;
2006 redo A;
2007 }
2008 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2009 if ($self->{nc} == 0x0022) { # "
2010 !!!cp (187);
2011 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2012 !!!next-input-character;
2013 redo A;
2014 } elsif ($self->{nc} == 0x003E) { # >
2015 !!!cp (188);
2016 !!!parse-error (type => 'unclosed PUBLIC literal');
2017
2018 $self->{state} = DATA_STATE;
2019 $self->{s_kwd} = '';
2020 !!!next-input-character;
2021
2022 $self->{ct}->{quirks} = 1;
2023 !!!emit ($self->{ct}); # DOCTYPE
2024
2025 redo A;
2026 } elsif ($self->{nc} == -1) {
2027 !!!cp (189);
2028 !!!parse-error (type => 'unclosed PUBLIC literal');
2029
2030 $self->{state} = DATA_STATE;
2031 $self->{s_kwd} = '';
2032 ## reconsume
2033
2034 $self->{ct}->{quirks} = 1;
2035 !!!emit ($self->{ct}); # DOCTYPE
2036
2037 redo A;
2038 } else {
2039 !!!cp (190);
2040 $self->{ct}->{pubid} # DOCTYPE
2041 .= chr $self->{nc};
2042 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2043 length $self->{ct}->{pubid});
2044
2045 ## Stay in the state
2046 !!!next-input-character;
2047 redo A;
2048 }
2049 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2050 if ($self->{nc} == 0x0027) { # '
2051 !!!cp (191);
2052 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2053 !!!next-input-character;
2054 redo A;
2055 } elsif ($self->{nc} == 0x003E) { # >
2056 !!!cp (192);
2057 !!!parse-error (type => 'unclosed PUBLIC literal');
2058
2059 $self->{state} = DATA_STATE;
2060 $self->{s_kwd} = '';
2061 !!!next-input-character;
2062
2063 $self->{ct}->{quirks} = 1;
2064 !!!emit ($self->{ct}); # DOCTYPE
2065
2066 redo A;
2067 } elsif ($self->{nc} == -1) {
2068 !!!cp (193);
2069 !!!parse-error (type => 'unclosed PUBLIC literal');
2070
2071 $self->{state} = DATA_STATE;
2072 $self->{s_kwd} = '';
2073 ## reconsume
2074
2075 $self->{ct}->{quirks} = 1;
2076 !!!emit ($self->{ct}); # DOCTYPE
2077
2078 redo A;
2079 } else {
2080 !!!cp (194);
2081 $self->{ct}->{pubid} # DOCTYPE
2082 .= chr $self->{nc};
2083 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2084 length $self->{ct}->{pubid});
2085
2086 ## Stay in the state
2087 !!!next-input-character;
2088 redo A;
2089 }
2090 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2091 if ($is_space->{$self->{nc}}) {
2092 !!!cp (195);
2093 ## Stay in the state
2094 !!!next-input-character;
2095 redo A;
2096 } elsif ($self->{nc} == 0x0022) { # "
2097 !!!cp (196);
2098 $self->{ct}->{sysid} = ''; # DOCTYPE
2099 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2100 !!!next-input-character;
2101 redo A;
2102 } elsif ($self->{nc} == 0x0027) { # '
2103 !!!cp (197);
2104 $self->{ct}->{sysid} = ''; # DOCTYPE
2105 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2106 !!!next-input-character;
2107 redo A;
2108 } elsif ($self->{nc} == 0x003E) { # >
2109 !!!cp (198);
2110 $self->{state} = DATA_STATE;
2111 $self->{s_kwd} = '';
2112 !!!next-input-character;
2113
2114 !!!emit ($self->{ct}); # DOCTYPE
2115
2116 redo A;
2117 } elsif ($self->{nc} == -1) {
2118 !!!cp (199);
2119 !!!parse-error (type => 'unclosed DOCTYPE');
2120
2121 $self->{state} = DATA_STATE;
2122 $self->{s_kwd} = '';
2123 ## reconsume
2124
2125 $self->{ct}->{quirks} = 1;
2126 !!!emit ($self->{ct}); # DOCTYPE
2127
2128 redo A;
2129 } else {
2130 !!!cp (200);
2131 !!!parse-error (type => 'string after PUBLIC literal');
2132 $self->{ct}->{quirks} = 1;
2133
2134 $self->{state} = BOGUS_DOCTYPE_STATE;
2135 !!!next-input-character;
2136 redo A;
2137 }
2138 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2139 if ($is_space->{$self->{nc}}) {
2140 !!!cp (201);
2141 ## Stay in the state
2142 !!!next-input-character;
2143 redo A;
2144 } elsif ($self->{nc} == 0x0022) { # "
2145 !!!cp (202);
2146 $self->{ct}->{sysid} = ''; # DOCTYPE
2147 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2148 !!!next-input-character;
2149 redo A;
2150 } elsif ($self->{nc} == 0x0027) { # '
2151 !!!cp (203);
2152 $self->{ct}->{sysid} = ''; # DOCTYPE
2153 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2154 !!!next-input-character;
2155 redo A;
2156 } elsif ($self->{nc} == 0x003E) { # >
2157 !!!cp (204);
2158 !!!parse-error (type => 'no SYSTEM literal');
2159 $self->{state} = DATA_STATE;
2160 $self->{s_kwd} = '';
2161 !!!next-input-character;
2162
2163 $self->{ct}->{quirks} = 1;
2164 !!!emit ($self->{ct}); # DOCTYPE
2165
2166 redo A;
2167 } elsif ($self->{nc} == -1) {
2168 !!!cp (205);
2169 !!!parse-error (type => 'unclosed DOCTYPE');
2170
2171 $self->{state} = DATA_STATE;
2172 $self->{s_kwd} = '';
2173 ## reconsume
2174
2175 $self->{ct}->{quirks} = 1;
2176 !!!emit ($self->{ct}); # DOCTYPE
2177
2178 redo A;
2179 } else {
2180 !!!cp (206);
2181 !!!parse-error (type => 'string after SYSTEM');
2182 $self->{ct}->{quirks} = 1;
2183
2184 $self->{state} = BOGUS_DOCTYPE_STATE;
2185 !!!next-input-character;
2186 redo A;
2187 }
2188 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2189 if ($self->{nc} == 0x0022) { # "
2190 !!!cp (207);
2191 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2192 !!!next-input-character;
2193 redo A;
2194 } elsif ($self->{nc} == 0x003E) { # >
2195 !!!cp (208);
2196 !!!parse-error (type => 'unclosed SYSTEM literal');
2197
2198 $self->{state} = DATA_STATE;
2199 $self->{s_kwd} = '';
2200 !!!next-input-character;
2201
2202 $self->{ct}->{quirks} = 1;
2203 !!!emit ($self->{ct}); # DOCTYPE
2204
2205 redo A;
2206 } elsif ($self->{nc} == -1) {
2207 !!!cp (209);
2208 !!!parse-error (type => 'unclosed SYSTEM literal');
2209
2210 $self->{state} = DATA_STATE;
2211 $self->{s_kwd} = '';
2212 ## reconsume
2213
2214 $self->{ct}->{quirks} = 1;
2215 !!!emit ($self->{ct}); # DOCTYPE
2216
2217 redo A;
2218 } else {
2219 !!!cp (210);
2220 $self->{ct}->{sysid} # DOCTYPE
2221 .= chr $self->{nc};
2222 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2223 length $self->{ct}->{sysid});
2224
2225 ## Stay in the state
2226 !!!next-input-character;
2227 redo A;
2228 }
2229 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2230 if ($self->{nc} == 0x0027) { # '
2231 !!!cp (211);
2232 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2233 !!!next-input-character;
2234 redo A;
2235 } elsif ($self->{nc} == 0x003E) { # >
2236 !!!cp (212);
2237 !!!parse-error (type => 'unclosed SYSTEM literal');
2238
2239 $self->{state} = DATA_STATE;
2240 $self->{s_kwd} = '';
2241 !!!next-input-character;
2242
2243 $self->{ct}->{quirks} = 1;
2244 !!!emit ($self->{ct}); # DOCTYPE
2245
2246 redo A;
2247 } elsif ($self->{nc} == -1) {
2248 !!!cp (213);
2249 !!!parse-error (type => 'unclosed SYSTEM literal');
2250
2251 $self->{state} = DATA_STATE;
2252 $self->{s_kwd} = '';
2253 ## reconsume
2254
2255 $self->{ct}->{quirks} = 1;
2256 !!!emit ($self->{ct}); # DOCTYPE
2257
2258 redo A;
2259 } else {
2260 !!!cp (214);
2261 $self->{ct}->{sysid} # DOCTYPE
2262 .= chr $self->{nc};
2263 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2264 length $self->{ct}->{sysid});
2265
2266 ## Stay in the state
2267 !!!next-input-character;
2268 redo A;
2269 }
2270 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2271 if ($is_space->{$self->{nc}}) {
2272 !!!cp (215);
2273 ## Stay in the state
2274 !!!next-input-character;
2275 redo A;
2276 } elsif ($self->{nc} == 0x003E) { # >
2277 !!!cp (216);
2278 $self->{state} = DATA_STATE;
2279 $self->{s_kwd} = '';
2280 !!!next-input-character;
2281
2282 !!!emit ($self->{ct}); # DOCTYPE
2283
2284 redo A;
2285 } elsif ($self->{nc} == -1) {
2286 !!!cp (217);
2287 !!!parse-error (type => 'unclosed DOCTYPE');
2288 $self->{state} = DATA_STATE;
2289 $self->{s_kwd} = '';
2290 ## reconsume
2291
2292 $self->{ct}->{quirks} = 1;
2293 !!!emit ($self->{ct}); # DOCTYPE
2294
2295 redo A;
2296 } else {
2297 !!!cp (218);
2298 !!!parse-error (type => 'string after SYSTEM literal');
2299 #$self->{ct}->{quirks} = 1;
2300
2301 $self->{state} = BOGUS_DOCTYPE_STATE;
2302 !!!next-input-character;
2303 redo A;
2304 }
2305 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2306 if ($self->{nc} == 0x003E) { # >
2307 !!!cp (219);
2308 $self->{state} = DATA_STATE;
2309 $self->{s_kwd} = '';
2310 !!!next-input-character;
2311
2312 !!!emit ($self->{ct}); # DOCTYPE
2313
2314 redo A;
2315 } elsif ($self->{nc} == -1) {
2316 !!!cp (220);
2317 $self->{state} = DATA_STATE;
2318 $self->{s_kwd} = '';
2319 ## reconsume
2320
2321 !!!emit ($self->{ct}); # DOCTYPE
2322
2323 redo A;
2324 } else {
2325 !!!cp (221);
2326 my $s = '';
2327 $self->{read_until}->($s, q[>], 0);
2328
2329 ## Stay in the state
2330 !!!next-input-character;
2331 redo A;
2332 }
2333 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2334 ## NOTE: "CDATA section state" in the state is jointly implemented
2335 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2336 ## and |CDATA_SECTION_MSE2_STATE|.
2337
2338 if ($self->{nc} == 0x005D) { # ]
2339 !!!cp (221.1);
2340 $self->{state} = CDATA_SECTION_MSE1_STATE;
2341 !!!next-input-character;
2342 redo A;
2343 } elsif ($self->{nc} == -1) {
2344 if ($self->{is_xml}) {
2345 !!!parse-error (type => 'no mse'); ## TODO: type
2346 }
2347
2348 $self->{state} = DATA_STATE;
2349 $self->{s_kwd} = '';
2350 !!!next-input-character;
2351 if (length $self->{ct}->{data}) { # character
2352 !!!cp (221.2);
2353 !!!emit ($self->{ct}); # character
2354 } else {
2355 !!!cp (221.3);
2356 ## No token to emit. $self->{ct} is discarded.
2357 }
2358 redo A;
2359 } else {
2360 !!!cp (221.4);
2361 $self->{ct}->{data} .= chr $self->{nc};
2362 $self->{read_until}->($self->{ct}->{data},
2363 q<]>,
2364 length $self->{ct}->{data});
2365
2366 ## Stay in the state.
2367 !!!next-input-character;
2368 redo A;
2369 }
2370
2371 ## ISSUE: "text tokens" in spec.
2372 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2373 if ($self->{nc} == 0x005D) { # ]
2374 !!!cp (221.5);
2375 $self->{state} = CDATA_SECTION_MSE2_STATE;
2376 !!!next-input-character;
2377 redo A;
2378 } else {
2379 !!!cp (221.6);
2380 $self->{ct}->{data} .= ']';
2381 $self->{state} = CDATA_SECTION_STATE;
2382 ## Reconsume.
2383 redo A;
2384 }
2385 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2386 if ($self->{nc} == 0x003E) { # >
2387 $self->{state} = DATA_STATE;
2388 $self->{s_kwd} = '';
2389 !!!next-input-character;
2390 if (length $self->{ct}->{data}) { # character
2391 !!!cp (221.7);
2392 !!!emit ($self->{ct}); # character
2393 } else {
2394 !!!cp (221.8);
2395 ## No token to emit. $self->{ct} is discarded.
2396 }
2397 redo A;
2398 } elsif ($self->{nc} == 0x005D) { # ]
2399 !!!cp (221.9); # character
2400 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2401 ## Stay in the state.
2402 !!!next-input-character;
2403 redo A;
2404 } else {
2405 !!!cp (221.11);
2406 $self->{ct}->{data} .= ']]'; # character
2407 $self->{state} = CDATA_SECTION_STATE;
2408 ## Reconsume.
2409 redo A;
2410 }
2411 } elsif ($self->{state} == ENTITY_STATE) {
2412 if ($is_space->{$self->{nc}} or
2413 {
2414 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2415 $self->{entity_add} => 1,
2416 }->{$self->{nc}}) {
2417 !!!cp (1001);
2418 ## Don't consume
2419 ## No error
2420 ## Return nothing.
2421 #
2422 } elsif ($self->{nc} == 0x0023) { # #
2423 !!!cp (999);
2424 $self->{state} = ENTITY_HASH_STATE;
2425 $self->{s_kwd} = '#';
2426 !!!next-input-character;
2427 redo A;
2428 } elsif ((0x0041 <= $self->{nc} and
2429 $self->{nc} <= 0x005A) or # A..Z
2430 (0x0061 <= $self->{nc} and
2431 $self->{nc} <= 0x007A)) { # a..z
2432 !!!cp (998);
2433 require Whatpm::_NamedEntityList;
2434 $self->{state} = ENTITY_NAME_STATE;
2435 $self->{s_kwd} = chr $self->{nc};
2436 $self->{entity__value} = $self->{s_kwd};
2437 $self->{entity__match} = 0;
2438 !!!next-input-character;
2439 redo A;
2440 } else {
2441 !!!cp (1027);
2442 !!!parse-error (type => 'bare ero');
2443 ## Return nothing.
2444 #
2445 }
2446
2447 ## NOTE: No character is consumed by the "consume a character
2448 ## reference" algorithm. In other word, there is an "&" character
2449 ## that does not introduce a character reference, which would be
2450 ## appended to the parent element or the attribute value in later
2451 ## process of the tokenizer.
2452
2453 if ($self->{prev_state} == DATA_STATE) {
2454 !!!cp (997);
2455 $self->{state} = $self->{prev_state};
2456 $self->{s_kwd} = '';
2457 ## Reconsume.
2458 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2459 line => $self->{line_prev},
2460 column => $self->{column_prev},
2461 });
2462 redo A;
2463 } else {
2464 !!!cp (996);
2465 $self->{ca}->{value} .= '&';
2466 $self->{state} = $self->{prev_state};
2467 $self->{s_kwd} = '';
2468 ## Reconsume.
2469 redo A;
2470 }
2471 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2472 if ($self->{nc} == 0x0078 or # x
2473 $self->{nc} == 0x0058) { # X
2474 !!!cp (995);
2475 $self->{state} = HEXREF_X_STATE;
2476 $self->{s_kwd} .= chr $self->{nc};
2477 !!!next-input-character;
2478 redo A;
2479 } elsif (0x0030 <= $self->{nc} and
2480 $self->{nc} <= 0x0039) { # 0..9
2481 !!!cp (994);
2482 $self->{state} = NCR_NUM_STATE;
2483 $self->{s_kwd} = $self->{nc} - 0x0030;
2484 !!!next-input-character;
2485 redo A;
2486 } else {
2487 !!!parse-error (type => 'bare nero',
2488 line => $self->{line_prev},
2489 column => $self->{column_prev} - 1);
2490
2491 ## NOTE: According to the spec algorithm, nothing is returned,
2492 ## and then "&#" is appended to the parent element or the attribute
2493 ## value in the later processing.
2494
2495 if ($self->{prev_state} == DATA_STATE) {
2496 !!!cp (1019);
2497 $self->{state} = $self->{prev_state};
2498 $self->{s_kwd} = '';
2499 ## Reconsume.
2500 !!!emit ({type => CHARACTER_TOKEN,
2501 data => '&#',
2502 line => $self->{line_prev},
2503 column => $self->{column_prev} - 1,
2504 });
2505 redo A;
2506 } else {
2507 !!!cp (993);
2508 $self->{ca}->{value} .= '&#';
2509 $self->{state} = $self->{prev_state};
2510 $self->{s_kwd} = '';
2511 ## Reconsume.
2512 redo A;
2513 }
2514 }
2515 } elsif ($self->{state} == NCR_NUM_STATE) {
2516 if (0x0030 <= $self->{nc} and
2517 $self->{nc} <= 0x0039) { # 0..9
2518 !!!cp (1012);
2519 $self->{s_kwd} *= 10;
2520 $self->{s_kwd} += $self->{nc} - 0x0030;
2521
2522 ## Stay in the state.
2523 !!!next-input-character;
2524 redo A;
2525 } elsif ($self->{nc} == 0x003B) { # ;
2526 !!!cp (1013);
2527 !!!next-input-character;
2528 #
2529 } else {
2530 !!!cp (1014);
2531 !!!parse-error (type => 'no refc');
2532 ## Reconsume.
2533 #
2534 }
2535
2536 my $code = $self->{s_kwd};
2537 my $l = $self->{line_prev};
2538 my $c = $self->{column_prev};
2539 if ($charref_map->{$code}) {
2540 !!!cp (1015);
2541 !!!parse-error (type => 'invalid character reference',
2542 text => (sprintf 'U+%04X', $code),
2543 line => $l, column => $c);
2544 $code = $charref_map->{$code};
2545 } elsif ($code > 0x10FFFF) {
2546 !!!cp (1016);
2547 !!!parse-error (type => 'invalid character reference',
2548 text => (sprintf 'U-%08X', $code),
2549 line => $l, column => $c);
2550 $code = 0xFFFD;
2551 }
2552
2553 if ($self->{prev_state} == DATA_STATE) {
2554 !!!cp (992);
2555 $self->{state} = $self->{prev_state};
2556 $self->{s_kwd} = '';
2557 ## Reconsume.
2558 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2559 line => $l, column => $c,
2560 });
2561 redo A;
2562 } else {
2563 !!!cp (991);
2564 $self->{ca}->{value} .= chr $code;
2565 $self->{ca}->{has_reference} = 1;
2566 $self->{state} = $self->{prev_state};
2567 $self->{s_kwd} = '';
2568 ## Reconsume.
2569 redo A;
2570 }
2571 } elsif ($self->{state} == HEXREF_X_STATE) {
2572 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2573 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2574 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2575 # 0..9, A..F, a..f
2576 !!!cp (990);
2577 $self->{state} = HEXREF_HEX_STATE;
2578 $self->{s_kwd} = 0;
2579 ## Reconsume.
2580 redo A;
2581 } else {
2582 !!!parse-error (type => 'bare hcro',
2583 line => $self->{line_prev},
2584 column => $self->{column_prev} - 2);
2585
2586 ## NOTE: According to the spec algorithm, nothing is returned,
2587 ## and then "&#" followed by "X" or "x" is appended to the parent
2588 ## element or the attribute value in the later processing.
2589
2590 if ($self->{prev_state} == DATA_STATE) {
2591 !!!cp (1005);
2592 $self->{state} = $self->{prev_state};
2593 $self->{s_kwd} = '';
2594 ## Reconsume.
2595 !!!emit ({type => CHARACTER_TOKEN,
2596 data => '&' . $self->{s_kwd},
2597 line => $self->{line_prev},
2598 column => $self->{column_prev} - length $self->{s_kwd},
2599 });
2600 redo A;
2601 } else {
2602 !!!cp (989);
2603 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2604 $self->{state} = $self->{prev_state};
2605 $self->{s_kwd} = '';
2606 ## Reconsume.
2607 redo A;
2608 }
2609 }
2610 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2611 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2612 # 0..9
2613 !!!cp (1002);
2614 $self->{s_kwd} *= 0x10;
2615 $self->{s_kwd} += $self->{nc} - 0x0030;
2616 ## Stay in the state.
2617 !!!next-input-character;
2618 redo A;
2619 } elsif (0x0061 <= $self->{nc} and
2620 $self->{nc} <= 0x0066) { # a..f
2621 !!!cp (1003);
2622 $self->{s_kwd} *= 0x10;
2623 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2624 ## Stay in the state.
2625 !!!next-input-character;
2626 redo A;
2627 } elsif (0x0041 <= $self->{nc} and
2628 $self->{nc} <= 0x0046) { # A..F
2629 !!!cp (1004);
2630 $self->{s_kwd} *= 0x10;
2631 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2632 ## Stay in the state.
2633 !!!next-input-character;
2634 redo A;
2635 } elsif ($self->{nc} == 0x003B) { # ;
2636 !!!cp (1006);
2637 !!!next-input-character;
2638 #
2639 } else {
2640 !!!cp (1007);
2641 !!!parse-error (type => 'no refc',
2642 line => $self->{line},
2643 column => $self->{column});
2644 ## Reconsume.
2645 #
2646 }
2647
2648 my $code = $self->{s_kwd};
2649 my $l = $self->{line_prev};
2650 my $c = $self->{column_prev};
2651 if ($charref_map->{$code}) {
2652 !!!cp (1008);
2653 !!!parse-error (type => 'invalid character reference',
2654 text => (sprintf 'U+%04X', $code),
2655 line => $l, column => $c);
2656 $code = $charref_map->{$code};
2657 } elsif ($code > 0x10FFFF) {
2658 !!!cp (1009);
2659 !!!parse-error (type => 'invalid character reference',
2660 text => (sprintf 'U-%08X', $code),
2661 line => $l, column => $c);
2662 $code = 0xFFFD;
2663 }
2664
2665 if ($self->{prev_state} == DATA_STATE) {
2666 !!!cp (988);
2667 $self->{state} = $self->{prev_state};
2668 $self->{s_kwd} = '';
2669 ## Reconsume.
2670 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2671 line => $l, column => $c,
2672 });
2673 redo A;
2674 } else {
2675 !!!cp (987);
2676 $self->{ca}->{value} .= chr $code;
2677 $self->{ca}->{has_reference} = 1;
2678 $self->{state} = $self->{prev_state};
2679 $self->{s_kwd} = '';
2680 ## Reconsume.
2681 redo A;
2682 }
2683 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2684 if (length $self->{s_kwd} < 30 and
2685 ## NOTE: Some number greater than the maximum length of entity name
2686 ((0x0041 <= $self->{nc} and # a
2687 $self->{nc} <= 0x005A) or # x
2688 (0x0061 <= $self->{nc} and # a
2689 $self->{nc} <= 0x007A) or # z
2690 (0x0030 <= $self->{nc} and # 0
2691 $self->{nc} <= 0x0039) or # 9
2692 $self->{nc} == 0x003B)) { # ;
2693 our $EntityChar;
2694 $self->{s_kwd} .= chr $self->{nc};
2695 if (defined $EntityChar->{$self->{s_kwd}}) {
2696 if ($self->{nc} == 0x003B) { # ;
2697 !!!cp (1020);
2698 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2699 $self->{entity__match} = 1;
2700 !!!next-input-character;
2701 #
2702 } else {
2703 !!!cp (1021);
2704 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2705 $self->{entity__match} = -1;
2706 ## Stay in the state.
2707 !!!next-input-character;
2708 redo A;
2709 }
2710 } else {
2711 !!!cp (1022);
2712 $self->{entity__value} .= chr $self->{nc};
2713 $self->{entity__match} *= 2;
2714 ## Stay in the state.
2715 !!!next-input-character;
2716 redo A;
2717 }
2718 }
2719
2720 my $data;
2721 my $has_ref;
2722 if ($self->{entity__match} > 0) {
2723 !!!cp (1023);
2724 $data = $self->{entity__value};
2725 $has_ref = 1;
2726 #
2727 } elsif ($self->{entity__match} < 0) {
2728 !!!parse-error (type => 'no refc');
2729 if ($self->{prev_state} != DATA_STATE and # in attribute
2730 $self->{entity__match} < -1) {
2731 !!!cp (1024);
2732 $data = '&' . $self->{s_kwd};
2733 #
2734 } else {
2735 !!!cp (1025);
2736 $data = $self->{entity__value};
2737 $has_ref = 1;
2738 #
2739 }
2740 } else {
2741 !!!cp (1026);
2742 !!!parse-error (type => 'bare ero',
2743 line => $self->{line_prev},
2744 column => $self->{column_prev} - length $self->{s_kwd});
2745 $data = '&' . $self->{s_kwd};
2746 #
2747 }
2748
2749 ## NOTE: In these cases, when a character reference is found,
2750 ## it is consumed and a character token is returned, or, otherwise,
2751 ## nothing is consumed and returned, according to the spec algorithm.
2752 ## In this implementation, anything that has been examined by the
2753 ## tokenizer is appended to the parent element or the attribute value
2754 ## as string, either literal string when no character reference or
2755 ## entity-replaced string otherwise, in this stage, since any characters
2756 ## that would not be consumed are appended in the data state or in an
2757 ## appropriate attribute value state anyway.
2758
2759 if ($self->{prev_state} == DATA_STATE) {
2760 !!!cp (986);
2761 $self->{state} = $self->{prev_state};
2762 $self->{s_kwd} = '';
2763 ## Reconsume.
2764 !!!emit ({type => CHARACTER_TOKEN,
2765 data => $data,
2766 line => $self->{line_prev},
2767 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2768 });
2769 redo A;
2770 } else {
2771 !!!cp (985);
2772 $self->{ca}->{value} .= $data;
2773 $self->{ca}->{has_reference} = 1 if $has_ref;
2774 $self->{state} = $self->{prev_state};
2775 $self->{s_kwd} = '';
2776 ## Reconsume.
2777 redo A;
2778 }
2779 } else {
2780 die "$0: $self->{state}: Unknown state";
2781 }
2782 } # A
2783
2784 die "$0: _get_next_token: unexpected case";
2785 } # _get_next_token
2786
2787 1;
2788 ## $Date: 2008/10/14 14:38:59 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24