/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (show annotations) (download) (as text)
Tue Oct 14 15:25:50 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.6: +6 -2 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	14 Oct 2008 15:23:30 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/charref-1.dat" added.

++ whatpm/t/xml/ChangeLog	14 Oct 2008 15:23:49 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* charref-1.dat: New test data file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 15:24:42 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Mark CHARACTER_TOKEN with character reference
	as such, for the support of XML parse error.

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 15:25:35 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Raise a parse error for white space character
	generated by a character reference outside of the root element.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 $self->{s_kwd} = ''; # state keyword
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194 !!!next-input-character;
195 $self->{token} = [];
196 # $self->{escape}
197 } # _initialize_tokenizer
198
199 ## A token has:
200 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202 ## ->{name} (DOCTYPE_TOKEN)
203 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204 ## ->{pubid} (DOCTYPE_TOKEN)
205 ## ->{sysid} (DOCTYPE_TOKEN)
206 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208 ## ->{name}
209 ## ->{value}
210 ## ->{has_reference} == 1 or 0
211 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
213 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
214 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
215 ## while the token is pushed back to the stack.
216
217 ## Emitted token MUST immediately be handled by the tree construction state.
218
219 ## Before each step, UA MAY check to see if either one of the scripts in
220 ## "list of scripts that will execute as soon as possible" or the first
221 ## script in the "list of scripts that will execute asynchronously",
222 ## has completed loading. If one has, then it MUST be executed
223 ## and removed from the list.
224
225 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
226 ## (This requirement was dropped from HTML5 spec, unfortunately.)
227
228 my $is_space = {
229 0x0009 => 1, # CHARACTER TABULATION (HT)
230 0x000A => 1, # LINE FEED (LF)
231 #0x000B => 0, # LINE TABULATION (VT)
232 0x000C => 1, # FORM FEED (FF)
233 #0x000D => 1, # CARRIAGE RETURN (CR)
234 0x0020 => 1, # SPACE (SP)
235 };
236
237 sub _get_next_token ($) {
238 my $self = shift;
239
240 if ($self->{self_closing}) {
241 !!!parse-error (type => 'nestc', token => $self->{ct});
242 ## NOTE: The |self_closing| flag is only set by start tag token.
243 ## In addition, when a start tag token is emitted, it is always set to
244 ## |ct|.
245 delete $self->{self_closing};
246 }
247
248 if (@{$self->{token}}) {
249 $self->{self_closing} = $self->{token}->[0]->{self_closing};
250 return shift @{$self->{token}};
251 }
252
253 A: {
254 if ($self->{state} == PCDATA_STATE) {
255 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
256
257 if ($self->{nc} == 0x0026) { # &
258 !!!cp (0.1);
259 ## NOTE: In the spec, the tokenizer is switched to the
260 ## "entity data state". In this implementation, the tokenizer
261 ## is switched to the |ENTITY_STATE|, which is an implementation
262 ## of the "consume a character reference" algorithm.
263 $self->{entity_add} = -1;
264 $self->{prev_state} = DATA_STATE;
265 $self->{state} = ENTITY_STATE;
266 !!!next-input-character;
267 redo A;
268 } elsif ($self->{nc} == 0x003C) { # <
269 !!!cp (0.2);
270 $self->{state} = TAG_OPEN_STATE;
271 !!!next-input-character;
272 redo A;
273 } elsif ($self->{nc} == -1) {
274 !!!cp (0.3);
275 !!!emit ({type => END_OF_FILE_TOKEN,
276 line => $self->{line}, column => $self->{column}});
277 last A; ## TODO: ok?
278 } else {
279 !!!cp (0.4);
280 #
281 }
282
283 # Anything else
284 my $token = {type => CHARACTER_TOKEN,
285 data => chr $self->{nc},
286 line => $self->{line}, column => $self->{column},
287 };
288 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
289
290 ## Stay in the state.
291 !!!next-input-character;
292 !!!emit ($token);
293 redo A;
294 } elsif ($self->{state} == DATA_STATE) {
295 $self->{s_kwd} = '' unless defined $self->{s_kwd};
296 if ($self->{nc} == 0x0026) { # &
297 $self->{s_kwd} = '';
298 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
299 not $self->{escape}) {
300 !!!cp (1);
301 ## NOTE: In the spec, the tokenizer is switched to the
302 ## "entity data state". In this implementation, the tokenizer
303 ## is switched to the |ENTITY_STATE|, which is an implementation
304 ## of the "consume a character reference" algorithm.
305 $self->{entity_add} = -1;
306 $self->{prev_state} = DATA_STATE;
307 $self->{state} = ENTITY_STATE;
308 !!!next-input-character;
309 redo A;
310 } else {
311 !!!cp (2);
312 #
313 }
314 } elsif ($self->{nc} == 0x002D) { # -
315 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
316 if ($self->{s_kwd} eq '<!-') {
317 !!!cp (3);
318 $self->{escape} = 1; # unless $self->{escape};
319 $self->{s_kwd} = '--';
320 #
321 } elsif ($self->{s_kwd} eq '-') {
322 !!!cp (4);
323 $self->{s_kwd} = '--';
324 #
325 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
326 !!!cp (4.1);
327 $self->{s_kwd} .= '-';
328 #
329 } else {
330 !!!cp (5);
331 $self->{s_kwd} = '-';
332 #
333 }
334 }
335
336 #
337 } elsif ($self->{nc} == 0x0021) { # !
338 if (length $self->{s_kwd}) {
339 !!!cp (5.1);
340 $self->{s_kwd} .= '!';
341 #
342 } else {
343 !!!cp (5.2);
344 #$self->{s_kwd} = '';
345 #
346 }
347 #
348 } elsif ($self->{nc} == 0x003C) { # <
349 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
350 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
351 not $self->{escape})) {
352 !!!cp (6);
353 $self->{state} = TAG_OPEN_STATE;
354 !!!next-input-character;
355 redo A;
356 } else {
357 !!!cp (7);
358 $self->{s_kwd} = '';
359 #
360 }
361 } elsif ($self->{nc} == 0x003E) { # >
362 if ($self->{escape} and
363 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
364 if ($self->{s_kwd} eq '--') {
365 !!!cp (8);
366 delete $self->{escape};
367 #
368 } else {
369 !!!cp (9);
370 #
371 }
372 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
373 !!!cp (9.1);
374 !!!parse-error (type => 'unmatched mse', ## TODO: type
375 line => $self->{line_prev},
376 column => $self->{column_prev} - 1);
377 #
378 } else {
379 !!!cp (10);
380 #
381 }
382
383 $self->{s_kwd} = '';
384 #
385 } elsif ($self->{nc} == 0x005D) { # ]
386 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
387 !!!cp (10.1);
388 $self->{s_kwd} .= ']';
389 } elsif ($self->{s_kwd} eq ']]') {
390 !!!cp (10.2);
391 #
392 } else {
393 !!!cp (10.3);
394 $self->{s_kwd} = '';
395 }
396 #
397 } elsif ($self->{nc} == -1) {
398 !!!cp (11);
399 $self->{s_kwd} = '';
400 !!!emit ({type => END_OF_FILE_TOKEN,
401 line => $self->{line}, column => $self->{column}});
402 last A; ## TODO: ok?
403 } else {
404 !!!cp (12);
405 $self->{s_kwd} = '';
406 #
407 }
408
409 # Anything else
410 my $token = {type => CHARACTER_TOKEN,
411 data => chr $self->{nc},
412 line => $self->{line}, column => $self->{column},
413 };
414 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
415 length $token->{data})) {
416 $self->{s_kwd} = '';
417 }
418
419 ## Stay in the data state.
420 if (not $self->{is_xml} and
421 $self->{content_model} == PCDATA_CONTENT_MODEL) {
422 !!!cp (13);
423 $self->{state} = PCDATA_STATE;
424 } else {
425 !!!cp (14);
426 ## Stay in the state.
427 }
428 !!!next-input-character;
429 !!!emit ($token);
430 redo A;
431 } elsif ($self->{state} == TAG_OPEN_STATE) {
432 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
433 if ($self->{nc} == 0x002F) { # /
434 !!!cp (15);
435 !!!next-input-character;
436 $self->{state} = CLOSE_TAG_OPEN_STATE;
437 redo A;
438 } elsif ($self->{nc} == 0x0021) { # !
439 !!!cp (15.1);
440 $self->{s_kwd} = '<' unless $self->{escape};
441 #
442 } else {
443 !!!cp (16);
444 #
445 }
446
447 ## reconsume
448 $self->{state} = DATA_STATE;
449 $self->{s_kwd} = '';
450 !!!emit ({type => CHARACTER_TOKEN, data => '<',
451 line => $self->{line_prev},
452 column => $self->{column_prev},
453 });
454 redo A;
455 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
456 if ($self->{nc} == 0x0021) { # !
457 !!!cp (17);
458 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
459 !!!next-input-character;
460 redo A;
461 } elsif ($self->{nc} == 0x002F) { # /
462 !!!cp (18);
463 $self->{state} = CLOSE_TAG_OPEN_STATE;
464 !!!next-input-character;
465 redo A;
466 } elsif (0x0041 <= $self->{nc} and
467 $self->{nc} <= 0x005A) { # A..Z
468 !!!cp (19);
469 $self->{ct}
470 = {type => START_TAG_TOKEN,
471 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
472 line => $self->{line_prev},
473 column => $self->{column_prev}};
474 $self->{state} = TAG_NAME_STATE;
475 !!!next-input-character;
476 redo A;
477 } elsif (0x0061 <= $self->{nc} and
478 $self->{nc} <= 0x007A) { # a..z
479 !!!cp (20);
480 $self->{ct} = {type => START_TAG_TOKEN,
481 tag_name => chr ($self->{nc}),
482 line => $self->{line_prev},
483 column => $self->{column_prev}};
484 $self->{state} = TAG_NAME_STATE;
485 !!!next-input-character;
486 redo A;
487 } elsif ($self->{nc} == 0x003E) { # >
488 !!!cp (21);
489 !!!parse-error (type => 'empty start tag',
490 line => $self->{line_prev},
491 column => $self->{column_prev});
492 $self->{state} = DATA_STATE;
493 $self->{s_kwd} = '';
494 !!!next-input-character;
495
496 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
497 line => $self->{line_prev},
498 column => $self->{column_prev},
499 });
500
501 redo A;
502 } elsif ($self->{nc} == 0x003F) { # ?
503 !!!cp (22);
504 !!!parse-error (type => 'pio',
505 line => $self->{line_prev},
506 column => $self->{column_prev});
507 $self->{state} = BOGUS_COMMENT_STATE;
508 $self->{ct} = {type => COMMENT_TOKEN, data => '',
509 line => $self->{line_prev},
510 column => $self->{column_prev},
511 };
512 ## $self->{nc} is intentionally left as is
513 redo A;
514 } else {
515 !!!cp (23);
516 !!!parse-error (type => 'bare stago',
517 line => $self->{line_prev},
518 column => $self->{column_prev});
519 $self->{state} = DATA_STATE;
520 $self->{s_kwd} = '';
521 ## reconsume
522
523 !!!emit ({type => CHARACTER_TOKEN, data => '<',
524 line => $self->{line_prev},
525 column => $self->{column_prev},
526 });
527
528 redo A;
529 }
530 } else {
531 die "$0: $self->{content_model} in tag open";
532 }
533 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
534 ## NOTE: The "close tag open state" in the spec is implemented as
535 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
536
537 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
538 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
539 if (defined $self->{last_stag_name}) {
540 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
541 $self->{s_kwd} = '';
542 ## Reconsume.
543 redo A;
544 } else {
545 ## No start tag token has ever been emitted
546 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
547 !!!cp (28);
548 $self->{state} = DATA_STATE;
549 $self->{s_kwd} = '';
550 ## Reconsume.
551 !!!emit ({type => CHARACTER_TOKEN, data => '</',
552 line => $l, column => $c,
553 });
554 redo A;
555 }
556 }
557
558 if (0x0041 <= $self->{nc} and
559 $self->{nc} <= 0x005A) { # A..Z
560 !!!cp (29);
561 $self->{ct}
562 = {type => END_TAG_TOKEN,
563 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
564 line => $l, column => $c};
565 $self->{state} = TAG_NAME_STATE;
566 !!!next-input-character;
567 redo A;
568 } elsif (0x0061 <= $self->{nc} and
569 $self->{nc} <= 0x007A) { # a..z
570 !!!cp (30);
571 $self->{ct} = {type => END_TAG_TOKEN,
572 tag_name => chr ($self->{nc}),
573 line => $l, column => $c};
574 $self->{state} = TAG_NAME_STATE;
575 !!!next-input-character;
576 redo A;
577 } elsif ($self->{nc} == 0x003E) { # >
578 !!!cp (31);
579 !!!parse-error (type => 'empty end tag',
580 line => $self->{line_prev}, ## "<" in "</>"
581 column => $self->{column_prev} - 1);
582 $self->{state} = DATA_STATE;
583 $self->{s_kwd} = '';
584 !!!next-input-character;
585 redo A;
586 } elsif ($self->{nc} == -1) {
587 !!!cp (32);
588 !!!parse-error (type => 'bare etago');
589 $self->{s_kwd} = '';
590 $self->{state} = DATA_STATE;
591 # reconsume
592
593 !!!emit ({type => CHARACTER_TOKEN, data => '</',
594 line => $l, column => $c,
595 });
596
597 redo A;
598 } else {
599 !!!cp (33);
600 !!!parse-error (type => 'bogus end tag');
601 $self->{state} = BOGUS_COMMENT_STATE;
602 $self->{ct} = {type => COMMENT_TOKEN, data => '',
603 line => $self->{line_prev}, # "<" of "</"
604 column => $self->{column_prev} - 1,
605 };
606 ## NOTE: $self->{nc} is intentionally left as is.
607 ## Although the "anything else" case of the spec not explicitly
608 ## states that the next input character is to be reconsumed,
609 ## it will be included to the |data| of the comment token
610 ## generated from the bogus end tag, as defined in the
611 ## "bogus comment state" entry.
612 redo A;
613 }
614 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
615 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
616 if (length $ch) {
617 my $CH = $ch;
618 $ch =~ tr/a-z/A-Z/;
619 my $nch = chr $self->{nc};
620 if ($nch eq $ch or $nch eq $CH) {
621 !!!cp (24);
622 ## Stay in the state.
623 $self->{s_kwd} .= $nch;
624 !!!next-input-character;
625 redo A;
626 } else {
627 !!!cp (25);
628 $self->{state} = DATA_STATE;
629 $self->{s_kwd} = '';
630 ## Reconsume.
631 !!!emit ({type => CHARACTER_TOKEN,
632 data => '</' . $self->{s_kwd},
633 line => $self->{line_prev},
634 column => $self->{column_prev} - 1 - length $self->{s_kwd},
635 });
636 redo A;
637 }
638 } else { # after "<{tag-name}"
639 unless ($is_space->{$self->{nc}} or
640 {
641 0x003E => 1, # >
642 0x002F => 1, # /
643 -1 => 1, # EOF
644 }->{$self->{nc}}) {
645 !!!cp (26);
646 ## Reconsume.
647 $self->{state} = DATA_STATE;
648 $self->{s_kwd} = '';
649 !!!emit ({type => CHARACTER_TOKEN,
650 data => '</' . $self->{s_kwd},
651 line => $self->{line_prev},
652 column => $self->{column_prev} - 1 - length $self->{s_kwd},
653 });
654 redo A;
655 } else {
656 !!!cp (27);
657 $self->{ct}
658 = {type => END_TAG_TOKEN,
659 tag_name => $self->{last_stag_name},
660 line => $self->{line_prev},
661 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
662 $self->{state} = TAG_NAME_STATE;
663 ## Reconsume.
664 redo A;
665 }
666 }
667 } elsif ($self->{state} == TAG_NAME_STATE) {
668 if ($is_space->{$self->{nc}}) {
669 !!!cp (34);
670 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
671 !!!next-input-character;
672 redo A;
673 } elsif ($self->{nc} == 0x003E) { # >
674 if ($self->{ct}->{type} == START_TAG_TOKEN) {
675 !!!cp (35);
676 $self->{last_stag_name} = $self->{ct}->{tag_name};
677 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
678 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
679 #if ($self->{ct}->{attributes}) {
680 # ## NOTE: This should never be reached.
681 # !!! cp (36);
682 # !!! parse-error (type => 'end tag attribute');
683 #} else {
684 !!!cp (37);
685 #}
686 } else {
687 die "$0: $self->{ct}->{type}: Unknown token type";
688 }
689 $self->{state} = DATA_STATE;
690 $self->{s_kwd} = '';
691 !!!next-input-character;
692
693 !!!emit ($self->{ct}); # start tag or end tag
694
695 redo A;
696 } elsif (0x0041 <= $self->{nc} and
697 $self->{nc} <= 0x005A) { # A..Z
698 !!!cp (38);
699 $self->{ct}->{tag_name}
700 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
701 # start tag or end tag
702 ## Stay in this state
703 !!!next-input-character;
704 redo A;
705 } elsif ($self->{nc} == -1) {
706 !!!parse-error (type => 'unclosed tag');
707 if ($self->{ct}->{type} == START_TAG_TOKEN) {
708 !!!cp (39);
709 $self->{last_stag_name} = $self->{ct}->{tag_name};
710 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
711 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
712 #if ($self->{ct}->{attributes}) {
713 # ## NOTE: This state should never be reached.
714 # !!! cp (40);
715 # !!! parse-error (type => 'end tag attribute');
716 #} else {
717 !!!cp (41);
718 #}
719 } else {
720 die "$0: $self->{ct}->{type}: Unknown token type";
721 }
722 $self->{state} = DATA_STATE;
723 $self->{s_kwd} = '';
724 # reconsume
725
726 !!!emit ($self->{ct}); # start tag or end tag
727
728 redo A;
729 } elsif ($self->{nc} == 0x002F) { # /
730 !!!cp (42);
731 $self->{state} = SELF_CLOSING_START_TAG_STATE;
732 !!!next-input-character;
733 redo A;
734 } else {
735 !!!cp (44);
736 $self->{ct}->{tag_name} .= chr $self->{nc};
737 # start tag or end tag
738 ## Stay in the state
739 !!!next-input-character;
740 redo A;
741 }
742 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
743 if ($is_space->{$self->{nc}}) {
744 !!!cp (45);
745 ## Stay in the state
746 !!!next-input-character;
747 redo A;
748 } elsif ($self->{nc} == 0x003E) { # >
749 if ($self->{ct}->{type} == START_TAG_TOKEN) {
750 !!!cp (46);
751 $self->{last_stag_name} = $self->{ct}->{tag_name};
752 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
753 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
754 if ($self->{ct}->{attributes}) {
755 !!!cp (47);
756 !!!parse-error (type => 'end tag attribute');
757 } else {
758 !!!cp (48);
759 }
760 } else {
761 die "$0: $self->{ct}->{type}: Unknown token type";
762 }
763 $self->{state} = DATA_STATE;
764 $self->{s_kwd} = '';
765 !!!next-input-character;
766
767 !!!emit ($self->{ct}); # start tag or end tag
768
769 redo A;
770 } elsif (0x0041 <= $self->{nc} and
771 $self->{nc} <= 0x005A) { # A..Z
772 !!!cp (49);
773 $self->{ca}
774 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
775 value => '',
776 line => $self->{line}, column => $self->{column}};
777 $self->{state} = ATTRIBUTE_NAME_STATE;
778 !!!next-input-character;
779 redo A;
780 } elsif ($self->{nc} == 0x002F) { # /
781 !!!cp (50);
782 $self->{state} = SELF_CLOSING_START_TAG_STATE;
783 !!!next-input-character;
784 redo A;
785 } elsif ($self->{nc} == -1) {
786 !!!parse-error (type => 'unclosed tag');
787 if ($self->{ct}->{type} == START_TAG_TOKEN) {
788 !!!cp (52);
789 $self->{last_stag_name} = $self->{ct}->{tag_name};
790 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
791 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
792 if ($self->{ct}->{attributes}) {
793 !!!cp (53);
794 !!!parse-error (type => 'end tag attribute');
795 } else {
796 !!!cp (54);
797 }
798 } else {
799 die "$0: $self->{ct}->{type}: Unknown token type";
800 }
801 $self->{state} = DATA_STATE;
802 $self->{s_kwd} = '';
803 # reconsume
804
805 !!!emit ($self->{ct}); # start tag or end tag
806
807 redo A;
808 } else {
809 if ({
810 0x0022 => 1, # "
811 0x0027 => 1, # '
812 0x003D => 1, # =
813 }->{$self->{nc}}) {
814 !!!cp (55);
815 !!!parse-error (type => 'bad attribute name');
816 } else {
817 !!!cp (56);
818 }
819 $self->{ca}
820 = {name => chr ($self->{nc}),
821 value => '',
822 line => $self->{line}, column => $self->{column}};
823 $self->{state} = ATTRIBUTE_NAME_STATE;
824 !!!next-input-character;
825 redo A;
826 }
827 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
828 my $before_leave = sub {
829 if (exists $self->{ct}->{attributes} # start tag or end tag
830 ->{$self->{ca}->{name}}) { # MUST
831 !!!cp (57);
832 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
833 ## Discard $self->{ca} # MUST
834 } else {
835 !!!cp (58);
836 $self->{ct}->{attributes}->{$self->{ca}->{name}}
837 = $self->{ca};
838 }
839 }; # $before_leave
840
841 if ($is_space->{$self->{nc}}) {
842 !!!cp (59);
843 $before_leave->();
844 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
845 !!!next-input-character;
846 redo A;
847 } elsif ($self->{nc} == 0x003D) { # =
848 !!!cp (60);
849 $before_leave->();
850 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
851 !!!next-input-character;
852 redo A;
853 } elsif ($self->{nc} == 0x003E) { # >
854 $before_leave->();
855 if ($self->{ct}->{type} == START_TAG_TOKEN) {
856 !!!cp (61);
857 $self->{last_stag_name} = $self->{ct}->{tag_name};
858 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
859 !!!cp (62);
860 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
861 if ($self->{ct}->{attributes}) {
862 !!!parse-error (type => 'end tag attribute');
863 }
864 } else {
865 die "$0: $self->{ct}->{type}: Unknown token type";
866 }
867 $self->{state} = DATA_STATE;
868 $self->{s_kwd} = '';
869 !!!next-input-character;
870
871 !!!emit ($self->{ct}); # start tag or end tag
872
873 redo A;
874 } elsif (0x0041 <= $self->{nc} and
875 $self->{nc} <= 0x005A) { # A..Z
876 !!!cp (63);
877 $self->{ca}->{name}
878 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
879 ## Stay in the state
880 !!!next-input-character;
881 redo A;
882 } elsif ($self->{nc} == 0x002F) { # /
883 !!!cp (64);
884 $before_leave->();
885 $self->{state} = SELF_CLOSING_START_TAG_STATE;
886 !!!next-input-character;
887 redo A;
888 } elsif ($self->{nc} == -1) {
889 !!!parse-error (type => 'unclosed tag');
890 $before_leave->();
891 if ($self->{ct}->{type} == START_TAG_TOKEN) {
892 !!!cp (66);
893 $self->{last_stag_name} = $self->{ct}->{tag_name};
894 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
895 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
896 if ($self->{ct}->{attributes}) {
897 !!!cp (67);
898 !!!parse-error (type => 'end tag attribute');
899 } else {
900 ## NOTE: This state should never be reached.
901 !!!cp (68);
902 }
903 } else {
904 die "$0: $self->{ct}->{type}: Unknown token type";
905 }
906 $self->{state} = DATA_STATE;
907 $self->{s_kwd} = '';
908 # reconsume
909
910 !!!emit ($self->{ct}); # start tag or end tag
911
912 redo A;
913 } else {
914 if ($self->{nc} == 0x0022 or # "
915 $self->{nc} == 0x0027) { # '
916 !!!cp (69);
917 !!!parse-error (type => 'bad attribute name');
918 } else {
919 !!!cp (70);
920 }
921 $self->{ca}->{name} .= chr ($self->{nc});
922 ## Stay in the state
923 !!!next-input-character;
924 redo A;
925 }
926 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
927 if ($is_space->{$self->{nc}}) {
928 !!!cp (71);
929 ## Stay in the state
930 !!!next-input-character;
931 redo A;
932 } elsif ($self->{nc} == 0x003D) { # =
933 !!!cp (72);
934 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
935 !!!next-input-character;
936 redo A;
937 } elsif ($self->{nc} == 0x003E) { # >
938 if ($self->{ct}->{type} == START_TAG_TOKEN) {
939 !!!cp (73);
940 $self->{last_stag_name} = $self->{ct}->{tag_name};
941 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
942 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
943 if ($self->{ct}->{attributes}) {
944 !!!cp (74);
945 !!!parse-error (type => 'end tag attribute');
946 } else {
947 ## NOTE: This state should never be reached.
948 !!!cp (75);
949 }
950 } else {
951 die "$0: $self->{ct}->{type}: Unknown token type";
952 }
953 $self->{state} = DATA_STATE;
954 $self->{s_kwd} = '';
955 !!!next-input-character;
956
957 !!!emit ($self->{ct}); # start tag or end tag
958
959 redo A;
960 } elsif (0x0041 <= $self->{nc} and
961 $self->{nc} <= 0x005A) { # A..Z
962 !!!cp (76);
963 $self->{ca}
964 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
965 value => '',
966 line => $self->{line}, column => $self->{column}};
967 $self->{state} = ATTRIBUTE_NAME_STATE;
968 !!!next-input-character;
969 redo A;
970 } elsif ($self->{nc} == 0x002F) { # /
971 !!!cp (77);
972 $self->{state} = SELF_CLOSING_START_TAG_STATE;
973 !!!next-input-character;
974 redo A;
975 } elsif ($self->{nc} == -1) {
976 !!!parse-error (type => 'unclosed tag');
977 if ($self->{ct}->{type} == START_TAG_TOKEN) {
978 !!!cp (79);
979 $self->{last_stag_name} = $self->{ct}->{tag_name};
980 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
981 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
982 if ($self->{ct}->{attributes}) {
983 !!!cp (80);
984 !!!parse-error (type => 'end tag attribute');
985 } else {
986 ## NOTE: This state should never be reached.
987 !!!cp (81);
988 }
989 } else {
990 die "$0: $self->{ct}->{type}: Unknown token type";
991 }
992 $self->{s_kwd} = '';
993 $self->{state} = DATA_STATE;
994 # reconsume
995
996 !!!emit ($self->{ct}); # start tag or end tag
997
998 redo A;
999 } else {
1000 if ($self->{nc} == 0x0022 or # "
1001 $self->{nc} == 0x0027) { # '
1002 !!!cp (78);
1003 !!!parse-error (type => 'bad attribute name');
1004 } else {
1005 !!!cp (82);
1006 }
1007 $self->{ca}
1008 = {name => chr ($self->{nc}),
1009 value => '',
1010 line => $self->{line}, column => $self->{column}};
1011 $self->{state} = ATTRIBUTE_NAME_STATE;
1012 !!!next-input-character;
1013 redo A;
1014 }
1015 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1016 if ($is_space->{$self->{nc}}) {
1017 !!!cp (83);
1018 ## Stay in the state
1019 !!!next-input-character;
1020 redo A;
1021 } elsif ($self->{nc} == 0x0022) { # "
1022 !!!cp (84);
1023 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1024 !!!next-input-character;
1025 redo A;
1026 } elsif ($self->{nc} == 0x0026) { # &
1027 !!!cp (85);
1028 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1029 ## reconsume
1030 redo A;
1031 } elsif ($self->{nc} == 0x0027) { # '
1032 !!!cp (86);
1033 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1034 !!!next-input-character;
1035 redo A;
1036 } elsif ($self->{nc} == 0x003E) { # >
1037 !!!parse-error (type => 'empty unquoted attribute value');
1038 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1039 !!!cp (87);
1040 $self->{last_stag_name} = $self->{ct}->{tag_name};
1041 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1042 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1043 if ($self->{ct}->{attributes}) {
1044 !!!cp (88);
1045 !!!parse-error (type => 'end tag attribute');
1046 } else {
1047 ## NOTE: This state should never be reached.
1048 !!!cp (89);
1049 }
1050 } else {
1051 die "$0: $self->{ct}->{type}: Unknown token type";
1052 }
1053 $self->{state} = DATA_STATE;
1054 $self->{s_kwd} = '';
1055 !!!next-input-character;
1056
1057 !!!emit ($self->{ct}); # start tag or end tag
1058
1059 redo A;
1060 } elsif ($self->{nc} == -1) {
1061 !!!parse-error (type => 'unclosed tag');
1062 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1063 !!!cp (90);
1064 $self->{last_stag_name} = $self->{ct}->{tag_name};
1065 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1066 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1067 if ($self->{ct}->{attributes}) {
1068 !!!cp (91);
1069 !!!parse-error (type => 'end tag attribute');
1070 } else {
1071 ## NOTE: This state should never be reached.
1072 !!!cp (92);
1073 }
1074 } else {
1075 die "$0: $self->{ct}->{type}: Unknown token type";
1076 }
1077 $self->{state} = DATA_STATE;
1078 $self->{s_kwd} = '';
1079 ## reconsume
1080
1081 !!!emit ($self->{ct}); # start tag or end tag
1082
1083 redo A;
1084 } else {
1085 if ($self->{nc} == 0x003D) { # =
1086 !!!cp (93);
1087 !!!parse-error (type => 'bad attribute value');
1088 } else {
1089 !!!cp (94);
1090 }
1091 $self->{ca}->{value} .= chr ($self->{nc});
1092 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1093 !!!next-input-character;
1094 redo A;
1095 }
1096 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1097 if ($self->{nc} == 0x0022) { # "
1098 !!!cp (95);
1099 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1100 !!!next-input-character;
1101 redo A;
1102 } elsif ($self->{nc} == 0x0026) { # &
1103 !!!cp (96);
1104 ## NOTE: In the spec, the tokenizer is switched to the
1105 ## "entity in attribute value state". In this implementation, the
1106 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1107 ## implementation of the "consume a character reference" algorithm.
1108 $self->{prev_state} = $self->{state};
1109 $self->{entity_add} = 0x0022; # "
1110 $self->{state} = ENTITY_STATE;
1111 !!!next-input-character;
1112 redo A;
1113 } elsif ($self->{nc} == -1) {
1114 !!!parse-error (type => 'unclosed attribute value');
1115 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1116 !!!cp (97);
1117 $self->{last_stag_name} = $self->{ct}->{tag_name};
1118 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1119 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1120 if ($self->{ct}->{attributes}) {
1121 !!!cp (98);
1122 !!!parse-error (type => 'end tag attribute');
1123 } else {
1124 ## NOTE: This state should never be reached.
1125 !!!cp (99);
1126 }
1127 } else {
1128 die "$0: $self->{ct}->{type}: Unknown token type";
1129 }
1130 $self->{state} = DATA_STATE;
1131 $self->{s_kwd} = '';
1132 ## reconsume
1133
1134 !!!emit ($self->{ct}); # start tag or end tag
1135
1136 redo A;
1137 } else {
1138 !!!cp (100);
1139 $self->{ca}->{value} .= chr ($self->{nc});
1140 $self->{read_until}->($self->{ca}->{value},
1141 q["&],
1142 length $self->{ca}->{value});
1143
1144 ## Stay in the state
1145 !!!next-input-character;
1146 redo A;
1147 }
1148 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1149 if ($self->{nc} == 0x0027) { # '
1150 !!!cp (101);
1151 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1152 !!!next-input-character;
1153 redo A;
1154 } elsif ($self->{nc} == 0x0026) { # &
1155 !!!cp (102);
1156 ## NOTE: In the spec, the tokenizer is switched to the
1157 ## "entity in attribute value state". In this implementation, the
1158 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1159 ## implementation of the "consume a character reference" algorithm.
1160 $self->{entity_add} = 0x0027; # '
1161 $self->{prev_state} = $self->{state};
1162 $self->{state} = ENTITY_STATE;
1163 !!!next-input-character;
1164 redo A;
1165 } elsif ($self->{nc} == -1) {
1166 !!!parse-error (type => 'unclosed attribute value');
1167 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1168 !!!cp (103);
1169 $self->{last_stag_name} = $self->{ct}->{tag_name};
1170 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1171 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1172 if ($self->{ct}->{attributes}) {
1173 !!!cp (104);
1174 !!!parse-error (type => 'end tag attribute');
1175 } else {
1176 ## NOTE: This state should never be reached.
1177 !!!cp (105);
1178 }
1179 } else {
1180 die "$0: $self->{ct}->{type}: Unknown token type";
1181 }
1182 $self->{state} = DATA_STATE;
1183 $self->{s_kwd} = '';
1184 ## reconsume
1185
1186 !!!emit ($self->{ct}); # start tag or end tag
1187
1188 redo A;
1189 } else {
1190 !!!cp (106);
1191 $self->{ca}->{value} .= chr ($self->{nc});
1192 $self->{read_until}->($self->{ca}->{value},
1193 q['&],
1194 length $self->{ca}->{value});
1195
1196 ## Stay in the state
1197 !!!next-input-character;
1198 redo A;
1199 }
1200 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1201 if ($is_space->{$self->{nc}}) {
1202 !!!cp (107);
1203 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1204 !!!next-input-character;
1205 redo A;
1206 } elsif ($self->{nc} == 0x0026) { # &
1207 !!!cp (108);
1208 ## NOTE: In the spec, the tokenizer is switched to the
1209 ## "entity in attribute value state". In this implementation, the
1210 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1211 ## implementation of the "consume a character reference" algorithm.
1212 $self->{entity_add} = -1;
1213 $self->{prev_state} = $self->{state};
1214 $self->{state} = ENTITY_STATE;
1215 !!!next-input-character;
1216 redo A;
1217 } elsif ($self->{nc} == 0x003E) { # >
1218 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1219 !!!cp (109);
1220 $self->{last_stag_name} = $self->{ct}->{tag_name};
1221 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1222 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1223 if ($self->{ct}->{attributes}) {
1224 !!!cp (110);
1225 !!!parse-error (type => 'end tag attribute');
1226 } else {
1227 ## NOTE: This state should never be reached.
1228 !!!cp (111);
1229 }
1230 } else {
1231 die "$0: $self->{ct}->{type}: Unknown token type";
1232 }
1233 $self->{state} = DATA_STATE;
1234 $self->{s_kwd} = '';
1235 !!!next-input-character;
1236
1237 !!!emit ($self->{ct}); # start tag or end tag
1238
1239 redo A;
1240 } elsif ($self->{nc} == -1) {
1241 !!!parse-error (type => 'unclosed tag');
1242 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1243 !!!cp (112);
1244 $self->{last_stag_name} = $self->{ct}->{tag_name};
1245 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1246 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1247 if ($self->{ct}->{attributes}) {
1248 !!!cp (113);
1249 !!!parse-error (type => 'end tag attribute');
1250 } else {
1251 ## NOTE: This state should never be reached.
1252 !!!cp (114);
1253 }
1254 } else {
1255 die "$0: $self->{ct}->{type}: Unknown token type";
1256 }
1257 $self->{state} = DATA_STATE;
1258 $self->{s_kwd} = '';
1259 ## reconsume
1260
1261 !!!emit ($self->{ct}); # start tag or end tag
1262
1263 redo A;
1264 } else {
1265 if ({
1266 0x0022 => 1, # "
1267 0x0027 => 1, # '
1268 0x003D => 1, # =
1269 }->{$self->{nc}}) {
1270 !!!cp (115);
1271 !!!parse-error (type => 'bad attribute value');
1272 } else {
1273 !!!cp (116);
1274 }
1275 $self->{ca}->{value} .= chr ($self->{nc});
1276 $self->{read_until}->($self->{ca}->{value},
1277 q["'=& >],
1278 length $self->{ca}->{value});
1279
1280 ## Stay in the state
1281 !!!next-input-character;
1282 redo A;
1283 }
1284 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1285 if ($is_space->{$self->{nc}}) {
1286 !!!cp (118);
1287 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1288 !!!next-input-character;
1289 redo A;
1290 } elsif ($self->{nc} == 0x003E) { # >
1291 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1292 !!!cp (119);
1293 $self->{last_stag_name} = $self->{ct}->{tag_name};
1294 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1295 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1296 if ($self->{ct}->{attributes}) {
1297 !!!cp (120);
1298 !!!parse-error (type => 'end tag attribute');
1299 } else {
1300 ## NOTE: This state should never be reached.
1301 !!!cp (121);
1302 }
1303 } else {
1304 die "$0: $self->{ct}->{type}: Unknown token type";
1305 }
1306 $self->{state} = DATA_STATE;
1307 $self->{s_kwd} = '';
1308 !!!next-input-character;
1309
1310 !!!emit ($self->{ct}); # start tag or end tag
1311
1312 redo A;
1313 } elsif ($self->{nc} == 0x002F) { # /
1314 !!!cp (122);
1315 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1316 !!!next-input-character;
1317 redo A;
1318 } elsif ($self->{nc} == -1) {
1319 !!!parse-error (type => 'unclosed tag');
1320 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1321 !!!cp (122.3);
1322 $self->{last_stag_name} = $self->{ct}->{tag_name};
1323 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324 if ($self->{ct}->{attributes}) {
1325 !!!cp (122.1);
1326 !!!parse-error (type => 'end tag attribute');
1327 } else {
1328 ## NOTE: This state should never be reached.
1329 !!!cp (122.2);
1330 }
1331 } else {
1332 die "$0: $self->{ct}->{type}: Unknown token type";
1333 }
1334 $self->{state} = DATA_STATE;
1335 $self->{s_kwd} = '';
1336 ## Reconsume.
1337 !!!emit ($self->{ct}); # start tag or end tag
1338 redo A;
1339 } else {
1340 !!!cp ('124.1');
1341 !!!parse-error (type => 'no space between attributes');
1342 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1343 ## reconsume
1344 redo A;
1345 }
1346 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1347 if ($self->{nc} == 0x003E) { # >
1348 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1349 !!!cp ('124.2');
1350 !!!parse-error (type => 'nestc', token => $self->{ct});
1351 ## TODO: Different type than slash in start tag
1352 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1353 if ($self->{ct}->{attributes}) {
1354 !!!cp ('124.4');
1355 !!!parse-error (type => 'end tag attribute');
1356 } else {
1357 !!!cp ('124.5');
1358 }
1359 ## TODO: Test |<title></title/>|
1360 } else {
1361 !!!cp ('124.3');
1362 $self->{self_closing} = 1;
1363 }
1364
1365 $self->{state} = DATA_STATE;
1366 $self->{s_kwd} = '';
1367 !!!next-input-character;
1368
1369 !!!emit ($self->{ct}); # start tag or end tag
1370
1371 redo A;
1372 } elsif ($self->{nc} == -1) {
1373 !!!parse-error (type => 'unclosed tag');
1374 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1375 !!!cp (124.7);
1376 $self->{last_stag_name} = $self->{ct}->{tag_name};
1377 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1378 if ($self->{ct}->{attributes}) {
1379 !!!cp (124.5);
1380 !!!parse-error (type => 'end tag attribute');
1381 } else {
1382 ## NOTE: This state should never be reached.
1383 !!!cp (124.6);
1384 }
1385 } else {
1386 die "$0: $self->{ct}->{type}: Unknown token type";
1387 }
1388 $self->{state} = DATA_STATE;
1389 $self->{s_kwd} = '';
1390 ## Reconsume.
1391 !!!emit ($self->{ct}); # start tag or end tag
1392 redo A;
1393 } else {
1394 !!!cp ('124.4');
1395 !!!parse-error (type => 'nestc');
1396 ## TODO: This error type is wrong.
1397 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1398 ## Reconsume.
1399 redo A;
1400 }
1401 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1402 ## (only happen if PCDATA state)
1403
1404 ## NOTE: Unlike spec's "bogus comment state", this implementation
1405 ## consumes characters one-by-one basis.
1406
1407 if ($self->{nc} == 0x003E) { # >
1408 !!!cp (124);
1409 $self->{state} = DATA_STATE;
1410 $self->{s_kwd} = '';
1411 !!!next-input-character;
1412
1413 !!!emit ($self->{ct}); # comment
1414 redo A;
1415 } elsif ($self->{nc} == -1) {
1416 !!!cp (125);
1417 $self->{state} = DATA_STATE;
1418 $self->{s_kwd} = '';
1419 ## reconsume
1420
1421 !!!emit ($self->{ct}); # comment
1422 redo A;
1423 } else {
1424 !!!cp (126);
1425 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1426 $self->{read_until}->($self->{ct}->{data},
1427 q[>],
1428 length $self->{ct}->{data});
1429
1430 ## Stay in the state.
1431 !!!next-input-character;
1432 redo A;
1433 }
1434 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1435 ## (only happen if PCDATA state)
1436
1437 if ($self->{nc} == 0x002D) { # -
1438 !!!cp (133);
1439 $self->{state} = MD_HYPHEN_STATE;
1440 !!!next-input-character;
1441 redo A;
1442 } elsif ($self->{nc} == 0x0044 or # D
1443 $self->{nc} == 0x0064) { # d
1444 ## ASCII case-insensitive.
1445 !!!cp (130);
1446 $self->{state} = MD_DOCTYPE_STATE;
1447 $self->{s_kwd} = chr $self->{nc};
1448 !!!next-input-character;
1449 redo A;
1450 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1451 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1452 $self->{is_xml}) and
1453 $self->{nc} == 0x005B) { # [
1454 !!!cp (135.4);
1455 $self->{state} = MD_CDATA_STATE;
1456 $self->{s_kwd} = '[';
1457 !!!next-input-character;
1458 redo A;
1459 } else {
1460 !!!cp (136);
1461 }
1462
1463 !!!parse-error (type => 'bogus comment',
1464 line => $self->{line_prev},
1465 column => $self->{column_prev} - 1);
1466 ## Reconsume.
1467 $self->{state} = BOGUS_COMMENT_STATE;
1468 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1469 line => $self->{line_prev},
1470 column => $self->{column_prev} - 1,
1471 };
1472 redo A;
1473 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1474 if ($self->{nc} == 0x002D) { # -
1475 !!!cp (127);
1476 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1477 line => $self->{line_prev},
1478 column => $self->{column_prev} - 2,
1479 };
1480 $self->{state} = COMMENT_START_STATE;
1481 !!!next-input-character;
1482 redo A;
1483 } else {
1484 !!!cp (128);
1485 !!!parse-error (type => 'bogus comment',
1486 line => $self->{line_prev},
1487 column => $self->{column_prev} - 2);
1488 $self->{state} = BOGUS_COMMENT_STATE;
1489 ## Reconsume.
1490 $self->{ct} = {type => COMMENT_TOKEN,
1491 data => '-',
1492 line => $self->{line_prev},
1493 column => $self->{column_prev} - 2,
1494 };
1495 redo A;
1496 }
1497 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1498 ## ASCII case-insensitive.
1499 if ($self->{nc} == [
1500 undef,
1501 0x004F, # O
1502 0x0043, # C
1503 0x0054, # T
1504 0x0059, # Y
1505 0x0050, # P
1506 ]->[length $self->{s_kwd}] or
1507 $self->{nc} == [
1508 undef,
1509 0x006F, # o
1510 0x0063, # c
1511 0x0074, # t
1512 0x0079, # y
1513 0x0070, # p
1514 ]->[length $self->{s_kwd}]) {
1515 !!!cp (131);
1516 ## Stay in the state.
1517 $self->{s_kwd} .= chr $self->{nc};
1518 !!!next-input-character;
1519 redo A;
1520 } elsif ((length $self->{s_kwd}) == 6 and
1521 ($self->{nc} == 0x0045 or # E
1522 $self->{nc} == 0x0065)) { # e
1523 !!!cp (129);
1524 $self->{state} = DOCTYPE_STATE;
1525 $self->{ct} = {type => DOCTYPE_TOKEN,
1526 quirks => 1,
1527 line => $self->{line_prev},
1528 column => $self->{column_prev} - 7,
1529 };
1530 !!!next-input-character;
1531 redo A;
1532 } else {
1533 !!!cp (132);
1534 !!!parse-error (type => 'bogus comment',
1535 line => $self->{line_prev},
1536 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1537 $self->{state} = BOGUS_COMMENT_STATE;
1538 ## Reconsume.
1539 $self->{ct} = {type => COMMENT_TOKEN,
1540 data => $self->{s_kwd},
1541 line => $self->{line_prev},
1542 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1543 };
1544 redo A;
1545 }
1546 } elsif ($self->{state} == MD_CDATA_STATE) {
1547 if ($self->{nc} == {
1548 '[' => 0x0043, # C
1549 '[C' => 0x0044, # D
1550 '[CD' => 0x0041, # A
1551 '[CDA' => 0x0054, # T
1552 '[CDAT' => 0x0041, # A
1553 }->{$self->{s_kwd}}) {
1554 !!!cp (135.1);
1555 ## Stay in the state.
1556 $self->{s_kwd} .= chr $self->{nc};
1557 !!!next-input-character;
1558 redo A;
1559 } elsif ($self->{s_kwd} eq '[CDATA' and
1560 $self->{nc} == 0x005B) { # [
1561 !!!cp (135.2);
1562
1563 if ($self->{is_xml} and
1564 not $self->{tainted} and
1565 @{$self->{open_elements} or []} == 0) {
1566 !!!parse-error (type => 'cdata outside of root element',
1567 line => $self->{line_prev},
1568 column => $self->{column_prev} - 7);
1569 $self->{tainted} = 1;
1570 }
1571
1572 $self->{ct} = {type => CHARACTER_TOKEN,
1573 data => '',
1574 line => $self->{line_prev},
1575 column => $self->{column_prev} - 7};
1576 $self->{state} = CDATA_SECTION_STATE;
1577 !!!next-input-character;
1578 redo A;
1579 } else {
1580 !!!cp (135.3);
1581 !!!parse-error (type => 'bogus comment',
1582 line => $self->{line_prev},
1583 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1584 $self->{state} = BOGUS_COMMENT_STATE;
1585 ## Reconsume.
1586 $self->{ct} = {type => COMMENT_TOKEN,
1587 data => $self->{s_kwd},
1588 line => $self->{line_prev},
1589 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1590 };
1591 redo A;
1592 }
1593 } elsif ($self->{state} == COMMENT_START_STATE) {
1594 if ($self->{nc} == 0x002D) { # -
1595 !!!cp (137);
1596 $self->{state} = COMMENT_START_DASH_STATE;
1597 !!!next-input-character;
1598 redo A;
1599 } elsif ($self->{nc} == 0x003E) { # >
1600 !!!cp (138);
1601 !!!parse-error (type => 'bogus comment');
1602 $self->{state} = DATA_STATE;
1603 $self->{s_kwd} = '';
1604 !!!next-input-character;
1605
1606 !!!emit ($self->{ct}); # comment
1607
1608 redo A;
1609 } elsif ($self->{nc} == -1) {
1610 !!!cp (139);
1611 !!!parse-error (type => 'unclosed comment');
1612 $self->{state} = DATA_STATE;
1613 $self->{s_kwd} = '';
1614 ## reconsume
1615
1616 !!!emit ($self->{ct}); # comment
1617
1618 redo A;
1619 } else {
1620 !!!cp (140);
1621 $self->{ct}->{data} # comment
1622 .= chr ($self->{nc});
1623 $self->{state} = COMMENT_STATE;
1624 !!!next-input-character;
1625 redo A;
1626 }
1627 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1628 if ($self->{nc} == 0x002D) { # -
1629 !!!cp (141);
1630 $self->{state} = COMMENT_END_STATE;
1631 !!!next-input-character;
1632 redo A;
1633 } elsif ($self->{nc} == 0x003E) { # >
1634 !!!cp (142);
1635 !!!parse-error (type => 'bogus comment');
1636 $self->{state} = DATA_STATE;
1637 $self->{s_kwd} = '';
1638 !!!next-input-character;
1639
1640 !!!emit ($self->{ct}); # comment
1641
1642 redo A;
1643 } elsif ($self->{nc} == -1) {
1644 !!!cp (143);
1645 !!!parse-error (type => 'unclosed comment');
1646 $self->{state} = DATA_STATE;
1647 $self->{s_kwd} = '';
1648 ## reconsume
1649
1650 !!!emit ($self->{ct}); # comment
1651
1652 redo A;
1653 } else {
1654 !!!cp (144);
1655 $self->{ct}->{data} # comment
1656 .= '-' . chr ($self->{nc});
1657 $self->{state} = COMMENT_STATE;
1658 !!!next-input-character;
1659 redo A;
1660 }
1661 } elsif ($self->{state} == COMMENT_STATE) {
1662 if ($self->{nc} == 0x002D) { # -
1663 !!!cp (145);
1664 $self->{state} = COMMENT_END_DASH_STATE;
1665 !!!next-input-character;
1666 redo A;
1667 } elsif ($self->{nc} == -1) {
1668 !!!cp (146);
1669 !!!parse-error (type => 'unclosed comment');
1670 $self->{state} = DATA_STATE;
1671 $self->{s_kwd} = '';
1672 ## reconsume
1673
1674 !!!emit ($self->{ct}); # comment
1675
1676 redo A;
1677 } else {
1678 !!!cp (147);
1679 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1680 $self->{read_until}->($self->{ct}->{data},
1681 q[-],
1682 length $self->{ct}->{data});
1683
1684 ## Stay in the state
1685 !!!next-input-character;
1686 redo A;
1687 }
1688 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1689 if ($self->{nc} == 0x002D) { # -
1690 !!!cp (148);
1691 $self->{state} = COMMENT_END_STATE;
1692 !!!next-input-character;
1693 redo A;
1694 } elsif ($self->{nc} == -1) {
1695 !!!cp (149);
1696 !!!parse-error (type => 'unclosed comment');
1697 $self->{s_kwd} = '';
1698 $self->{state} = DATA_STATE;
1699 $self->{s_kwd} = '';
1700 ## reconsume
1701
1702 !!!emit ($self->{ct}); # comment
1703
1704 redo A;
1705 } else {
1706 !!!cp (150);
1707 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1708 $self->{state} = COMMENT_STATE;
1709 !!!next-input-character;
1710 redo A;
1711 }
1712 } elsif ($self->{state} == COMMENT_END_STATE) {
1713 if ($self->{nc} == 0x003E) { # >
1714 !!!cp (151);
1715 $self->{state} = DATA_STATE;
1716 $self->{s_kwd} = '';
1717 !!!next-input-character;
1718
1719 !!!emit ($self->{ct}); # comment
1720
1721 redo A;
1722 } elsif ($self->{nc} == 0x002D) { # -
1723 !!!cp (152);
1724 !!!parse-error (type => 'dash in comment',
1725 line => $self->{line_prev},
1726 column => $self->{column_prev});
1727 $self->{ct}->{data} .= '-'; # comment
1728 ## Stay in the state
1729 !!!next-input-character;
1730 redo A;
1731 } elsif ($self->{nc} == -1) {
1732 !!!cp (153);
1733 !!!parse-error (type => 'unclosed comment');
1734 $self->{state} = DATA_STATE;
1735 $self->{s_kwd} = '';
1736 ## reconsume
1737
1738 !!!emit ($self->{ct}); # comment
1739
1740 redo A;
1741 } else {
1742 !!!cp (154);
1743 !!!parse-error (type => 'dash in comment',
1744 line => $self->{line_prev},
1745 column => $self->{column_prev});
1746 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1747 $self->{state} = COMMENT_STATE;
1748 !!!next-input-character;
1749 redo A;
1750 }
1751 } elsif ($self->{state} == DOCTYPE_STATE) {
1752 if ($is_space->{$self->{nc}}) {
1753 !!!cp (155);
1754 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1755 !!!next-input-character;
1756 redo A;
1757 } else {
1758 !!!cp (156);
1759 !!!parse-error (type => 'no space before DOCTYPE name');
1760 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1761 ## reconsume
1762 redo A;
1763 }
1764 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1765 if ($is_space->{$self->{nc}}) {
1766 !!!cp (157);
1767 ## Stay in the state
1768 !!!next-input-character;
1769 redo A;
1770 } elsif ($self->{nc} == 0x003E) { # >
1771 !!!cp (158);
1772 !!!parse-error (type => 'no DOCTYPE name');
1773 $self->{state} = DATA_STATE;
1774 $self->{s_kwd} = '';
1775 !!!next-input-character;
1776
1777 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1778
1779 redo A;
1780 } elsif ($self->{nc} == -1) {
1781 !!!cp (159);
1782 !!!parse-error (type => 'no DOCTYPE name');
1783 $self->{state} = DATA_STATE;
1784 $self->{s_kwd} = '';
1785 ## reconsume
1786
1787 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1788
1789 redo A;
1790 } else {
1791 !!!cp (160);
1792 $self->{ct}->{name} = chr $self->{nc};
1793 delete $self->{ct}->{quirks};
1794 $self->{state} = DOCTYPE_NAME_STATE;
1795 !!!next-input-character;
1796 redo A;
1797 }
1798 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1799 ## ISSUE: Redundant "First," in the spec.
1800 if ($is_space->{$self->{nc}}) {
1801 !!!cp (161);
1802 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1803 !!!next-input-character;
1804 redo A;
1805 } elsif ($self->{nc} == 0x003E) { # >
1806 !!!cp (162);
1807 $self->{state} = DATA_STATE;
1808 $self->{s_kwd} = '';
1809 !!!next-input-character;
1810
1811 !!!emit ($self->{ct}); # DOCTYPE
1812
1813 redo A;
1814 } elsif ($self->{nc} == -1) {
1815 !!!cp (163);
1816 !!!parse-error (type => 'unclosed DOCTYPE');
1817 $self->{state} = DATA_STATE;
1818 $self->{s_kwd} = '';
1819 ## reconsume
1820
1821 $self->{ct}->{quirks} = 1;
1822 !!!emit ($self->{ct}); # DOCTYPE
1823
1824 redo A;
1825 } else {
1826 !!!cp (164);
1827 $self->{ct}->{name}
1828 .= chr ($self->{nc}); # DOCTYPE
1829 ## Stay in the state
1830 !!!next-input-character;
1831 redo A;
1832 }
1833 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1834 if ($is_space->{$self->{nc}}) {
1835 !!!cp (165);
1836 ## Stay in the state
1837 !!!next-input-character;
1838 redo A;
1839 } elsif ($self->{nc} == 0x003E) { # >
1840 !!!cp (166);
1841 $self->{state} = DATA_STATE;
1842 $self->{s_kwd} = '';
1843 !!!next-input-character;
1844
1845 !!!emit ($self->{ct}); # DOCTYPE
1846
1847 redo A;
1848 } elsif ($self->{nc} == -1) {
1849 !!!cp (167);
1850 !!!parse-error (type => 'unclosed DOCTYPE');
1851 $self->{state} = DATA_STATE;
1852 $self->{s_kwd} = '';
1853 ## reconsume
1854
1855 $self->{ct}->{quirks} = 1;
1856 !!!emit ($self->{ct}); # DOCTYPE
1857
1858 redo A;
1859 } elsif ($self->{nc} == 0x0050 or # P
1860 $self->{nc} == 0x0070) { # p
1861 $self->{state} = PUBLIC_STATE;
1862 $self->{s_kwd} = chr $self->{nc};
1863 !!!next-input-character;
1864 redo A;
1865 } elsif ($self->{nc} == 0x0053 or # S
1866 $self->{nc} == 0x0073) { # s
1867 $self->{state} = SYSTEM_STATE;
1868 $self->{s_kwd} = chr $self->{nc};
1869 !!!next-input-character;
1870 redo A;
1871 } else {
1872 !!!cp (180);
1873 !!!parse-error (type => 'string after DOCTYPE name');
1874 $self->{ct}->{quirks} = 1;
1875
1876 $self->{state} = BOGUS_DOCTYPE_STATE;
1877 !!!next-input-character;
1878 redo A;
1879 }
1880 } elsif ($self->{state} == PUBLIC_STATE) {
1881 ## ASCII case-insensitive
1882 if ($self->{nc} == [
1883 undef,
1884 0x0055, # U
1885 0x0042, # B
1886 0x004C, # L
1887 0x0049, # I
1888 ]->[length $self->{s_kwd}] or
1889 $self->{nc} == [
1890 undef,
1891 0x0075, # u
1892 0x0062, # b
1893 0x006C, # l
1894 0x0069, # i
1895 ]->[length $self->{s_kwd}]) {
1896 !!!cp (175);
1897 ## Stay in the state.
1898 $self->{s_kwd} .= chr $self->{nc};
1899 !!!next-input-character;
1900 redo A;
1901 } elsif ((length $self->{s_kwd}) == 5 and
1902 ($self->{nc} == 0x0043 or # C
1903 $self->{nc} == 0x0063)) { # c
1904 !!!cp (168);
1905 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1906 !!!next-input-character;
1907 redo A;
1908 } else {
1909 !!!cp (169);
1910 !!!parse-error (type => 'string after DOCTYPE name',
1911 line => $self->{line_prev},
1912 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1913 $self->{ct}->{quirks} = 1;
1914
1915 $self->{state} = BOGUS_DOCTYPE_STATE;
1916 ## Reconsume.
1917 redo A;
1918 }
1919 } elsif ($self->{state} == SYSTEM_STATE) {
1920 ## ASCII case-insensitive
1921 if ($self->{nc} == [
1922 undef,
1923 0x0059, # Y
1924 0x0053, # S
1925 0x0054, # T
1926 0x0045, # E
1927 ]->[length $self->{s_kwd}] or
1928 $self->{nc} == [
1929 undef,
1930 0x0079, # y
1931 0x0073, # s
1932 0x0074, # t
1933 0x0065, # e
1934 ]->[length $self->{s_kwd}]) {
1935 !!!cp (170);
1936 ## Stay in the state.
1937 $self->{s_kwd} .= chr $self->{nc};
1938 !!!next-input-character;
1939 redo A;
1940 } elsif ((length $self->{s_kwd}) == 5 and
1941 ($self->{nc} == 0x004D or # M
1942 $self->{nc} == 0x006D)) { # m
1943 !!!cp (171);
1944 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1945 !!!next-input-character;
1946 redo A;
1947 } else {
1948 !!!cp (172);
1949 !!!parse-error (type => 'string after DOCTYPE name',
1950 line => $self->{line_prev},
1951 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1952 $self->{ct}->{quirks} = 1;
1953
1954 $self->{state} = BOGUS_DOCTYPE_STATE;
1955 ## Reconsume.
1956 redo A;
1957 }
1958 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1959 if ($is_space->{$self->{nc}}) {
1960 !!!cp (181);
1961 ## Stay in the state
1962 !!!next-input-character;
1963 redo A;
1964 } elsif ($self->{nc} eq 0x0022) { # "
1965 !!!cp (182);
1966 $self->{ct}->{pubid} = ''; # DOCTYPE
1967 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1968 !!!next-input-character;
1969 redo A;
1970 } elsif ($self->{nc} eq 0x0027) { # '
1971 !!!cp (183);
1972 $self->{ct}->{pubid} = ''; # DOCTYPE
1973 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1974 !!!next-input-character;
1975 redo A;
1976 } elsif ($self->{nc} eq 0x003E) { # >
1977 !!!cp (184);
1978 !!!parse-error (type => 'no PUBLIC literal');
1979
1980 $self->{state} = DATA_STATE;
1981 $self->{s_kwd} = '';
1982 !!!next-input-character;
1983
1984 $self->{ct}->{quirks} = 1;
1985 !!!emit ($self->{ct}); # DOCTYPE
1986
1987 redo A;
1988 } elsif ($self->{nc} == -1) {
1989 !!!cp (185);
1990 !!!parse-error (type => 'unclosed DOCTYPE');
1991
1992 $self->{state} = DATA_STATE;
1993 $self->{s_kwd} = '';
1994 ## reconsume
1995
1996 $self->{ct}->{quirks} = 1;
1997 !!!emit ($self->{ct}); # DOCTYPE
1998
1999 redo A;
2000 } else {
2001 !!!cp (186);
2002 !!!parse-error (type => 'string after PUBLIC');
2003 $self->{ct}->{quirks} = 1;
2004
2005 $self->{state} = BOGUS_DOCTYPE_STATE;
2006 !!!next-input-character;
2007 redo A;
2008 }
2009 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2010 if ($self->{nc} == 0x0022) { # "
2011 !!!cp (187);
2012 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2013 !!!next-input-character;
2014 redo A;
2015 } elsif ($self->{nc} == 0x003E) { # >
2016 !!!cp (188);
2017 !!!parse-error (type => 'unclosed PUBLIC literal');
2018
2019 $self->{state} = DATA_STATE;
2020 $self->{s_kwd} = '';
2021 !!!next-input-character;
2022
2023 $self->{ct}->{quirks} = 1;
2024 !!!emit ($self->{ct}); # DOCTYPE
2025
2026 redo A;
2027 } elsif ($self->{nc} == -1) {
2028 !!!cp (189);
2029 !!!parse-error (type => 'unclosed PUBLIC literal');
2030
2031 $self->{state} = DATA_STATE;
2032 $self->{s_kwd} = '';
2033 ## reconsume
2034
2035 $self->{ct}->{quirks} = 1;
2036 !!!emit ($self->{ct}); # DOCTYPE
2037
2038 redo A;
2039 } else {
2040 !!!cp (190);
2041 $self->{ct}->{pubid} # DOCTYPE
2042 .= chr $self->{nc};
2043 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2044 length $self->{ct}->{pubid});
2045
2046 ## Stay in the state
2047 !!!next-input-character;
2048 redo A;
2049 }
2050 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2051 if ($self->{nc} == 0x0027) { # '
2052 !!!cp (191);
2053 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2054 !!!next-input-character;
2055 redo A;
2056 } elsif ($self->{nc} == 0x003E) { # >
2057 !!!cp (192);
2058 !!!parse-error (type => 'unclosed PUBLIC literal');
2059
2060 $self->{state} = DATA_STATE;
2061 $self->{s_kwd} = '';
2062 !!!next-input-character;
2063
2064 $self->{ct}->{quirks} = 1;
2065 !!!emit ($self->{ct}); # DOCTYPE
2066
2067 redo A;
2068 } elsif ($self->{nc} == -1) {
2069 !!!cp (193);
2070 !!!parse-error (type => 'unclosed PUBLIC literal');
2071
2072 $self->{state} = DATA_STATE;
2073 $self->{s_kwd} = '';
2074 ## reconsume
2075
2076 $self->{ct}->{quirks} = 1;
2077 !!!emit ($self->{ct}); # DOCTYPE
2078
2079 redo A;
2080 } else {
2081 !!!cp (194);
2082 $self->{ct}->{pubid} # DOCTYPE
2083 .= chr $self->{nc};
2084 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2085 length $self->{ct}->{pubid});
2086
2087 ## Stay in the state
2088 !!!next-input-character;
2089 redo A;
2090 }
2091 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2092 if ($is_space->{$self->{nc}}) {
2093 !!!cp (195);
2094 ## Stay in the state
2095 !!!next-input-character;
2096 redo A;
2097 } elsif ($self->{nc} == 0x0022) { # "
2098 !!!cp (196);
2099 $self->{ct}->{sysid} = ''; # DOCTYPE
2100 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2101 !!!next-input-character;
2102 redo A;
2103 } elsif ($self->{nc} == 0x0027) { # '
2104 !!!cp (197);
2105 $self->{ct}->{sysid} = ''; # DOCTYPE
2106 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2107 !!!next-input-character;
2108 redo A;
2109 } elsif ($self->{nc} == 0x003E) { # >
2110 !!!cp (198);
2111 $self->{state} = DATA_STATE;
2112 $self->{s_kwd} = '';
2113 !!!next-input-character;
2114
2115 !!!emit ($self->{ct}); # DOCTYPE
2116
2117 redo A;
2118 } elsif ($self->{nc} == -1) {
2119 !!!cp (199);
2120 !!!parse-error (type => 'unclosed DOCTYPE');
2121
2122 $self->{state} = DATA_STATE;
2123 $self->{s_kwd} = '';
2124 ## reconsume
2125
2126 $self->{ct}->{quirks} = 1;
2127 !!!emit ($self->{ct}); # DOCTYPE
2128
2129 redo A;
2130 } else {
2131 !!!cp (200);
2132 !!!parse-error (type => 'string after PUBLIC literal');
2133 $self->{ct}->{quirks} = 1;
2134
2135 $self->{state} = BOGUS_DOCTYPE_STATE;
2136 !!!next-input-character;
2137 redo A;
2138 }
2139 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2140 if ($is_space->{$self->{nc}}) {
2141 !!!cp (201);
2142 ## Stay in the state
2143 !!!next-input-character;
2144 redo A;
2145 } elsif ($self->{nc} == 0x0022) { # "
2146 !!!cp (202);
2147 $self->{ct}->{sysid} = ''; # DOCTYPE
2148 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2149 !!!next-input-character;
2150 redo A;
2151 } elsif ($self->{nc} == 0x0027) { # '
2152 !!!cp (203);
2153 $self->{ct}->{sysid} = ''; # DOCTYPE
2154 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2155 !!!next-input-character;
2156 redo A;
2157 } elsif ($self->{nc} == 0x003E) { # >
2158 !!!cp (204);
2159 !!!parse-error (type => 'no SYSTEM literal');
2160 $self->{state} = DATA_STATE;
2161 $self->{s_kwd} = '';
2162 !!!next-input-character;
2163
2164 $self->{ct}->{quirks} = 1;
2165 !!!emit ($self->{ct}); # DOCTYPE
2166
2167 redo A;
2168 } elsif ($self->{nc} == -1) {
2169 !!!cp (205);
2170 !!!parse-error (type => 'unclosed DOCTYPE');
2171
2172 $self->{state} = DATA_STATE;
2173 $self->{s_kwd} = '';
2174 ## reconsume
2175
2176 $self->{ct}->{quirks} = 1;
2177 !!!emit ($self->{ct}); # DOCTYPE
2178
2179 redo A;
2180 } else {
2181 !!!cp (206);
2182 !!!parse-error (type => 'string after SYSTEM');
2183 $self->{ct}->{quirks} = 1;
2184
2185 $self->{state} = BOGUS_DOCTYPE_STATE;
2186 !!!next-input-character;
2187 redo A;
2188 }
2189 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2190 if ($self->{nc} == 0x0022) { # "
2191 !!!cp (207);
2192 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2193 !!!next-input-character;
2194 redo A;
2195 } elsif ($self->{nc} == 0x003E) { # >
2196 !!!cp (208);
2197 !!!parse-error (type => 'unclosed SYSTEM literal');
2198
2199 $self->{state} = DATA_STATE;
2200 $self->{s_kwd} = '';
2201 !!!next-input-character;
2202
2203 $self->{ct}->{quirks} = 1;
2204 !!!emit ($self->{ct}); # DOCTYPE
2205
2206 redo A;
2207 } elsif ($self->{nc} == -1) {
2208 !!!cp (209);
2209 !!!parse-error (type => 'unclosed SYSTEM literal');
2210
2211 $self->{state} = DATA_STATE;
2212 $self->{s_kwd} = '';
2213 ## reconsume
2214
2215 $self->{ct}->{quirks} = 1;
2216 !!!emit ($self->{ct}); # DOCTYPE
2217
2218 redo A;
2219 } else {
2220 !!!cp (210);
2221 $self->{ct}->{sysid} # DOCTYPE
2222 .= chr $self->{nc};
2223 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2224 length $self->{ct}->{sysid});
2225
2226 ## Stay in the state
2227 !!!next-input-character;
2228 redo A;
2229 }
2230 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2231 if ($self->{nc} == 0x0027) { # '
2232 !!!cp (211);
2233 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2234 !!!next-input-character;
2235 redo A;
2236 } elsif ($self->{nc} == 0x003E) { # >
2237 !!!cp (212);
2238 !!!parse-error (type => 'unclosed SYSTEM literal');
2239
2240 $self->{state} = DATA_STATE;
2241 $self->{s_kwd} = '';
2242 !!!next-input-character;
2243
2244 $self->{ct}->{quirks} = 1;
2245 !!!emit ($self->{ct}); # DOCTYPE
2246
2247 redo A;
2248 } elsif ($self->{nc} == -1) {
2249 !!!cp (213);
2250 !!!parse-error (type => 'unclosed SYSTEM literal');
2251
2252 $self->{state} = DATA_STATE;
2253 $self->{s_kwd} = '';
2254 ## reconsume
2255
2256 $self->{ct}->{quirks} = 1;
2257 !!!emit ($self->{ct}); # DOCTYPE
2258
2259 redo A;
2260 } else {
2261 !!!cp (214);
2262 $self->{ct}->{sysid} # DOCTYPE
2263 .= chr $self->{nc};
2264 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2265 length $self->{ct}->{sysid});
2266
2267 ## Stay in the state
2268 !!!next-input-character;
2269 redo A;
2270 }
2271 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2272 if ($is_space->{$self->{nc}}) {
2273 !!!cp (215);
2274 ## Stay in the state
2275 !!!next-input-character;
2276 redo A;
2277 } elsif ($self->{nc} == 0x003E) { # >
2278 !!!cp (216);
2279 $self->{state} = DATA_STATE;
2280 $self->{s_kwd} = '';
2281 !!!next-input-character;
2282
2283 !!!emit ($self->{ct}); # DOCTYPE
2284
2285 redo A;
2286 } elsif ($self->{nc} == -1) {
2287 !!!cp (217);
2288 !!!parse-error (type => 'unclosed DOCTYPE');
2289 $self->{state} = DATA_STATE;
2290 $self->{s_kwd} = '';
2291 ## reconsume
2292
2293 $self->{ct}->{quirks} = 1;
2294 !!!emit ($self->{ct}); # DOCTYPE
2295
2296 redo A;
2297 } else {
2298 !!!cp (218);
2299 !!!parse-error (type => 'string after SYSTEM literal');
2300 #$self->{ct}->{quirks} = 1;
2301
2302 $self->{state} = BOGUS_DOCTYPE_STATE;
2303 !!!next-input-character;
2304 redo A;
2305 }
2306 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2307 if ($self->{nc} == 0x003E) { # >
2308 !!!cp (219);
2309 $self->{state} = DATA_STATE;
2310 $self->{s_kwd} = '';
2311 !!!next-input-character;
2312
2313 !!!emit ($self->{ct}); # DOCTYPE
2314
2315 redo A;
2316 } elsif ($self->{nc} == -1) {
2317 !!!cp (220);
2318 $self->{state} = DATA_STATE;
2319 $self->{s_kwd} = '';
2320 ## reconsume
2321
2322 !!!emit ($self->{ct}); # DOCTYPE
2323
2324 redo A;
2325 } else {
2326 !!!cp (221);
2327 my $s = '';
2328 $self->{read_until}->($s, q[>], 0);
2329
2330 ## Stay in the state
2331 !!!next-input-character;
2332 redo A;
2333 }
2334 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2335 ## NOTE: "CDATA section state" in the state is jointly implemented
2336 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2337 ## and |CDATA_SECTION_MSE2_STATE|.
2338
2339 if ($self->{nc} == 0x005D) { # ]
2340 !!!cp (221.1);
2341 $self->{state} = CDATA_SECTION_MSE1_STATE;
2342 !!!next-input-character;
2343 redo A;
2344 } elsif ($self->{nc} == -1) {
2345 if ($self->{is_xml}) {
2346 !!!parse-error (type => 'no mse'); ## TODO: type
2347 }
2348
2349 $self->{state} = DATA_STATE;
2350 $self->{s_kwd} = '';
2351 !!!next-input-character;
2352 if (length $self->{ct}->{data}) { # character
2353 !!!cp (221.2);
2354 !!!emit ($self->{ct}); # character
2355 } else {
2356 !!!cp (221.3);
2357 ## No token to emit. $self->{ct} is discarded.
2358 }
2359 redo A;
2360 } else {
2361 !!!cp (221.4);
2362 $self->{ct}->{data} .= chr $self->{nc};
2363 $self->{read_until}->($self->{ct}->{data},
2364 q<]>,
2365 length $self->{ct}->{data});
2366
2367 ## Stay in the state.
2368 !!!next-input-character;
2369 redo A;
2370 }
2371
2372 ## ISSUE: "text tokens" in spec.
2373 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2374 if ($self->{nc} == 0x005D) { # ]
2375 !!!cp (221.5);
2376 $self->{state} = CDATA_SECTION_MSE2_STATE;
2377 !!!next-input-character;
2378 redo A;
2379 } else {
2380 !!!cp (221.6);
2381 $self->{ct}->{data} .= ']';
2382 $self->{state} = CDATA_SECTION_STATE;
2383 ## Reconsume.
2384 redo A;
2385 }
2386 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2387 if ($self->{nc} == 0x003E) { # >
2388 $self->{state} = DATA_STATE;
2389 $self->{s_kwd} = '';
2390 !!!next-input-character;
2391 if (length $self->{ct}->{data}) { # character
2392 !!!cp (221.7);
2393 !!!emit ($self->{ct}); # character
2394 } else {
2395 !!!cp (221.8);
2396 ## No token to emit. $self->{ct} is discarded.
2397 }
2398 redo A;
2399 } elsif ($self->{nc} == 0x005D) { # ]
2400 !!!cp (221.9); # character
2401 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2402 ## Stay in the state.
2403 !!!next-input-character;
2404 redo A;
2405 } else {
2406 !!!cp (221.11);
2407 $self->{ct}->{data} .= ']]'; # character
2408 $self->{state} = CDATA_SECTION_STATE;
2409 ## Reconsume.
2410 redo A;
2411 }
2412 } elsif ($self->{state} == ENTITY_STATE) {
2413 if ($is_space->{$self->{nc}} or
2414 {
2415 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2416 $self->{entity_add} => 1,
2417 }->{$self->{nc}}) {
2418 !!!cp (1001);
2419 ## Don't consume
2420 ## No error
2421 ## Return nothing.
2422 #
2423 } elsif ($self->{nc} == 0x0023) { # #
2424 !!!cp (999);
2425 $self->{state} = ENTITY_HASH_STATE;
2426 $self->{s_kwd} = '#';
2427 !!!next-input-character;
2428 redo A;
2429 } elsif ((0x0041 <= $self->{nc} and
2430 $self->{nc} <= 0x005A) or # A..Z
2431 (0x0061 <= $self->{nc} and
2432 $self->{nc} <= 0x007A)) { # a..z
2433 !!!cp (998);
2434 require Whatpm::_NamedEntityList;
2435 $self->{state} = ENTITY_NAME_STATE;
2436 $self->{s_kwd} = chr $self->{nc};
2437 $self->{entity__value} = $self->{s_kwd};
2438 $self->{entity__match} = 0;
2439 !!!next-input-character;
2440 redo A;
2441 } else {
2442 !!!cp (1027);
2443 !!!parse-error (type => 'bare ero');
2444 ## Return nothing.
2445 #
2446 }
2447
2448 ## NOTE: No character is consumed by the "consume a character
2449 ## reference" algorithm. In other word, there is an "&" character
2450 ## that does not introduce a character reference, which would be
2451 ## appended to the parent element or the attribute value in later
2452 ## process of the tokenizer.
2453
2454 if ($self->{prev_state} == DATA_STATE) {
2455 !!!cp (997);
2456 $self->{state} = $self->{prev_state};
2457 $self->{s_kwd} = '';
2458 ## Reconsume.
2459 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2460 line => $self->{line_prev},
2461 column => $self->{column_prev},
2462 });
2463 redo A;
2464 } else {
2465 !!!cp (996);
2466 $self->{ca}->{value} .= '&';
2467 $self->{state} = $self->{prev_state};
2468 $self->{s_kwd} = '';
2469 ## Reconsume.
2470 redo A;
2471 }
2472 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2473 if ($self->{nc} == 0x0078 or # x
2474 $self->{nc} == 0x0058) { # X
2475 !!!cp (995);
2476 $self->{state} = HEXREF_X_STATE;
2477 $self->{s_kwd} .= chr $self->{nc};
2478 !!!next-input-character;
2479 redo A;
2480 } elsif (0x0030 <= $self->{nc} and
2481 $self->{nc} <= 0x0039) { # 0..9
2482 !!!cp (994);
2483 $self->{state} = NCR_NUM_STATE;
2484 $self->{s_kwd} = $self->{nc} - 0x0030;
2485 !!!next-input-character;
2486 redo A;
2487 } else {
2488 !!!parse-error (type => 'bare nero',
2489 line => $self->{line_prev},
2490 column => $self->{column_prev} - 1);
2491
2492 ## NOTE: According to the spec algorithm, nothing is returned,
2493 ## and then "&#" is appended to the parent element or the attribute
2494 ## value in the later processing.
2495
2496 if ($self->{prev_state} == DATA_STATE) {
2497 !!!cp (1019);
2498 $self->{state} = $self->{prev_state};
2499 $self->{s_kwd} = '';
2500 ## Reconsume.
2501 !!!emit ({type => CHARACTER_TOKEN,
2502 data => '&#',
2503 line => $self->{line_prev},
2504 column => $self->{column_prev} - 1,
2505 });
2506 redo A;
2507 } else {
2508 !!!cp (993);
2509 $self->{ca}->{value} .= '&#';
2510 $self->{state} = $self->{prev_state};
2511 $self->{s_kwd} = '';
2512 ## Reconsume.
2513 redo A;
2514 }
2515 }
2516 } elsif ($self->{state} == NCR_NUM_STATE) {
2517 if (0x0030 <= $self->{nc} and
2518 $self->{nc} <= 0x0039) { # 0..9
2519 !!!cp (1012);
2520 $self->{s_kwd} *= 10;
2521 $self->{s_kwd} += $self->{nc} - 0x0030;
2522
2523 ## Stay in the state.
2524 !!!next-input-character;
2525 redo A;
2526 } elsif ($self->{nc} == 0x003B) { # ;
2527 !!!cp (1013);
2528 !!!next-input-character;
2529 #
2530 } else {
2531 !!!cp (1014);
2532 !!!parse-error (type => 'no refc');
2533 ## Reconsume.
2534 #
2535 }
2536
2537 my $code = $self->{s_kwd};
2538 my $l = $self->{line_prev};
2539 my $c = $self->{column_prev};
2540 if ($charref_map->{$code}) {
2541 !!!cp (1015);
2542 !!!parse-error (type => 'invalid character reference',
2543 text => (sprintf 'U+%04X', $code),
2544 line => $l, column => $c);
2545 $code = $charref_map->{$code};
2546 } elsif ($code > 0x10FFFF) {
2547 !!!cp (1016);
2548 !!!parse-error (type => 'invalid character reference',
2549 text => (sprintf 'U-%08X', $code),
2550 line => $l, column => $c);
2551 $code = 0xFFFD;
2552 }
2553
2554 if ($self->{prev_state} == DATA_STATE) {
2555 !!!cp (992);
2556 $self->{state} = $self->{prev_state};
2557 $self->{s_kwd} = '';
2558 ## Reconsume.
2559 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2560 has_reference => 1,
2561 line => $l, column => $c,
2562 });
2563 redo A;
2564 } else {
2565 !!!cp (991);
2566 $self->{ca}->{value} .= chr $code;
2567 $self->{ca}->{has_reference} = 1;
2568 $self->{state} = $self->{prev_state};
2569 $self->{s_kwd} = '';
2570 ## Reconsume.
2571 redo A;
2572 }
2573 } elsif ($self->{state} == HEXREF_X_STATE) {
2574 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2575 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2576 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2577 # 0..9, A..F, a..f
2578 !!!cp (990);
2579 $self->{state} = HEXREF_HEX_STATE;
2580 $self->{s_kwd} = 0;
2581 ## Reconsume.
2582 redo A;
2583 } else {
2584 !!!parse-error (type => 'bare hcro',
2585 line => $self->{line_prev},
2586 column => $self->{column_prev} - 2);
2587
2588 ## NOTE: According to the spec algorithm, nothing is returned,
2589 ## and then "&#" followed by "X" or "x" is appended to the parent
2590 ## element or the attribute value in the later processing.
2591
2592 if ($self->{prev_state} == DATA_STATE) {
2593 !!!cp (1005);
2594 $self->{state} = $self->{prev_state};
2595 $self->{s_kwd} = '';
2596 ## Reconsume.
2597 !!!emit ({type => CHARACTER_TOKEN,
2598 data => '&' . $self->{s_kwd},
2599 line => $self->{line_prev},
2600 column => $self->{column_prev} - length $self->{s_kwd},
2601 });
2602 redo A;
2603 } else {
2604 !!!cp (989);
2605 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2606 $self->{state} = $self->{prev_state};
2607 $self->{s_kwd} = '';
2608 ## Reconsume.
2609 redo A;
2610 }
2611 }
2612 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2613 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2614 # 0..9
2615 !!!cp (1002);
2616 $self->{s_kwd} *= 0x10;
2617 $self->{s_kwd} += $self->{nc} - 0x0030;
2618 ## Stay in the state.
2619 !!!next-input-character;
2620 redo A;
2621 } elsif (0x0061 <= $self->{nc} and
2622 $self->{nc} <= 0x0066) { # a..f
2623 !!!cp (1003);
2624 $self->{s_kwd} *= 0x10;
2625 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2626 ## Stay in the state.
2627 !!!next-input-character;
2628 redo A;
2629 } elsif (0x0041 <= $self->{nc} and
2630 $self->{nc} <= 0x0046) { # A..F
2631 !!!cp (1004);
2632 $self->{s_kwd} *= 0x10;
2633 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2634 ## Stay in the state.
2635 !!!next-input-character;
2636 redo A;
2637 } elsif ($self->{nc} == 0x003B) { # ;
2638 !!!cp (1006);
2639 !!!next-input-character;
2640 #
2641 } else {
2642 !!!cp (1007);
2643 !!!parse-error (type => 'no refc',
2644 line => $self->{line},
2645 column => $self->{column});
2646 ## Reconsume.
2647 #
2648 }
2649
2650 my $code = $self->{s_kwd};
2651 my $l = $self->{line_prev};
2652 my $c = $self->{column_prev};
2653 if ($charref_map->{$code}) {
2654 !!!cp (1008);
2655 !!!parse-error (type => 'invalid character reference',
2656 text => (sprintf 'U+%04X', $code),
2657 line => $l, column => $c);
2658 $code = $charref_map->{$code};
2659 } elsif ($code > 0x10FFFF) {
2660 !!!cp (1009);
2661 !!!parse-error (type => 'invalid character reference',
2662 text => (sprintf 'U-%08X', $code),
2663 line => $l, column => $c);
2664 $code = 0xFFFD;
2665 }
2666
2667 if ($self->{prev_state} == DATA_STATE) {
2668 !!!cp (988);
2669 $self->{state} = $self->{prev_state};
2670 $self->{s_kwd} = '';
2671 ## Reconsume.
2672 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2673 has_reference => 1,
2674 line => $l, column => $c,
2675 });
2676 redo A;
2677 } else {
2678 !!!cp (987);
2679 $self->{ca}->{value} .= chr $code;
2680 $self->{ca}->{has_reference} = 1;
2681 $self->{state} = $self->{prev_state};
2682 $self->{s_kwd} = '';
2683 ## Reconsume.
2684 redo A;
2685 }
2686 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2687 if (length $self->{s_kwd} < 30 and
2688 ## NOTE: Some number greater than the maximum length of entity name
2689 ((0x0041 <= $self->{nc} and # a
2690 $self->{nc} <= 0x005A) or # x
2691 (0x0061 <= $self->{nc} and # a
2692 $self->{nc} <= 0x007A) or # z
2693 (0x0030 <= $self->{nc} and # 0
2694 $self->{nc} <= 0x0039) or # 9
2695 $self->{nc} == 0x003B)) { # ;
2696 our $EntityChar;
2697 $self->{s_kwd} .= chr $self->{nc};
2698 if (defined $EntityChar->{$self->{s_kwd}}) {
2699 if ($self->{nc} == 0x003B) { # ;
2700 !!!cp (1020);
2701 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2702 $self->{entity__match} = 1;
2703 !!!next-input-character;
2704 #
2705 } else {
2706 !!!cp (1021);
2707 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2708 $self->{entity__match} = -1;
2709 ## Stay in the state.
2710 !!!next-input-character;
2711 redo A;
2712 }
2713 } else {
2714 !!!cp (1022);
2715 $self->{entity__value} .= chr $self->{nc};
2716 $self->{entity__match} *= 2;
2717 ## Stay in the state.
2718 !!!next-input-character;
2719 redo A;
2720 }
2721 }
2722
2723 my $data;
2724 my $has_ref;
2725 if ($self->{entity__match} > 0) {
2726 !!!cp (1023);
2727 $data = $self->{entity__value};
2728 $has_ref = 1;
2729 #
2730 } elsif ($self->{entity__match} < 0) {
2731 !!!parse-error (type => 'no refc');
2732 if ($self->{prev_state} != DATA_STATE and # in attribute
2733 $self->{entity__match} < -1) {
2734 !!!cp (1024);
2735 $data = '&' . $self->{s_kwd};
2736 #
2737 } else {
2738 !!!cp (1025);
2739 $data = $self->{entity__value};
2740 $has_ref = 1;
2741 #
2742 }
2743 } else {
2744 !!!cp (1026);
2745 !!!parse-error (type => 'bare ero',
2746 line => $self->{line_prev},
2747 column => $self->{column_prev} - length $self->{s_kwd});
2748 $data = '&' . $self->{s_kwd};
2749 #
2750 }
2751
2752 ## NOTE: In these cases, when a character reference is found,
2753 ## it is consumed and a character token is returned, or, otherwise,
2754 ## nothing is consumed and returned, according to the spec algorithm.
2755 ## In this implementation, anything that has been examined by the
2756 ## tokenizer is appended to the parent element or the attribute value
2757 ## as string, either literal string when no character reference or
2758 ## entity-replaced string otherwise, in this stage, since any characters
2759 ## that would not be consumed are appended in the data state or in an
2760 ## appropriate attribute value state anyway.
2761
2762 if ($self->{prev_state} == DATA_STATE) {
2763 !!!cp (986);
2764 $self->{state} = $self->{prev_state};
2765 $self->{s_kwd} = '';
2766 ## Reconsume.
2767 !!!emit ({type => CHARACTER_TOKEN,
2768 data => $data,
2769 has_reference => $has_ref,
2770 line => $self->{line_prev},
2771 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2772 });
2773 redo A;
2774 } else {
2775 !!!cp (985);
2776 $self->{ca}->{value} .= $data;
2777 $self->{ca}->{has_reference} = 1 if $has_ref;
2778 $self->{state} = $self->{prev_state};
2779 $self->{s_kwd} = '';
2780 ## Reconsume.
2781 redo A;
2782 }
2783 } else {
2784 die "$0: $self->{state}: Unknown state";
2785 }
2786 } # A
2787
2788 die "$0: _get_next_token: unexpected case";
2789 } # _get_next_token
2790
2791 1;
2792 ## $Date: 2008/10/14 14:57:52 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24