/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations) (download) (as text)
Tue Oct 14 04:32:49 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.1: +44 -11 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 04:28:43 -0000
	* Tokenizer.pm.src: Make *_TOKEN (token type constants)
	exportable.  New token types, PI_TOKEN for XML and ABORT_TOKEN for
	document.write() or incremental parsing, are added for future
	extensions.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 04:27:29 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile, Parser.pm.src: New files.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178
179 $self->{state} = DATA_STATE; # MUST
180 #$self->{s_kwd}; # state keyword - initialized when used
181 #$self->{entity__value}; # initialized when used
182 #$self->{entity__match}; # initialized when used
183 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
184 undef $self->{ct}; # current token
185 undef $self->{ca}; # current attribute
186 undef $self->{last_stag_name}; # last emitted start tag name
187 #$self->{prev_state}; # initialized when used
188 delete $self->{self_closing};
189 $self->{char_buffer} = '';
190 $self->{char_buffer_pos} = 0;
191 $self->{nc} = -1; # next input character
192 #$self->{next_nc}
193 !!!next-input-character;
194 $self->{token} = [];
195 # $self->{escape}
196 } # _initialize_tokenizer
197
198 ## A token has:
199 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
200 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
201 ## ->{name} (DOCTYPE_TOKEN)
202 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
203 ## ->{pubid} (DOCTYPE_TOKEN)
204 ## ->{sysid} (DOCTYPE_TOKEN)
205 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
206 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
207 ## ->{name}
208 ## ->{value}
209 ## ->{has_reference} == 1 or 0
210 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
211 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
212 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
213 ## while the token is pushed back to the stack.
214
215 ## Emitted token MUST immediately be handled by the tree construction state.
216
217 ## Before each step, UA MAY check to see if either one of the scripts in
218 ## "list of scripts that will execute as soon as possible" or the first
219 ## script in the "list of scripts that will execute asynchronously",
220 ## has completed loading. If one has, then it MUST be executed
221 ## and removed from the list.
222
223 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
224 ## (This requirement was dropped from HTML5 spec, unfortunately.)
225
226 my $is_space = {
227 0x0009 => 1, # CHARACTER TABULATION (HT)
228 0x000A => 1, # LINE FEED (LF)
229 #0x000B => 0, # LINE TABULATION (VT)
230 0x000C => 1, # FORM FEED (FF)
231 #0x000D => 1, # CARRIAGE RETURN (CR)
232 0x0020 => 1, # SPACE (SP)
233 };
234
235 sub _get_next_token ($) {
236 my $self = shift;
237
238 if ($self->{self_closing}) {
239 !!!parse-error (type => 'nestc', token => $self->{ct});
240 ## NOTE: The |self_closing| flag is only set by start tag token.
241 ## In addition, when a start tag token is emitted, it is always set to
242 ## |ct|.
243 delete $self->{self_closing};
244 }
245
246 if (@{$self->{token}}) {
247 $self->{self_closing} = $self->{token}->[0]->{self_closing};
248 return shift @{$self->{token}};
249 }
250
251 A: {
252 if ($self->{state} == PCDATA_STATE) {
253 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
254
255 if ($self->{nc} == 0x0026) { # &
256 !!!cp (0.1);
257 ## NOTE: In the spec, the tokenizer is switched to the
258 ## "entity data state". In this implementation, the tokenizer
259 ## is switched to the |ENTITY_STATE|, which is an implementation
260 ## of the "consume a character reference" algorithm.
261 $self->{entity_add} = -1;
262 $self->{prev_state} = DATA_STATE;
263 $self->{state} = ENTITY_STATE;
264 !!!next-input-character;
265 redo A;
266 } elsif ($self->{nc} == 0x003C) { # <
267 !!!cp (0.2);
268 $self->{state} = TAG_OPEN_STATE;
269 !!!next-input-character;
270 redo A;
271 } elsif ($self->{nc} == -1) {
272 !!!cp (0.3);
273 !!!emit ({type => END_OF_FILE_TOKEN,
274 line => $self->{line}, column => $self->{column}});
275 last A; ## TODO: ok?
276 } else {
277 !!!cp (0.4);
278 #
279 }
280
281 # Anything else
282 my $token = {type => CHARACTER_TOKEN,
283 data => chr $self->{nc},
284 line => $self->{line}, column => $self->{column},
285 };
286 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
287
288 ## Stay in the state.
289 !!!next-input-character;
290 !!!emit ($token);
291 redo A;
292 } elsif ($self->{state} == DATA_STATE) {
293 $self->{s_kwd} = '' unless defined $self->{s_kwd};
294 if ($self->{nc} == 0x0026) { # &
295 $self->{s_kwd} = '';
296 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
297 not $self->{escape}) {
298 !!!cp (1);
299 ## NOTE: In the spec, the tokenizer is switched to the
300 ## "entity data state". In this implementation, the tokenizer
301 ## is switched to the |ENTITY_STATE|, which is an implementation
302 ## of the "consume a character reference" algorithm.
303 $self->{entity_add} = -1;
304 $self->{prev_state} = DATA_STATE;
305 $self->{state} = ENTITY_STATE;
306 !!!next-input-character;
307 redo A;
308 } else {
309 !!!cp (2);
310 #
311 }
312 } elsif ($self->{nc} == 0x002D) { # -
313 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
314 $self->{s_kwd} .= '-';
315
316 if ($self->{s_kwd} eq '<!--') {
317 !!!cp (3);
318 $self->{escape} = 1; # unless $self->{escape};
319 $self->{s_kwd} = '--';
320 #
321 } elsif ($self->{s_kwd} eq '---') {
322 !!!cp (4);
323 $self->{s_kwd} = '--';
324 #
325 } else {
326 !!!cp (5);
327 #
328 }
329 }
330
331 #
332 } elsif ($self->{nc} == 0x0021) { # !
333 if (length $self->{s_kwd}) {
334 !!!cp (5.1);
335 $self->{s_kwd} .= '!';
336 #
337 } else {
338 !!!cp (5.2);
339 #$self->{s_kwd} = '';
340 #
341 }
342 #
343 } elsif ($self->{nc} == 0x003C) { # <
344 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
345 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
346 not $self->{escape})) {
347 !!!cp (6);
348 $self->{state} = TAG_OPEN_STATE;
349 !!!next-input-character;
350 redo A;
351 } else {
352 !!!cp (7);
353 $self->{s_kwd} = '';
354 #
355 }
356 } elsif ($self->{nc} == 0x003E) { # >
357 if ($self->{escape} and
358 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
359 if ($self->{s_kwd} eq '--') {
360 !!!cp (8);
361 delete $self->{escape};
362 } else {
363 !!!cp (9);
364 }
365 } else {
366 !!!cp (10);
367 }
368
369 $self->{s_kwd} = '';
370 #
371 } elsif ($self->{nc} == -1) {
372 !!!cp (11);
373 $self->{s_kwd} = '';
374 !!!emit ({type => END_OF_FILE_TOKEN,
375 line => $self->{line}, column => $self->{column}});
376 last A; ## TODO: ok?
377 } else {
378 !!!cp (12);
379 $self->{s_kwd} = '';
380 #
381 }
382
383 # Anything else
384 my $token = {type => CHARACTER_TOKEN,
385 data => chr $self->{nc},
386 line => $self->{line}, column => $self->{column},
387 };
388 if ($self->{read_until}->($token->{data}, q[-!<>&],
389 length $token->{data})) {
390 $self->{s_kwd} = '';
391 }
392
393 ## Stay in the data state.
394 if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
395 !!!cp (13);
396 $self->{state} = PCDATA_STATE;
397 } else {
398 !!!cp (14);
399 ## Stay in the state.
400 }
401 !!!next-input-character;
402 !!!emit ($token);
403 redo A;
404 } elsif ($self->{state} == TAG_OPEN_STATE) {
405 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
406 if ($self->{nc} == 0x002F) { # /
407 !!!cp (15);
408 !!!next-input-character;
409 $self->{state} = CLOSE_TAG_OPEN_STATE;
410 redo A;
411 } elsif ($self->{nc} == 0x0021) { # !
412 !!!cp (15.1);
413 $self->{s_kwd} = '<' unless $self->{escape};
414 #
415 } else {
416 !!!cp (16);
417 #
418 }
419
420 ## reconsume
421 $self->{state} = DATA_STATE;
422 !!!emit ({type => CHARACTER_TOKEN, data => '<',
423 line => $self->{line_prev},
424 column => $self->{column_prev},
425 });
426 redo A;
427 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
428 if ($self->{nc} == 0x0021) { # !
429 !!!cp (17);
430 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
431 !!!next-input-character;
432 redo A;
433 } elsif ($self->{nc} == 0x002F) { # /
434 !!!cp (18);
435 $self->{state} = CLOSE_TAG_OPEN_STATE;
436 !!!next-input-character;
437 redo A;
438 } elsif (0x0041 <= $self->{nc} and
439 $self->{nc} <= 0x005A) { # A..Z
440 !!!cp (19);
441 $self->{ct}
442 = {type => START_TAG_TOKEN,
443 tag_name => chr ($self->{nc} + 0x0020),
444 line => $self->{line_prev},
445 column => $self->{column_prev}};
446 $self->{state} = TAG_NAME_STATE;
447 !!!next-input-character;
448 redo A;
449 } elsif (0x0061 <= $self->{nc} and
450 $self->{nc} <= 0x007A) { # a..z
451 !!!cp (20);
452 $self->{ct} = {type => START_TAG_TOKEN,
453 tag_name => chr ($self->{nc}),
454 line => $self->{line_prev},
455 column => $self->{column_prev}};
456 $self->{state} = TAG_NAME_STATE;
457 !!!next-input-character;
458 redo A;
459 } elsif ($self->{nc} == 0x003E) { # >
460 !!!cp (21);
461 !!!parse-error (type => 'empty start tag',
462 line => $self->{line_prev},
463 column => $self->{column_prev});
464 $self->{state} = DATA_STATE;
465 !!!next-input-character;
466
467 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
468 line => $self->{line_prev},
469 column => $self->{column_prev},
470 });
471
472 redo A;
473 } elsif ($self->{nc} == 0x003F) { # ?
474 !!!cp (22);
475 !!!parse-error (type => 'pio',
476 line => $self->{line_prev},
477 column => $self->{column_prev});
478 $self->{state} = BOGUS_COMMENT_STATE;
479 $self->{ct} = {type => COMMENT_TOKEN, data => '',
480 line => $self->{line_prev},
481 column => $self->{column_prev},
482 };
483 ## $self->{nc} is intentionally left as is
484 redo A;
485 } else {
486 !!!cp (23);
487 !!!parse-error (type => 'bare stago',
488 line => $self->{line_prev},
489 column => $self->{column_prev});
490 $self->{state} = DATA_STATE;
491 ## reconsume
492
493 !!!emit ({type => CHARACTER_TOKEN, data => '<',
494 line => $self->{line_prev},
495 column => $self->{column_prev},
496 });
497
498 redo A;
499 }
500 } else {
501 die "$0: $self->{content_model} in tag open";
502 }
503 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
504 ## NOTE: The "close tag open state" in the spec is implemented as
505 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
506
507 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
508 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
509 if (defined $self->{last_stag_name}) {
510 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
511 $self->{s_kwd} = '';
512 ## Reconsume.
513 redo A;
514 } else {
515 ## No start tag token has ever been emitted
516 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
517 !!!cp (28);
518 $self->{state} = DATA_STATE;
519 ## Reconsume.
520 !!!emit ({type => CHARACTER_TOKEN, data => '</',
521 line => $l, column => $c,
522 });
523 redo A;
524 }
525 }
526
527 if (0x0041 <= $self->{nc} and
528 $self->{nc} <= 0x005A) { # A..Z
529 !!!cp (29);
530 $self->{ct}
531 = {type => END_TAG_TOKEN,
532 tag_name => chr ($self->{nc} + 0x0020),
533 line => $l, column => $c};
534 $self->{state} = TAG_NAME_STATE;
535 !!!next-input-character;
536 redo A;
537 } elsif (0x0061 <= $self->{nc} and
538 $self->{nc} <= 0x007A) { # a..z
539 !!!cp (30);
540 $self->{ct} = {type => END_TAG_TOKEN,
541 tag_name => chr ($self->{nc}),
542 line => $l, column => $c};
543 $self->{state} = TAG_NAME_STATE;
544 !!!next-input-character;
545 redo A;
546 } elsif ($self->{nc} == 0x003E) { # >
547 !!!cp (31);
548 !!!parse-error (type => 'empty end tag',
549 line => $self->{line_prev}, ## "<" in "</>"
550 column => $self->{column_prev} - 1);
551 $self->{state} = DATA_STATE;
552 !!!next-input-character;
553 redo A;
554 } elsif ($self->{nc} == -1) {
555 !!!cp (32);
556 !!!parse-error (type => 'bare etago');
557 $self->{state} = DATA_STATE;
558 # reconsume
559
560 !!!emit ({type => CHARACTER_TOKEN, data => '</',
561 line => $l, column => $c,
562 });
563
564 redo A;
565 } else {
566 !!!cp (33);
567 !!!parse-error (type => 'bogus end tag');
568 $self->{state} = BOGUS_COMMENT_STATE;
569 $self->{ct} = {type => COMMENT_TOKEN, data => '',
570 line => $self->{line_prev}, # "<" of "</"
571 column => $self->{column_prev} - 1,
572 };
573 ## NOTE: $self->{nc} is intentionally left as is.
574 ## Although the "anything else" case of the spec not explicitly
575 ## states that the next input character is to be reconsumed,
576 ## it will be included to the |data| of the comment token
577 ## generated from the bogus end tag, as defined in the
578 ## "bogus comment state" entry.
579 redo A;
580 }
581 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
582 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
583 if (length $ch) {
584 my $CH = $ch;
585 $ch =~ tr/a-z/A-Z/;
586 my $nch = chr $self->{nc};
587 if ($nch eq $ch or $nch eq $CH) {
588 !!!cp (24);
589 ## Stay in the state.
590 $self->{s_kwd} .= $nch;
591 !!!next-input-character;
592 redo A;
593 } else {
594 !!!cp (25);
595 $self->{state} = DATA_STATE;
596 ## Reconsume.
597 !!!emit ({type => CHARACTER_TOKEN,
598 data => '</' . $self->{s_kwd},
599 line => $self->{line_prev},
600 column => $self->{column_prev} - 1 - length $self->{s_kwd},
601 });
602 redo A;
603 }
604 } else { # after "<{tag-name}"
605 unless ($is_space->{$self->{nc}} or
606 {
607 0x003E => 1, # >
608 0x002F => 1, # /
609 -1 => 1, # EOF
610 }->{$self->{nc}}) {
611 !!!cp (26);
612 ## Reconsume.
613 $self->{state} = DATA_STATE;
614 !!!emit ({type => CHARACTER_TOKEN,
615 data => '</' . $self->{s_kwd},
616 line => $self->{line_prev},
617 column => $self->{column_prev} - 1 - length $self->{s_kwd},
618 });
619 redo A;
620 } else {
621 !!!cp (27);
622 $self->{ct}
623 = {type => END_TAG_TOKEN,
624 tag_name => $self->{last_stag_name},
625 line => $self->{line_prev},
626 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
627 $self->{state} = TAG_NAME_STATE;
628 ## Reconsume.
629 redo A;
630 }
631 }
632 } elsif ($self->{state} == TAG_NAME_STATE) {
633 if ($is_space->{$self->{nc}}) {
634 !!!cp (34);
635 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
636 !!!next-input-character;
637 redo A;
638 } elsif ($self->{nc} == 0x003E) { # >
639 if ($self->{ct}->{type} == START_TAG_TOKEN) {
640 !!!cp (35);
641 $self->{last_stag_name} = $self->{ct}->{tag_name};
642 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
643 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
644 #if ($self->{ct}->{attributes}) {
645 # ## NOTE: This should never be reached.
646 # !!! cp (36);
647 # !!! parse-error (type => 'end tag attribute');
648 #} else {
649 !!!cp (37);
650 #}
651 } else {
652 die "$0: $self->{ct}->{type}: Unknown token type";
653 }
654 $self->{state} = DATA_STATE;
655 !!!next-input-character;
656
657 !!!emit ($self->{ct}); # start tag or end tag
658
659 redo A;
660 } elsif (0x0041 <= $self->{nc} and
661 $self->{nc} <= 0x005A) { # A..Z
662 !!!cp (38);
663 $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
664 # start tag or end tag
665 ## Stay in this state
666 !!!next-input-character;
667 redo A;
668 } elsif ($self->{nc} == -1) {
669 !!!parse-error (type => 'unclosed tag');
670 if ($self->{ct}->{type} == START_TAG_TOKEN) {
671 !!!cp (39);
672 $self->{last_stag_name} = $self->{ct}->{tag_name};
673 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
674 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
675 #if ($self->{ct}->{attributes}) {
676 # ## NOTE: This state should never be reached.
677 # !!! cp (40);
678 # !!! parse-error (type => 'end tag attribute');
679 #} else {
680 !!!cp (41);
681 #}
682 } else {
683 die "$0: $self->{ct}->{type}: Unknown token type";
684 }
685 $self->{state} = DATA_STATE;
686 # reconsume
687
688 !!!emit ($self->{ct}); # start tag or end tag
689
690 redo A;
691 } elsif ($self->{nc} == 0x002F) { # /
692 !!!cp (42);
693 $self->{state} = SELF_CLOSING_START_TAG_STATE;
694 !!!next-input-character;
695 redo A;
696 } else {
697 !!!cp (44);
698 $self->{ct}->{tag_name} .= chr $self->{nc};
699 # start tag or end tag
700 ## Stay in the state
701 !!!next-input-character;
702 redo A;
703 }
704 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
705 if ($is_space->{$self->{nc}}) {
706 !!!cp (45);
707 ## Stay in the state
708 !!!next-input-character;
709 redo A;
710 } elsif ($self->{nc} == 0x003E) { # >
711 if ($self->{ct}->{type} == START_TAG_TOKEN) {
712 !!!cp (46);
713 $self->{last_stag_name} = $self->{ct}->{tag_name};
714 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
715 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
716 if ($self->{ct}->{attributes}) {
717 !!!cp (47);
718 !!!parse-error (type => 'end tag attribute');
719 } else {
720 !!!cp (48);
721 }
722 } else {
723 die "$0: $self->{ct}->{type}: Unknown token type";
724 }
725 $self->{state} = DATA_STATE;
726 !!!next-input-character;
727
728 !!!emit ($self->{ct}); # start tag or end tag
729
730 redo A;
731 } elsif (0x0041 <= $self->{nc} and
732 $self->{nc} <= 0x005A) { # A..Z
733 !!!cp (49);
734 $self->{ca}
735 = {name => chr ($self->{nc} + 0x0020),
736 value => '',
737 line => $self->{line}, column => $self->{column}};
738 $self->{state} = ATTRIBUTE_NAME_STATE;
739 !!!next-input-character;
740 redo A;
741 } elsif ($self->{nc} == 0x002F) { # /
742 !!!cp (50);
743 $self->{state} = SELF_CLOSING_START_TAG_STATE;
744 !!!next-input-character;
745 redo A;
746 } elsif ($self->{nc} == -1) {
747 !!!parse-error (type => 'unclosed tag');
748 if ($self->{ct}->{type} == START_TAG_TOKEN) {
749 !!!cp (52);
750 $self->{last_stag_name} = $self->{ct}->{tag_name};
751 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
752 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
753 if ($self->{ct}->{attributes}) {
754 !!!cp (53);
755 !!!parse-error (type => 'end tag attribute');
756 } else {
757 !!!cp (54);
758 }
759 } else {
760 die "$0: $self->{ct}->{type}: Unknown token type";
761 }
762 $self->{state} = DATA_STATE;
763 # reconsume
764
765 !!!emit ($self->{ct}); # start tag or end tag
766
767 redo A;
768 } else {
769 if ({
770 0x0022 => 1, # "
771 0x0027 => 1, # '
772 0x003D => 1, # =
773 }->{$self->{nc}}) {
774 !!!cp (55);
775 !!!parse-error (type => 'bad attribute name');
776 } else {
777 !!!cp (56);
778 }
779 $self->{ca}
780 = {name => chr ($self->{nc}),
781 value => '',
782 line => $self->{line}, column => $self->{column}};
783 $self->{state} = ATTRIBUTE_NAME_STATE;
784 !!!next-input-character;
785 redo A;
786 }
787 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
788 my $before_leave = sub {
789 if (exists $self->{ct}->{attributes} # start tag or end tag
790 ->{$self->{ca}->{name}}) { # MUST
791 !!!cp (57);
792 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
793 ## Discard $self->{ca} # MUST
794 } else {
795 !!!cp (58);
796 $self->{ct}->{attributes}->{$self->{ca}->{name}}
797 = $self->{ca};
798 }
799 }; # $before_leave
800
801 if ($is_space->{$self->{nc}}) {
802 !!!cp (59);
803 $before_leave->();
804 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
805 !!!next-input-character;
806 redo A;
807 } elsif ($self->{nc} == 0x003D) { # =
808 !!!cp (60);
809 $before_leave->();
810 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
811 !!!next-input-character;
812 redo A;
813 } elsif ($self->{nc} == 0x003E) { # >
814 $before_leave->();
815 if ($self->{ct}->{type} == START_TAG_TOKEN) {
816 !!!cp (61);
817 $self->{last_stag_name} = $self->{ct}->{tag_name};
818 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
819 !!!cp (62);
820 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
821 if ($self->{ct}->{attributes}) {
822 !!!parse-error (type => 'end tag attribute');
823 }
824 } else {
825 die "$0: $self->{ct}->{type}: Unknown token type";
826 }
827 $self->{state} = DATA_STATE;
828 !!!next-input-character;
829
830 !!!emit ($self->{ct}); # start tag or end tag
831
832 redo A;
833 } elsif (0x0041 <= $self->{nc} and
834 $self->{nc} <= 0x005A) { # A..Z
835 !!!cp (63);
836 $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
837 ## Stay in the state
838 !!!next-input-character;
839 redo A;
840 } elsif ($self->{nc} == 0x002F) { # /
841 !!!cp (64);
842 $before_leave->();
843 $self->{state} = SELF_CLOSING_START_TAG_STATE;
844 !!!next-input-character;
845 redo A;
846 } elsif ($self->{nc} == -1) {
847 !!!parse-error (type => 'unclosed tag');
848 $before_leave->();
849 if ($self->{ct}->{type} == START_TAG_TOKEN) {
850 !!!cp (66);
851 $self->{last_stag_name} = $self->{ct}->{tag_name};
852 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
853 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
854 if ($self->{ct}->{attributes}) {
855 !!!cp (67);
856 !!!parse-error (type => 'end tag attribute');
857 } else {
858 ## NOTE: This state should never be reached.
859 !!!cp (68);
860 }
861 } else {
862 die "$0: $self->{ct}->{type}: Unknown token type";
863 }
864 $self->{state} = DATA_STATE;
865 # reconsume
866
867 !!!emit ($self->{ct}); # start tag or end tag
868
869 redo A;
870 } else {
871 if ($self->{nc} == 0x0022 or # "
872 $self->{nc} == 0x0027) { # '
873 !!!cp (69);
874 !!!parse-error (type => 'bad attribute name');
875 } else {
876 !!!cp (70);
877 }
878 $self->{ca}->{name} .= chr ($self->{nc});
879 ## Stay in the state
880 !!!next-input-character;
881 redo A;
882 }
883 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
884 if ($is_space->{$self->{nc}}) {
885 !!!cp (71);
886 ## Stay in the state
887 !!!next-input-character;
888 redo A;
889 } elsif ($self->{nc} == 0x003D) { # =
890 !!!cp (72);
891 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
892 !!!next-input-character;
893 redo A;
894 } elsif ($self->{nc} == 0x003E) { # >
895 if ($self->{ct}->{type} == START_TAG_TOKEN) {
896 !!!cp (73);
897 $self->{last_stag_name} = $self->{ct}->{tag_name};
898 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
899 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
900 if ($self->{ct}->{attributes}) {
901 !!!cp (74);
902 !!!parse-error (type => 'end tag attribute');
903 } else {
904 ## NOTE: This state should never be reached.
905 !!!cp (75);
906 }
907 } else {
908 die "$0: $self->{ct}->{type}: Unknown token type";
909 }
910 $self->{state} = DATA_STATE;
911 !!!next-input-character;
912
913 !!!emit ($self->{ct}); # start tag or end tag
914
915 redo A;
916 } elsif (0x0041 <= $self->{nc} and
917 $self->{nc} <= 0x005A) { # A..Z
918 !!!cp (76);
919 $self->{ca}
920 = {name => chr ($self->{nc} + 0x0020),
921 value => '',
922 line => $self->{line}, column => $self->{column}};
923 $self->{state} = ATTRIBUTE_NAME_STATE;
924 !!!next-input-character;
925 redo A;
926 } elsif ($self->{nc} == 0x002F) { # /
927 !!!cp (77);
928 $self->{state} = SELF_CLOSING_START_TAG_STATE;
929 !!!next-input-character;
930 redo A;
931 } elsif ($self->{nc} == -1) {
932 !!!parse-error (type => 'unclosed tag');
933 if ($self->{ct}->{type} == START_TAG_TOKEN) {
934 !!!cp (79);
935 $self->{last_stag_name} = $self->{ct}->{tag_name};
936 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
937 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
938 if ($self->{ct}->{attributes}) {
939 !!!cp (80);
940 !!!parse-error (type => 'end tag attribute');
941 } else {
942 ## NOTE: This state should never be reached.
943 !!!cp (81);
944 }
945 } else {
946 die "$0: $self->{ct}->{type}: Unknown token type";
947 }
948 $self->{state} = DATA_STATE;
949 # reconsume
950
951 !!!emit ($self->{ct}); # start tag or end tag
952
953 redo A;
954 } else {
955 if ($self->{nc} == 0x0022 or # "
956 $self->{nc} == 0x0027) { # '
957 !!!cp (78);
958 !!!parse-error (type => 'bad attribute name');
959 } else {
960 !!!cp (82);
961 }
962 $self->{ca}
963 = {name => chr ($self->{nc}),
964 value => '',
965 line => $self->{line}, column => $self->{column}};
966 $self->{state} = ATTRIBUTE_NAME_STATE;
967 !!!next-input-character;
968 redo A;
969 }
970 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
971 if ($is_space->{$self->{nc}}) {
972 !!!cp (83);
973 ## Stay in the state
974 !!!next-input-character;
975 redo A;
976 } elsif ($self->{nc} == 0x0022) { # "
977 !!!cp (84);
978 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
979 !!!next-input-character;
980 redo A;
981 } elsif ($self->{nc} == 0x0026) { # &
982 !!!cp (85);
983 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
984 ## reconsume
985 redo A;
986 } elsif ($self->{nc} == 0x0027) { # '
987 !!!cp (86);
988 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
989 !!!next-input-character;
990 redo A;
991 } elsif ($self->{nc} == 0x003E) { # >
992 !!!parse-error (type => 'empty unquoted attribute value');
993 if ($self->{ct}->{type} == START_TAG_TOKEN) {
994 !!!cp (87);
995 $self->{last_stag_name} = $self->{ct}->{tag_name};
996 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
997 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
998 if ($self->{ct}->{attributes}) {
999 !!!cp (88);
1000 !!!parse-error (type => 'end tag attribute');
1001 } else {
1002 ## NOTE: This state should never be reached.
1003 !!!cp (89);
1004 }
1005 } else {
1006 die "$0: $self->{ct}->{type}: Unknown token type";
1007 }
1008 $self->{state} = DATA_STATE;
1009 !!!next-input-character;
1010
1011 !!!emit ($self->{ct}); # start tag or end tag
1012
1013 redo A;
1014 } elsif ($self->{nc} == -1) {
1015 !!!parse-error (type => 'unclosed tag');
1016 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1017 !!!cp (90);
1018 $self->{last_stag_name} = $self->{ct}->{tag_name};
1019 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1020 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1021 if ($self->{ct}->{attributes}) {
1022 !!!cp (91);
1023 !!!parse-error (type => 'end tag attribute');
1024 } else {
1025 ## NOTE: This state should never be reached.
1026 !!!cp (92);
1027 }
1028 } else {
1029 die "$0: $self->{ct}->{type}: Unknown token type";
1030 }
1031 $self->{state} = DATA_STATE;
1032 ## reconsume
1033
1034 !!!emit ($self->{ct}); # start tag or end tag
1035
1036 redo A;
1037 } else {
1038 if ($self->{nc} == 0x003D) { # =
1039 !!!cp (93);
1040 !!!parse-error (type => 'bad attribute value');
1041 } else {
1042 !!!cp (94);
1043 }
1044 $self->{ca}->{value} .= chr ($self->{nc});
1045 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1046 !!!next-input-character;
1047 redo A;
1048 }
1049 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1050 if ($self->{nc} == 0x0022) { # "
1051 !!!cp (95);
1052 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1053 !!!next-input-character;
1054 redo A;
1055 } elsif ($self->{nc} == 0x0026) { # &
1056 !!!cp (96);
1057 ## NOTE: In the spec, the tokenizer is switched to the
1058 ## "entity in attribute value state". In this implementation, the
1059 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1060 ## implementation of the "consume a character reference" algorithm.
1061 $self->{prev_state} = $self->{state};
1062 $self->{entity_add} = 0x0022; # "
1063 $self->{state} = ENTITY_STATE;
1064 !!!next-input-character;
1065 redo A;
1066 } elsif ($self->{nc} == -1) {
1067 !!!parse-error (type => 'unclosed attribute value');
1068 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1069 !!!cp (97);
1070 $self->{last_stag_name} = $self->{ct}->{tag_name};
1071 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1072 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1073 if ($self->{ct}->{attributes}) {
1074 !!!cp (98);
1075 !!!parse-error (type => 'end tag attribute');
1076 } else {
1077 ## NOTE: This state should never be reached.
1078 !!!cp (99);
1079 }
1080 } else {
1081 die "$0: $self->{ct}->{type}: Unknown token type";
1082 }
1083 $self->{state} = DATA_STATE;
1084 ## reconsume
1085
1086 !!!emit ($self->{ct}); # start tag or end tag
1087
1088 redo A;
1089 } else {
1090 !!!cp (100);
1091 $self->{ca}->{value} .= chr ($self->{nc});
1092 $self->{read_until}->($self->{ca}->{value},
1093 q["&],
1094 length $self->{ca}->{value});
1095
1096 ## Stay in the state
1097 !!!next-input-character;
1098 redo A;
1099 }
1100 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1101 if ($self->{nc} == 0x0027) { # '
1102 !!!cp (101);
1103 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1104 !!!next-input-character;
1105 redo A;
1106 } elsif ($self->{nc} == 0x0026) { # &
1107 !!!cp (102);
1108 ## NOTE: In the spec, the tokenizer is switched to the
1109 ## "entity in attribute value state". In this implementation, the
1110 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1111 ## implementation of the "consume a character reference" algorithm.
1112 $self->{entity_add} = 0x0027; # '
1113 $self->{prev_state} = $self->{state};
1114 $self->{state} = ENTITY_STATE;
1115 !!!next-input-character;
1116 redo A;
1117 } elsif ($self->{nc} == -1) {
1118 !!!parse-error (type => 'unclosed attribute value');
1119 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1120 !!!cp (103);
1121 $self->{last_stag_name} = $self->{ct}->{tag_name};
1122 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1123 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1124 if ($self->{ct}->{attributes}) {
1125 !!!cp (104);
1126 !!!parse-error (type => 'end tag attribute');
1127 } else {
1128 ## NOTE: This state should never be reached.
1129 !!!cp (105);
1130 }
1131 } else {
1132 die "$0: $self->{ct}->{type}: Unknown token type";
1133 }
1134 $self->{state} = DATA_STATE;
1135 ## reconsume
1136
1137 !!!emit ($self->{ct}); # start tag or end tag
1138
1139 redo A;
1140 } else {
1141 !!!cp (106);
1142 $self->{ca}->{value} .= chr ($self->{nc});
1143 $self->{read_until}->($self->{ca}->{value},
1144 q['&],
1145 length $self->{ca}->{value});
1146
1147 ## Stay in the state
1148 !!!next-input-character;
1149 redo A;
1150 }
1151 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1152 if ($is_space->{$self->{nc}}) {
1153 !!!cp (107);
1154 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1155 !!!next-input-character;
1156 redo A;
1157 } elsif ($self->{nc} == 0x0026) { # &
1158 !!!cp (108);
1159 ## NOTE: In the spec, the tokenizer is switched to the
1160 ## "entity in attribute value state". In this implementation, the
1161 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1162 ## implementation of the "consume a character reference" algorithm.
1163 $self->{entity_add} = -1;
1164 $self->{prev_state} = $self->{state};
1165 $self->{state} = ENTITY_STATE;
1166 !!!next-input-character;
1167 redo A;
1168 } elsif ($self->{nc} == 0x003E) { # >
1169 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1170 !!!cp (109);
1171 $self->{last_stag_name} = $self->{ct}->{tag_name};
1172 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1173 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1174 if ($self->{ct}->{attributes}) {
1175 !!!cp (110);
1176 !!!parse-error (type => 'end tag attribute');
1177 } else {
1178 ## NOTE: This state should never be reached.
1179 !!!cp (111);
1180 }
1181 } else {
1182 die "$0: $self->{ct}->{type}: Unknown token type";
1183 }
1184 $self->{state} = DATA_STATE;
1185 !!!next-input-character;
1186
1187 !!!emit ($self->{ct}); # start tag or end tag
1188
1189 redo A;
1190 } elsif ($self->{nc} == -1) {
1191 !!!parse-error (type => 'unclosed tag');
1192 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1193 !!!cp (112);
1194 $self->{last_stag_name} = $self->{ct}->{tag_name};
1195 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1196 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1197 if ($self->{ct}->{attributes}) {
1198 !!!cp (113);
1199 !!!parse-error (type => 'end tag attribute');
1200 } else {
1201 ## NOTE: This state should never be reached.
1202 !!!cp (114);
1203 }
1204 } else {
1205 die "$0: $self->{ct}->{type}: Unknown token type";
1206 }
1207 $self->{state} = DATA_STATE;
1208 ## reconsume
1209
1210 !!!emit ($self->{ct}); # start tag or end tag
1211
1212 redo A;
1213 } else {
1214 if ({
1215 0x0022 => 1, # "
1216 0x0027 => 1, # '
1217 0x003D => 1, # =
1218 }->{$self->{nc}}) {
1219 !!!cp (115);
1220 !!!parse-error (type => 'bad attribute value');
1221 } else {
1222 !!!cp (116);
1223 }
1224 $self->{ca}->{value} .= chr ($self->{nc});
1225 $self->{read_until}->($self->{ca}->{value},
1226 q["'=& >],
1227 length $self->{ca}->{value});
1228
1229 ## Stay in the state
1230 !!!next-input-character;
1231 redo A;
1232 }
1233 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1234 if ($is_space->{$self->{nc}}) {
1235 !!!cp (118);
1236 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1237 !!!next-input-character;
1238 redo A;
1239 } elsif ($self->{nc} == 0x003E) { # >
1240 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1241 !!!cp (119);
1242 $self->{last_stag_name} = $self->{ct}->{tag_name};
1243 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1244 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1245 if ($self->{ct}->{attributes}) {
1246 !!!cp (120);
1247 !!!parse-error (type => 'end tag attribute');
1248 } else {
1249 ## NOTE: This state should never be reached.
1250 !!!cp (121);
1251 }
1252 } else {
1253 die "$0: $self->{ct}->{type}: Unknown token type";
1254 }
1255 $self->{state} = DATA_STATE;
1256 !!!next-input-character;
1257
1258 !!!emit ($self->{ct}); # start tag or end tag
1259
1260 redo A;
1261 } elsif ($self->{nc} == 0x002F) { # /
1262 !!!cp (122);
1263 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1264 !!!next-input-character;
1265 redo A;
1266 } elsif ($self->{nc} == -1) {
1267 !!!parse-error (type => 'unclosed tag');
1268 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1269 !!!cp (122.3);
1270 $self->{last_stag_name} = $self->{ct}->{tag_name};
1271 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1272 if ($self->{ct}->{attributes}) {
1273 !!!cp (122.1);
1274 !!!parse-error (type => 'end tag attribute');
1275 } else {
1276 ## NOTE: This state should never be reached.
1277 !!!cp (122.2);
1278 }
1279 } else {
1280 die "$0: $self->{ct}->{type}: Unknown token type";
1281 }
1282 $self->{state} = DATA_STATE;
1283 ## Reconsume.
1284 !!!emit ($self->{ct}); # start tag or end tag
1285 redo A;
1286 } else {
1287 !!!cp ('124.1');
1288 !!!parse-error (type => 'no space between attributes');
1289 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1290 ## reconsume
1291 redo A;
1292 }
1293 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1294 if ($self->{nc} == 0x003E) { # >
1295 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1296 !!!cp ('124.2');
1297 !!!parse-error (type => 'nestc', token => $self->{ct});
1298 ## TODO: Different type than slash in start tag
1299 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1300 if ($self->{ct}->{attributes}) {
1301 !!!cp ('124.4');
1302 !!!parse-error (type => 'end tag attribute');
1303 } else {
1304 !!!cp ('124.5');
1305 }
1306 ## TODO: Test |<title></title/>|
1307 } else {
1308 !!!cp ('124.3');
1309 $self->{self_closing} = 1;
1310 }
1311
1312 $self->{state} = DATA_STATE;
1313 !!!next-input-character;
1314
1315 !!!emit ($self->{ct}); # start tag or end tag
1316
1317 redo A;
1318 } elsif ($self->{nc} == -1) {
1319 !!!parse-error (type => 'unclosed tag');
1320 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1321 !!!cp (124.7);
1322 $self->{last_stag_name} = $self->{ct}->{tag_name};
1323 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324 if ($self->{ct}->{attributes}) {
1325 !!!cp (124.5);
1326 !!!parse-error (type => 'end tag attribute');
1327 } else {
1328 ## NOTE: This state should never be reached.
1329 !!!cp (124.6);
1330 }
1331 } else {
1332 die "$0: $self->{ct}->{type}: Unknown token type";
1333 }
1334 $self->{state} = DATA_STATE;
1335 ## Reconsume.
1336 !!!emit ($self->{ct}); # start tag or end tag
1337 redo A;
1338 } else {
1339 !!!cp ('124.4');
1340 !!!parse-error (type => 'nestc');
1341 ## TODO: This error type is wrong.
1342 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1343 ## Reconsume.
1344 redo A;
1345 }
1346 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1347 ## (only happen if PCDATA state)
1348
1349 ## NOTE: Unlike spec's "bogus comment state", this implementation
1350 ## consumes characters one-by-one basis.
1351
1352 if ($self->{nc} == 0x003E) { # >
1353 !!!cp (124);
1354 $self->{state} = DATA_STATE;
1355 !!!next-input-character;
1356
1357 !!!emit ($self->{ct}); # comment
1358 redo A;
1359 } elsif ($self->{nc} == -1) {
1360 !!!cp (125);
1361 $self->{state} = DATA_STATE;
1362 ## reconsume
1363
1364 !!!emit ($self->{ct}); # comment
1365 redo A;
1366 } else {
1367 !!!cp (126);
1368 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1369 $self->{read_until}->($self->{ct}->{data},
1370 q[>],
1371 length $self->{ct}->{data});
1372
1373 ## Stay in the state.
1374 !!!next-input-character;
1375 redo A;
1376 }
1377 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1378 ## (only happen if PCDATA state)
1379
1380 if ($self->{nc} == 0x002D) { # -
1381 !!!cp (133);
1382 $self->{state} = MD_HYPHEN_STATE;
1383 !!!next-input-character;
1384 redo A;
1385 } elsif ($self->{nc} == 0x0044 or # D
1386 $self->{nc} == 0x0064) { # d
1387 ## ASCII case-insensitive.
1388 !!!cp (130);
1389 $self->{state} = MD_DOCTYPE_STATE;
1390 $self->{s_kwd} = chr $self->{nc};
1391 !!!next-input-character;
1392 redo A;
1393 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1394 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1395 $self->{nc} == 0x005B) { # [
1396 !!!cp (135.4);
1397 $self->{state} = MD_CDATA_STATE;
1398 $self->{s_kwd} = '[';
1399 !!!next-input-character;
1400 redo A;
1401 } else {
1402 !!!cp (136);
1403 }
1404
1405 !!!parse-error (type => 'bogus comment',
1406 line => $self->{line_prev},
1407 column => $self->{column_prev} - 1);
1408 ## Reconsume.
1409 $self->{state} = BOGUS_COMMENT_STATE;
1410 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1411 line => $self->{line_prev},
1412 column => $self->{column_prev} - 1,
1413 };
1414 redo A;
1415 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1416 if ($self->{nc} == 0x002D) { # -
1417 !!!cp (127);
1418 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1419 line => $self->{line_prev},
1420 column => $self->{column_prev} - 2,
1421 };
1422 $self->{state} = COMMENT_START_STATE;
1423 !!!next-input-character;
1424 redo A;
1425 } else {
1426 !!!cp (128);
1427 !!!parse-error (type => 'bogus comment',
1428 line => $self->{line_prev},
1429 column => $self->{column_prev} - 2);
1430 $self->{state} = BOGUS_COMMENT_STATE;
1431 ## Reconsume.
1432 $self->{ct} = {type => COMMENT_TOKEN,
1433 data => '-',
1434 line => $self->{line_prev},
1435 column => $self->{column_prev} - 2,
1436 };
1437 redo A;
1438 }
1439 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1440 ## ASCII case-insensitive.
1441 if ($self->{nc} == [
1442 undef,
1443 0x004F, # O
1444 0x0043, # C
1445 0x0054, # T
1446 0x0059, # Y
1447 0x0050, # P
1448 ]->[length $self->{s_kwd}] or
1449 $self->{nc} == [
1450 undef,
1451 0x006F, # o
1452 0x0063, # c
1453 0x0074, # t
1454 0x0079, # y
1455 0x0070, # p
1456 ]->[length $self->{s_kwd}]) {
1457 !!!cp (131);
1458 ## Stay in the state.
1459 $self->{s_kwd} .= chr $self->{nc};
1460 !!!next-input-character;
1461 redo A;
1462 } elsif ((length $self->{s_kwd}) == 6 and
1463 ($self->{nc} == 0x0045 or # E
1464 $self->{nc} == 0x0065)) { # e
1465 !!!cp (129);
1466 $self->{state} = DOCTYPE_STATE;
1467 $self->{ct} = {type => DOCTYPE_TOKEN,
1468 quirks => 1,
1469 line => $self->{line_prev},
1470 column => $self->{column_prev} - 7,
1471 };
1472 !!!next-input-character;
1473 redo A;
1474 } else {
1475 !!!cp (132);
1476 !!!parse-error (type => 'bogus comment',
1477 line => $self->{line_prev},
1478 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1479 $self->{state} = BOGUS_COMMENT_STATE;
1480 ## Reconsume.
1481 $self->{ct} = {type => COMMENT_TOKEN,
1482 data => $self->{s_kwd},
1483 line => $self->{line_prev},
1484 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1485 };
1486 redo A;
1487 }
1488 } elsif ($self->{state} == MD_CDATA_STATE) {
1489 if ($self->{nc} == {
1490 '[' => 0x0043, # C
1491 '[C' => 0x0044, # D
1492 '[CD' => 0x0041, # A
1493 '[CDA' => 0x0054, # T
1494 '[CDAT' => 0x0041, # A
1495 }->{$self->{s_kwd}}) {
1496 !!!cp (135.1);
1497 ## Stay in the state.
1498 $self->{s_kwd} .= chr $self->{nc};
1499 !!!next-input-character;
1500 redo A;
1501 } elsif ($self->{s_kwd} eq '[CDATA' and
1502 $self->{nc} == 0x005B) { # [
1503 !!!cp (135.2);
1504 $self->{ct} = {type => CHARACTER_TOKEN,
1505 data => '',
1506 line => $self->{line_prev},
1507 column => $self->{column_prev} - 7};
1508 $self->{state} = CDATA_SECTION_STATE;
1509 !!!next-input-character;
1510 redo A;
1511 } else {
1512 !!!cp (135.3);
1513 !!!parse-error (type => 'bogus comment',
1514 line => $self->{line_prev},
1515 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1516 $self->{state} = BOGUS_COMMENT_STATE;
1517 ## Reconsume.
1518 $self->{ct} = {type => COMMENT_TOKEN,
1519 data => $self->{s_kwd},
1520 line => $self->{line_prev},
1521 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1522 };
1523 redo A;
1524 }
1525 } elsif ($self->{state} == COMMENT_START_STATE) {
1526 if ($self->{nc} == 0x002D) { # -
1527 !!!cp (137);
1528 $self->{state} = COMMENT_START_DASH_STATE;
1529 !!!next-input-character;
1530 redo A;
1531 } elsif ($self->{nc} == 0x003E) { # >
1532 !!!cp (138);
1533 !!!parse-error (type => 'bogus comment');
1534 $self->{state} = DATA_STATE;
1535 !!!next-input-character;
1536
1537 !!!emit ($self->{ct}); # comment
1538
1539 redo A;
1540 } elsif ($self->{nc} == -1) {
1541 !!!cp (139);
1542 !!!parse-error (type => 'unclosed comment');
1543 $self->{state} = DATA_STATE;
1544 ## reconsume
1545
1546 !!!emit ($self->{ct}); # comment
1547
1548 redo A;
1549 } else {
1550 !!!cp (140);
1551 $self->{ct}->{data} # comment
1552 .= chr ($self->{nc});
1553 $self->{state} = COMMENT_STATE;
1554 !!!next-input-character;
1555 redo A;
1556 }
1557 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1558 if ($self->{nc} == 0x002D) { # -
1559 !!!cp (141);
1560 $self->{state} = COMMENT_END_STATE;
1561 !!!next-input-character;
1562 redo A;
1563 } elsif ($self->{nc} == 0x003E) { # >
1564 !!!cp (142);
1565 !!!parse-error (type => 'bogus comment');
1566 $self->{state} = DATA_STATE;
1567 !!!next-input-character;
1568
1569 !!!emit ($self->{ct}); # comment
1570
1571 redo A;
1572 } elsif ($self->{nc} == -1) {
1573 !!!cp (143);
1574 !!!parse-error (type => 'unclosed comment');
1575 $self->{state} = DATA_STATE;
1576 ## reconsume
1577
1578 !!!emit ($self->{ct}); # comment
1579
1580 redo A;
1581 } else {
1582 !!!cp (144);
1583 $self->{ct}->{data} # comment
1584 .= '-' . chr ($self->{nc});
1585 $self->{state} = COMMENT_STATE;
1586 !!!next-input-character;
1587 redo A;
1588 }
1589 } elsif ($self->{state} == COMMENT_STATE) {
1590 if ($self->{nc} == 0x002D) { # -
1591 !!!cp (145);
1592 $self->{state} = COMMENT_END_DASH_STATE;
1593 !!!next-input-character;
1594 redo A;
1595 } elsif ($self->{nc} == -1) {
1596 !!!cp (146);
1597 !!!parse-error (type => 'unclosed comment');
1598 $self->{state} = DATA_STATE;
1599 ## reconsume
1600
1601 !!!emit ($self->{ct}); # comment
1602
1603 redo A;
1604 } else {
1605 !!!cp (147);
1606 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1607 $self->{read_until}->($self->{ct}->{data},
1608 q[-],
1609 length $self->{ct}->{data});
1610
1611 ## Stay in the state
1612 !!!next-input-character;
1613 redo A;
1614 }
1615 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1616 if ($self->{nc} == 0x002D) { # -
1617 !!!cp (148);
1618 $self->{state} = COMMENT_END_STATE;
1619 !!!next-input-character;
1620 redo A;
1621 } elsif ($self->{nc} == -1) {
1622 !!!cp (149);
1623 !!!parse-error (type => 'unclosed comment');
1624 $self->{state} = DATA_STATE;
1625 ## reconsume
1626
1627 !!!emit ($self->{ct}); # comment
1628
1629 redo A;
1630 } else {
1631 !!!cp (150);
1632 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1633 $self->{state} = COMMENT_STATE;
1634 !!!next-input-character;
1635 redo A;
1636 }
1637 } elsif ($self->{state} == COMMENT_END_STATE) {
1638 if ($self->{nc} == 0x003E) { # >
1639 !!!cp (151);
1640 $self->{state} = DATA_STATE;
1641 !!!next-input-character;
1642
1643 !!!emit ($self->{ct}); # comment
1644
1645 redo A;
1646 } elsif ($self->{nc} == 0x002D) { # -
1647 !!!cp (152);
1648 !!!parse-error (type => 'dash in comment',
1649 line => $self->{line_prev},
1650 column => $self->{column_prev});
1651 $self->{ct}->{data} .= '-'; # comment
1652 ## Stay in the state
1653 !!!next-input-character;
1654 redo A;
1655 } elsif ($self->{nc} == -1) {
1656 !!!cp (153);
1657 !!!parse-error (type => 'unclosed comment');
1658 $self->{state} = DATA_STATE;
1659 ## reconsume
1660
1661 !!!emit ($self->{ct}); # comment
1662
1663 redo A;
1664 } else {
1665 !!!cp (154);
1666 !!!parse-error (type => 'dash in comment',
1667 line => $self->{line_prev},
1668 column => $self->{column_prev});
1669 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1670 $self->{state} = COMMENT_STATE;
1671 !!!next-input-character;
1672 redo A;
1673 }
1674 } elsif ($self->{state} == DOCTYPE_STATE) {
1675 if ($is_space->{$self->{nc}}) {
1676 !!!cp (155);
1677 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1678 !!!next-input-character;
1679 redo A;
1680 } else {
1681 !!!cp (156);
1682 !!!parse-error (type => 'no space before DOCTYPE name');
1683 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1684 ## reconsume
1685 redo A;
1686 }
1687 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1688 if ($is_space->{$self->{nc}}) {
1689 !!!cp (157);
1690 ## Stay in the state
1691 !!!next-input-character;
1692 redo A;
1693 } elsif ($self->{nc} == 0x003E) { # >
1694 !!!cp (158);
1695 !!!parse-error (type => 'no DOCTYPE name');
1696 $self->{state} = DATA_STATE;
1697 !!!next-input-character;
1698
1699 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1700
1701 redo A;
1702 } elsif ($self->{nc} == -1) {
1703 !!!cp (159);
1704 !!!parse-error (type => 'no DOCTYPE name');
1705 $self->{state} = DATA_STATE;
1706 ## reconsume
1707
1708 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1709
1710 redo A;
1711 } else {
1712 !!!cp (160);
1713 $self->{ct}->{name} = chr $self->{nc};
1714 delete $self->{ct}->{quirks};
1715 $self->{state} = DOCTYPE_NAME_STATE;
1716 !!!next-input-character;
1717 redo A;
1718 }
1719 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1720 ## ISSUE: Redundant "First," in the spec.
1721 if ($is_space->{$self->{nc}}) {
1722 !!!cp (161);
1723 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1724 !!!next-input-character;
1725 redo A;
1726 } elsif ($self->{nc} == 0x003E) { # >
1727 !!!cp (162);
1728 $self->{state} = DATA_STATE;
1729 !!!next-input-character;
1730
1731 !!!emit ($self->{ct}); # DOCTYPE
1732
1733 redo A;
1734 } elsif ($self->{nc} == -1) {
1735 !!!cp (163);
1736 !!!parse-error (type => 'unclosed DOCTYPE');
1737 $self->{state} = DATA_STATE;
1738 ## reconsume
1739
1740 $self->{ct}->{quirks} = 1;
1741 !!!emit ($self->{ct}); # DOCTYPE
1742
1743 redo A;
1744 } else {
1745 !!!cp (164);
1746 $self->{ct}->{name}
1747 .= chr ($self->{nc}); # DOCTYPE
1748 ## Stay in the state
1749 !!!next-input-character;
1750 redo A;
1751 }
1752 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1753 if ($is_space->{$self->{nc}}) {
1754 !!!cp (165);
1755 ## Stay in the state
1756 !!!next-input-character;
1757 redo A;
1758 } elsif ($self->{nc} == 0x003E) { # >
1759 !!!cp (166);
1760 $self->{state} = DATA_STATE;
1761 !!!next-input-character;
1762
1763 !!!emit ($self->{ct}); # DOCTYPE
1764
1765 redo A;
1766 } elsif ($self->{nc} == -1) {
1767 !!!cp (167);
1768 !!!parse-error (type => 'unclosed DOCTYPE');
1769 $self->{state} = DATA_STATE;
1770 ## reconsume
1771
1772 $self->{ct}->{quirks} = 1;
1773 !!!emit ($self->{ct}); # DOCTYPE
1774
1775 redo A;
1776 } elsif ($self->{nc} == 0x0050 or # P
1777 $self->{nc} == 0x0070) { # p
1778 $self->{state} = PUBLIC_STATE;
1779 $self->{s_kwd} = chr $self->{nc};
1780 !!!next-input-character;
1781 redo A;
1782 } elsif ($self->{nc} == 0x0053 or # S
1783 $self->{nc} == 0x0073) { # s
1784 $self->{state} = SYSTEM_STATE;
1785 $self->{s_kwd} = chr $self->{nc};
1786 !!!next-input-character;
1787 redo A;
1788 } else {
1789 !!!cp (180);
1790 !!!parse-error (type => 'string after DOCTYPE name');
1791 $self->{ct}->{quirks} = 1;
1792
1793 $self->{state} = BOGUS_DOCTYPE_STATE;
1794 !!!next-input-character;
1795 redo A;
1796 }
1797 } elsif ($self->{state} == PUBLIC_STATE) {
1798 ## ASCII case-insensitive
1799 if ($self->{nc} == [
1800 undef,
1801 0x0055, # U
1802 0x0042, # B
1803 0x004C, # L
1804 0x0049, # I
1805 ]->[length $self->{s_kwd}] or
1806 $self->{nc} == [
1807 undef,
1808 0x0075, # u
1809 0x0062, # b
1810 0x006C, # l
1811 0x0069, # i
1812 ]->[length $self->{s_kwd}]) {
1813 !!!cp (175);
1814 ## Stay in the state.
1815 $self->{s_kwd} .= chr $self->{nc};
1816 !!!next-input-character;
1817 redo A;
1818 } elsif ((length $self->{s_kwd}) == 5 and
1819 ($self->{nc} == 0x0043 or # C
1820 $self->{nc} == 0x0063)) { # c
1821 !!!cp (168);
1822 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1823 !!!next-input-character;
1824 redo A;
1825 } else {
1826 !!!cp (169);
1827 !!!parse-error (type => 'string after DOCTYPE name',
1828 line => $self->{line_prev},
1829 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1830 $self->{ct}->{quirks} = 1;
1831
1832 $self->{state} = BOGUS_DOCTYPE_STATE;
1833 ## Reconsume.
1834 redo A;
1835 }
1836 } elsif ($self->{state} == SYSTEM_STATE) {
1837 ## ASCII case-insensitive
1838 if ($self->{nc} == [
1839 undef,
1840 0x0059, # Y
1841 0x0053, # S
1842 0x0054, # T
1843 0x0045, # E
1844 ]->[length $self->{s_kwd}] or
1845 $self->{nc} == [
1846 undef,
1847 0x0079, # y
1848 0x0073, # s
1849 0x0074, # t
1850 0x0065, # e
1851 ]->[length $self->{s_kwd}]) {
1852 !!!cp (170);
1853 ## Stay in the state.
1854 $self->{s_kwd} .= chr $self->{nc};
1855 !!!next-input-character;
1856 redo A;
1857 } elsif ((length $self->{s_kwd}) == 5 and
1858 ($self->{nc} == 0x004D or # M
1859 $self->{nc} == 0x006D)) { # m
1860 !!!cp (171);
1861 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1862 !!!next-input-character;
1863 redo A;
1864 } else {
1865 !!!cp (172);
1866 !!!parse-error (type => 'string after DOCTYPE name',
1867 line => $self->{line_prev},
1868 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1869 $self->{ct}->{quirks} = 1;
1870
1871 $self->{state} = BOGUS_DOCTYPE_STATE;
1872 ## Reconsume.
1873 redo A;
1874 }
1875 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1876 if ($is_space->{$self->{nc}}) {
1877 !!!cp (181);
1878 ## Stay in the state
1879 !!!next-input-character;
1880 redo A;
1881 } elsif ($self->{nc} eq 0x0022) { # "
1882 !!!cp (182);
1883 $self->{ct}->{pubid} = ''; # DOCTYPE
1884 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1885 !!!next-input-character;
1886 redo A;
1887 } elsif ($self->{nc} eq 0x0027) { # '
1888 !!!cp (183);
1889 $self->{ct}->{pubid} = ''; # DOCTYPE
1890 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1891 !!!next-input-character;
1892 redo A;
1893 } elsif ($self->{nc} eq 0x003E) { # >
1894 !!!cp (184);
1895 !!!parse-error (type => 'no PUBLIC literal');
1896
1897 $self->{state} = DATA_STATE;
1898 !!!next-input-character;
1899
1900 $self->{ct}->{quirks} = 1;
1901 !!!emit ($self->{ct}); # DOCTYPE
1902
1903 redo A;
1904 } elsif ($self->{nc} == -1) {
1905 !!!cp (185);
1906 !!!parse-error (type => 'unclosed DOCTYPE');
1907
1908 $self->{state} = DATA_STATE;
1909 ## reconsume
1910
1911 $self->{ct}->{quirks} = 1;
1912 !!!emit ($self->{ct}); # DOCTYPE
1913
1914 redo A;
1915 } else {
1916 !!!cp (186);
1917 !!!parse-error (type => 'string after PUBLIC');
1918 $self->{ct}->{quirks} = 1;
1919
1920 $self->{state} = BOGUS_DOCTYPE_STATE;
1921 !!!next-input-character;
1922 redo A;
1923 }
1924 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1925 if ($self->{nc} == 0x0022) { # "
1926 !!!cp (187);
1927 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1928 !!!next-input-character;
1929 redo A;
1930 } elsif ($self->{nc} == 0x003E) { # >
1931 !!!cp (188);
1932 !!!parse-error (type => 'unclosed PUBLIC literal');
1933
1934 $self->{state} = DATA_STATE;
1935 !!!next-input-character;
1936
1937 $self->{ct}->{quirks} = 1;
1938 !!!emit ($self->{ct}); # DOCTYPE
1939
1940 redo A;
1941 } elsif ($self->{nc} == -1) {
1942 !!!cp (189);
1943 !!!parse-error (type => 'unclosed PUBLIC literal');
1944
1945 $self->{state} = DATA_STATE;
1946 ## reconsume
1947
1948 $self->{ct}->{quirks} = 1;
1949 !!!emit ($self->{ct}); # DOCTYPE
1950
1951 redo A;
1952 } else {
1953 !!!cp (190);
1954 $self->{ct}->{pubid} # DOCTYPE
1955 .= chr $self->{nc};
1956 $self->{read_until}->($self->{ct}->{pubid}, q[">],
1957 length $self->{ct}->{pubid});
1958
1959 ## Stay in the state
1960 !!!next-input-character;
1961 redo A;
1962 }
1963 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1964 if ($self->{nc} == 0x0027) { # '
1965 !!!cp (191);
1966 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1967 !!!next-input-character;
1968 redo A;
1969 } elsif ($self->{nc} == 0x003E) { # >
1970 !!!cp (192);
1971 !!!parse-error (type => 'unclosed PUBLIC literal');
1972
1973 $self->{state} = DATA_STATE;
1974 !!!next-input-character;
1975
1976 $self->{ct}->{quirks} = 1;
1977 !!!emit ($self->{ct}); # DOCTYPE
1978
1979 redo A;
1980 } elsif ($self->{nc} == -1) {
1981 !!!cp (193);
1982 !!!parse-error (type => 'unclosed PUBLIC literal');
1983
1984 $self->{state} = DATA_STATE;
1985 ## reconsume
1986
1987 $self->{ct}->{quirks} = 1;
1988 !!!emit ($self->{ct}); # DOCTYPE
1989
1990 redo A;
1991 } else {
1992 !!!cp (194);
1993 $self->{ct}->{pubid} # DOCTYPE
1994 .= chr $self->{nc};
1995 $self->{read_until}->($self->{ct}->{pubid}, q['>],
1996 length $self->{ct}->{pubid});
1997
1998 ## Stay in the state
1999 !!!next-input-character;
2000 redo A;
2001 }
2002 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2003 if ($is_space->{$self->{nc}}) {
2004 !!!cp (195);
2005 ## Stay in the state
2006 !!!next-input-character;
2007 redo A;
2008 } elsif ($self->{nc} == 0x0022) { # "
2009 !!!cp (196);
2010 $self->{ct}->{sysid} = ''; # DOCTYPE
2011 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2012 !!!next-input-character;
2013 redo A;
2014 } elsif ($self->{nc} == 0x0027) { # '
2015 !!!cp (197);
2016 $self->{ct}->{sysid} = ''; # DOCTYPE
2017 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2018 !!!next-input-character;
2019 redo A;
2020 } elsif ($self->{nc} == 0x003E) { # >
2021 !!!cp (198);
2022 $self->{state} = DATA_STATE;
2023 !!!next-input-character;
2024
2025 !!!emit ($self->{ct}); # DOCTYPE
2026
2027 redo A;
2028 } elsif ($self->{nc} == -1) {
2029 !!!cp (199);
2030 !!!parse-error (type => 'unclosed DOCTYPE');
2031
2032 $self->{state} = DATA_STATE;
2033 ## reconsume
2034
2035 $self->{ct}->{quirks} = 1;
2036 !!!emit ($self->{ct}); # DOCTYPE
2037
2038 redo A;
2039 } else {
2040 !!!cp (200);
2041 !!!parse-error (type => 'string after PUBLIC literal');
2042 $self->{ct}->{quirks} = 1;
2043
2044 $self->{state} = BOGUS_DOCTYPE_STATE;
2045 !!!next-input-character;
2046 redo A;
2047 }
2048 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2049 if ($is_space->{$self->{nc}}) {
2050 !!!cp (201);
2051 ## Stay in the state
2052 !!!next-input-character;
2053 redo A;
2054 } elsif ($self->{nc} == 0x0022) { # "
2055 !!!cp (202);
2056 $self->{ct}->{sysid} = ''; # DOCTYPE
2057 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2058 !!!next-input-character;
2059 redo A;
2060 } elsif ($self->{nc} == 0x0027) { # '
2061 !!!cp (203);
2062 $self->{ct}->{sysid} = ''; # DOCTYPE
2063 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2064 !!!next-input-character;
2065 redo A;
2066 } elsif ($self->{nc} == 0x003E) { # >
2067 !!!cp (204);
2068 !!!parse-error (type => 'no SYSTEM literal');
2069 $self->{state} = DATA_STATE;
2070 !!!next-input-character;
2071
2072 $self->{ct}->{quirks} = 1;
2073 !!!emit ($self->{ct}); # DOCTYPE
2074
2075 redo A;
2076 } elsif ($self->{nc} == -1) {
2077 !!!cp (205);
2078 !!!parse-error (type => 'unclosed DOCTYPE');
2079
2080 $self->{state} = DATA_STATE;
2081 ## reconsume
2082
2083 $self->{ct}->{quirks} = 1;
2084 !!!emit ($self->{ct}); # DOCTYPE
2085
2086 redo A;
2087 } else {
2088 !!!cp (206);
2089 !!!parse-error (type => 'string after SYSTEM');
2090 $self->{ct}->{quirks} = 1;
2091
2092 $self->{state} = BOGUS_DOCTYPE_STATE;
2093 !!!next-input-character;
2094 redo A;
2095 }
2096 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2097 if ($self->{nc} == 0x0022) { # "
2098 !!!cp (207);
2099 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2100 !!!next-input-character;
2101 redo A;
2102 } elsif ($self->{nc} == 0x003E) { # >
2103 !!!cp (208);
2104 !!!parse-error (type => 'unclosed SYSTEM literal');
2105
2106 $self->{state} = DATA_STATE;
2107 !!!next-input-character;
2108
2109 $self->{ct}->{quirks} = 1;
2110 !!!emit ($self->{ct}); # DOCTYPE
2111
2112 redo A;
2113 } elsif ($self->{nc} == -1) {
2114 !!!cp (209);
2115 !!!parse-error (type => 'unclosed SYSTEM literal');
2116
2117 $self->{state} = DATA_STATE;
2118 ## reconsume
2119
2120 $self->{ct}->{quirks} = 1;
2121 !!!emit ($self->{ct}); # DOCTYPE
2122
2123 redo A;
2124 } else {
2125 !!!cp (210);
2126 $self->{ct}->{sysid} # DOCTYPE
2127 .= chr $self->{nc};
2128 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2129 length $self->{ct}->{sysid});
2130
2131 ## Stay in the state
2132 !!!next-input-character;
2133 redo A;
2134 }
2135 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2136 if ($self->{nc} == 0x0027) { # '
2137 !!!cp (211);
2138 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2139 !!!next-input-character;
2140 redo A;
2141 } elsif ($self->{nc} == 0x003E) { # >
2142 !!!cp (212);
2143 !!!parse-error (type => 'unclosed SYSTEM literal');
2144
2145 $self->{state} = DATA_STATE;
2146 !!!next-input-character;
2147
2148 $self->{ct}->{quirks} = 1;
2149 !!!emit ($self->{ct}); # DOCTYPE
2150
2151 redo A;
2152 } elsif ($self->{nc} == -1) {
2153 !!!cp (213);
2154 !!!parse-error (type => 'unclosed SYSTEM literal');
2155
2156 $self->{state} = DATA_STATE;
2157 ## reconsume
2158
2159 $self->{ct}->{quirks} = 1;
2160 !!!emit ($self->{ct}); # DOCTYPE
2161
2162 redo A;
2163 } else {
2164 !!!cp (214);
2165 $self->{ct}->{sysid} # DOCTYPE
2166 .= chr $self->{nc};
2167 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2168 length $self->{ct}->{sysid});
2169
2170 ## Stay in the state
2171 !!!next-input-character;
2172 redo A;
2173 }
2174 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2175 if ($is_space->{$self->{nc}}) {
2176 !!!cp (215);
2177 ## Stay in the state
2178 !!!next-input-character;
2179 redo A;
2180 } elsif ($self->{nc} == 0x003E) { # >
2181 !!!cp (216);
2182 $self->{state} = DATA_STATE;
2183 !!!next-input-character;
2184
2185 !!!emit ($self->{ct}); # DOCTYPE
2186
2187 redo A;
2188 } elsif ($self->{nc} == -1) {
2189 !!!cp (217);
2190 !!!parse-error (type => 'unclosed DOCTYPE');
2191 $self->{state} = DATA_STATE;
2192 ## reconsume
2193
2194 $self->{ct}->{quirks} = 1;
2195 !!!emit ($self->{ct}); # DOCTYPE
2196
2197 redo A;
2198 } else {
2199 !!!cp (218);
2200 !!!parse-error (type => 'string after SYSTEM literal');
2201 #$self->{ct}->{quirks} = 1;
2202
2203 $self->{state} = BOGUS_DOCTYPE_STATE;
2204 !!!next-input-character;
2205 redo A;
2206 }
2207 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2208 if ($self->{nc} == 0x003E) { # >
2209 !!!cp (219);
2210 $self->{state} = DATA_STATE;
2211 !!!next-input-character;
2212
2213 !!!emit ($self->{ct}); # DOCTYPE
2214
2215 redo A;
2216 } elsif ($self->{nc} == -1) {
2217 !!!cp (220);
2218 $self->{state} = DATA_STATE;
2219 ## reconsume
2220
2221 !!!emit ($self->{ct}); # DOCTYPE
2222
2223 redo A;
2224 } else {
2225 !!!cp (221);
2226 my $s = '';
2227 $self->{read_until}->($s, q[>], 0);
2228
2229 ## Stay in the state
2230 !!!next-input-character;
2231 redo A;
2232 }
2233 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2234 ## NOTE: "CDATA section state" in the state is jointly implemented
2235 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2236 ## and |CDATA_SECTION_MSE2_STATE|.
2237
2238 if ($self->{nc} == 0x005D) { # ]
2239 !!!cp (221.1);
2240 $self->{state} = CDATA_SECTION_MSE1_STATE;
2241 !!!next-input-character;
2242 redo A;
2243 } elsif ($self->{nc} == -1) {
2244 $self->{state} = DATA_STATE;
2245 !!!next-input-character;
2246 if (length $self->{ct}->{data}) { # character
2247 !!!cp (221.2);
2248 !!!emit ($self->{ct}); # character
2249 } else {
2250 !!!cp (221.3);
2251 ## No token to emit. $self->{ct} is discarded.
2252 }
2253 redo A;
2254 } else {
2255 !!!cp (221.4);
2256 $self->{ct}->{data} .= chr $self->{nc};
2257 $self->{read_until}->($self->{ct}->{data},
2258 q<]>,
2259 length $self->{ct}->{data});
2260
2261 ## Stay in the state.
2262 !!!next-input-character;
2263 redo A;
2264 }
2265
2266 ## ISSUE: "text tokens" in spec.
2267 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2268 if ($self->{nc} == 0x005D) { # ]
2269 !!!cp (221.5);
2270 $self->{state} = CDATA_SECTION_MSE2_STATE;
2271 !!!next-input-character;
2272 redo A;
2273 } else {
2274 !!!cp (221.6);
2275 $self->{ct}->{data} .= ']';
2276 $self->{state} = CDATA_SECTION_STATE;
2277 ## Reconsume.
2278 redo A;
2279 }
2280 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2281 if ($self->{nc} == 0x003E) { # >
2282 $self->{state} = DATA_STATE;
2283 !!!next-input-character;
2284 if (length $self->{ct}->{data}) { # character
2285 !!!cp (221.7);
2286 !!!emit ($self->{ct}); # character
2287 } else {
2288 !!!cp (221.8);
2289 ## No token to emit. $self->{ct} is discarded.
2290 }
2291 redo A;
2292 } elsif ($self->{nc} == 0x005D) { # ]
2293 !!!cp (221.9); # character
2294 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2295 ## Stay in the state.
2296 !!!next-input-character;
2297 redo A;
2298 } else {
2299 !!!cp (221.11);
2300 $self->{ct}->{data} .= ']]'; # character
2301 $self->{state} = CDATA_SECTION_STATE;
2302 ## Reconsume.
2303 redo A;
2304 }
2305 } elsif ($self->{state} == ENTITY_STATE) {
2306 if ($is_space->{$self->{nc}} or
2307 {
2308 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2309 $self->{entity_add} => 1,
2310 }->{$self->{nc}}) {
2311 !!!cp (1001);
2312 ## Don't consume
2313 ## No error
2314 ## Return nothing.
2315 #
2316 } elsif ($self->{nc} == 0x0023) { # #
2317 !!!cp (999);
2318 $self->{state} = ENTITY_HASH_STATE;
2319 $self->{s_kwd} = '#';
2320 !!!next-input-character;
2321 redo A;
2322 } elsif ((0x0041 <= $self->{nc} and
2323 $self->{nc} <= 0x005A) or # A..Z
2324 (0x0061 <= $self->{nc} and
2325 $self->{nc} <= 0x007A)) { # a..z
2326 !!!cp (998);
2327 require Whatpm::_NamedEntityList;
2328 $self->{state} = ENTITY_NAME_STATE;
2329 $self->{s_kwd} = chr $self->{nc};
2330 $self->{entity__value} = $self->{s_kwd};
2331 $self->{entity__match} = 0;
2332 !!!next-input-character;
2333 redo A;
2334 } else {
2335 !!!cp (1027);
2336 !!!parse-error (type => 'bare ero');
2337 ## Return nothing.
2338 #
2339 }
2340
2341 ## NOTE: No character is consumed by the "consume a character
2342 ## reference" algorithm. In other word, there is an "&" character
2343 ## that does not introduce a character reference, which would be
2344 ## appended to the parent element or the attribute value in later
2345 ## process of the tokenizer.
2346
2347 if ($self->{prev_state} == DATA_STATE) {
2348 !!!cp (997);
2349 $self->{state} = $self->{prev_state};
2350 ## Reconsume.
2351 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2352 line => $self->{line_prev},
2353 column => $self->{column_prev},
2354 });
2355 redo A;
2356 } else {
2357 !!!cp (996);
2358 $self->{ca}->{value} .= '&';
2359 $self->{state} = $self->{prev_state};
2360 ## Reconsume.
2361 redo A;
2362 }
2363 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2364 if ($self->{nc} == 0x0078 or # x
2365 $self->{nc} == 0x0058) { # X
2366 !!!cp (995);
2367 $self->{state} = HEXREF_X_STATE;
2368 $self->{s_kwd} .= chr $self->{nc};
2369 !!!next-input-character;
2370 redo A;
2371 } elsif (0x0030 <= $self->{nc} and
2372 $self->{nc} <= 0x0039) { # 0..9
2373 !!!cp (994);
2374 $self->{state} = NCR_NUM_STATE;
2375 $self->{s_kwd} = $self->{nc} - 0x0030;
2376 !!!next-input-character;
2377 redo A;
2378 } else {
2379 !!!parse-error (type => 'bare nero',
2380 line => $self->{line_prev},
2381 column => $self->{column_prev} - 1);
2382
2383 ## NOTE: According to the spec algorithm, nothing is returned,
2384 ## and then "&#" is appended to the parent element or the attribute
2385 ## value in the later processing.
2386
2387 if ($self->{prev_state} == DATA_STATE) {
2388 !!!cp (1019);
2389 $self->{state} = $self->{prev_state};
2390 ## Reconsume.
2391 !!!emit ({type => CHARACTER_TOKEN,
2392 data => '&#',
2393 line => $self->{line_prev},
2394 column => $self->{column_prev} - 1,
2395 });
2396 redo A;
2397 } else {
2398 !!!cp (993);
2399 $self->{ca}->{value} .= '&#';
2400 $self->{state} = $self->{prev_state};
2401 ## Reconsume.
2402 redo A;
2403 }
2404 }
2405 } elsif ($self->{state} == NCR_NUM_STATE) {
2406 if (0x0030 <= $self->{nc} and
2407 $self->{nc} <= 0x0039) { # 0..9
2408 !!!cp (1012);
2409 $self->{s_kwd} *= 10;
2410 $self->{s_kwd} += $self->{nc} - 0x0030;
2411
2412 ## Stay in the state.
2413 !!!next-input-character;
2414 redo A;
2415 } elsif ($self->{nc} == 0x003B) { # ;
2416 !!!cp (1013);
2417 !!!next-input-character;
2418 #
2419 } else {
2420 !!!cp (1014);
2421 !!!parse-error (type => 'no refc');
2422 ## Reconsume.
2423 #
2424 }
2425
2426 my $code = $self->{s_kwd};
2427 my $l = $self->{line_prev};
2428 my $c = $self->{column_prev};
2429 if ($charref_map->{$code}) {
2430 !!!cp (1015);
2431 !!!parse-error (type => 'invalid character reference',
2432 text => (sprintf 'U+%04X', $code),
2433 line => $l, column => $c);
2434 $code = $charref_map->{$code};
2435 } elsif ($code > 0x10FFFF) {
2436 !!!cp (1016);
2437 !!!parse-error (type => 'invalid character reference',
2438 text => (sprintf 'U-%08X', $code),
2439 line => $l, column => $c);
2440 $code = 0xFFFD;
2441 }
2442
2443 if ($self->{prev_state} == DATA_STATE) {
2444 !!!cp (992);
2445 $self->{state} = $self->{prev_state};
2446 ## Reconsume.
2447 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2448 line => $l, column => $c,
2449 });
2450 redo A;
2451 } else {
2452 !!!cp (991);
2453 $self->{ca}->{value} .= chr $code;
2454 $self->{ca}->{has_reference} = 1;
2455 $self->{state} = $self->{prev_state};
2456 ## Reconsume.
2457 redo A;
2458 }
2459 } elsif ($self->{state} == HEXREF_X_STATE) {
2460 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2461 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2462 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2463 # 0..9, A..F, a..f
2464 !!!cp (990);
2465 $self->{state} = HEXREF_HEX_STATE;
2466 $self->{s_kwd} = 0;
2467 ## Reconsume.
2468 redo A;
2469 } else {
2470 !!!parse-error (type => 'bare hcro',
2471 line => $self->{line_prev},
2472 column => $self->{column_prev} - 2);
2473
2474 ## NOTE: According to the spec algorithm, nothing is returned,
2475 ## and then "&#" followed by "X" or "x" is appended to the parent
2476 ## element or the attribute value in the later processing.
2477
2478 if ($self->{prev_state} == DATA_STATE) {
2479 !!!cp (1005);
2480 $self->{state} = $self->{prev_state};
2481 ## Reconsume.
2482 !!!emit ({type => CHARACTER_TOKEN,
2483 data => '&' . $self->{s_kwd},
2484 line => $self->{line_prev},
2485 column => $self->{column_prev} - length $self->{s_kwd},
2486 });
2487 redo A;
2488 } else {
2489 !!!cp (989);
2490 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2491 $self->{state} = $self->{prev_state};
2492 ## Reconsume.
2493 redo A;
2494 }
2495 }
2496 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2497 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2498 # 0..9
2499 !!!cp (1002);
2500 $self->{s_kwd} *= 0x10;
2501 $self->{s_kwd} += $self->{nc} - 0x0030;
2502 ## Stay in the state.
2503 !!!next-input-character;
2504 redo A;
2505 } elsif (0x0061 <= $self->{nc} and
2506 $self->{nc} <= 0x0066) { # a..f
2507 !!!cp (1003);
2508 $self->{s_kwd} *= 0x10;
2509 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2510 ## Stay in the state.
2511 !!!next-input-character;
2512 redo A;
2513 } elsif (0x0041 <= $self->{nc} and
2514 $self->{nc} <= 0x0046) { # A..F
2515 !!!cp (1004);
2516 $self->{s_kwd} *= 0x10;
2517 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2518 ## Stay in the state.
2519 !!!next-input-character;
2520 redo A;
2521 } elsif ($self->{nc} == 0x003B) { # ;
2522 !!!cp (1006);
2523 !!!next-input-character;
2524 #
2525 } else {
2526 !!!cp (1007);
2527 !!!parse-error (type => 'no refc',
2528 line => $self->{line},
2529 column => $self->{column});
2530 ## Reconsume.
2531 #
2532 }
2533
2534 my $code = $self->{s_kwd};
2535 my $l = $self->{line_prev};
2536 my $c = $self->{column_prev};
2537 if ($charref_map->{$code}) {
2538 !!!cp (1008);
2539 !!!parse-error (type => 'invalid character reference',
2540 text => (sprintf 'U+%04X', $code),
2541 line => $l, column => $c);
2542 $code = $charref_map->{$code};
2543 } elsif ($code > 0x10FFFF) {
2544 !!!cp (1009);
2545 !!!parse-error (type => 'invalid character reference',
2546 text => (sprintf 'U-%08X', $code),
2547 line => $l, column => $c);
2548 $code = 0xFFFD;
2549 }
2550
2551 if ($self->{prev_state} == DATA_STATE) {
2552 !!!cp (988);
2553 $self->{state} = $self->{prev_state};
2554 ## Reconsume.
2555 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2556 line => $l, column => $c,
2557 });
2558 redo A;
2559 } else {
2560 !!!cp (987);
2561 $self->{ca}->{value} .= chr $code;
2562 $self->{ca}->{has_reference} = 1;
2563 $self->{state} = $self->{prev_state};
2564 ## Reconsume.
2565 redo A;
2566 }
2567 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2568 if (length $self->{s_kwd} < 30 and
2569 ## NOTE: Some number greater than the maximum length of entity name
2570 ((0x0041 <= $self->{nc} and # a
2571 $self->{nc} <= 0x005A) or # x
2572 (0x0061 <= $self->{nc} and # a
2573 $self->{nc} <= 0x007A) or # z
2574 (0x0030 <= $self->{nc} and # 0
2575 $self->{nc} <= 0x0039) or # 9
2576 $self->{nc} == 0x003B)) { # ;
2577 our $EntityChar;
2578 $self->{s_kwd} .= chr $self->{nc};
2579 if (defined $EntityChar->{$self->{s_kwd}}) {
2580 if ($self->{nc} == 0x003B) { # ;
2581 !!!cp (1020);
2582 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2583 $self->{entity__match} = 1;
2584 !!!next-input-character;
2585 #
2586 } else {
2587 !!!cp (1021);
2588 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2589 $self->{entity__match} = -1;
2590 ## Stay in the state.
2591 !!!next-input-character;
2592 redo A;
2593 }
2594 } else {
2595 !!!cp (1022);
2596 $self->{entity__value} .= chr $self->{nc};
2597 $self->{entity__match} *= 2;
2598 ## Stay in the state.
2599 !!!next-input-character;
2600 redo A;
2601 }
2602 }
2603
2604 my $data;
2605 my $has_ref;
2606 if ($self->{entity__match} > 0) {
2607 !!!cp (1023);
2608 $data = $self->{entity__value};
2609 $has_ref = 1;
2610 #
2611 } elsif ($self->{entity__match} < 0) {
2612 !!!parse-error (type => 'no refc');
2613 if ($self->{prev_state} != DATA_STATE and # in attribute
2614 $self->{entity__match} < -1) {
2615 !!!cp (1024);
2616 $data = '&' . $self->{s_kwd};
2617 #
2618 } else {
2619 !!!cp (1025);
2620 $data = $self->{entity__value};
2621 $has_ref = 1;
2622 #
2623 }
2624 } else {
2625 !!!cp (1026);
2626 !!!parse-error (type => 'bare ero',
2627 line => $self->{line_prev},
2628 column => $self->{column_prev} - length $self->{s_kwd});
2629 $data = '&' . $self->{s_kwd};
2630 #
2631 }
2632
2633 ## NOTE: In these cases, when a character reference is found,
2634 ## it is consumed and a character token is returned, or, otherwise,
2635 ## nothing is consumed and returned, according to the spec algorithm.
2636 ## In this implementation, anything that has been examined by the
2637 ## tokenizer is appended to the parent element or the attribute value
2638 ## as string, either literal string when no character reference or
2639 ## entity-replaced string otherwise, in this stage, since any characters
2640 ## that would not be consumed are appended in the data state or in an
2641 ## appropriate attribute value state anyway.
2642
2643 if ($self->{prev_state} == DATA_STATE) {
2644 !!!cp (986);
2645 $self->{state} = $self->{prev_state};
2646 ## Reconsume.
2647 !!!emit ({type => CHARACTER_TOKEN,
2648 data => $data,
2649 line => $self->{line_prev},
2650 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2651 });
2652 redo A;
2653 } else {
2654 !!!cp (985);
2655 $self->{ca}->{value} .= $data;
2656 $self->{ca}->{has_reference} = 1 if $has_ref;
2657 $self->{state} = $self->{prev_state};
2658 ## Reconsume.
2659 redo A;
2660 }
2661 } else {
2662 die "$0: $self->{state}: Unknown state";
2663 }
2664 } # A
2665
2666 die "$0: _get_next_token: unexpected case";
2667 } # _get_next_token
2668
2669 1;
2670 ## $Date: 2008/10/14 02:27:58 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24