/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download) (as text)
Tue Oct 14 02:27:58 2008 UTC (16 years ago) by wakaba
Branch: MAIN
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	14 Oct 2008 02:26:16 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile: New rule to make HTML/Tokenizer.pm is added.

	* HTML.pm.src: Tokenizer part moved to another file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 02:25:46 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New file.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.207 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 package Whatpm::HTML;
6
7 ## Content model flags
8
9 sub CM_ENTITY () { 0b001 } # & markup in data
10 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
11 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
12
13 sub PLAINTEXT_CONTENT_MODEL () { 0 }
14 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
15 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
16 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
17
18 ## Tokenizer states
19
20 sub DATA_STATE () { 0 }
21 #sub ENTITY_DATA_STATE () { 1 }
22 sub TAG_OPEN_STATE () { 2 }
23 sub CLOSE_TAG_OPEN_STATE () { 3 }
24 sub TAG_NAME_STATE () { 4 }
25 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
26 sub ATTRIBUTE_NAME_STATE () { 6 }
27 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
28 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
29 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
30 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
31 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
32 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
33 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
34 sub COMMENT_START_STATE () { 14 }
35 sub COMMENT_START_DASH_STATE () { 15 }
36 sub COMMENT_STATE () { 16 }
37 sub COMMENT_END_STATE () { 17 }
38 sub COMMENT_END_DASH_STATE () { 18 }
39 sub BOGUS_COMMENT_STATE () { 19 }
40 sub DOCTYPE_STATE () { 20 }
41 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
42 sub DOCTYPE_NAME_STATE () { 22 }
43 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
44 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
45 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
46 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
47 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
48 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
49 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
50 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
51 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
52 sub BOGUS_DOCTYPE_STATE () { 32 }
53 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
54 sub SELF_CLOSING_START_TAG_STATE () { 34 }
55 sub CDATA_SECTION_STATE () { 35 }
56 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
57 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
58 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
59 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
60 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
61 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
62 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
63 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
64 ## NOTE: "Entity data state", "entity in attribute value state", and
65 ## "consume a character reference" algorithm are jointly implemented
66 ## using the following six states:
67 sub ENTITY_STATE () { 44 }
68 sub ENTITY_HASH_STATE () { 45 }
69 sub NCR_NUM_STATE () { 46 }
70 sub HEXREF_X_STATE () { 47 }
71 sub HEXREF_HEX_STATE () { 48 }
72 sub ENTITY_NAME_STATE () { 49 }
73 sub PCDATA_STATE () { 50 } # "data state" in the spec
74
75 ## Token types
76
77 sub DOCTYPE_TOKEN () { 1 }
78 sub COMMENT_TOKEN () { 2 }
79 sub START_TAG_TOKEN () { 3 }
80 sub END_TAG_TOKEN () { 4 }
81 sub END_OF_FILE_TOKEN () { 5 }
82 sub CHARACTER_TOKEN () { 6 }
83
84 ## Tree constructor state constants (see Whatpm::HTML for the full
85 ## list and descriptions)
86
87 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
88 sub FOREIGN_EL () { 0b1_00000000000 }
89
90 ## Character reference mappings
91
92 my $charref_map = {
93 0x0D => 0x000A,
94 0x80 => 0x20AC,
95 0x81 => 0xFFFD,
96 0x82 => 0x201A,
97 0x83 => 0x0192,
98 0x84 => 0x201E,
99 0x85 => 0x2026,
100 0x86 => 0x2020,
101 0x87 => 0x2021,
102 0x88 => 0x02C6,
103 0x89 => 0x2030,
104 0x8A => 0x0160,
105 0x8B => 0x2039,
106 0x8C => 0x0152,
107 0x8D => 0xFFFD,
108 0x8E => 0x017D,
109 0x8F => 0xFFFD,
110 0x90 => 0xFFFD,
111 0x91 => 0x2018,
112 0x92 => 0x2019,
113 0x93 => 0x201C,
114 0x94 => 0x201D,
115 0x95 => 0x2022,
116 0x96 => 0x2013,
117 0x97 => 0x2014,
118 0x98 => 0x02DC,
119 0x99 => 0x2122,
120 0x9A => 0x0161,
121 0x9B => 0x203A,
122 0x9C => 0x0153,
123 0x9D => 0xFFFD,
124 0x9E => 0x017E,
125 0x9F => 0x0178,
126 }; # $charref_map
127 $charref_map->{$_} = 0xFFFD
128 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
129 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
130 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
131 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
132 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
133 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
134 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
135
136 ## Implementations MUST act as if state machine in the spec
137
138 sub _initialize_tokenizer ($) {
139 my $self = shift;
140
141 ## NOTE: Fields set by |new| constructor:
142 #$self->{level}
143 #$self->{set_nc}
144 #$self->{parse_error}
145
146 $self->{state} = DATA_STATE; # MUST
147 #$self->{s_kwd}; # state keyword - initialized when used
148 #$self->{entity__value}; # initialized when used
149 #$self->{entity__match}; # initialized when used
150 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
151 undef $self->{ct}; # current token
152 undef $self->{ca}; # current attribute
153 undef $self->{last_stag_name}; # last emitted start tag name
154 #$self->{prev_state}; # initialized when used
155 delete $self->{self_closing};
156 $self->{char_buffer} = '';
157 $self->{char_buffer_pos} = 0;
158 $self->{nc} = -1; # next input character
159 #$self->{next_nc}
160 !!!next-input-character;
161 $self->{token} = [];
162 # $self->{escape}
163 } # _initialize_tokenizer
164
165 ## A token has:
166 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
167 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
168 ## ->{name} (DOCTYPE_TOKEN)
169 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
170 ## ->{pubid} (DOCTYPE_TOKEN)
171 ## ->{sysid} (DOCTYPE_TOKEN)
172 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
173 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
174 ## ->{name}
175 ## ->{value}
176 ## ->{has_reference} == 1 or 0
177 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
178 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
179 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
180 ## while the token is pushed back to the stack.
181
182 ## Emitted token MUST immediately be handled by the tree construction state.
183
184 ## Before each step, UA MAY check to see if either one of the scripts in
185 ## "list of scripts that will execute as soon as possible" or the first
186 ## script in the "list of scripts that will execute asynchronously",
187 ## has completed loading. If one has, then it MUST be executed
188 ## and removed from the list.
189
190 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
191 ## (This requirement was dropped from HTML5 spec, unfortunately.)
192
193 my $is_space = {
194 0x0009 => 1, # CHARACTER TABULATION (HT)
195 0x000A => 1, # LINE FEED (LF)
196 #0x000B => 0, # LINE TABULATION (VT)
197 0x000C => 1, # FORM FEED (FF)
198 #0x000D => 1, # CARRIAGE RETURN (CR)
199 0x0020 => 1, # SPACE (SP)
200 };
201
202 sub _get_next_token ($) {
203 my $self = shift;
204
205 if ($self->{self_closing}) {
206 !!!parse-error (type => 'nestc', token => $self->{ct});
207 ## NOTE: The |self_closing| flag is only set by start tag token.
208 ## In addition, when a start tag token is emitted, it is always set to
209 ## |ct|.
210 delete $self->{self_closing};
211 }
212
213 if (@{$self->{token}}) {
214 $self->{self_closing} = $self->{token}->[0]->{self_closing};
215 return shift @{$self->{token}};
216 }
217
218 A: {
219 if ($self->{state} == PCDATA_STATE) {
220 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
221
222 if ($self->{nc} == 0x0026) { # &
223 !!!cp (0.1);
224 ## NOTE: In the spec, the tokenizer is switched to the
225 ## "entity data state". In this implementation, the tokenizer
226 ## is switched to the |ENTITY_STATE|, which is an implementation
227 ## of the "consume a character reference" algorithm.
228 $self->{entity_add} = -1;
229 $self->{prev_state} = DATA_STATE;
230 $self->{state} = ENTITY_STATE;
231 !!!next-input-character;
232 redo A;
233 } elsif ($self->{nc} == 0x003C) { # <
234 !!!cp (0.2);
235 $self->{state} = TAG_OPEN_STATE;
236 !!!next-input-character;
237 redo A;
238 } elsif ($self->{nc} == -1) {
239 !!!cp (0.3);
240 !!!emit ({type => END_OF_FILE_TOKEN,
241 line => $self->{line}, column => $self->{column}});
242 last A; ## TODO: ok?
243 } else {
244 !!!cp (0.4);
245 #
246 }
247
248 # Anything else
249 my $token = {type => CHARACTER_TOKEN,
250 data => chr $self->{nc},
251 line => $self->{line}, column => $self->{column},
252 };
253 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
254
255 ## Stay in the state.
256 !!!next-input-character;
257 !!!emit ($token);
258 redo A;
259 } elsif ($self->{state} == DATA_STATE) {
260 $self->{s_kwd} = '' unless defined $self->{s_kwd};
261 if ($self->{nc} == 0x0026) { # &
262 $self->{s_kwd} = '';
263 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
264 not $self->{escape}) {
265 !!!cp (1);
266 ## NOTE: In the spec, the tokenizer is switched to the
267 ## "entity data state". In this implementation, the tokenizer
268 ## is switched to the |ENTITY_STATE|, which is an implementation
269 ## of the "consume a character reference" algorithm.
270 $self->{entity_add} = -1;
271 $self->{prev_state} = DATA_STATE;
272 $self->{state} = ENTITY_STATE;
273 !!!next-input-character;
274 redo A;
275 } else {
276 !!!cp (2);
277 #
278 }
279 } elsif ($self->{nc} == 0x002D) { # -
280 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
281 $self->{s_kwd} .= '-';
282
283 if ($self->{s_kwd} eq '<!--') {
284 !!!cp (3);
285 $self->{escape} = 1; # unless $self->{escape};
286 $self->{s_kwd} = '--';
287 #
288 } elsif ($self->{s_kwd} eq '---') {
289 !!!cp (4);
290 $self->{s_kwd} = '--';
291 #
292 } else {
293 !!!cp (5);
294 #
295 }
296 }
297
298 #
299 } elsif ($self->{nc} == 0x0021) { # !
300 if (length $self->{s_kwd}) {
301 !!!cp (5.1);
302 $self->{s_kwd} .= '!';
303 #
304 } else {
305 !!!cp (5.2);
306 #$self->{s_kwd} = '';
307 #
308 }
309 #
310 } elsif ($self->{nc} == 0x003C) { # <
311 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
312 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
313 not $self->{escape})) {
314 !!!cp (6);
315 $self->{state} = TAG_OPEN_STATE;
316 !!!next-input-character;
317 redo A;
318 } else {
319 !!!cp (7);
320 $self->{s_kwd} = '';
321 #
322 }
323 } elsif ($self->{nc} == 0x003E) { # >
324 if ($self->{escape} and
325 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
326 if ($self->{s_kwd} eq '--') {
327 !!!cp (8);
328 delete $self->{escape};
329 } else {
330 !!!cp (9);
331 }
332 } else {
333 !!!cp (10);
334 }
335
336 $self->{s_kwd} = '';
337 #
338 } elsif ($self->{nc} == -1) {
339 !!!cp (11);
340 $self->{s_kwd} = '';
341 !!!emit ({type => END_OF_FILE_TOKEN,
342 line => $self->{line}, column => $self->{column}});
343 last A; ## TODO: ok?
344 } else {
345 !!!cp (12);
346 $self->{s_kwd} = '';
347 #
348 }
349
350 # Anything else
351 my $token = {type => CHARACTER_TOKEN,
352 data => chr $self->{nc},
353 line => $self->{line}, column => $self->{column},
354 };
355 if ($self->{read_until}->($token->{data}, q[-!<>&],
356 length $token->{data})) {
357 $self->{s_kwd} = '';
358 }
359
360 ## Stay in the data state.
361 if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
362 !!!cp (13);
363 $self->{state} = PCDATA_STATE;
364 } else {
365 !!!cp (14);
366 ## Stay in the state.
367 }
368 !!!next-input-character;
369 !!!emit ($token);
370 redo A;
371 } elsif ($self->{state} == TAG_OPEN_STATE) {
372 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
373 if ($self->{nc} == 0x002F) { # /
374 !!!cp (15);
375 !!!next-input-character;
376 $self->{state} = CLOSE_TAG_OPEN_STATE;
377 redo A;
378 } elsif ($self->{nc} == 0x0021) { # !
379 !!!cp (15.1);
380 $self->{s_kwd} = '<' unless $self->{escape};
381 #
382 } else {
383 !!!cp (16);
384 #
385 }
386
387 ## reconsume
388 $self->{state} = DATA_STATE;
389 !!!emit ({type => CHARACTER_TOKEN, data => '<',
390 line => $self->{line_prev},
391 column => $self->{column_prev},
392 });
393 redo A;
394 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
395 if ($self->{nc} == 0x0021) { # !
396 !!!cp (17);
397 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
398 !!!next-input-character;
399 redo A;
400 } elsif ($self->{nc} == 0x002F) { # /
401 !!!cp (18);
402 $self->{state} = CLOSE_TAG_OPEN_STATE;
403 !!!next-input-character;
404 redo A;
405 } elsif (0x0041 <= $self->{nc} and
406 $self->{nc} <= 0x005A) { # A..Z
407 !!!cp (19);
408 $self->{ct}
409 = {type => START_TAG_TOKEN,
410 tag_name => chr ($self->{nc} + 0x0020),
411 line => $self->{line_prev},
412 column => $self->{column_prev}};
413 $self->{state} = TAG_NAME_STATE;
414 !!!next-input-character;
415 redo A;
416 } elsif (0x0061 <= $self->{nc} and
417 $self->{nc} <= 0x007A) { # a..z
418 !!!cp (20);
419 $self->{ct} = {type => START_TAG_TOKEN,
420 tag_name => chr ($self->{nc}),
421 line => $self->{line_prev},
422 column => $self->{column_prev}};
423 $self->{state} = TAG_NAME_STATE;
424 !!!next-input-character;
425 redo A;
426 } elsif ($self->{nc} == 0x003E) { # >
427 !!!cp (21);
428 !!!parse-error (type => 'empty start tag',
429 line => $self->{line_prev},
430 column => $self->{column_prev});
431 $self->{state} = DATA_STATE;
432 !!!next-input-character;
433
434 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
435 line => $self->{line_prev},
436 column => $self->{column_prev},
437 });
438
439 redo A;
440 } elsif ($self->{nc} == 0x003F) { # ?
441 !!!cp (22);
442 !!!parse-error (type => 'pio',
443 line => $self->{line_prev},
444 column => $self->{column_prev});
445 $self->{state} = BOGUS_COMMENT_STATE;
446 $self->{ct} = {type => COMMENT_TOKEN, data => '',
447 line => $self->{line_prev},
448 column => $self->{column_prev},
449 };
450 ## $self->{nc} is intentionally left as is
451 redo A;
452 } else {
453 !!!cp (23);
454 !!!parse-error (type => 'bare stago',
455 line => $self->{line_prev},
456 column => $self->{column_prev});
457 $self->{state} = DATA_STATE;
458 ## reconsume
459
460 !!!emit ({type => CHARACTER_TOKEN, data => '<',
461 line => $self->{line_prev},
462 column => $self->{column_prev},
463 });
464
465 redo A;
466 }
467 } else {
468 die "$0: $self->{content_model} in tag open";
469 }
470 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
471 ## NOTE: The "close tag open state" in the spec is implemented as
472 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
473
474 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
475 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
476 if (defined $self->{last_stag_name}) {
477 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
478 $self->{s_kwd} = '';
479 ## Reconsume.
480 redo A;
481 } else {
482 ## No start tag token has ever been emitted
483 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
484 !!!cp (28);
485 $self->{state} = DATA_STATE;
486 ## Reconsume.
487 !!!emit ({type => CHARACTER_TOKEN, data => '</',
488 line => $l, column => $c,
489 });
490 redo A;
491 }
492 }
493
494 if (0x0041 <= $self->{nc} and
495 $self->{nc} <= 0x005A) { # A..Z
496 !!!cp (29);
497 $self->{ct}
498 = {type => END_TAG_TOKEN,
499 tag_name => chr ($self->{nc} + 0x0020),
500 line => $l, column => $c};
501 $self->{state} = TAG_NAME_STATE;
502 !!!next-input-character;
503 redo A;
504 } elsif (0x0061 <= $self->{nc} and
505 $self->{nc} <= 0x007A) { # a..z
506 !!!cp (30);
507 $self->{ct} = {type => END_TAG_TOKEN,
508 tag_name => chr ($self->{nc}),
509 line => $l, column => $c};
510 $self->{state} = TAG_NAME_STATE;
511 !!!next-input-character;
512 redo A;
513 } elsif ($self->{nc} == 0x003E) { # >
514 !!!cp (31);
515 !!!parse-error (type => 'empty end tag',
516 line => $self->{line_prev}, ## "<" in "</>"
517 column => $self->{column_prev} - 1);
518 $self->{state} = DATA_STATE;
519 !!!next-input-character;
520 redo A;
521 } elsif ($self->{nc} == -1) {
522 !!!cp (32);
523 !!!parse-error (type => 'bare etago');
524 $self->{state} = DATA_STATE;
525 # reconsume
526
527 !!!emit ({type => CHARACTER_TOKEN, data => '</',
528 line => $l, column => $c,
529 });
530
531 redo A;
532 } else {
533 !!!cp (33);
534 !!!parse-error (type => 'bogus end tag');
535 $self->{state} = BOGUS_COMMENT_STATE;
536 $self->{ct} = {type => COMMENT_TOKEN, data => '',
537 line => $self->{line_prev}, # "<" of "</"
538 column => $self->{column_prev} - 1,
539 };
540 ## NOTE: $self->{nc} is intentionally left as is.
541 ## Although the "anything else" case of the spec not explicitly
542 ## states that the next input character is to be reconsumed,
543 ## it will be included to the |data| of the comment token
544 ## generated from the bogus end tag, as defined in the
545 ## "bogus comment state" entry.
546 redo A;
547 }
548 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
549 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
550 if (length $ch) {
551 my $CH = $ch;
552 $ch =~ tr/a-z/A-Z/;
553 my $nch = chr $self->{nc};
554 if ($nch eq $ch or $nch eq $CH) {
555 !!!cp (24);
556 ## Stay in the state.
557 $self->{s_kwd} .= $nch;
558 !!!next-input-character;
559 redo A;
560 } else {
561 !!!cp (25);
562 $self->{state} = DATA_STATE;
563 ## Reconsume.
564 !!!emit ({type => CHARACTER_TOKEN,
565 data => '</' . $self->{s_kwd},
566 line => $self->{line_prev},
567 column => $self->{column_prev} - 1 - length $self->{s_kwd},
568 });
569 redo A;
570 }
571 } else { # after "<{tag-name}"
572 unless ($is_space->{$self->{nc}} or
573 {
574 0x003E => 1, # >
575 0x002F => 1, # /
576 -1 => 1, # EOF
577 }->{$self->{nc}}) {
578 !!!cp (26);
579 ## Reconsume.
580 $self->{state} = DATA_STATE;
581 !!!emit ({type => CHARACTER_TOKEN,
582 data => '</' . $self->{s_kwd},
583 line => $self->{line_prev},
584 column => $self->{column_prev} - 1 - length $self->{s_kwd},
585 });
586 redo A;
587 } else {
588 !!!cp (27);
589 $self->{ct}
590 = {type => END_TAG_TOKEN,
591 tag_name => $self->{last_stag_name},
592 line => $self->{line_prev},
593 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
594 $self->{state} = TAG_NAME_STATE;
595 ## Reconsume.
596 redo A;
597 }
598 }
599 } elsif ($self->{state} == TAG_NAME_STATE) {
600 if ($is_space->{$self->{nc}}) {
601 !!!cp (34);
602 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
603 !!!next-input-character;
604 redo A;
605 } elsif ($self->{nc} == 0x003E) { # >
606 if ($self->{ct}->{type} == START_TAG_TOKEN) {
607 !!!cp (35);
608 $self->{last_stag_name} = $self->{ct}->{tag_name};
609 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
610 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
611 #if ($self->{ct}->{attributes}) {
612 # ## NOTE: This should never be reached.
613 # !!! cp (36);
614 # !!! parse-error (type => 'end tag attribute');
615 #} else {
616 !!!cp (37);
617 #}
618 } else {
619 die "$0: $self->{ct}->{type}: Unknown token type";
620 }
621 $self->{state} = DATA_STATE;
622 !!!next-input-character;
623
624 !!!emit ($self->{ct}); # start tag or end tag
625
626 redo A;
627 } elsif (0x0041 <= $self->{nc} and
628 $self->{nc} <= 0x005A) { # A..Z
629 !!!cp (38);
630 $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
631 # start tag or end tag
632 ## Stay in this state
633 !!!next-input-character;
634 redo A;
635 } elsif ($self->{nc} == -1) {
636 !!!parse-error (type => 'unclosed tag');
637 if ($self->{ct}->{type} == START_TAG_TOKEN) {
638 !!!cp (39);
639 $self->{last_stag_name} = $self->{ct}->{tag_name};
640 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
641 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
642 #if ($self->{ct}->{attributes}) {
643 # ## NOTE: This state should never be reached.
644 # !!! cp (40);
645 # !!! parse-error (type => 'end tag attribute');
646 #} else {
647 !!!cp (41);
648 #}
649 } else {
650 die "$0: $self->{ct}->{type}: Unknown token type";
651 }
652 $self->{state} = DATA_STATE;
653 # reconsume
654
655 !!!emit ($self->{ct}); # start tag or end tag
656
657 redo A;
658 } elsif ($self->{nc} == 0x002F) { # /
659 !!!cp (42);
660 $self->{state} = SELF_CLOSING_START_TAG_STATE;
661 !!!next-input-character;
662 redo A;
663 } else {
664 !!!cp (44);
665 $self->{ct}->{tag_name} .= chr $self->{nc};
666 # start tag or end tag
667 ## Stay in the state
668 !!!next-input-character;
669 redo A;
670 }
671 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
672 if ($is_space->{$self->{nc}}) {
673 !!!cp (45);
674 ## Stay in the state
675 !!!next-input-character;
676 redo A;
677 } elsif ($self->{nc} == 0x003E) { # >
678 if ($self->{ct}->{type} == START_TAG_TOKEN) {
679 !!!cp (46);
680 $self->{last_stag_name} = $self->{ct}->{tag_name};
681 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
682 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
683 if ($self->{ct}->{attributes}) {
684 !!!cp (47);
685 !!!parse-error (type => 'end tag attribute');
686 } else {
687 !!!cp (48);
688 }
689 } else {
690 die "$0: $self->{ct}->{type}: Unknown token type";
691 }
692 $self->{state} = DATA_STATE;
693 !!!next-input-character;
694
695 !!!emit ($self->{ct}); # start tag or end tag
696
697 redo A;
698 } elsif (0x0041 <= $self->{nc} and
699 $self->{nc} <= 0x005A) { # A..Z
700 !!!cp (49);
701 $self->{ca}
702 = {name => chr ($self->{nc} + 0x0020),
703 value => '',
704 line => $self->{line}, column => $self->{column}};
705 $self->{state} = ATTRIBUTE_NAME_STATE;
706 !!!next-input-character;
707 redo A;
708 } elsif ($self->{nc} == 0x002F) { # /
709 !!!cp (50);
710 $self->{state} = SELF_CLOSING_START_TAG_STATE;
711 !!!next-input-character;
712 redo A;
713 } elsif ($self->{nc} == -1) {
714 !!!parse-error (type => 'unclosed tag');
715 if ($self->{ct}->{type} == START_TAG_TOKEN) {
716 !!!cp (52);
717 $self->{last_stag_name} = $self->{ct}->{tag_name};
718 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
719 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
720 if ($self->{ct}->{attributes}) {
721 !!!cp (53);
722 !!!parse-error (type => 'end tag attribute');
723 } else {
724 !!!cp (54);
725 }
726 } else {
727 die "$0: $self->{ct}->{type}: Unknown token type";
728 }
729 $self->{state} = DATA_STATE;
730 # reconsume
731
732 !!!emit ($self->{ct}); # start tag or end tag
733
734 redo A;
735 } else {
736 if ({
737 0x0022 => 1, # "
738 0x0027 => 1, # '
739 0x003D => 1, # =
740 }->{$self->{nc}}) {
741 !!!cp (55);
742 !!!parse-error (type => 'bad attribute name');
743 } else {
744 !!!cp (56);
745 }
746 $self->{ca}
747 = {name => chr ($self->{nc}),
748 value => '',
749 line => $self->{line}, column => $self->{column}};
750 $self->{state} = ATTRIBUTE_NAME_STATE;
751 !!!next-input-character;
752 redo A;
753 }
754 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
755 my $before_leave = sub {
756 if (exists $self->{ct}->{attributes} # start tag or end tag
757 ->{$self->{ca}->{name}}) { # MUST
758 !!!cp (57);
759 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
760 ## Discard $self->{ca} # MUST
761 } else {
762 !!!cp (58);
763 $self->{ct}->{attributes}->{$self->{ca}->{name}}
764 = $self->{ca};
765 }
766 }; # $before_leave
767
768 if ($is_space->{$self->{nc}}) {
769 !!!cp (59);
770 $before_leave->();
771 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
772 !!!next-input-character;
773 redo A;
774 } elsif ($self->{nc} == 0x003D) { # =
775 !!!cp (60);
776 $before_leave->();
777 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
778 !!!next-input-character;
779 redo A;
780 } elsif ($self->{nc} == 0x003E) { # >
781 $before_leave->();
782 if ($self->{ct}->{type} == START_TAG_TOKEN) {
783 !!!cp (61);
784 $self->{last_stag_name} = $self->{ct}->{tag_name};
785 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
786 !!!cp (62);
787 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
788 if ($self->{ct}->{attributes}) {
789 !!!parse-error (type => 'end tag attribute');
790 }
791 } else {
792 die "$0: $self->{ct}->{type}: Unknown token type";
793 }
794 $self->{state} = DATA_STATE;
795 !!!next-input-character;
796
797 !!!emit ($self->{ct}); # start tag or end tag
798
799 redo A;
800 } elsif (0x0041 <= $self->{nc} and
801 $self->{nc} <= 0x005A) { # A..Z
802 !!!cp (63);
803 $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
804 ## Stay in the state
805 !!!next-input-character;
806 redo A;
807 } elsif ($self->{nc} == 0x002F) { # /
808 !!!cp (64);
809 $before_leave->();
810 $self->{state} = SELF_CLOSING_START_TAG_STATE;
811 !!!next-input-character;
812 redo A;
813 } elsif ($self->{nc} == -1) {
814 !!!parse-error (type => 'unclosed tag');
815 $before_leave->();
816 if ($self->{ct}->{type} == START_TAG_TOKEN) {
817 !!!cp (66);
818 $self->{last_stag_name} = $self->{ct}->{tag_name};
819 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
820 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
821 if ($self->{ct}->{attributes}) {
822 !!!cp (67);
823 !!!parse-error (type => 'end tag attribute');
824 } else {
825 ## NOTE: This state should never be reached.
826 !!!cp (68);
827 }
828 } else {
829 die "$0: $self->{ct}->{type}: Unknown token type";
830 }
831 $self->{state} = DATA_STATE;
832 # reconsume
833
834 !!!emit ($self->{ct}); # start tag or end tag
835
836 redo A;
837 } else {
838 if ($self->{nc} == 0x0022 or # "
839 $self->{nc} == 0x0027) { # '
840 !!!cp (69);
841 !!!parse-error (type => 'bad attribute name');
842 } else {
843 !!!cp (70);
844 }
845 $self->{ca}->{name} .= chr ($self->{nc});
846 ## Stay in the state
847 !!!next-input-character;
848 redo A;
849 }
850 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
851 if ($is_space->{$self->{nc}}) {
852 !!!cp (71);
853 ## Stay in the state
854 !!!next-input-character;
855 redo A;
856 } elsif ($self->{nc} == 0x003D) { # =
857 !!!cp (72);
858 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
859 !!!next-input-character;
860 redo A;
861 } elsif ($self->{nc} == 0x003E) { # >
862 if ($self->{ct}->{type} == START_TAG_TOKEN) {
863 !!!cp (73);
864 $self->{last_stag_name} = $self->{ct}->{tag_name};
865 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
866 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
867 if ($self->{ct}->{attributes}) {
868 !!!cp (74);
869 !!!parse-error (type => 'end tag attribute');
870 } else {
871 ## NOTE: This state should never be reached.
872 !!!cp (75);
873 }
874 } else {
875 die "$0: $self->{ct}->{type}: Unknown token type";
876 }
877 $self->{state} = DATA_STATE;
878 !!!next-input-character;
879
880 !!!emit ($self->{ct}); # start tag or end tag
881
882 redo A;
883 } elsif (0x0041 <= $self->{nc} and
884 $self->{nc} <= 0x005A) { # A..Z
885 !!!cp (76);
886 $self->{ca}
887 = {name => chr ($self->{nc} + 0x0020),
888 value => '',
889 line => $self->{line}, column => $self->{column}};
890 $self->{state} = ATTRIBUTE_NAME_STATE;
891 !!!next-input-character;
892 redo A;
893 } elsif ($self->{nc} == 0x002F) { # /
894 !!!cp (77);
895 $self->{state} = SELF_CLOSING_START_TAG_STATE;
896 !!!next-input-character;
897 redo A;
898 } elsif ($self->{nc} == -1) {
899 !!!parse-error (type => 'unclosed tag');
900 if ($self->{ct}->{type} == START_TAG_TOKEN) {
901 !!!cp (79);
902 $self->{last_stag_name} = $self->{ct}->{tag_name};
903 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
904 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
905 if ($self->{ct}->{attributes}) {
906 !!!cp (80);
907 !!!parse-error (type => 'end tag attribute');
908 } else {
909 ## NOTE: This state should never be reached.
910 !!!cp (81);
911 }
912 } else {
913 die "$0: $self->{ct}->{type}: Unknown token type";
914 }
915 $self->{state} = DATA_STATE;
916 # reconsume
917
918 !!!emit ($self->{ct}); # start tag or end tag
919
920 redo A;
921 } else {
922 if ($self->{nc} == 0x0022 or # "
923 $self->{nc} == 0x0027) { # '
924 !!!cp (78);
925 !!!parse-error (type => 'bad attribute name');
926 } else {
927 !!!cp (82);
928 }
929 $self->{ca}
930 = {name => chr ($self->{nc}),
931 value => '',
932 line => $self->{line}, column => $self->{column}};
933 $self->{state} = ATTRIBUTE_NAME_STATE;
934 !!!next-input-character;
935 redo A;
936 }
937 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
938 if ($is_space->{$self->{nc}}) {
939 !!!cp (83);
940 ## Stay in the state
941 !!!next-input-character;
942 redo A;
943 } elsif ($self->{nc} == 0x0022) { # "
944 !!!cp (84);
945 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
946 !!!next-input-character;
947 redo A;
948 } elsif ($self->{nc} == 0x0026) { # &
949 !!!cp (85);
950 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
951 ## reconsume
952 redo A;
953 } elsif ($self->{nc} == 0x0027) { # '
954 !!!cp (86);
955 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
956 !!!next-input-character;
957 redo A;
958 } elsif ($self->{nc} == 0x003E) { # >
959 !!!parse-error (type => 'empty unquoted attribute value');
960 if ($self->{ct}->{type} == START_TAG_TOKEN) {
961 !!!cp (87);
962 $self->{last_stag_name} = $self->{ct}->{tag_name};
963 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
964 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
965 if ($self->{ct}->{attributes}) {
966 !!!cp (88);
967 !!!parse-error (type => 'end tag attribute');
968 } else {
969 ## NOTE: This state should never be reached.
970 !!!cp (89);
971 }
972 } else {
973 die "$0: $self->{ct}->{type}: Unknown token type";
974 }
975 $self->{state} = DATA_STATE;
976 !!!next-input-character;
977
978 !!!emit ($self->{ct}); # start tag or end tag
979
980 redo A;
981 } elsif ($self->{nc} == -1) {
982 !!!parse-error (type => 'unclosed tag');
983 if ($self->{ct}->{type} == START_TAG_TOKEN) {
984 !!!cp (90);
985 $self->{last_stag_name} = $self->{ct}->{tag_name};
986 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
987 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
988 if ($self->{ct}->{attributes}) {
989 !!!cp (91);
990 !!!parse-error (type => 'end tag attribute');
991 } else {
992 ## NOTE: This state should never be reached.
993 !!!cp (92);
994 }
995 } else {
996 die "$0: $self->{ct}->{type}: Unknown token type";
997 }
998 $self->{state} = DATA_STATE;
999 ## reconsume
1000
1001 !!!emit ($self->{ct}); # start tag or end tag
1002
1003 redo A;
1004 } else {
1005 if ($self->{nc} == 0x003D) { # =
1006 !!!cp (93);
1007 !!!parse-error (type => 'bad attribute value');
1008 } else {
1009 !!!cp (94);
1010 }
1011 $self->{ca}->{value} .= chr ($self->{nc});
1012 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1013 !!!next-input-character;
1014 redo A;
1015 }
1016 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1017 if ($self->{nc} == 0x0022) { # "
1018 !!!cp (95);
1019 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1020 !!!next-input-character;
1021 redo A;
1022 } elsif ($self->{nc} == 0x0026) { # &
1023 !!!cp (96);
1024 ## NOTE: In the spec, the tokenizer is switched to the
1025 ## "entity in attribute value state". In this implementation, the
1026 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1027 ## implementation of the "consume a character reference" algorithm.
1028 $self->{prev_state} = $self->{state};
1029 $self->{entity_add} = 0x0022; # "
1030 $self->{state} = ENTITY_STATE;
1031 !!!next-input-character;
1032 redo A;
1033 } elsif ($self->{nc} == -1) {
1034 !!!parse-error (type => 'unclosed attribute value');
1035 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1036 !!!cp (97);
1037 $self->{last_stag_name} = $self->{ct}->{tag_name};
1038 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1039 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1040 if ($self->{ct}->{attributes}) {
1041 !!!cp (98);
1042 !!!parse-error (type => 'end tag attribute');
1043 } else {
1044 ## NOTE: This state should never be reached.
1045 !!!cp (99);
1046 }
1047 } else {
1048 die "$0: $self->{ct}->{type}: Unknown token type";
1049 }
1050 $self->{state} = DATA_STATE;
1051 ## reconsume
1052
1053 !!!emit ($self->{ct}); # start tag or end tag
1054
1055 redo A;
1056 } else {
1057 !!!cp (100);
1058 $self->{ca}->{value} .= chr ($self->{nc});
1059 $self->{read_until}->($self->{ca}->{value},
1060 q["&],
1061 length $self->{ca}->{value});
1062
1063 ## Stay in the state
1064 !!!next-input-character;
1065 redo A;
1066 }
1067 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1068 if ($self->{nc} == 0x0027) { # '
1069 !!!cp (101);
1070 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1071 !!!next-input-character;
1072 redo A;
1073 } elsif ($self->{nc} == 0x0026) { # &
1074 !!!cp (102);
1075 ## NOTE: In the spec, the tokenizer is switched to the
1076 ## "entity in attribute value state". In this implementation, the
1077 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1078 ## implementation of the "consume a character reference" algorithm.
1079 $self->{entity_add} = 0x0027; # '
1080 $self->{prev_state} = $self->{state};
1081 $self->{state} = ENTITY_STATE;
1082 !!!next-input-character;
1083 redo A;
1084 } elsif ($self->{nc} == -1) {
1085 !!!parse-error (type => 'unclosed attribute value');
1086 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1087 !!!cp (103);
1088 $self->{last_stag_name} = $self->{ct}->{tag_name};
1089 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1090 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1091 if ($self->{ct}->{attributes}) {
1092 !!!cp (104);
1093 !!!parse-error (type => 'end tag attribute');
1094 } else {
1095 ## NOTE: This state should never be reached.
1096 !!!cp (105);
1097 }
1098 } else {
1099 die "$0: $self->{ct}->{type}: Unknown token type";
1100 }
1101 $self->{state} = DATA_STATE;
1102 ## reconsume
1103
1104 !!!emit ($self->{ct}); # start tag or end tag
1105
1106 redo A;
1107 } else {
1108 !!!cp (106);
1109 $self->{ca}->{value} .= chr ($self->{nc});
1110 $self->{read_until}->($self->{ca}->{value},
1111 q['&],
1112 length $self->{ca}->{value});
1113
1114 ## Stay in the state
1115 !!!next-input-character;
1116 redo A;
1117 }
1118 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1119 if ($is_space->{$self->{nc}}) {
1120 !!!cp (107);
1121 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1122 !!!next-input-character;
1123 redo A;
1124 } elsif ($self->{nc} == 0x0026) { # &
1125 !!!cp (108);
1126 ## NOTE: In the spec, the tokenizer is switched to the
1127 ## "entity in attribute value state". In this implementation, the
1128 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1129 ## implementation of the "consume a character reference" algorithm.
1130 $self->{entity_add} = -1;
1131 $self->{prev_state} = $self->{state};
1132 $self->{state} = ENTITY_STATE;
1133 !!!next-input-character;
1134 redo A;
1135 } elsif ($self->{nc} == 0x003E) { # >
1136 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1137 !!!cp (109);
1138 $self->{last_stag_name} = $self->{ct}->{tag_name};
1139 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1140 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1141 if ($self->{ct}->{attributes}) {
1142 !!!cp (110);
1143 !!!parse-error (type => 'end tag attribute');
1144 } else {
1145 ## NOTE: This state should never be reached.
1146 !!!cp (111);
1147 }
1148 } else {
1149 die "$0: $self->{ct}->{type}: Unknown token type";
1150 }
1151 $self->{state} = DATA_STATE;
1152 !!!next-input-character;
1153
1154 !!!emit ($self->{ct}); # start tag or end tag
1155
1156 redo A;
1157 } elsif ($self->{nc} == -1) {
1158 !!!parse-error (type => 'unclosed tag');
1159 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1160 !!!cp (112);
1161 $self->{last_stag_name} = $self->{ct}->{tag_name};
1162 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1163 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1164 if ($self->{ct}->{attributes}) {
1165 !!!cp (113);
1166 !!!parse-error (type => 'end tag attribute');
1167 } else {
1168 ## NOTE: This state should never be reached.
1169 !!!cp (114);
1170 }
1171 } else {
1172 die "$0: $self->{ct}->{type}: Unknown token type";
1173 }
1174 $self->{state} = DATA_STATE;
1175 ## reconsume
1176
1177 !!!emit ($self->{ct}); # start tag or end tag
1178
1179 redo A;
1180 } else {
1181 if ({
1182 0x0022 => 1, # "
1183 0x0027 => 1, # '
1184 0x003D => 1, # =
1185 }->{$self->{nc}}) {
1186 !!!cp (115);
1187 !!!parse-error (type => 'bad attribute value');
1188 } else {
1189 !!!cp (116);
1190 }
1191 $self->{ca}->{value} .= chr ($self->{nc});
1192 $self->{read_until}->($self->{ca}->{value},
1193 q["'=& >],
1194 length $self->{ca}->{value});
1195
1196 ## Stay in the state
1197 !!!next-input-character;
1198 redo A;
1199 }
1200 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1201 if ($is_space->{$self->{nc}}) {
1202 !!!cp (118);
1203 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1204 !!!next-input-character;
1205 redo A;
1206 } elsif ($self->{nc} == 0x003E) { # >
1207 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1208 !!!cp (119);
1209 $self->{last_stag_name} = $self->{ct}->{tag_name};
1210 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1211 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1212 if ($self->{ct}->{attributes}) {
1213 !!!cp (120);
1214 !!!parse-error (type => 'end tag attribute');
1215 } else {
1216 ## NOTE: This state should never be reached.
1217 !!!cp (121);
1218 }
1219 } else {
1220 die "$0: $self->{ct}->{type}: Unknown token type";
1221 }
1222 $self->{state} = DATA_STATE;
1223 !!!next-input-character;
1224
1225 !!!emit ($self->{ct}); # start tag or end tag
1226
1227 redo A;
1228 } elsif ($self->{nc} == 0x002F) { # /
1229 !!!cp (122);
1230 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1231 !!!next-input-character;
1232 redo A;
1233 } elsif ($self->{nc} == -1) {
1234 !!!parse-error (type => 'unclosed tag');
1235 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1236 !!!cp (122.3);
1237 $self->{last_stag_name} = $self->{ct}->{tag_name};
1238 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1239 if ($self->{ct}->{attributes}) {
1240 !!!cp (122.1);
1241 !!!parse-error (type => 'end tag attribute');
1242 } else {
1243 ## NOTE: This state should never be reached.
1244 !!!cp (122.2);
1245 }
1246 } else {
1247 die "$0: $self->{ct}->{type}: Unknown token type";
1248 }
1249 $self->{state} = DATA_STATE;
1250 ## Reconsume.
1251 !!!emit ($self->{ct}); # start tag or end tag
1252 redo A;
1253 } else {
1254 !!!cp ('124.1');
1255 !!!parse-error (type => 'no space between attributes');
1256 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1257 ## reconsume
1258 redo A;
1259 }
1260 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1261 if ($self->{nc} == 0x003E) { # >
1262 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1263 !!!cp ('124.2');
1264 !!!parse-error (type => 'nestc', token => $self->{ct});
1265 ## TODO: Different type than slash in start tag
1266 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1267 if ($self->{ct}->{attributes}) {
1268 !!!cp ('124.4');
1269 !!!parse-error (type => 'end tag attribute');
1270 } else {
1271 !!!cp ('124.5');
1272 }
1273 ## TODO: Test |<title></title/>|
1274 } else {
1275 !!!cp ('124.3');
1276 $self->{self_closing} = 1;
1277 }
1278
1279 $self->{state} = DATA_STATE;
1280 !!!next-input-character;
1281
1282 !!!emit ($self->{ct}); # start tag or end tag
1283
1284 redo A;
1285 } elsif ($self->{nc} == -1) {
1286 !!!parse-error (type => 'unclosed tag');
1287 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1288 !!!cp (124.7);
1289 $self->{last_stag_name} = $self->{ct}->{tag_name};
1290 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1291 if ($self->{ct}->{attributes}) {
1292 !!!cp (124.5);
1293 !!!parse-error (type => 'end tag attribute');
1294 } else {
1295 ## NOTE: This state should never be reached.
1296 !!!cp (124.6);
1297 }
1298 } else {
1299 die "$0: $self->{ct}->{type}: Unknown token type";
1300 }
1301 $self->{state} = DATA_STATE;
1302 ## Reconsume.
1303 !!!emit ($self->{ct}); # start tag or end tag
1304 redo A;
1305 } else {
1306 !!!cp ('124.4');
1307 !!!parse-error (type => 'nestc');
1308 ## TODO: This error type is wrong.
1309 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1310 ## Reconsume.
1311 redo A;
1312 }
1313 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1314 ## (only happen if PCDATA state)
1315
1316 ## NOTE: Unlike spec's "bogus comment state", this implementation
1317 ## consumes characters one-by-one basis.
1318
1319 if ($self->{nc} == 0x003E) { # >
1320 !!!cp (124);
1321 $self->{state} = DATA_STATE;
1322 !!!next-input-character;
1323
1324 !!!emit ($self->{ct}); # comment
1325 redo A;
1326 } elsif ($self->{nc} == -1) {
1327 !!!cp (125);
1328 $self->{state} = DATA_STATE;
1329 ## reconsume
1330
1331 !!!emit ($self->{ct}); # comment
1332 redo A;
1333 } else {
1334 !!!cp (126);
1335 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1336 $self->{read_until}->($self->{ct}->{data},
1337 q[>],
1338 length $self->{ct}->{data});
1339
1340 ## Stay in the state.
1341 !!!next-input-character;
1342 redo A;
1343 }
1344 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1345 ## (only happen if PCDATA state)
1346
1347 if ($self->{nc} == 0x002D) { # -
1348 !!!cp (133);
1349 $self->{state} = MD_HYPHEN_STATE;
1350 !!!next-input-character;
1351 redo A;
1352 } elsif ($self->{nc} == 0x0044 or # D
1353 $self->{nc} == 0x0064) { # d
1354 ## ASCII case-insensitive.
1355 !!!cp (130);
1356 $self->{state} = MD_DOCTYPE_STATE;
1357 $self->{s_kwd} = chr $self->{nc};
1358 !!!next-input-character;
1359 redo A;
1360 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1361 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1362 $self->{nc} == 0x005B) { # [
1363 !!!cp (135.4);
1364 $self->{state} = MD_CDATA_STATE;
1365 $self->{s_kwd} = '[';
1366 !!!next-input-character;
1367 redo A;
1368 } else {
1369 !!!cp (136);
1370 }
1371
1372 !!!parse-error (type => 'bogus comment',
1373 line => $self->{line_prev},
1374 column => $self->{column_prev} - 1);
1375 ## Reconsume.
1376 $self->{state} = BOGUS_COMMENT_STATE;
1377 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1378 line => $self->{line_prev},
1379 column => $self->{column_prev} - 1,
1380 };
1381 redo A;
1382 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1383 if ($self->{nc} == 0x002D) { # -
1384 !!!cp (127);
1385 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1386 line => $self->{line_prev},
1387 column => $self->{column_prev} - 2,
1388 };
1389 $self->{state} = COMMENT_START_STATE;
1390 !!!next-input-character;
1391 redo A;
1392 } else {
1393 !!!cp (128);
1394 !!!parse-error (type => 'bogus comment',
1395 line => $self->{line_prev},
1396 column => $self->{column_prev} - 2);
1397 $self->{state} = BOGUS_COMMENT_STATE;
1398 ## Reconsume.
1399 $self->{ct} = {type => COMMENT_TOKEN,
1400 data => '-',
1401 line => $self->{line_prev},
1402 column => $self->{column_prev} - 2,
1403 };
1404 redo A;
1405 }
1406 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1407 ## ASCII case-insensitive.
1408 if ($self->{nc} == [
1409 undef,
1410 0x004F, # O
1411 0x0043, # C
1412 0x0054, # T
1413 0x0059, # Y
1414 0x0050, # P
1415 ]->[length $self->{s_kwd}] or
1416 $self->{nc} == [
1417 undef,
1418 0x006F, # o
1419 0x0063, # c
1420 0x0074, # t
1421 0x0079, # y
1422 0x0070, # p
1423 ]->[length $self->{s_kwd}]) {
1424 !!!cp (131);
1425 ## Stay in the state.
1426 $self->{s_kwd} .= chr $self->{nc};
1427 !!!next-input-character;
1428 redo A;
1429 } elsif ((length $self->{s_kwd}) == 6 and
1430 ($self->{nc} == 0x0045 or # E
1431 $self->{nc} == 0x0065)) { # e
1432 !!!cp (129);
1433 $self->{state} = DOCTYPE_STATE;
1434 $self->{ct} = {type => DOCTYPE_TOKEN,
1435 quirks => 1,
1436 line => $self->{line_prev},
1437 column => $self->{column_prev} - 7,
1438 };
1439 !!!next-input-character;
1440 redo A;
1441 } else {
1442 !!!cp (132);
1443 !!!parse-error (type => 'bogus comment',
1444 line => $self->{line_prev},
1445 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1446 $self->{state} = BOGUS_COMMENT_STATE;
1447 ## Reconsume.
1448 $self->{ct} = {type => COMMENT_TOKEN,
1449 data => $self->{s_kwd},
1450 line => $self->{line_prev},
1451 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1452 };
1453 redo A;
1454 }
1455 } elsif ($self->{state} == MD_CDATA_STATE) {
1456 if ($self->{nc} == {
1457 '[' => 0x0043, # C
1458 '[C' => 0x0044, # D
1459 '[CD' => 0x0041, # A
1460 '[CDA' => 0x0054, # T
1461 '[CDAT' => 0x0041, # A
1462 }->{$self->{s_kwd}}) {
1463 !!!cp (135.1);
1464 ## Stay in the state.
1465 $self->{s_kwd} .= chr $self->{nc};
1466 !!!next-input-character;
1467 redo A;
1468 } elsif ($self->{s_kwd} eq '[CDATA' and
1469 $self->{nc} == 0x005B) { # [
1470 !!!cp (135.2);
1471 $self->{ct} = {type => CHARACTER_TOKEN,
1472 data => '',
1473 line => $self->{line_prev},
1474 column => $self->{column_prev} - 7};
1475 $self->{state} = CDATA_SECTION_STATE;
1476 !!!next-input-character;
1477 redo A;
1478 } else {
1479 !!!cp (135.3);
1480 !!!parse-error (type => 'bogus comment',
1481 line => $self->{line_prev},
1482 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1483 $self->{state} = BOGUS_COMMENT_STATE;
1484 ## Reconsume.
1485 $self->{ct} = {type => COMMENT_TOKEN,
1486 data => $self->{s_kwd},
1487 line => $self->{line_prev},
1488 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1489 };
1490 redo A;
1491 }
1492 } elsif ($self->{state} == COMMENT_START_STATE) {
1493 if ($self->{nc} == 0x002D) { # -
1494 !!!cp (137);
1495 $self->{state} = COMMENT_START_DASH_STATE;
1496 !!!next-input-character;
1497 redo A;
1498 } elsif ($self->{nc} == 0x003E) { # >
1499 !!!cp (138);
1500 !!!parse-error (type => 'bogus comment');
1501 $self->{state} = DATA_STATE;
1502 !!!next-input-character;
1503
1504 !!!emit ($self->{ct}); # comment
1505
1506 redo A;
1507 } elsif ($self->{nc} == -1) {
1508 !!!cp (139);
1509 !!!parse-error (type => 'unclosed comment');
1510 $self->{state} = DATA_STATE;
1511 ## reconsume
1512
1513 !!!emit ($self->{ct}); # comment
1514
1515 redo A;
1516 } else {
1517 !!!cp (140);
1518 $self->{ct}->{data} # comment
1519 .= chr ($self->{nc});
1520 $self->{state} = COMMENT_STATE;
1521 !!!next-input-character;
1522 redo A;
1523 }
1524 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1525 if ($self->{nc} == 0x002D) { # -
1526 !!!cp (141);
1527 $self->{state} = COMMENT_END_STATE;
1528 !!!next-input-character;
1529 redo A;
1530 } elsif ($self->{nc} == 0x003E) { # >
1531 !!!cp (142);
1532 !!!parse-error (type => 'bogus comment');
1533 $self->{state} = DATA_STATE;
1534 !!!next-input-character;
1535
1536 !!!emit ($self->{ct}); # comment
1537
1538 redo A;
1539 } elsif ($self->{nc} == -1) {
1540 !!!cp (143);
1541 !!!parse-error (type => 'unclosed comment');
1542 $self->{state} = DATA_STATE;
1543 ## reconsume
1544
1545 !!!emit ($self->{ct}); # comment
1546
1547 redo A;
1548 } else {
1549 !!!cp (144);
1550 $self->{ct}->{data} # comment
1551 .= '-' . chr ($self->{nc});
1552 $self->{state} = COMMENT_STATE;
1553 !!!next-input-character;
1554 redo A;
1555 }
1556 } elsif ($self->{state} == COMMENT_STATE) {
1557 if ($self->{nc} == 0x002D) { # -
1558 !!!cp (145);
1559 $self->{state} = COMMENT_END_DASH_STATE;
1560 !!!next-input-character;
1561 redo A;
1562 } elsif ($self->{nc} == -1) {
1563 !!!cp (146);
1564 !!!parse-error (type => 'unclosed comment');
1565 $self->{state} = DATA_STATE;
1566 ## reconsume
1567
1568 !!!emit ($self->{ct}); # comment
1569
1570 redo A;
1571 } else {
1572 !!!cp (147);
1573 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1574 $self->{read_until}->($self->{ct}->{data},
1575 q[-],
1576 length $self->{ct}->{data});
1577
1578 ## Stay in the state
1579 !!!next-input-character;
1580 redo A;
1581 }
1582 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1583 if ($self->{nc} == 0x002D) { # -
1584 !!!cp (148);
1585 $self->{state} = COMMENT_END_STATE;
1586 !!!next-input-character;
1587 redo A;
1588 } elsif ($self->{nc} == -1) {
1589 !!!cp (149);
1590 !!!parse-error (type => 'unclosed comment');
1591 $self->{state} = DATA_STATE;
1592 ## reconsume
1593
1594 !!!emit ($self->{ct}); # comment
1595
1596 redo A;
1597 } else {
1598 !!!cp (150);
1599 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1600 $self->{state} = COMMENT_STATE;
1601 !!!next-input-character;
1602 redo A;
1603 }
1604 } elsif ($self->{state} == COMMENT_END_STATE) {
1605 if ($self->{nc} == 0x003E) { # >
1606 !!!cp (151);
1607 $self->{state} = DATA_STATE;
1608 !!!next-input-character;
1609
1610 !!!emit ($self->{ct}); # comment
1611
1612 redo A;
1613 } elsif ($self->{nc} == 0x002D) { # -
1614 !!!cp (152);
1615 !!!parse-error (type => 'dash in comment',
1616 line => $self->{line_prev},
1617 column => $self->{column_prev});
1618 $self->{ct}->{data} .= '-'; # comment
1619 ## Stay in the state
1620 !!!next-input-character;
1621 redo A;
1622 } elsif ($self->{nc} == -1) {
1623 !!!cp (153);
1624 !!!parse-error (type => 'unclosed comment');
1625 $self->{state} = DATA_STATE;
1626 ## reconsume
1627
1628 !!!emit ($self->{ct}); # comment
1629
1630 redo A;
1631 } else {
1632 !!!cp (154);
1633 !!!parse-error (type => 'dash in comment',
1634 line => $self->{line_prev},
1635 column => $self->{column_prev});
1636 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1637 $self->{state} = COMMENT_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 }
1641 } elsif ($self->{state} == DOCTYPE_STATE) {
1642 if ($is_space->{$self->{nc}}) {
1643 !!!cp (155);
1644 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1645 !!!next-input-character;
1646 redo A;
1647 } else {
1648 !!!cp (156);
1649 !!!parse-error (type => 'no space before DOCTYPE name');
1650 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1651 ## reconsume
1652 redo A;
1653 }
1654 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1655 if ($is_space->{$self->{nc}}) {
1656 !!!cp (157);
1657 ## Stay in the state
1658 !!!next-input-character;
1659 redo A;
1660 } elsif ($self->{nc} == 0x003E) { # >
1661 !!!cp (158);
1662 !!!parse-error (type => 'no DOCTYPE name');
1663 $self->{state} = DATA_STATE;
1664 !!!next-input-character;
1665
1666 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1667
1668 redo A;
1669 } elsif ($self->{nc} == -1) {
1670 !!!cp (159);
1671 !!!parse-error (type => 'no DOCTYPE name');
1672 $self->{state} = DATA_STATE;
1673 ## reconsume
1674
1675 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1676
1677 redo A;
1678 } else {
1679 !!!cp (160);
1680 $self->{ct}->{name} = chr $self->{nc};
1681 delete $self->{ct}->{quirks};
1682 $self->{state} = DOCTYPE_NAME_STATE;
1683 !!!next-input-character;
1684 redo A;
1685 }
1686 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1687 ## ISSUE: Redundant "First," in the spec.
1688 if ($is_space->{$self->{nc}}) {
1689 !!!cp (161);
1690 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1691 !!!next-input-character;
1692 redo A;
1693 } elsif ($self->{nc} == 0x003E) { # >
1694 !!!cp (162);
1695 $self->{state} = DATA_STATE;
1696 !!!next-input-character;
1697
1698 !!!emit ($self->{ct}); # DOCTYPE
1699
1700 redo A;
1701 } elsif ($self->{nc} == -1) {
1702 !!!cp (163);
1703 !!!parse-error (type => 'unclosed DOCTYPE');
1704 $self->{state} = DATA_STATE;
1705 ## reconsume
1706
1707 $self->{ct}->{quirks} = 1;
1708 !!!emit ($self->{ct}); # DOCTYPE
1709
1710 redo A;
1711 } else {
1712 !!!cp (164);
1713 $self->{ct}->{name}
1714 .= chr ($self->{nc}); # DOCTYPE
1715 ## Stay in the state
1716 !!!next-input-character;
1717 redo A;
1718 }
1719 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1720 if ($is_space->{$self->{nc}}) {
1721 !!!cp (165);
1722 ## Stay in the state
1723 !!!next-input-character;
1724 redo A;
1725 } elsif ($self->{nc} == 0x003E) { # >
1726 !!!cp (166);
1727 $self->{state} = DATA_STATE;
1728 !!!next-input-character;
1729
1730 !!!emit ($self->{ct}); # DOCTYPE
1731
1732 redo A;
1733 } elsif ($self->{nc} == -1) {
1734 !!!cp (167);
1735 !!!parse-error (type => 'unclosed DOCTYPE');
1736 $self->{state} = DATA_STATE;
1737 ## reconsume
1738
1739 $self->{ct}->{quirks} = 1;
1740 !!!emit ($self->{ct}); # DOCTYPE
1741
1742 redo A;
1743 } elsif ($self->{nc} == 0x0050 or # P
1744 $self->{nc} == 0x0070) { # p
1745 $self->{state} = PUBLIC_STATE;
1746 $self->{s_kwd} = chr $self->{nc};
1747 !!!next-input-character;
1748 redo A;
1749 } elsif ($self->{nc} == 0x0053 or # S
1750 $self->{nc} == 0x0073) { # s
1751 $self->{state} = SYSTEM_STATE;
1752 $self->{s_kwd} = chr $self->{nc};
1753 !!!next-input-character;
1754 redo A;
1755 } else {
1756 !!!cp (180);
1757 !!!parse-error (type => 'string after DOCTYPE name');
1758 $self->{ct}->{quirks} = 1;
1759
1760 $self->{state} = BOGUS_DOCTYPE_STATE;
1761 !!!next-input-character;
1762 redo A;
1763 }
1764 } elsif ($self->{state} == PUBLIC_STATE) {
1765 ## ASCII case-insensitive
1766 if ($self->{nc} == [
1767 undef,
1768 0x0055, # U
1769 0x0042, # B
1770 0x004C, # L
1771 0x0049, # I
1772 ]->[length $self->{s_kwd}] or
1773 $self->{nc} == [
1774 undef,
1775 0x0075, # u
1776 0x0062, # b
1777 0x006C, # l
1778 0x0069, # i
1779 ]->[length $self->{s_kwd}]) {
1780 !!!cp (175);
1781 ## Stay in the state.
1782 $self->{s_kwd} .= chr $self->{nc};
1783 !!!next-input-character;
1784 redo A;
1785 } elsif ((length $self->{s_kwd}) == 5 and
1786 ($self->{nc} == 0x0043 or # C
1787 $self->{nc} == 0x0063)) { # c
1788 !!!cp (168);
1789 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1790 !!!next-input-character;
1791 redo A;
1792 } else {
1793 !!!cp (169);
1794 !!!parse-error (type => 'string after DOCTYPE name',
1795 line => $self->{line_prev},
1796 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1797 $self->{ct}->{quirks} = 1;
1798
1799 $self->{state} = BOGUS_DOCTYPE_STATE;
1800 ## Reconsume.
1801 redo A;
1802 }
1803 } elsif ($self->{state} == SYSTEM_STATE) {
1804 ## ASCII case-insensitive
1805 if ($self->{nc} == [
1806 undef,
1807 0x0059, # Y
1808 0x0053, # S
1809 0x0054, # T
1810 0x0045, # E
1811 ]->[length $self->{s_kwd}] or
1812 $self->{nc} == [
1813 undef,
1814 0x0079, # y
1815 0x0073, # s
1816 0x0074, # t
1817 0x0065, # e
1818 ]->[length $self->{s_kwd}]) {
1819 !!!cp (170);
1820 ## Stay in the state.
1821 $self->{s_kwd} .= chr $self->{nc};
1822 !!!next-input-character;
1823 redo A;
1824 } elsif ((length $self->{s_kwd}) == 5 and
1825 ($self->{nc} == 0x004D or # M
1826 $self->{nc} == 0x006D)) { # m
1827 !!!cp (171);
1828 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1829 !!!next-input-character;
1830 redo A;
1831 } else {
1832 !!!cp (172);
1833 !!!parse-error (type => 'string after DOCTYPE name',
1834 line => $self->{line_prev},
1835 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1836 $self->{ct}->{quirks} = 1;
1837
1838 $self->{state} = BOGUS_DOCTYPE_STATE;
1839 ## Reconsume.
1840 redo A;
1841 }
1842 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1843 if ($is_space->{$self->{nc}}) {
1844 !!!cp (181);
1845 ## Stay in the state
1846 !!!next-input-character;
1847 redo A;
1848 } elsif ($self->{nc} eq 0x0022) { # "
1849 !!!cp (182);
1850 $self->{ct}->{pubid} = ''; # DOCTYPE
1851 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1852 !!!next-input-character;
1853 redo A;
1854 } elsif ($self->{nc} eq 0x0027) { # '
1855 !!!cp (183);
1856 $self->{ct}->{pubid} = ''; # DOCTYPE
1857 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1858 !!!next-input-character;
1859 redo A;
1860 } elsif ($self->{nc} eq 0x003E) { # >
1861 !!!cp (184);
1862 !!!parse-error (type => 'no PUBLIC literal');
1863
1864 $self->{state} = DATA_STATE;
1865 !!!next-input-character;
1866
1867 $self->{ct}->{quirks} = 1;
1868 !!!emit ($self->{ct}); # DOCTYPE
1869
1870 redo A;
1871 } elsif ($self->{nc} == -1) {
1872 !!!cp (185);
1873 !!!parse-error (type => 'unclosed DOCTYPE');
1874
1875 $self->{state} = DATA_STATE;
1876 ## reconsume
1877
1878 $self->{ct}->{quirks} = 1;
1879 !!!emit ($self->{ct}); # DOCTYPE
1880
1881 redo A;
1882 } else {
1883 !!!cp (186);
1884 !!!parse-error (type => 'string after PUBLIC');
1885 $self->{ct}->{quirks} = 1;
1886
1887 $self->{state} = BOGUS_DOCTYPE_STATE;
1888 !!!next-input-character;
1889 redo A;
1890 }
1891 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1892 if ($self->{nc} == 0x0022) { # "
1893 !!!cp (187);
1894 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1895 !!!next-input-character;
1896 redo A;
1897 } elsif ($self->{nc} == 0x003E) { # >
1898 !!!cp (188);
1899 !!!parse-error (type => 'unclosed PUBLIC literal');
1900
1901 $self->{state} = DATA_STATE;
1902 !!!next-input-character;
1903
1904 $self->{ct}->{quirks} = 1;
1905 !!!emit ($self->{ct}); # DOCTYPE
1906
1907 redo A;
1908 } elsif ($self->{nc} == -1) {
1909 !!!cp (189);
1910 !!!parse-error (type => 'unclosed PUBLIC literal');
1911
1912 $self->{state} = DATA_STATE;
1913 ## reconsume
1914
1915 $self->{ct}->{quirks} = 1;
1916 !!!emit ($self->{ct}); # DOCTYPE
1917
1918 redo A;
1919 } else {
1920 !!!cp (190);
1921 $self->{ct}->{pubid} # DOCTYPE
1922 .= chr $self->{nc};
1923 $self->{read_until}->($self->{ct}->{pubid}, q[">],
1924 length $self->{ct}->{pubid});
1925
1926 ## Stay in the state
1927 !!!next-input-character;
1928 redo A;
1929 }
1930 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1931 if ($self->{nc} == 0x0027) { # '
1932 !!!cp (191);
1933 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1934 !!!next-input-character;
1935 redo A;
1936 } elsif ($self->{nc} == 0x003E) { # >
1937 !!!cp (192);
1938 !!!parse-error (type => 'unclosed PUBLIC literal');
1939
1940 $self->{state} = DATA_STATE;
1941 !!!next-input-character;
1942
1943 $self->{ct}->{quirks} = 1;
1944 !!!emit ($self->{ct}); # DOCTYPE
1945
1946 redo A;
1947 } elsif ($self->{nc} == -1) {
1948 !!!cp (193);
1949 !!!parse-error (type => 'unclosed PUBLIC literal');
1950
1951 $self->{state} = DATA_STATE;
1952 ## reconsume
1953
1954 $self->{ct}->{quirks} = 1;
1955 !!!emit ($self->{ct}); # DOCTYPE
1956
1957 redo A;
1958 } else {
1959 !!!cp (194);
1960 $self->{ct}->{pubid} # DOCTYPE
1961 .= chr $self->{nc};
1962 $self->{read_until}->($self->{ct}->{pubid}, q['>],
1963 length $self->{ct}->{pubid});
1964
1965 ## Stay in the state
1966 !!!next-input-character;
1967 redo A;
1968 }
1969 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1970 if ($is_space->{$self->{nc}}) {
1971 !!!cp (195);
1972 ## Stay in the state
1973 !!!next-input-character;
1974 redo A;
1975 } elsif ($self->{nc} == 0x0022) { # "
1976 !!!cp (196);
1977 $self->{ct}->{sysid} = ''; # DOCTYPE
1978 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1979 !!!next-input-character;
1980 redo A;
1981 } elsif ($self->{nc} == 0x0027) { # '
1982 !!!cp (197);
1983 $self->{ct}->{sysid} = ''; # DOCTYPE
1984 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1985 !!!next-input-character;
1986 redo A;
1987 } elsif ($self->{nc} == 0x003E) { # >
1988 !!!cp (198);
1989 $self->{state} = DATA_STATE;
1990 !!!next-input-character;
1991
1992 !!!emit ($self->{ct}); # DOCTYPE
1993
1994 redo A;
1995 } elsif ($self->{nc} == -1) {
1996 !!!cp (199);
1997 !!!parse-error (type => 'unclosed DOCTYPE');
1998
1999 $self->{state} = DATA_STATE;
2000 ## reconsume
2001
2002 $self->{ct}->{quirks} = 1;
2003 !!!emit ($self->{ct}); # DOCTYPE
2004
2005 redo A;
2006 } else {
2007 !!!cp (200);
2008 !!!parse-error (type => 'string after PUBLIC literal');
2009 $self->{ct}->{quirks} = 1;
2010
2011 $self->{state} = BOGUS_DOCTYPE_STATE;
2012 !!!next-input-character;
2013 redo A;
2014 }
2015 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2016 if ($is_space->{$self->{nc}}) {
2017 !!!cp (201);
2018 ## Stay in the state
2019 !!!next-input-character;
2020 redo A;
2021 } elsif ($self->{nc} == 0x0022) { # "
2022 !!!cp (202);
2023 $self->{ct}->{sysid} = ''; # DOCTYPE
2024 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2025 !!!next-input-character;
2026 redo A;
2027 } elsif ($self->{nc} == 0x0027) { # '
2028 !!!cp (203);
2029 $self->{ct}->{sysid} = ''; # DOCTYPE
2030 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2031 !!!next-input-character;
2032 redo A;
2033 } elsif ($self->{nc} == 0x003E) { # >
2034 !!!cp (204);
2035 !!!parse-error (type => 'no SYSTEM literal');
2036 $self->{state} = DATA_STATE;
2037 !!!next-input-character;
2038
2039 $self->{ct}->{quirks} = 1;
2040 !!!emit ($self->{ct}); # DOCTYPE
2041
2042 redo A;
2043 } elsif ($self->{nc} == -1) {
2044 !!!cp (205);
2045 !!!parse-error (type => 'unclosed DOCTYPE');
2046
2047 $self->{state} = DATA_STATE;
2048 ## reconsume
2049
2050 $self->{ct}->{quirks} = 1;
2051 !!!emit ($self->{ct}); # DOCTYPE
2052
2053 redo A;
2054 } else {
2055 !!!cp (206);
2056 !!!parse-error (type => 'string after SYSTEM');
2057 $self->{ct}->{quirks} = 1;
2058
2059 $self->{state} = BOGUS_DOCTYPE_STATE;
2060 !!!next-input-character;
2061 redo A;
2062 }
2063 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2064 if ($self->{nc} == 0x0022) { # "
2065 !!!cp (207);
2066 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2067 !!!next-input-character;
2068 redo A;
2069 } elsif ($self->{nc} == 0x003E) { # >
2070 !!!cp (208);
2071 !!!parse-error (type => 'unclosed SYSTEM literal');
2072
2073 $self->{state} = DATA_STATE;
2074 !!!next-input-character;
2075
2076 $self->{ct}->{quirks} = 1;
2077 !!!emit ($self->{ct}); # DOCTYPE
2078
2079 redo A;
2080 } elsif ($self->{nc} == -1) {
2081 !!!cp (209);
2082 !!!parse-error (type => 'unclosed SYSTEM literal');
2083
2084 $self->{state} = DATA_STATE;
2085 ## reconsume
2086
2087 $self->{ct}->{quirks} = 1;
2088 !!!emit ($self->{ct}); # DOCTYPE
2089
2090 redo A;
2091 } else {
2092 !!!cp (210);
2093 $self->{ct}->{sysid} # DOCTYPE
2094 .= chr $self->{nc};
2095 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2096 length $self->{ct}->{sysid});
2097
2098 ## Stay in the state
2099 !!!next-input-character;
2100 redo A;
2101 }
2102 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2103 if ($self->{nc} == 0x0027) { # '
2104 !!!cp (211);
2105 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2106 !!!next-input-character;
2107 redo A;
2108 } elsif ($self->{nc} == 0x003E) { # >
2109 !!!cp (212);
2110 !!!parse-error (type => 'unclosed SYSTEM literal');
2111
2112 $self->{state} = DATA_STATE;
2113 !!!next-input-character;
2114
2115 $self->{ct}->{quirks} = 1;
2116 !!!emit ($self->{ct}); # DOCTYPE
2117
2118 redo A;
2119 } elsif ($self->{nc} == -1) {
2120 !!!cp (213);
2121 !!!parse-error (type => 'unclosed SYSTEM literal');
2122
2123 $self->{state} = DATA_STATE;
2124 ## reconsume
2125
2126 $self->{ct}->{quirks} = 1;
2127 !!!emit ($self->{ct}); # DOCTYPE
2128
2129 redo A;
2130 } else {
2131 !!!cp (214);
2132 $self->{ct}->{sysid} # DOCTYPE
2133 .= chr $self->{nc};
2134 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2135 length $self->{ct}->{sysid});
2136
2137 ## Stay in the state
2138 !!!next-input-character;
2139 redo A;
2140 }
2141 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2142 if ($is_space->{$self->{nc}}) {
2143 !!!cp (215);
2144 ## Stay in the state
2145 !!!next-input-character;
2146 redo A;
2147 } elsif ($self->{nc} == 0x003E) { # >
2148 !!!cp (216);
2149 $self->{state} = DATA_STATE;
2150 !!!next-input-character;
2151
2152 !!!emit ($self->{ct}); # DOCTYPE
2153
2154 redo A;
2155 } elsif ($self->{nc} == -1) {
2156 !!!cp (217);
2157 !!!parse-error (type => 'unclosed DOCTYPE');
2158 $self->{state} = DATA_STATE;
2159 ## reconsume
2160
2161 $self->{ct}->{quirks} = 1;
2162 !!!emit ($self->{ct}); # DOCTYPE
2163
2164 redo A;
2165 } else {
2166 !!!cp (218);
2167 !!!parse-error (type => 'string after SYSTEM literal');
2168 #$self->{ct}->{quirks} = 1;
2169
2170 $self->{state} = BOGUS_DOCTYPE_STATE;
2171 !!!next-input-character;
2172 redo A;
2173 }
2174 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2175 if ($self->{nc} == 0x003E) { # >
2176 !!!cp (219);
2177 $self->{state} = DATA_STATE;
2178 !!!next-input-character;
2179
2180 !!!emit ($self->{ct}); # DOCTYPE
2181
2182 redo A;
2183 } elsif ($self->{nc} == -1) {
2184 !!!cp (220);
2185 $self->{state} = DATA_STATE;
2186 ## reconsume
2187
2188 !!!emit ($self->{ct}); # DOCTYPE
2189
2190 redo A;
2191 } else {
2192 !!!cp (221);
2193 my $s = '';
2194 $self->{read_until}->($s, q[>], 0);
2195
2196 ## Stay in the state
2197 !!!next-input-character;
2198 redo A;
2199 }
2200 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2201 ## NOTE: "CDATA section state" in the state is jointly implemented
2202 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2203 ## and |CDATA_SECTION_MSE2_STATE|.
2204
2205 if ($self->{nc} == 0x005D) { # ]
2206 !!!cp (221.1);
2207 $self->{state} = CDATA_SECTION_MSE1_STATE;
2208 !!!next-input-character;
2209 redo A;
2210 } elsif ($self->{nc} == -1) {
2211 $self->{state} = DATA_STATE;
2212 !!!next-input-character;
2213 if (length $self->{ct}->{data}) { # character
2214 !!!cp (221.2);
2215 !!!emit ($self->{ct}); # character
2216 } else {
2217 !!!cp (221.3);
2218 ## No token to emit. $self->{ct} is discarded.
2219 }
2220 redo A;
2221 } else {
2222 !!!cp (221.4);
2223 $self->{ct}->{data} .= chr $self->{nc};
2224 $self->{read_until}->($self->{ct}->{data},
2225 q<]>,
2226 length $self->{ct}->{data});
2227
2228 ## Stay in the state.
2229 !!!next-input-character;
2230 redo A;
2231 }
2232
2233 ## ISSUE: "text tokens" in spec.
2234 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2235 if ($self->{nc} == 0x005D) { # ]
2236 !!!cp (221.5);
2237 $self->{state} = CDATA_SECTION_MSE2_STATE;
2238 !!!next-input-character;
2239 redo A;
2240 } else {
2241 !!!cp (221.6);
2242 $self->{ct}->{data} .= ']';
2243 $self->{state} = CDATA_SECTION_STATE;
2244 ## Reconsume.
2245 redo A;
2246 }
2247 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2248 if ($self->{nc} == 0x003E) { # >
2249 $self->{state} = DATA_STATE;
2250 !!!next-input-character;
2251 if (length $self->{ct}->{data}) { # character
2252 !!!cp (221.7);
2253 !!!emit ($self->{ct}); # character
2254 } else {
2255 !!!cp (221.8);
2256 ## No token to emit. $self->{ct} is discarded.
2257 }
2258 redo A;
2259 } elsif ($self->{nc} == 0x005D) { # ]
2260 !!!cp (221.9); # character
2261 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2262 ## Stay in the state.
2263 !!!next-input-character;
2264 redo A;
2265 } else {
2266 !!!cp (221.11);
2267 $self->{ct}->{data} .= ']]'; # character
2268 $self->{state} = CDATA_SECTION_STATE;
2269 ## Reconsume.
2270 redo A;
2271 }
2272 } elsif ($self->{state} == ENTITY_STATE) {
2273 if ($is_space->{$self->{nc}} or
2274 {
2275 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2276 $self->{entity_add} => 1,
2277 }->{$self->{nc}}) {
2278 !!!cp (1001);
2279 ## Don't consume
2280 ## No error
2281 ## Return nothing.
2282 #
2283 } elsif ($self->{nc} == 0x0023) { # #
2284 !!!cp (999);
2285 $self->{state} = ENTITY_HASH_STATE;
2286 $self->{s_kwd} = '#';
2287 !!!next-input-character;
2288 redo A;
2289 } elsif ((0x0041 <= $self->{nc} and
2290 $self->{nc} <= 0x005A) or # A..Z
2291 (0x0061 <= $self->{nc} and
2292 $self->{nc} <= 0x007A)) { # a..z
2293 !!!cp (998);
2294 require Whatpm::_NamedEntityList;
2295 $self->{state} = ENTITY_NAME_STATE;
2296 $self->{s_kwd} = chr $self->{nc};
2297 $self->{entity__value} = $self->{s_kwd};
2298 $self->{entity__match} = 0;
2299 !!!next-input-character;
2300 redo A;
2301 } else {
2302 !!!cp (1027);
2303 !!!parse-error (type => 'bare ero');
2304 ## Return nothing.
2305 #
2306 }
2307
2308 ## NOTE: No character is consumed by the "consume a character
2309 ## reference" algorithm. In other word, there is an "&" character
2310 ## that does not introduce a character reference, which would be
2311 ## appended to the parent element or the attribute value in later
2312 ## process of the tokenizer.
2313
2314 if ($self->{prev_state} == DATA_STATE) {
2315 !!!cp (997);
2316 $self->{state} = $self->{prev_state};
2317 ## Reconsume.
2318 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2319 line => $self->{line_prev},
2320 column => $self->{column_prev},
2321 });
2322 redo A;
2323 } else {
2324 !!!cp (996);
2325 $self->{ca}->{value} .= '&';
2326 $self->{state} = $self->{prev_state};
2327 ## Reconsume.
2328 redo A;
2329 }
2330 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2331 if ($self->{nc} == 0x0078 or # x
2332 $self->{nc} == 0x0058) { # X
2333 !!!cp (995);
2334 $self->{state} = HEXREF_X_STATE;
2335 $self->{s_kwd} .= chr $self->{nc};
2336 !!!next-input-character;
2337 redo A;
2338 } elsif (0x0030 <= $self->{nc} and
2339 $self->{nc} <= 0x0039) { # 0..9
2340 !!!cp (994);
2341 $self->{state} = NCR_NUM_STATE;
2342 $self->{s_kwd} = $self->{nc} - 0x0030;
2343 !!!next-input-character;
2344 redo A;
2345 } else {
2346 !!!parse-error (type => 'bare nero',
2347 line => $self->{line_prev},
2348 column => $self->{column_prev} - 1);
2349
2350 ## NOTE: According to the spec algorithm, nothing is returned,
2351 ## and then "&#" is appended to the parent element or the attribute
2352 ## value in the later processing.
2353
2354 if ($self->{prev_state} == DATA_STATE) {
2355 !!!cp (1019);
2356 $self->{state} = $self->{prev_state};
2357 ## Reconsume.
2358 !!!emit ({type => CHARACTER_TOKEN,
2359 data => '&#',
2360 line => $self->{line_prev},
2361 column => $self->{column_prev} - 1,
2362 });
2363 redo A;
2364 } else {
2365 !!!cp (993);
2366 $self->{ca}->{value} .= '&#';
2367 $self->{state} = $self->{prev_state};
2368 ## Reconsume.
2369 redo A;
2370 }
2371 }
2372 } elsif ($self->{state} == NCR_NUM_STATE) {
2373 if (0x0030 <= $self->{nc} and
2374 $self->{nc} <= 0x0039) { # 0..9
2375 !!!cp (1012);
2376 $self->{s_kwd} *= 10;
2377 $self->{s_kwd} += $self->{nc} - 0x0030;
2378
2379 ## Stay in the state.
2380 !!!next-input-character;
2381 redo A;
2382 } elsif ($self->{nc} == 0x003B) { # ;
2383 !!!cp (1013);
2384 !!!next-input-character;
2385 #
2386 } else {
2387 !!!cp (1014);
2388 !!!parse-error (type => 'no refc');
2389 ## Reconsume.
2390 #
2391 }
2392
2393 my $code = $self->{s_kwd};
2394 my $l = $self->{line_prev};
2395 my $c = $self->{column_prev};
2396 if ($charref_map->{$code}) {
2397 !!!cp (1015);
2398 !!!parse-error (type => 'invalid character reference',
2399 text => (sprintf 'U+%04X', $code),
2400 line => $l, column => $c);
2401 $code = $charref_map->{$code};
2402 } elsif ($code > 0x10FFFF) {
2403 !!!cp (1016);
2404 !!!parse-error (type => 'invalid character reference',
2405 text => (sprintf 'U-%08X', $code),
2406 line => $l, column => $c);
2407 $code = 0xFFFD;
2408 }
2409
2410 if ($self->{prev_state} == DATA_STATE) {
2411 !!!cp (992);
2412 $self->{state} = $self->{prev_state};
2413 ## Reconsume.
2414 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2415 line => $l, column => $c,
2416 });
2417 redo A;
2418 } else {
2419 !!!cp (991);
2420 $self->{ca}->{value} .= chr $code;
2421 $self->{ca}->{has_reference} = 1;
2422 $self->{state} = $self->{prev_state};
2423 ## Reconsume.
2424 redo A;
2425 }
2426 } elsif ($self->{state} == HEXREF_X_STATE) {
2427 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2428 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2429 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2430 # 0..9, A..F, a..f
2431 !!!cp (990);
2432 $self->{state} = HEXREF_HEX_STATE;
2433 $self->{s_kwd} = 0;
2434 ## Reconsume.
2435 redo A;
2436 } else {
2437 !!!parse-error (type => 'bare hcro',
2438 line => $self->{line_prev},
2439 column => $self->{column_prev} - 2);
2440
2441 ## NOTE: According to the spec algorithm, nothing is returned,
2442 ## and then "&#" followed by "X" or "x" is appended to the parent
2443 ## element or the attribute value in the later processing.
2444
2445 if ($self->{prev_state} == DATA_STATE) {
2446 !!!cp (1005);
2447 $self->{state} = $self->{prev_state};
2448 ## Reconsume.
2449 !!!emit ({type => CHARACTER_TOKEN,
2450 data => '&' . $self->{s_kwd},
2451 line => $self->{line_prev},
2452 column => $self->{column_prev} - length $self->{s_kwd},
2453 });
2454 redo A;
2455 } else {
2456 !!!cp (989);
2457 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2458 $self->{state} = $self->{prev_state};
2459 ## Reconsume.
2460 redo A;
2461 }
2462 }
2463 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2464 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2465 # 0..9
2466 !!!cp (1002);
2467 $self->{s_kwd} *= 0x10;
2468 $self->{s_kwd} += $self->{nc} - 0x0030;
2469 ## Stay in the state.
2470 !!!next-input-character;
2471 redo A;
2472 } elsif (0x0061 <= $self->{nc} and
2473 $self->{nc} <= 0x0066) { # a..f
2474 !!!cp (1003);
2475 $self->{s_kwd} *= 0x10;
2476 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2477 ## Stay in the state.
2478 !!!next-input-character;
2479 redo A;
2480 } elsif (0x0041 <= $self->{nc} and
2481 $self->{nc} <= 0x0046) { # A..F
2482 !!!cp (1004);
2483 $self->{s_kwd} *= 0x10;
2484 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2485 ## Stay in the state.
2486 !!!next-input-character;
2487 redo A;
2488 } elsif ($self->{nc} == 0x003B) { # ;
2489 !!!cp (1006);
2490 !!!next-input-character;
2491 #
2492 } else {
2493 !!!cp (1007);
2494 !!!parse-error (type => 'no refc',
2495 line => $self->{line},
2496 column => $self->{column});
2497 ## Reconsume.
2498 #
2499 }
2500
2501 my $code = $self->{s_kwd};
2502 my $l = $self->{line_prev};
2503 my $c = $self->{column_prev};
2504 if ($charref_map->{$code}) {
2505 !!!cp (1008);
2506 !!!parse-error (type => 'invalid character reference',
2507 text => (sprintf 'U+%04X', $code),
2508 line => $l, column => $c);
2509 $code = $charref_map->{$code};
2510 } elsif ($code > 0x10FFFF) {
2511 !!!cp (1009);
2512 !!!parse-error (type => 'invalid character reference',
2513 text => (sprintf 'U-%08X', $code),
2514 line => $l, column => $c);
2515 $code = 0xFFFD;
2516 }
2517
2518 if ($self->{prev_state} == DATA_STATE) {
2519 !!!cp (988);
2520 $self->{state} = $self->{prev_state};
2521 ## Reconsume.
2522 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2523 line => $l, column => $c,
2524 });
2525 redo A;
2526 } else {
2527 !!!cp (987);
2528 $self->{ca}->{value} .= chr $code;
2529 $self->{ca}->{has_reference} = 1;
2530 $self->{state} = $self->{prev_state};
2531 ## Reconsume.
2532 redo A;
2533 }
2534 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2535 if (length $self->{s_kwd} < 30 and
2536 ## NOTE: Some number greater than the maximum length of entity name
2537 ((0x0041 <= $self->{nc} and # a
2538 $self->{nc} <= 0x005A) or # x
2539 (0x0061 <= $self->{nc} and # a
2540 $self->{nc} <= 0x007A) or # z
2541 (0x0030 <= $self->{nc} and # 0
2542 $self->{nc} <= 0x0039) or # 9
2543 $self->{nc} == 0x003B)) { # ;
2544 our $EntityChar;
2545 $self->{s_kwd} .= chr $self->{nc};
2546 if (defined $EntityChar->{$self->{s_kwd}}) {
2547 if ($self->{nc} == 0x003B) { # ;
2548 !!!cp (1020);
2549 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2550 $self->{entity__match} = 1;
2551 !!!next-input-character;
2552 #
2553 } else {
2554 !!!cp (1021);
2555 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2556 $self->{entity__match} = -1;
2557 ## Stay in the state.
2558 !!!next-input-character;
2559 redo A;
2560 }
2561 } else {
2562 !!!cp (1022);
2563 $self->{entity__value} .= chr $self->{nc};
2564 $self->{entity__match} *= 2;
2565 ## Stay in the state.
2566 !!!next-input-character;
2567 redo A;
2568 }
2569 }
2570
2571 my $data;
2572 my $has_ref;
2573 if ($self->{entity__match} > 0) {
2574 !!!cp (1023);
2575 $data = $self->{entity__value};
2576 $has_ref = 1;
2577 #
2578 } elsif ($self->{entity__match} < 0) {
2579 !!!parse-error (type => 'no refc');
2580 if ($self->{prev_state} != DATA_STATE and # in attribute
2581 $self->{entity__match} < -1) {
2582 !!!cp (1024);
2583 $data = '&' . $self->{s_kwd};
2584 #
2585 } else {
2586 !!!cp (1025);
2587 $data = $self->{entity__value};
2588 $has_ref = 1;
2589 #
2590 }
2591 } else {
2592 !!!cp (1026);
2593 !!!parse-error (type => 'bare ero',
2594 line => $self->{line_prev},
2595 column => $self->{column_prev} - length $self->{s_kwd});
2596 $data = '&' . $self->{s_kwd};
2597 #
2598 }
2599
2600 ## NOTE: In these cases, when a character reference is found,
2601 ## it is consumed and a character token is returned, or, otherwise,
2602 ## nothing is consumed and returned, according to the spec algorithm.
2603 ## In this implementation, anything that has been examined by the
2604 ## tokenizer is appended to the parent element or the attribute value
2605 ## as string, either literal string when no character reference or
2606 ## entity-replaced string otherwise, in this stage, since any characters
2607 ## that would not be consumed are appended in the data state or in an
2608 ## appropriate attribute value state anyway.
2609
2610 if ($self->{prev_state} == DATA_STATE) {
2611 !!!cp (986);
2612 $self->{state} = $self->{prev_state};
2613 ## Reconsume.
2614 !!!emit ({type => CHARACTER_TOKEN,
2615 data => $data,
2616 line => $self->{line_prev},
2617 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2618 });
2619 redo A;
2620 } else {
2621 !!!cp (985);
2622 $self->{ca}->{value} .= $data;
2623 $self->{ca}->{has_reference} = 1 if $has_ref;
2624 $self->{state} = $self->{prev_state};
2625 ## Reconsume.
2626 redo A;
2627 }
2628 } else {
2629 die "$0: $self->{state}: Unknown state";
2630 }
2631 } # A
2632
2633 die "$0: _get_next_token: unexpected case";
2634 } # _get_next_token
2635
2636 1;
2637 ## $Date:$

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24