/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations) (download) (as text)
Tue Oct 14 11:46:57 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.3: +10 -8 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	14 Oct 2008 11:46:38 -0000
	* XML-Parser.t: "xml/elements-1.dat" and "xml/doctypes-1.dat"
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 11:46:52 -0000
	* elements-1.dat: New test data file.

	* doctypes-1.dat: New test data file.

	* attrs-1.dat: New test data on attribute name cases are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 11:40:58 -0000
	* Tokenizer.pm.src: Support for case-insensitive XML attribute
	names.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 #$self->{s_kwd}; # state keyword - initialized when used
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194 !!!next-input-character;
195 $self->{token} = [];
196 # $self->{escape}
197 } # _initialize_tokenizer
198
199 ## A token has:
200 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202 ## ->{name} (DOCTYPE_TOKEN)
203 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204 ## ->{pubid} (DOCTYPE_TOKEN)
205 ## ->{sysid} (DOCTYPE_TOKEN)
206 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208 ## ->{name}
209 ## ->{value}
210 ## ->{has_reference} == 1 or 0
211 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
213 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
214 ## while the token is pushed back to the stack.
215
216 ## Emitted token MUST immediately be handled by the tree construction state.
217
218 ## Before each step, UA MAY check to see if either one of the scripts in
219 ## "list of scripts that will execute as soon as possible" or the first
220 ## script in the "list of scripts that will execute asynchronously",
221 ## has completed loading. If one has, then it MUST be executed
222 ## and removed from the list.
223
224 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
225 ## (This requirement was dropped from HTML5 spec, unfortunately.)
226
227 my $is_space = {
228 0x0009 => 1, # CHARACTER TABULATION (HT)
229 0x000A => 1, # LINE FEED (LF)
230 #0x000B => 0, # LINE TABULATION (VT)
231 0x000C => 1, # FORM FEED (FF)
232 #0x000D => 1, # CARRIAGE RETURN (CR)
233 0x0020 => 1, # SPACE (SP)
234 };
235
236 sub _get_next_token ($) {
237 my $self = shift;
238
239 if ($self->{self_closing}) {
240 !!!parse-error (type => 'nestc', token => $self->{ct});
241 ## NOTE: The |self_closing| flag is only set by start tag token.
242 ## In addition, when a start tag token is emitted, it is always set to
243 ## |ct|.
244 delete $self->{self_closing};
245 }
246
247 if (@{$self->{token}}) {
248 $self->{self_closing} = $self->{token}->[0]->{self_closing};
249 return shift @{$self->{token}};
250 }
251
252 A: {
253 if ($self->{state} == PCDATA_STATE) {
254 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
255
256 if ($self->{nc} == 0x0026) { # &
257 !!!cp (0.1);
258 ## NOTE: In the spec, the tokenizer is switched to the
259 ## "entity data state". In this implementation, the tokenizer
260 ## is switched to the |ENTITY_STATE|, which is an implementation
261 ## of the "consume a character reference" algorithm.
262 $self->{entity_add} = -1;
263 $self->{prev_state} = DATA_STATE;
264 $self->{state} = ENTITY_STATE;
265 !!!next-input-character;
266 redo A;
267 } elsif ($self->{nc} == 0x003C) { # <
268 !!!cp (0.2);
269 $self->{state} = TAG_OPEN_STATE;
270 !!!next-input-character;
271 redo A;
272 } elsif ($self->{nc} == -1) {
273 !!!cp (0.3);
274 !!!emit ({type => END_OF_FILE_TOKEN,
275 line => $self->{line}, column => $self->{column}});
276 last A; ## TODO: ok?
277 } else {
278 !!!cp (0.4);
279 #
280 }
281
282 # Anything else
283 my $token = {type => CHARACTER_TOKEN,
284 data => chr $self->{nc},
285 line => $self->{line}, column => $self->{column},
286 };
287 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
288
289 ## Stay in the state.
290 !!!next-input-character;
291 !!!emit ($token);
292 redo A;
293 } elsif ($self->{state} == DATA_STATE) {
294 $self->{s_kwd} = '' unless defined $self->{s_kwd};
295 if ($self->{nc} == 0x0026) { # &
296 $self->{s_kwd} = '';
297 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
298 not $self->{escape}) {
299 !!!cp (1);
300 ## NOTE: In the spec, the tokenizer is switched to the
301 ## "entity data state". In this implementation, the tokenizer
302 ## is switched to the |ENTITY_STATE|, which is an implementation
303 ## of the "consume a character reference" algorithm.
304 $self->{entity_add} = -1;
305 $self->{prev_state} = DATA_STATE;
306 $self->{state} = ENTITY_STATE;
307 !!!next-input-character;
308 redo A;
309 } else {
310 !!!cp (2);
311 #
312 }
313 } elsif ($self->{nc} == 0x002D) { # -
314 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
315 $self->{s_kwd} .= '-';
316
317 if ($self->{s_kwd} eq '<!--') {
318 !!!cp (3);
319 $self->{escape} = 1; # unless $self->{escape};
320 $self->{s_kwd} = '--';
321 #
322 } elsif ($self->{s_kwd} eq '---') {
323 !!!cp (4);
324 $self->{s_kwd} = '--';
325 #
326 } else {
327 !!!cp (5);
328 #
329 }
330 }
331
332 #
333 } elsif ($self->{nc} == 0x0021) { # !
334 if (length $self->{s_kwd}) {
335 !!!cp (5.1);
336 $self->{s_kwd} .= '!';
337 #
338 } else {
339 !!!cp (5.2);
340 #$self->{s_kwd} = '';
341 #
342 }
343 #
344 } elsif ($self->{nc} == 0x003C) { # <
345 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
346 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
347 not $self->{escape})) {
348 !!!cp (6);
349 $self->{state} = TAG_OPEN_STATE;
350 !!!next-input-character;
351 redo A;
352 } else {
353 !!!cp (7);
354 $self->{s_kwd} = '';
355 #
356 }
357 } elsif ($self->{nc} == 0x003E) { # >
358 if ($self->{escape} and
359 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
360 if ($self->{s_kwd} eq '--') {
361 !!!cp (8);
362 delete $self->{escape};
363 } else {
364 !!!cp (9);
365 }
366 } else {
367 !!!cp (10);
368 }
369
370 $self->{s_kwd} = '';
371 #
372 } elsif ($self->{nc} == -1) {
373 !!!cp (11);
374 $self->{s_kwd} = '';
375 !!!emit ({type => END_OF_FILE_TOKEN,
376 line => $self->{line}, column => $self->{column}});
377 last A; ## TODO: ok?
378 } else {
379 !!!cp (12);
380 $self->{s_kwd} = '';
381 #
382 }
383
384 # Anything else
385 my $token = {type => CHARACTER_TOKEN,
386 data => chr $self->{nc},
387 line => $self->{line}, column => $self->{column},
388 };
389 if ($self->{read_until}->($token->{data}, q[-!<>&],
390 length $token->{data})) {
391 $self->{s_kwd} = '';
392 }
393
394 ## Stay in the data state.
395 if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
396 !!!cp (13);
397 $self->{state} = PCDATA_STATE;
398 } else {
399 !!!cp (14);
400 ## Stay in the state.
401 }
402 !!!next-input-character;
403 !!!emit ($token);
404 redo A;
405 } elsif ($self->{state} == TAG_OPEN_STATE) {
406 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
407 if ($self->{nc} == 0x002F) { # /
408 !!!cp (15);
409 !!!next-input-character;
410 $self->{state} = CLOSE_TAG_OPEN_STATE;
411 redo A;
412 } elsif ($self->{nc} == 0x0021) { # !
413 !!!cp (15.1);
414 $self->{s_kwd} = '<' unless $self->{escape};
415 #
416 } else {
417 !!!cp (16);
418 #
419 }
420
421 ## reconsume
422 $self->{state} = DATA_STATE;
423 !!!emit ({type => CHARACTER_TOKEN, data => '<',
424 line => $self->{line_prev},
425 column => $self->{column_prev},
426 });
427 redo A;
428 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
429 if ($self->{nc} == 0x0021) { # !
430 !!!cp (17);
431 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
432 !!!next-input-character;
433 redo A;
434 } elsif ($self->{nc} == 0x002F) { # /
435 !!!cp (18);
436 $self->{state} = CLOSE_TAG_OPEN_STATE;
437 !!!next-input-character;
438 redo A;
439 } elsif (0x0041 <= $self->{nc} and
440 $self->{nc} <= 0x005A) { # A..Z
441 !!!cp (19);
442 $self->{ct}
443 = {type => START_TAG_TOKEN,
444 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
445 line => $self->{line_prev},
446 column => $self->{column_prev}};
447 $self->{state} = TAG_NAME_STATE;
448 !!!next-input-character;
449 redo A;
450 } elsif (0x0061 <= $self->{nc} and
451 $self->{nc} <= 0x007A) { # a..z
452 !!!cp (20);
453 $self->{ct} = {type => START_TAG_TOKEN,
454 tag_name => chr ($self->{nc}),
455 line => $self->{line_prev},
456 column => $self->{column_prev}};
457 $self->{state} = TAG_NAME_STATE;
458 !!!next-input-character;
459 redo A;
460 } elsif ($self->{nc} == 0x003E) { # >
461 !!!cp (21);
462 !!!parse-error (type => 'empty start tag',
463 line => $self->{line_prev},
464 column => $self->{column_prev});
465 $self->{state} = DATA_STATE;
466 !!!next-input-character;
467
468 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
469 line => $self->{line_prev},
470 column => $self->{column_prev},
471 });
472
473 redo A;
474 } elsif ($self->{nc} == 0x003F) { # ?
475 !!!cp (22);
476 !!!parse-error (type => 'pio',
477 line => $self->{line_prev},
478 column => $self->{column_prev});
479 $self->{state} = BOGUS_COMMENT_STATE;
480 $self->{ct} = {type => COMMENT_TOKEN, data => '',
481 line => $self->{line_prev},
482 column => $self->{column_prev},
483 };
484 ## $self->{nc} is intentionally left as is
485 redo A;
486 } else {
487 !!!cp (23);
488 !!!parse-error (type => 'bare stago',
489 line => $self->{line_prev},
490 column => $self->{column_prev});
491 $self->{state} = DATA_STATE;
492 ## reconsume
493
494 !!!emit ({type => CHARACTER_TOKEN, data => '<',
495 line => $self->{line_prev},
496 column => $self->{column_prev},
497 });
498
499 redo A;
500 }
501 } else {
502 die "$0: $self->{content_model} in tag open";
503 }
504 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
505 ## NOTE: The "close tag open state" in the spec is implemented as
506 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
507
508 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
509 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
510 if (defined $self->{last_stag_name}) {
511 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
512 $self->{s_kwd} = '';
513 ## Reconsume.
514 redo A;
515 } else {
516 ## No start tag token has ever been emitted
517 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
518 !!!cp (28);
519 $self->{state} = DATA_STATE;
520 ## Reconsume.
521 !!!emit ({type => CHARACTER_TOKEN, data => '</',
522 line => $l, column => $c,
523 });
524 redo A;
525 }
526 }
527
528 if (0x0041 <= $self->{nc} and
529 $self->{nc} <= 0x005A) { # A..Z
530 !!!cp (29);
531 $self->{ct}
532 = {type => END_TAG_TOKEN,
533 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
534 line => $l, column => $c};
535 $self->{state} = TAG_NAME_STATE;
536 !!!next-input-character;
537 redo A;
538 } elsif (0x0061 <= $self->{nc} and
539 $self->{nc} <= 0x007A) { # a..z
540 !!!cp (30);
541 $self->{ct} = {type => END_TAG_TOKEN,
542 tag_name => chr ($self->{nc}),
543 line => $l, column => $c};
544 $self->{state} = TAG_NAME_STATE;
545 !!!next-input-character;
546 redo A;
547 } elsif ($self->{nc} == 0x003E) { # >
548 !!!cp (31);
549 !!!parse-error (type => 'empty end tag',
550 line => $self->{line_prev}, ## "<" in "</>"
551 column => $self->{column_prev} - 1);
552 $self->{state} = DATA_STATE;
553 !!!next-input-character;
554 redo A;
555 } elsif ($self->{nc} == -1) {
556 !!!cp (32);
557 !!!parse-error (type => 'bare etago');
558 $self->{state} = DATA_STATE;
559 # reconsume
560
561 !!!emit ({type => CHARACTER_TOKEN, data => '</',
562 line => $l, column => $c,
563 });
564
565 redo A;
566 } else {
567 !!!cp (33);
568 !!!parse-error (type => 'bogus end tag');
569 $self->{state} = BOGUS_COMMENT_STATE;
570 $self->{ct} = {type => COMMENT_TOKEN, data => '',
571 line => $self->{line_prev}, # "<" of "</"
572 column => $self->{column_prev} - 1,
573 };
574 ## NOTE: $self->{nc} is intentionally left as is.
575 ## Although the "anything else" case of the spec not explicitly
576 ## states that the next input character is to be reconsumed,
577 ## it will be included to the |data| of the comment token
578 ## generated from the bogus end tag, as defined in the
579 ## "bogus comment state" entry.
580 redo A;
581 }
582 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
583 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
584 if (length $ch) {
585 my $CH = $ch;
586 $ch =~ tr/a-z/A-Z/;
587 my $nch = chr $self->{nc};
588 if ($nch eq $ch or $nch eq $CH) {
589 !!!cp (24);
590 ## Stay in the state.
591 $self->{s_kwd} .= $nch;
592 !!!next-input-character;
593 redo A;
594 } else {
595 !!!cp (25);
596 $self->{state} = DATA_STATE;
597 ## Reconsume.
598 !!!emit ({type => CHARACTER_TOKEN,
599 data => '</' . $self->{s_kwd},
600 line => $self->{line_prev},
601 column => $self->{column_prev} - 1 - length $self->{s_kwd},
602 });
603 redo A;
604 }
605 } else { # after "<{tag-name}"
606 unless ($is_space->{$self->{nc}} or
607 {
608 0x003E => 1, # >
609 0x002F => 1, # /
610 -1 => 1, # EOF
611 }->{$self->{nc}}) {
612 !!!cp (26);
613 ## Reconsume.
614 $self->{state} = DATA_STATE;
615 !!!emit ({type => CHARACTER_TOKEN,
616 data => '</' . $self->{s_kwd},
617 line => $self->{line_prev},
618 column => $self->{column_prev} - 1 - length $self->{s_kwd},
619 });
620 redo A;
621 } else {
622 !!!cp (27);
623 $self->{ct}
624 = {type => END_TAG_TOKEN,
625 tag_name => $self->{last_stag_name},
626 line => $self->{line_prev},
627 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
628 $self->{state} = TAG_NAME_STATE;
629 ## Reconsume.
630 redo A;
631 }
632 }
633 } elsif ($self->{state} == TAG_NAME_STATE) {
634 if ($is_space->{$self->{nc}}) {
635 !!!cp (34);
636 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
637 !!!next-input-character;
638 redo A;
639 } elsif ($self->{nc} == 0x003E) { # >
640 if ($self->{ct}->{type} == START_TAG_TOKEN) {
641 !!!cp (35);
642 $self->{last_stag_name} = $self->{ct}->{tag_name};
643 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
644 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
645 #if ($self->{ct}->{attributes}) {
646 # ## NOTE: This should never be reached.
647 # !!! cp (36);
648 # !!! parse-error (type => 'end tag attribute');
649 #} else {
650 !!!cp (37);
651 #}
652 } else {
653 die "$0: $self->{ct}->{type}: Unknown token type";
654 }
655 $self->{state} = DATA_STATE;
656 !!!next-input-character;
657
658 !!!emit ($self->{ct}); # start tag or end tag
659
660 redo A;
661 } elsif (0x0041 <= $self->{nc} and
662 $self->{nc} <= 0x005A) { # A..Z
663 !!!cp (38);
664 $self->{ct}->{tag_name}
665 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
666 # start tag or end tag
667 ## Stay in this state
668 !!!next-input-character;
669 redo A;
670 } elsif ($self->{nc} == -1) {
671 !!!parse-error (type => 'unclosed tag');
672 if ($self->{ct}->{type} == START_TAG_TOKEN) {
673 !!!cp (39);
674 $self->{last_stag_name} = $self->{ct}->{tag_name};
675 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
676 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
677 #if ($self->{ct}->{attributes}) {
678 # ## NOTE: This state should never be reached.
679 # !!! cp (40);
680 # !!! parse-error (type => 'end tag attribute');
681 #} else {
682 !!!cp (41);
683 #}
684 } else {
685 die "$0: $self->{ct}->{type}: Unknown token type";
686 }
687 $self->{state} = DATA_STATE;
688 # reconsume
689
690 !!!emit ($self->{ct}); # start tag or end tag
691
692 redo A;
693 } elsif ($self->{nc} == 0x002F) { # /
694 !!!cp (42);
695 $self->{state} = SELF_CLOSING_START_TAG_STATE;
696 !!!next-input-character;
697 redo A;
698 } else {
699 !!!cp (44);
700 $self->{ct}->{tag_name} .= chr $self->{nc};
701 # start tag or end tag
702 ## Stay in the state
703 !!!next-input-character;
704 redo A;
705 }
706 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
707 if ($is_space->{$self->{nc}}) {
708 !!!cp (45);
709 ## Stay in the state
710 !!!next-input-character;
711 redo A;
712 } elsif ($self->{nc} == 0x003E) { # >
713 if ($self->{ct}->{type} == START_TAG_TOKEN) {
714 !!!cp (46);
715 $self->{last_stag_name} = $self->{ct}->{tag_name};
716 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
717 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
718 if ($self->{ct}->{attributes}) {
719 !!!cp (47);
720 !!!parse-error (type => 'end tag attribute');
721 } else {
722 !!!cp (48);
723 }
724 } else {
725 die "$0: $self->{ct}->{type}: Unknown token type";
726 }
727 $self->{state} = DATA_STATE;
728 !!!next-input-character;
729
730 !!!emit ($self->{ct}); # start tag or end tag
731
732 redo A;
733 } elsif (0x0041 <= $self->{nc} and
734 $self->{nc} <= 0x005A) { # A..Z
735 !!!cp (49);
736 $self->{ca}
737 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
738 value => '',
739 line => $self->{line}, column => $self->{column}};
740 $self->{state} = ATTRIBUTE_NAME_STATE;
741 !!!next-input-character;
742 redo A;
743 } elsif ($self->{nc} == 0x002F) { # /
744 !!!cp (50);
745 $self->{state} = SELF_CLOSING_START_TAG_STATE;
746 !!!next-input-character;
747 redo A;
748 } elsif ($self->{nc} == -1) {
749 !!!parse-error (type => 'unclosed tag');
750 if ($self->{ct}->{type} == START_TAG_TOKEN) {
751 !!!cp (52);
752 $self->{last_stag_name} = $self->{ct}->{tag_name};
753 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
754 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
755 if ($self->{ct}->{attributes}) {
756 !!!cp (53);
757 !!!parse-error (type => 'end tag attribute');
758 } else {
759 !!!cp (54);
760 }
761 } else {
762 die "$0: $self->{ct}->{type}: Unknown token type";
763 }
764 $self->{state} = DATA_STATE;
765 # reconsume
766
767 !!!emit ($self->{ct}); # start tag or end tag
768
769 redo A;
770 } else {
771 if ({
772 0x0022 => 1, # "
773 0x0027 => 1, # '
774 0x003D => 1, # =
775 }->{$self->{nc}}) {
776 !!!cp (55);
777 !!!parse-error (type => 'bad attribute name');
778 } else {
779 !!!cp (56);
780 }
781 $self->{ca}
782 = {name => chr ($self->{nc}),
783 value => '',
784 line => $self->{line}, column => $self->{column}};
785 $self->{state} = ATTRIBUTE_NAME_STATE;
786 !!!next-input-character;
787 redo A;
788 }
789 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
790 my $before_leave = sub {
791 if (exists $self->{ct}->{attributes} # start tag or end tag
792 ->{$self->{ca}->{name}}) { # MUST
793 !!!cp (57);
794 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
795 ## Discard $self->{ca} # MUST
796 } else {
797 !!!cp (58);
798 $self->{ct}->{attributes}->{$self->{ca}->{name}}
799 = $self->{ca};
800 }
801 }; # $before_leave
802
803 if ($is_space->{$self->{nc}}) {
804 !!!cp (59);
805 $before_leave->();
806 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
807 !!!next-input-character;
808 redo A;
809 } elsif ($self->{nc} == 0x003D) { # =
810 !!!cp (60);
811 $before_leave->();
812 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
813 !!!next-input-character;
814 redo A;
815 } elsif ($self->{nc} == 0x003E) { # >
816 $before_leave->();
817 if ($self->{ct}->{type} == START_TAG_TOKEN) {
818 !!!cp (61);
819 $self->{last_stag_name} = $self->{ct}->{tag_name};
820 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
821 !!!cp (62);
822 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
823 if ($self->{ct}->{attributes}) {
824 !!!parse-error (type => 'end tag attribute');
825 }
826 } else {
827 die "$0: $self->{ct}->{type}: Unknown token type";
828 }
829 $self->{state} = DATA_STATE;
830 !!!next-input-character;
831
832 !!!emit ($self->{ct}); # start tag or end tag
833
834 redo A;
835 } elsif (0x0041 <= $self->{nc} and
836 $self->{nc} <= 0x005A) { # A..Z
837 !!!cp (63);
838 $self->{ca}->{name}
839 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
840 ## Stay in the state
841 !!!next-input-character;
842 redo A;
843 } elsif ($self->{nc} == 0x002F) { # /
844 !!!cp (64);
845 $before_leave->();
846 $self->{state} = SELF_CLOSING_START_TAG_STATE;
847 !!!next-input-character;
848 redo A;
849 } elsif ($self->{nc} == -1) {
850 !!!parse-error (type => 'unclosed tag');
851 $before_leave->();
852 if ($self->{ct}->{type} == START_TAG_TOKEN) {
853 !!!cp (66);
854 $self->{last_stag_name} = $self->{ct}->{tag_name};
855 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
856 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
857 if ($self->{ct}->{attributes}) {
858 !!!cp (67);
859 !!!parse-error (type => 'end tag attribute');
860 } else {
861 ## NOTE: This state should never be reached.
862 !!!cp (68);
863 }
864 } else {
865 die "$0: $self->{ct}->{type}: Unknown token type";
866 }
867 $self->{state} = DATA_STATE;
868 # reconsume
869
870 !!!emit ($self->{ct}); # start tag or end tag
871
872 redo A;
873 } else {
874 if ($self->{nc} == 0x0022 or # "
875 $self->{nc} == 0x0027) { # '
876 !!!cp (69);
877 !!!parse-error (type => 'bad attribute name');
878 } else {
879 !!!cp (70);
880 }
881 $self->{ca}->{name} .= chr ($self->{nc});
882 ## Stay in the state
883 !!!next-input-character;
884 redo A;
885 }
886 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
887 if ($is_space->{$self->{nc}}) {
888 !!!cp (71);
889 ## Stay in the state
890 !!!next-input-character;
891 redo A;
892 } elsif ($self->{nc} == 0x003D) { # =
893 !!!cp (72);
894 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
895 !!!next-input-character;
896 redo A;
897 } elsif ($self->{nc} == 0x003E) { # >
898 if ($self->{ct}->{type} == START_TAG_TOKEN) {
899 !!!cp (73);
900 $self->{last_stag_name} = $self->{ct}->{tag_name};
901 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
902 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
903 if ($self->{ct}->{attributes}) {
904 !!!cp (74);
905 !!!parse-error (type => 'end tag attribute');
906 } else {
907 ## NOTE: This state should never be reached.
908 !!!cp (75);
909 }
910 } else {
911 die "$0: $self->{ct}->{type}: Unknown token type";
912 }
913 $self->{state} = DATA_STATE;
914 !!!next-input-character;
915
916 !!!emit ($self->{ct}); # start tag or end tag
917
918 redo A;
919 } elsif (0x0041 <= $self->{nc} and
920 $self->{nc} <= 0x005A) { # A..Z
921 !!!cp (76);
922 $self->{ca}
923 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
924 value => '',
925 line => $self->{line}, column => $self->{column}};
926 $self->{state} = ATTRIBUTE_NAME_STATE;
927 !!!next-input-character;
928 redo A;
929 } elsif ($self->{nc} == 0x002F) { # /
930 !!!cp (77);
931 $self->{state} = SELF_CLOSING_START_TAG_STATE;
932 !!!next-input-character;
933 redo A;
934 } elsif ($self->{nc} == -1) {
935 !!!parse-error (type => 'unclosed tag');
936 if ($self->{ct}->{type} == START_TAG_TOKEN) {
937 !!!cp (79);
938 $self->{last_stag_name} = $self->{ct}->{tag_name};
939 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
940 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
941 if ($self->{ct}->{attributes}) {
942 !!!cp (80);
943 !!!parse-error (type => 'end tag attribute');
944 } else {
945 ## NOTE: This state should never be reached.
946 !!!cp (81);
947 }
948 } else {
949 die "$0: $self->{ct}->{type}: Unknown token type";
950 }
951 $self->{state} = DATA_STATE;
952 # reconsume
953
954 !!!emit ($self->{ct}); # start tag or end tag
955
956 redo A;
957 } else {
958 if ($self->{nc} == 0x0022 or # "
959 $self->{nc} == 0x0027) { # '
960 !!!cp (78);
961 !!!parse-error (type => 'bad attribute name');
962 } else {
963 !!!cp (82);
964 }
965 $self->{ca}
966 = {name => chr ($self->{nc}),
967 value => '',
968 line => $self->{line}, column => $self->{column}};
969 $self->{state} = ATTRIBUTE_NAME_STATE;
970 !!!next-input-character;
971 redo A;
972 }
973 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
974 if ($is_space->{$self->{nc}}) {
975 !!!cp (83);
976 ## Stay in the state
977 !!!next-input-character;
978 redo A;
979 } elsif ($self->{nc} == 0x0022) { # "
980 !!!cp (84);
981 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
982 !!!next-input-character;
983 redo A;
984 } elsif ($self->{nc} == 0x0026) { # &
985 !!!cp (85);
986 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
987 ## reconsume
988 redo A;
989 } elsif ($self->{nc} == 0x0027) { # '
990 !!!cp (86);
991 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
992 !!!next-input-character;
993 redo A;
994 } elsif ($self->{nc} == 0x003E) { # >
995 !!!parse-error (type => 'empty unquoted attribute value');
996 if ($self->{ct}->{type} == START_TAG_TOKEN) {
997 !!!cp (87);
998 $self->{last_stag_name} = $self->{ct}->{tag_name};
999 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1000 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1001 if ($self->{ct}->{attributes}) {
1002 !!!cp (88);
1003 !!!parse-error (type => 'end tag attribute');
1004 } else {
1005 ## NOTE: This state should never be reached.
1006 !!!cp (89);
1007 }
1008 } else {
1009 die "$0: $self->{ct}->{type}: Unknown token type";
1010 }
1011 $self->{state} = DATA_STATE;
1012 !!!next-input-character;
1013
1014 !!!emit ($self->{ct}); # start tag or end tag
1015
1016 redo A;
1017 } elsif ($self->{nc} == -1) {
1018 !!!parse-error (type => 'unclosed tag');
1019 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1020 !!!cp (90);
1021 $self->{last_stag_name} = $self->{ct}->{tag_name};
1022 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1023 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1024 if ($self->{ct}->{attributes}) {
1025 !!!cp (91);
1026 !!!parse-error (type => 'end tag attribute');
1027 } else {
1028 ## NOTE: This state should never be reached.
1029 !!!cp (92);
1030 }
1031 } else {
1032 die "$0: $self->{ct}->{type}: Unknown token type";
1033 }
1034 $self->{state} = DATA_STATE;
1035 ## reconsume
1036
1037 !!!emit ($self->{ct}); # start tag or end tag
1038
1039 redo A;
1040 } else {
1041 if ($self->{nc} == 0x003D) { # =
1042 !!!cp (93);
1043 !!!parse-error (type => 'bad attribute value');
1044 } else {
1045 !!!cp (94);
1046 }
1047 $self->{ca}->{value} .= chr ($self->{nc});
1048 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1049 !!!next-input-character;
1050 redo A;
1051 }
1052 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1053 if ($self->{nc} == 0x0022) { # "
1054 !!!cp (95);
1055 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1056 !!!next-input-character;
1057 redo A;
1058 } elsif ($self->{nc} == 0x0026) { # &
1059 !!!cp (96);
1060 ## NOTE: In the spec, the tokenizer is switched to the
1061 ## "entity in attribute value state". In this implementation, the
1062 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1063 ## implementation of the "consume a character reference" algorithm.
1064 $self->{prev_state} = $self->{state};
1065 $self->{entity_add} = 0x0022; # "
1066 $self->{state} = ENTITY_STATE;
1067 !!!next-input-character;
1068 redo A;
1069 } elsif ($self->{nc} == -1) {
1070 !!!parse-error (type => 'unclosed attribute value');
1071 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1072 !!!cp (97);
1073 $self->{last_stag_name} = $self->{ct}->{tag_name};
1074 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1075 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1076 if ($self->{ct}->{attributes}) {
1077 !!!cp (98);
1078 !!!parse-error (type => 'end tag attribute');
1079 } else {
1080 ## NOTE: This state should never be reached.
1081 !!!cp (99);
1082 }
1083 } else {
1084 die "$0: $self->{ct}->{type}: Unknown token type";
1085 }
1086 $self->{state} = DATA_STATE;
1087 ## reconsume
1088
1089 !!!emit ($self->{ct}); # start tag or end tag
1090
1091 redo A;
1092 } else {
1093 !!!cp (100);
1094 $self->{ca}->{value} .= chr ($self->{nc});
1095 $self->{read_until}->($self->{ca}->{value},
1096 q["&],
1097 length $self->{ca}->{value});
1098
1099 ## Stay in the state
1100 !!!next-input-character;
1101 redo A;
1102 }
1103 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1104 if ($self->{nc} == 0x0027) { # '
1105 !!!cp (101);
1106 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1107 !!!next-input-character;
1108 redo A;
1109 } elsif ($self->{nc} == 0x0026) { # &
1110 !!!cp (102);
1111 ## NOTE: In the spec, the tokenizer is switched to the
1112 ## "entity in attribute value state". In this implementation, the
1113 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1114 ## implementation of the "consume a character reference" algorithm.
1115 $self->{entity_add} = 0x0027; # '
1116 $self->{prev_state} = $self->{state};
1117 $self->{state} = ENTITY_STATE;
1118 !!!next-input-character;
1119 redo A;
1120 } elsif ($self->{nc} == -1) {
1121 !!!parse-error (type => 'unclosed attribute value');
1122 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1123 !!!cp (103);
1124 $self->{last_stag_name} = $self->{ct}->{tag_name};
1125 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1126 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1127 if ($self->{ct}->{attributes}) {
1128 !!!cp (104);
1129 !!!parse-error (type => 'end tag attribute');
1130 } else {
1131 ## NOTE: This state should never be reached.
1132 !!!cp (105);
1133 }
1134 } else {
1135 die "$0: $self->{ct}->{type}: Unknown token type";
1136 }
1137 $self->{state} = DATA_STATE;
1138 ## reconsume
1139
1140 !!!emit ($self->{ct}); # start tag or end tag
1141
1142 redo A;
1143 } else {
1144 !!!cp (106);
1145 $self->{ca}->{value} .= chr ($self->{nc});
1146 $self->{read_until}->($self->{ca}->{value},
1147 q['&],
1148 length $self->{ca}->{value});
1149
1150 ## Stay in the state
1151 !!!next-input-character;
1152 redo A;
1153 }
1154 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1155 if ($is_space->{$self->{nc}}) {
1156 !!!cp (107);
1157 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1158 !!!next-input-character;
1159 redo A;
1160 } elsif ($self->{nc} == 0x0026) { # &
1161 !!!cp (108);
1162 ## NOTE: In the spec, the tokenizer is switched to the
1163 ## "entity in attribute value state". In this implementation, the
1164 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1165 ## implementation of the "consume a character reference" algorithm.
1166 $self->{entity_add} = -1;
1167 $self->{prev_state} = $self->{state};
1168 $self->{state} = ENTITY_STATE;
1169 !!!next-input-character;
1170 redo A;
1171 } elsif ($self->{nc} == 0x003E) { # >
1172 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1173 !!!cp (109);
1174 $self->{last_stag_name} = $self->{ct}->{tag_name};
1175 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1176 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1177 if ($self->{ct}->{attributes}) {
1178 !!!cp (110);
1179 !!!parse-error (type => 'end tag attribute');
1180 } else {
1181 ## NOTE: This state should never be reached.
1182 !!!cp (111);
1183 }
1184 } else {
1185 die "$0: $self->{ct}->{type}: Unknown token type";
1186 }
1187 $self->{state} = DATA_STATE;
1188 !!!next-input-character;
1189
1190 !!!emit ($self->{ct}); # start tag or end tag
1191
1192 redo A;
1193 } elsif ($self->{nc} == -1) {
1194 !!!parse-error (type => 'unclosed tag');
1195 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1196 !!!cp (112);
1197 $self->{last_stag_name} = $self->{ct}->{tag_name};
1198 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1199 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1200 if ($self->{ct}->{attributes}) {
1201 !!!cp (113);
1202 !!!parse-error (type => 'end tag attribute');
1203 } else {
1204 ## NOTE: This state should never be reached.
1205 !!!cp (114);
1206 }
1207 } else {
1208 die "$0: $self->{ct}->{type}: Unknown token type";
1209 }
1210 $self->{state} = DATA_STATE;
1211 ## reconsume
1212
1213 !!!emit ($self->{ct}); # start tag or end tag
1214
1215 redo A;
1216 } else {
1217 if ({
1218 0x0022 => 1, # "
1219 0x0027 => 1, # '
1220 0x003D => 1, # =
1221 }->{$self->{nc}}) {
1222 !!!cp (115);
1223 !!!parse-error (type => 'bad attribute value');
1224 } else {
1225 !!!cp (116);
1226 }
1227 $self->{ca}->{value} .= chr ($self->{nc});
1228 $self->{read_until}->($self->{ca}->{value},
1229 q["'=& >],
1230 length $self->{ca}->{value});
1231
1232 ## Stay in the state
1233 !!!next-input-character;
1234 redo A;
1235 }
1236 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1237 if ($is_space->{$self->{nc}}) {
1238 !!!cp (118);
1239 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1240 !!!next-input-character;
1241 redo A;
1242 } elsif ($self->{nc} == 0x003E) { # >
1243 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1244 !!!cp (119);
1245 $self->{last_stag_name} = $self->{ct}->{tag_name};
1246 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1247 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1248 if ($self->{ct}->{attributes}) {
1249 !!!cp (120);
1250 !!!parse-error (type => 'end tag attribute');
1251 } else {
1252 ## NOTE: This state should never be reached.
1253 !!!cp (121);
1254 }
1255 } else {
1256 die "$0: $self->{ct}->{type}: Unknown token type";
1257 }
1258 $self->{state} = DATA_STATE;
1259 !!!next-input-character;
1260
1261 !!!emit ($self->{ct}); # start tag or end tag
1262
1263 redo A;
1264 } elsif ($self->{nc} == 0x002F) { # /
1265 !!!cp (122);
1266 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1267 !!!next-input-character;
1268 redo A;
1269 } elsif ($self->{nc} == -1) {
1270 !!!parse-error (type => 'unclosed tag');
1271 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1272 !!!cp (122.3);
1273 $self->{last_stag_name} = $self->{ct}->{tag_name};
1274 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1275 if ($self->{ct}->{attributes}) {
1276 !!!cp (122.1);
1277 !!!parse-error (type => 'end tag attribute');
1278 } else {
1279 ## NOTE: This state should never be reached.
1280 !!!cp (122.2);
1281 }
1282 } else {
1283 die "$0: $self->{ct}->{type}: Unknown token type";
1284 }
1285 $self->{state} = DATA_STATE;
1286 ## Reconsume.
1287 !!!emit ($self->{ct}); # start tag or end tag
1288 redo A;
1289 } else {
1290 !!!cp ('124.1');
1291 !!!parse-error (type => 'no space between attributes');
1292 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1293 ## reconsume
1294 redo A;
1295 }
1296 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1297 if ($self->{nc} == 0x003E) { # >
1298 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1299 !!!cp ('124.2');
1300 !!!parse-error (type => 'nestc', token => $self->{ct});
1301 ## TODO: Different type than slash in start tag
1302 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1303 if ($self->{ct}->{attributes}) {
1304 !!!cp ('124.4');
1305 !!!parse-error (type => 'end tag attribute');
1306 } else {
1307 !!!cp ('124.5');
1308 }
1309 ## TODO: Test |<title></title/>|
1310 } else {
1311 !!!cp ('124.3');
1312 $self->{self_closing} = 1;
1313 }
1314
1315 $self->{state} = DATA_STATE;
1316 !!!next-input-character;
1317
1318 !!!emit ($self->{ct}); # start tag or end tag
1319
1320 redo A;
1321 } elsif ($self->{nc} == -1) {
1322 !!!parse-error (type => 'unclosed tag');
1323 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1324 !!!cp (124.7);
1325 $self->{last_stag_name} = $self->{ct}->{tag_name};
1326 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1327 if ($self->{ct}->{attributes}) {
1328 !!!cp (124.5);
1329 !!!parse-error (type => 'end tag attribute');
1330 } else {
1331 ## NOTE: This state should never be reached.
1332 !!!cp (124.6);
1333 }
1334 } else {
1335 die "$0: $self->{ct}->{type}: Unknown token type";
1336 }
1337 $self->{state} = DATA_STATE;
1338 ## Reconsume.
1339 !!!emit ($self->{ct}); # start tag or end tag
1340 redo A;
1341 } else {
1342 !!!cp ('124.4');
1343 !!!parse-error (type => 'nestc');
1344 ## TODO: This error type is wrong.
1345 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346 ## Reconsume.
1347 redo A;
1348 }
1349 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1350 ## (only happen if PCDATA state)
1351
1352 ## NOTE: Unlike spec's "bogus comment state", this implementation
1353 ## consumes characters one-by-one basis.
1354
1355 if ($self->{nc} == 0x003E) { # >
1356 !!!cp (124);
1357 $self->{state} = DATA_STATE;
1358 !!!next-input-character;
1359
1360 !!!emit ($self->{ct}); # comment
1361 redo A;
1362 } elsif ($self->{nc} == -1) {
1363 !!!cp (125);
1364 $self->{state} = DATA_STATE;
1365 ## reconsume
1366
1367 !!!emit ($self->{ct}); # comment
1368 redo A;
1369 } else {
1370 !!!cp (126);
1371 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1372 $self->{read_until}->($self->{ct}->{data},
1373 q[>],
1374 length $self->{ct}->{data});
1375
1376 ## Stay in the state.
1377 !!!next-input-character;
1378 redo A;
1379 }
1380 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1381 ## (only happen if PCDATA state)
1382
1383 if ($self->{nc} == 0x002D) { # -
1384 !!!cp (133);
1385 $self->{state} = MD_HYPHEN_STATE;
1386 !!!next-input-character;
1387 redo A;
1388 } elsif ($self->{nc} == 0x0044 or # D
1389 $self->{nc} == 0x0064) { # d
1390 ## ASCII case-insensitive.
1391 !!!cp (130);
1392 $self->{state} = MD_DOCTYPE_STATE;
1393 $self->{s_kwd} = chr $self->{nc};
1394 !!!next-input-character;
1395 redo A;
1396 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1397 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1398 $self->{is_xml}) and
1399 $self->{nc} == 0x005B) { # [
1400 !!!cp (135.4);
1401 $self->{state} = MD_CDATA_STATE;
1402 $self->{s_kwd} = '[';
1403 !!!next-input-character;
1404 redo A;
1405 } else {
1406 !!!cp (136);
1407 }
1408
1409 !!!parse-error (type => 'bogus comment',
1410 line => $self->{line_prev},
1411 column => $self->{column_prev} - 1);
1412 ## Reconsume.
1413 $self->{state} = BOGUS_COMMENT_STATE;
1414 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1415 line => $self->{line_prev},
1416 column => $self->{column_prev} - 1,
1417 };
1418 redo A;
1419 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1420 if ($self->{nc} == 0x002D) { # -
1421 !!!cp (127);
1422 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1423 line => $self->{line_prev},
1424 column => $self->{column_prev} - 2,
1425 };
1426 $self->{state} = COMMENT_START_STATE;
1427 !!!next-input-character;
1428 redo A;
1429 } else {
1430 !!!cp (128);
1431 !!!parse-error (type => 'bogus comment',
1432 line => $self->{line_prev},
1433 column => $self->{column_prev} - 2);
1434 $self->{state} = BOGUS_COMMENT_STATE;
1435 ## Reconsume.
1436 $self->{ct} = {type => COMMENT_TOKEN,
1437 data => '-',
1438 line => $self->{line_prev},
1439 column => $self->{column_prev} - 2,
1440 };
1441 redo A;
1442 }
1443 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1444 ## ASCII case-insensitive.
1445 if ($self->{nc} == [
1446 undef,
1447 0x004F, # O
1448 0x0043, # C
1449 0x0054, # T
1450 0x0059, # Y
1451 0x0050, # P
1452 ]->[length $self->{s_kwd}] or
1453 $self->{nc} == [
1454 undef,
1455 0x006F, # o
1456 0x0063, # c
1457 0x0074, # t
1458 0x0079, # y
1459 0x0070, # p
1460 ]->[length $self->{s_kwd}]) {
1461 !!!cp (131);
1462 ## Stay in the state.
1463 $self->{s_kwd} .= chr $self->{nc};
1464 !!!next-input-character;
1465 redo A;
1466 } elsif ((length $self->{s_kwd}) == 6 and
1467 ($self->{nc} == 0x0045 or # E
1468 $self->{nc} == 0x0065)) { # e
1469 !!!cp (129);
1470 $self->{state} = DOCTYPE_STATE;
1471 $self->{ct} = {type => DOCTYPE_TOKEN,
1472 quirks => 1,
1473 line => $self->{line_prev},
1474 column => $self->{column_prev} - 7,
1475 };
1476 !!!next-input-character;
1477 redo A;
1478 } else {
1479 !!!cp (132);
1480 !!!parse-error (type => 'bogus comment',
1481 line => $self->{line_prev},
1482 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1483 $self->{state} = BOGUS_COMMENT_STATE;
1484 ## Reconsume.
1485 $self->{ct} = {type => COMMENT_TOKEN,
1486 data => $self->{s_kwd},
1487 line => $self->{line_prev},
1488 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1489 };
1490 redo A;
1491 }
1492 } elsif ($self->{state} == MD_CDATA_STATE) {
1493 if ($self->{nc} == {
1494 '[' => 0x0043, # C
1495 '[C' => 0x0044, # D
1496 '[CD' => 0x0041, # A
1497 '[CDA' => 0x0054, # T
1498 '[CDAT' => 0x0041, # A
1499 }->{$self->{s_kwd}}) {
1500 !!!cp (135.1);
1501 ## Stay in the state.
1502 $self->{s_kwd} .= chr $self->{nc};
1503 !!!next-input-character;
1504 redo A;
1505 } elsif ($self->{s_kwd} eq '[CDATA' and
1506 $self->{nc} == 0x005B) { # [
1507 !!!cp (135.2);
1508 $self->{ct} = {type => CHARACTER_TOKEN,
1509 data => '',
1510 line => $self->{line_prev},
1511 column => $self->{column_prev} - 7};
1512 $self->{state} = CDATA_SECTION_STATE;
1513 !!!next-input-character;
1514 redo A;
1515 } else {
1516 !!!cp (135.3);
1517 !!!parse-error (type => 'bogus comment',
1518 line => $self->{line_prev},
1519 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1520 $self->{state} = BOGUS_COMMENT_STATE;
1521 ## Reconsume.
1522 $self->{ct} = {type => COMMENT_TOKEN,
1523 data => $self->{s_kwd},
1524 line => $self->{line_prev},
1525 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1526 };
1527 redo A;
1528 }
1529 } elsif ($self->{state} == COMMENT_START_STATE) {
1530 if ($self->{nc} == 0x002D) { # -
1531 !!!cp (137);
1532 $self->{state} = COMMENT_START_DASH_STATE;
1533 !!!next-input-character;
1534 redo A;
1535 } elsif ($self->{nc} == 0x003E) { # >
1536 !!!cp (138);
1537 !!!parse-error (type => 'bogus comment');
1538 $self->{state} = DATA_STATE;
1539 !!!next-input-character;
1540
1541 !!!emit ($self->{ct}); # comment
1542
1543 redo A;
1544 } elsif ($self->{nc} == -1) {
1545 !!!cp (139);
1546 !!!parse-error (type => 'unclosed comment');
1547 $self->{state} = DATA_STATE;
1548 ## reconsume
1549
1550 !!!emit ($self->{ct}); # comment
1551
1552 redo A;
1553 } else {
1554 !!!cp (140);
1555 $self->{ct}->{data} # comment
1556 .= chr ($self->{nc});
1557 $self->{state} = COMMENT_STATE;
1558 !!!next-input-character;
1559 redo A;
1560 }
1561 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1562 if ($self->{nc} == 0x002D) { # -
1563 !!!cp (141);
1564 $self->{state} = COMMENT_END_STATE;
1565 !!!next-input-character;
1566 redo A;
1567 } elsif ($self->{nc} == 0x003E) { # >
1568 !!!cp (142);
1569 !!!parse-error (type => 'bogus comment');
1570 $self->{state} = DATA_STATE;
1571 !!!next-input-character;
1572
1573 !!!emit ($self->{ct}); # comment
1574
1575 redo A;
1576 } elsif ($self->{nc} == -1) {
1577 !!!cp (143);
1578 !!!parse-error (type => 'unclosed comment');
1579 $self->{state} = DATA_STATE;
1580 ## reconsume
1581
1582 !!!emit ($self->{ct}); # comment
1583
1584 redo A;
1585 } else {
1586 !!!cp (144);
1587 $self->{ct}->{data} # comment
1588 .= '-' . chr ($self->{nc});
1589 $self->{state} = COMMENT_STATE;
1590 !!!next-input-character;
1591 redo A;
1592 }
1593 } elsif ($self->{state} == COMMENT_STATE) {
1594 if ($self->{nc} == 0x002D) { # -
1595 !!!cp (145);
1596 $self->{state} = COMMENT_END_DASH_STATE;
1597 !!!next-input-character;
1598 redo A;
1599 } elsif ($self->{nc} == -1) {
1600 !!!cp (146);
1601 !!!parse-error (type => 'unclosed comment');
1602 $self->{state} = DATA_STATE;
1603 ## reconsume
1604
1605 !!!emit ($self->{ct}); # comment
1606
1607 redo A;
1608 } else {
1609 !!!cp (147);
1610 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1611 $self->{read_until}->($self->{ct}->{data},
1612 q[-],
1613 length $self->{ct}->{data});
1614
1615 ## Stay in the state
1616 !!!next-input-character;
1617 redo A;
1618 }
1619 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1620 if ($self->{nc} == 0x002D) { # -
1621 !!!cp (148);
1622 $self->{state} = COMMENT_END_STATE;
1623 !!!next-input-character;
1624 redo A;
1625 } elsif ($self->{nc} == -1) {
1626 !!!cp (149);
1627 !!!parse-error (type => 'unclosed comment');
1628 $self->{state} = DATA_STATE;
1629 ## reconsume
1630
1631 !!!emit ($self->{ct}); # comment
1632
1633 redo A;
1634 } else {
1635 !!!cp (150);
1636 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1637 $self->{state} = COMMENT_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 }
1641 } elsif ($self->{state} == COMMENT_END_STATE) {
1642 if ($self->{nc} == 0x003E) { # >
1643 !!!cp (151);
1644 $self->{state} = DATA_STATE;
1645 !!!next-input-character;
1646
1647 !!!emit ($self->{ct}); # comment
1648
1649 redo A;
1650 } elsif ($self->{nc} == 0x002D) { # -
1651 !!!cp (152);
1652 !!!parse-error (type => 'dash in comment',
1653 line => $self->{line_prev},
1654 column => $self->{column_prev});
1655 $self->{ct}->{data} .= '-'; # comment
1656 ## Stay in the state
1657 !!!next-input-character;
1658 redo A;
1659 } elsif ($self->{nc} == -1) {
1660 !!!cp (153);
1661 !!!parse-error (type => 'unclosed comment');
1662 $self->{state} = DATA_STATE;
1663 ## reconsume
1664
1665 !!!emit ($self->{ct}); # comment
1666
1667 redo A;
1668 } else {
1669 !!!cp (154);
1670 !!!parse-error (type => 'dash in comment',
1671 line => $self->{line_prev},
1672 column => $self->{column_prev});
1673 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1674 $self->{state} = COMMENT_STATE;
1675 !!!next-input-character;
1676 redo A;
1677 }
1678 } elsif ($self->{state} == DOCTYPE_STATE) {
1679 if ($is_space->{$self->{nc}}) {
1680 !!!cp (155);
1681 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1682 !!!next-input-character;
1683 redo A;
1684 } else {
1685 !!!cp (156);
1686 !!!parse-error (type => 'no space before DOCTYPE name');
1687 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1688 ## reconsume
1689 redo A;
1690 }
1691 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1692 if ($is_space->{$self->{nc}}) {
1693 !!!cp (157);
1694 ## Stay in the state
1695 !!!next-input-character;
1696 redo A;
1697 } elsif ($self->{nc} == 0x003E) { # >
1698 !!!cp (158);
1699 !!!parse-error (type => 'no DOCTYPE name');
1700 $self->{state} = DATA_STATE;
1701 !!!next-input-character;
1702
1703 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1704
1705 redo A;
1706 } elsif ($self->{nc} == -1) {
1707 !!!cp (159);
1708 !!!parse-error (type => 'no DOCTYPE name');
1709 $self->{state} = DATA_STATE;
1710 ## reconsume
1711
1712 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1713
1714 redo A;
1715 } else {
1716 !!!cp (160);
1717 $self->{ct}->{name} = chr $self->{nc};
1718 delete $self->{ct}->{quirks};
1719 $self->{state} = DOCTYPE_NAME_STATE;
1720 !!!next-input-character;
1721 redo A;
1722 }
1723 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1724 ## ISSUE: Redundant "First," in the spec.
1725 if ($is_space->{$self->{nc}}) {
1726 !!!cp (161);
1727 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1728 !!!next-input-character;
1729 redo A;
1730 } elsif ($self->{nc} == 0x003E) { # >
1731 !!!cp (162);
1732 $self->{state} = DATA_STATE;
1733 !!!next-input-character;
1734
1735 !!!emit ($self->{ct}); # DOCTYPE
1736
1737 redo A;
1738 } elsif ($self->{nc} == -1) {
1739 !!!cp (163);
1740 !!!parse-error (type => 'unclosed DOCTYPE');
1741 $self->{state} = DATA_STATE;
1742 ## reconsume
1743
1744 $self->{ct}->{quirks} = 1;
1745 !!!emit ($self->{ct}); # DOCTYPE
1746
1747 redo A;
1748 } else {
1749 !!!cp (164);
1750 $self->{ct}->{name}
1751 .= chr ($self->{nc}); # DOCTYPE
1752 ## Stay in the state
1753 !!!next-input-character;
1754 redo A;
1755 }
1756 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1757 if ($is_space->{$self->{nc}}) {
1758 !!!cp (165);
1759 ## Stay in the state
1760 !!!next-input-character;
1761 redo A;
1762 } elsif ($self->{nc} == 0x003E) { # >
1763 !!!cp (166);
1764 $self->{state} = DATA_STATE;
1765 !!!next-input-character;
1766
1767 !!!emit ($self->{ct}); # DOCTYPE
1768
1769 redo A;
1770 } elsif ($self->{nc} == -1) {
1771 !!!cp (167);
1772 !!!parse-error (type => 'unclosed DOCTYPE');
1773 $self->{state} = DATA_STATE;
1774 ## reconsume
1775
1776 $self->{ct}->{quirks} = 1;
1777 !!!emit ($self->{ct}); # DOCTYPE
1778
1779 redo A;
1780 } elsif ($self->{nc} == 0x0050 or # P
1781 $self->{nc} == 0x0070) { # p
1782 $self->{state} = PUBLIC_STATE;
1783 $self->{s_kwd} = chr $self->{nc};
1784 !!!next-input-character;
1785 redo A;
1786 } elsif ($self->{nc} == 0x0053 or # S
1787 $self->{nc} == 0x0073) { # s
1788 $self->{state} = SYSTEM_STATE;
1789 $self->{s_kwd} = chr $self->{nc};
1790 !!!next-input-character;
1791 redo A;
1792 } else {
1793 !!!cp (180);
1794 !!!parse-error (type => 'string after DOCTYPE name');
1795 $self->{ct}->{quirks} = 1;
1796
1797 $self->{state} = BOGUS_DOCTYPE_STATE;
1798 !!!next-input-character;
1799 redo A;
1800 }
1801 } elsif ($self->{state} == PUBLIC_STATE) {
1802 ## ASCII case-insensitive
1803 if ($self->{nc} == [
1804 undef,
1805 0x0055, # U
1806 0x0042, # B
1807 0x004C, # L
1808 0x0049, # I
1809 ]->[length $self->{s_kwd}] or
1810 $self->{nc} == [
1811 undef,
1812 0x0075, # u
1813 0x0062, # b
1814 0x006C, # l
1815 0x0069, # i
1816 ]->[length $self->{s_kwd}]) {
1817 !!!cp (175);
1818 ## Stay in the state.
1819 $self->{s_kwd} .= chr $self->{nc};
1820 !!!next-input-character;
1821 redo A;
1822 } elsif ((length $self->{s_kwd}) == 5 and
1823 ($self->{nc} == 0x0043 or # C
1824 $self->{nc} == 0x0063)) { # c
1825 !!!cp (168);
1826 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1827 !!!next-input-character;
1828 redo A;
1829 } else {
1830 !!!cp (169);
1831 !!!parse-error (type => 'string after DOCTYPE name',
1832 line => $self->{line_prev},
1833 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1834 $self->{ct}->{quirks} = 1;
1835
1836 $self->{state} = BOGUS_DOCTYPE_STATE;
1837 ## Reconsume.
1838 redo A;
1839 }
1840 } elsif ($self->{state} == SYSTEM_STATE) {
1841 ## ASCII case-insensitive
1842 if ($self->{nc} == [
1843 undef,
1844 0x0059, # Y
1845 0x0053, # S
1846 0x0054, # T
1847 0x0045, # E
1848 ]->[length $self->{s_kwd}] or
1849 $self->{nc} == [
1850 undef,
1851 0x0079, # y
1852 0x0073, # s
1853 0x0074, # t
1854 0x0065, # e
1855 ]->[length $self->{s_kwd}]) {
1856 !!!cp (170);
1857 ## Stay in the state.
1858 $self->{s_kwd} .= chr $self->{nc};
1859 !!!next-input-character;
1860 redo A;
1861 } elsif ((length $self->{s_kwd}) == 5 and
1862 ($self->{nc} == 0x004D or # M
1863 $self->{nc} == 0x006D)) { # m
1864 !!!cp (171);
1865 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1866 !!!next-input-character;
1867 redo A;
1868 } else {
1869 !!!cp (172);
1870 !!!parse-error (type => 'string after DOCTYPE name',
1871 line => $self->{line_prev},
1872 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1873 $self->{ct}->{quirks} = 1;
1874
1875 $self->{state} = BOGUS_DOCTYPE_STATE;
1876 ## Reconsume.
1877 redo A;
1878 }
1879 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1880 if ($is_space->{$self->{nc}}) {
1881 !!!cp (181);
1882 ## Stay in the state
1883 !!!next-input-character;
1884 redo A;
1885 } elsif ($self->{nc} eq 0x0022) { # "
1886 !!!cp (182);
1887 $self->{ct}->{pubid} = ''; # DOCTYPE
1888 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1889 !!!next-input-character;
1890 redo A;
1891 } elsif ($self->{nc} eq 0x0027) { # '
1892 !!!cp (183);
1893 $self->{ct}->{pubid} = ''; # DOCTYPE
1894 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1895 !!!next-input-character;
1896 redo A;
1897 } elsif ($self->{nc} eq 0x003E) { # >
1898 !!!cp (184);
1899 !!!parse-error (type => 'no PUBLIC literal');
1900
1901 $self->{state} = DATA_STATE;
1902 !!!next-input-character;
1903
1904 $self->{ct}->{quirks} = 1;
1905 !!!emit ($self->{ct}); # DOCTYPE
1906
1907 redo A;
1908 } elsif ($self->{nc} == -1) {
1909 !!!cp (185);
1910 !!!parse-error (type => 'unclosed DOCTYPE');
1911
1912 $self->{state} = DATA_STATE;
1913 ## reconsume
1914
1915 $self->{ct}->{quirks} = 1;
1916 !!!emit ($self->{ct}); # DOCTYPE
1917
1918 redo A;
1919 } else {
1920 !!!cp (186);
1921 !!!parse-error (type => 'string after PUBLIC');
1922 $self->{ct}->{quirks} = 1;
1923
1924 $self->{state} = BOGUS_DOCTYPE_STATE;
1925 !!!next-input-character;
1926 redo A;
1927 }
1928 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1929 if ($self->{nc} == 0x0022) { # "
1930 !!!cp (187);
1931 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1932 !!!next-input-character;
1933 redo A;
1934 } elsif ($self->{nc} == 0x003E) { # >
1935 !!!cp (188);
1936 !!!parse-error (type => 'unclosed PUBLIC literal');
1937
1938 $self->{state} = DATA_STATE;
1939 !!!next-input-character;
1940
1941 $self->{ct}->{quirks} = 1;
1942 !!!emit ($self->{ct}); # DOCTYPE
1943
1944 redo A;
1945 } elsif ($self->{nc} == -1) {
1946 !!!cp (189);
1947 !!!parse-error (type => 'unclosed PUBLIC literal');
1948
1949 $self->{state} = DATA_STATE;
1950 ## reconsume
1951
1952 $self->{ct}->{quirks} = 1;
1953 !!!emit ($self->{ct}); # DOCTYPE
1954
1955 redo A;
1956 } else {
1957 !!!cp (190);
1958 $self->{ct}->{pubid} # DOCTYPE
1959 .= chr $self->{nc};
1960 $self->{read_until}->($self->{ct}->{pubid}, q[">],
1961 length $self->{ct}->{pubid});
1962
1963 ## Stay in the state
1964 !!!next-input-character;
1965 redo A;
1966 }
1967 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1968 if ($self->{nc} == 0x0027) { # '
1969 !!!cp (191);
1970 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1971 !!!next-input-character;
1972 redo A;
1973 } elsif ($self->{nc} == 0x003E) { # >
1974 !!!cp (192);
1975 !!!parse-error (type => 'unclosed PUBLIC literal');
1976
1977 $self->{state} = DATA_STATE;
1978 !!!next-input-character;
1979
1980 $self->{ct}->{quirks} = 1;
1981 !!!emit ($self->{ct}); # DOCTYPE
1982
1983 redo A;
1984 } elsif ($self->{nc} == -1) {
1985 !!!cp (193);
1986 !!!parse-error (type => 'unclosed PUBLIC literal');
1987
1988 $self->{state} = DATA_STATE;
1989 ## reconsume
1990
1991 $self->{ct}->{quirks} = 1;
1992 !!!emit ($self->{ct}); # DOCTYPE
1993
1994 redo A;
1995 } else {
1996 !!!cp (194);
1997 $self->{ct}->{pubid} # DOCTYPE
1998 .= chr $self->{nc};
1999 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2000 length $self->{ct}->{pubid});
2001
2002 ## Stay in the state
2003 !!!next-input-character;
2004 redo A;
2005 }
2006 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2007 if ($is_space->{$self->{nc}}) {
2008 !!!cp (195);
2009 ## Stay in the state
2010 !!!next-input-character;
2011 redo A;
2012 } elsif ($self->{nc} == 0x0022) { # "
2013 !!!cp (196);
2014 $self->{ct}->{sysid} = ''; # DOCTYPE
2015 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2016 !!!next-input-character;
2017 redo A;
2018 } elsif ($self->{nc} == 0x0027) { # '
2019 !!!cp (197);
2020 $self->{ct}->{sysid} = ''; # DOCTYPE
2021 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2022 !!!next-input-character;
2023 redo A;
2024 } elsif ($self->{nc} == 0x003E) { # >
2025 !!!cp (198);
2026 $self->{state} = DATA_STATE;
2027 !!!next-input-character;
2028
2029 !!!emit ($self->{ct}); # DOCTYPE
2030
2031 redo A;
2032 } elsif ($self->{nc} == -1) {
2033 !!!cp (199);
2034 !!!parse-error (type => 'unclosed DOCTYPE');
2035
2036 $self->{state} = DATA_STATE;
2037 ## reconsume
2038
2039 $self->{ct}->{quirks} = 1;
2040 !!!emit ($self->{ct}); # DOCTYPE
2041
2042 redo A;
2043 } else {
2044 !!!cp (200);
2045 !!!parse-error (type => 'string after PUBLIC literal');
2046 $self->{ct}->{quirks} = 1;
2047
2048 $self->{state} = BOGUS_DOCTYPE_STATE;
2049 !!!next-input-character;
2050 redo A;
2051 }
2052 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2053 if ($is_space->{$self->{nc}}) {
2054 !!!cp (201);
2055 ## Stay in the state
2056 !!!next-input-character;
2057 redo A;
2058 } elsif ($self->{nc} == 0x0022) { # "
2059 !!!cp (202);
2060 $self->{ct}->{sysid} = ''; # DOCTYPE
2061 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2062 !!!next-input-character;
2063 redo A;
2064 } elsif ($self->{nc} == 0x0027) { # '
2065 !!!cp (203);
2066 $self->{ct}->{sysid} = ''; # DOCTYPE
2067 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2068 !!!next-input-character;
2069 redo A;
2070 } elsif ($self->{nc} == 0x003E) { # >
2071 !!!cp (204);
2072 !!!parse-error (type => 'no SYSTEM literal');
2073 $self->{state} = DATA_STATE;
2074 !!!next-input-character;
2075
2076 $self->{ct}->{quirks} = 1;
2077 !!!emit ($self->{ct}); # DOCTYPE
2078
2079 redo A;
2080 } elsif ($self->{nc} == -1) {
2081 !!!cp (205);
2082 !!!parse-error (type => 'unclosed DOCTYPE');
2083
2084 $self->{state} = DATA_STATE;
2085 ## reconsume
2086
2087 $self->{ct}->{quirks} = 1;
2088 !!!emit ($self->{ct}); # DOCTYPE
2089
2090 redo A;
2091 } else {
2092 !!!cp (206);
2093 !!!parse-error (type => 'string after SYSTEM');
2094 $self->{ct}->{quirks} = 1;
2095
2096 $self->{state} = BOGUS_DOCTYPE_STATE;
2097 !!!next-input-character;
2098 redo A;
2099 }
2100 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2101 if ($self->{nc} == 0x0022) { # "
2102 !!!cp (207);
2103 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2104 !!!next-input-character;
2105 redo A;
2106 } elsif ($self->{nc} == 0x003E) { # >
2107 !!!cp (208);
2108 !!!parse-error (type => 'unclosed SYSTEM literal');
2109
2110 $self->{state} = DATA_STATE;
2111 !!!next-input-character;
2112
2113 $self->{ct}->{quirks} = 1;
2114 !!!emit ($self->{ct}); # DOCTYPE
2115
2116 redo A;
2117 } elsif ($self->{nc} == -1) {
2118 !!!cp (209);
2119 !!!parse-error (type => 'unclosed SYSTEM literal');
2120
2121 $self->{state} = DATA_STATE;
2122 ## reconsume
2123
2124 $self->{ct}->{quirks} = 1;
2125 !!!emit ($self->{ct}); # DOCTYPE
2126
2127 redo A;
2128 } else {
2129 !!!cp (210);
2130 $self->{ct}->{sysid} # DOCTYPE
2131 .= chr $self->{nc};
2132 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2133 length $self->{ct}->{sysid});
2134
2135 ## Stay in the state
2136 !!!next-input-character;
2137 redo A;
2138 }
2139 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2140 if ($self->{nc} == 0x0027) { # '
2141 !!!cp (211);
2142 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2143 !!!next-input-character;
2144 redo A;
2145 } elsif ($self->{nc} == 0x003E) { # >
2146 !!!cp (212);
2147 !!!parse-error (type => 'unclosed SYSTEM literal');
2148
2149 $self->{state} = DATA_STATE;
2150 !!!next-input-character;
2151
2152 $self->{ct}->{quirks} = 1;
2153 !!!emit ($self->{ct}); # DOCTYPE
2154
2155 redo A;
2156 } elsif ($self->{nc} == -1) {
2157 !!!cp (213);
2158 !!!parse-error (type => 'unclosed SYSTEM literal');
2159
2160 $self->{state} = DATA_STATE;
2161 ## reconsume
2162
2163 $self->{ct}->{quirks} = 1;
2164 !!!emit ($self->{ct}); # DOCTYPE
2165
2166 redo A;
2167 } else {
2168 !!!cp (214);
2169 $self->{ct}->{sysid} # DOCTYPE
2170 .= chr $self->{nc};
2171 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2172 length $self->{ct}->{sysid});
2173
2174 ## Stay in the state
2175 !!!next-input-character;
2176 redo A;
2177 }
2178 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2179 if ($is_space->{$self->{nc}}) {
2180 !!!cp (215);
2181 ## Stay in the state
2182 !!!next-input-character;
2183 redo A;
2184 } elsif ($self->{nc} == 0x003E) { # >
2185 !!!cp (216);
2186 $self->{state} = DATA_STATE;
2187 !!!next-input-character;
2188
2189 !!!emit ($self->{ct}); # DOCTYPE
2190
2191 redo A;
2192 } elsif ($self->{nc} == -1) {
2193 !!!cp (217);
2194 !!!parse-error (type => 'unclosed DOCTYPE');
2195 $self->{state} = DATA_STATE;
2196 ## reconsume
2197
2198 $self->{ct}->{quirks} = 1;
2199 !!!emit ($self->{ct}); # DOCTYPE
2200
2201 redo A;
2202 } else {
2203 !!!cp (218);
2204 !!!parse-error (type => 'string after SYSTEM literal');
2205 #$self->{ct}->{quirks} = 1;
2206
2207 $self->{state} = BOGUS_DOCTYPE_STATE;
2208 !!!next-input-character;
2209 redo A;
2210 }
2211 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2212 if ($self->{nc} == 0x003E) { # >
2213 !!!cp (219);
2214 $self->{state} = DATA_STATE;
2215 !!!next-input-character;
2216
2217 !!!emit ($self->{ct}); # DOCTYPE
2218
2219 redo A;
2220 } elsif ($self->{nc} == -1) {
2221 !!!cp (220);
2222 $self->{state} = DATA_STATE;
2223 ## reconsume
2224
2225 !!!emit ($self->{ct}); # DOCTYPE
2226
2227 redo A;
2228 } else {
2229 !!!cp (221);
2230 my $s = '';
2231 $self->{read_until}->($s, q[>], 0);
2232
2233 ## Stay in the state
2234 !!!next-input-character;
2235 redo A;
2236 }
2237 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2238 ## NOTE: "CDATA section state" in the state is jointly implemented
2239 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2240 ## and |CDATA_SECTION_MSE2_STATE|.
2241
2242 if ($self->{nc} == 0x005D) { # ]
2243 !!!cp (221.1);
2244 $self->{state} = CDATA_SECTION_MSE1_STATE;
2245 !!!next-input-character;
2246 redo A;
2247 } elsif ($self->{nc} == -1) {
2248 $self->{state} = DATA_STATE;
2249 !!!next-input-character;
2250 if (length $self->{ct}->{data}) { # character
2251 !!!cp (221.2);
2252 !!!emit ($self->{ct}); # character
2253 } else {
2254 !!!cp (221.3);
2255 ## No token to emit. $self->{ct} is discarded.
2256 }
2257 redo A;
2258 } else {
2259 !!!cp (221.4);
2260 $self->{ct}->{data} .= chr $self->{nc};
2261 $self->{read_until}->($self->{ct}->{data},
2262 q<]>,
2263 length $self->{ct}->{data});
2264
2265 ## Stay in the state.
2266 !!!next-input-character;
2267 redo A;
2268 }
2269
2270 ## ISSUE: "text tokens" in spec.
2271 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2272 if ($self->{nc} == 0x005D) { # ]
2273 !!!cp (221.5);
2274 $self->{state} = CDATA_SECTION_MSE2_STATE;
2275 !!!next-input-character;
2276 redo A;
2277 } else {
2278 !!!cp (221.6);
2279 $self->{ct}->{data} .= ']';
2280 $self->{state} = CDATA_SECTION_STATE;
2281 ## Reconsume.
2282 redo A;
2283 }
2284 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2285 if ($self->{nc} == 0x003E) { # >
2286 $self->{state} = DATA_STATE;
2287 !!!next-input-character;
2288 if (length $self->{ct}->{data}) { # character
2289 !!!cp (221.7);
2290 !!!emit ($self->{ct}); # character
2291 } else {
2292 !!!cp (221.8);
2293 ## No token to emit. $self->{ct} is discarded.
2294 }
2295 redo A;
2296 } elsif ($self->{nc} == 0x005D) { # ]
2297 !!!cp (221.9); # character
2298 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2299 ## Stay in the state.
2300 !!!next-input-character;
2301 redo A;
2302 } else {
2303 !!!cp (221.11);
2304 $self->{ct}->{data} .= ']]'; # character
2305 $self->{state} = CDATA_SECTION_STATE;
2306 ## Reconsume.
2307 redo A;
2308 }
2309 } elsif ($self->{state} == ENTITY_STATE) {
2310 if ($is_space->{$self->{nc}} or
2311 {
2312 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2313 $self->{entity_add} => 1,
2314 }->{$self->{nc}}) {
2315 !!!cp (1001);
2316 ## Don't consume
2317 ## No error
2318 ## Return nothing.
2319 #
2320 } elsif ($self->{nc} == 0x0023) { # #
2321 !!!cp (999);
2322 $self->{state} = ENTITY_HASH_STATE;
2323 $self->{s_kwd} = '#';
2324 !!!next-input-character;
2325 redo A;
2326 } elsif ((0x0041 <= $self->{nc} and
2327 $self->{nc} <= 0x005A) or # A..Z
2328 (0x0061 <= $self->{nc} and
2329 $self->{nc} <= 0x007A)) { # a..z
2330 !!!cp (998);
2331 require Whatpm::_NamedEntityList;
2332 $self->{state} = ENTITY_NAME_STATE;
2333 $self->{s_kwd} = chr $self->{nc};
2334 $self->{entity__value} = $self->{s_kwd};
2335 $self->{entity__match} = 0;
2336 !!!next-input-character;
2337 redo A;
2338 } else {
2339 !!!cp (1027);
2340 !!!parse-error (type => 'bare ero');
2341 ## Return nothing.
2342 #
2343 }
2344
2345 ## NOTE: No character is consumed by the "consume a character
2346 ## reference" algorithm. In other word, there is an "&" character
2347 ## that does not introduce a character reference, which would be
2348 ## appended to the parent element or the attribute value in later
2349 ## process of the tokenizer.
2350
2351 if ($self->{prev_state} == DATA_STATE) {
2352 !!!cp (997);
2353 $self->{state} = $self->{prev_state};
2354 ## Reconsume.
2355 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2356 line => $self->{line_prev},
2357 column => $self->{column_prev},
2358 });
2359 redo A;
2360 } else {
2361 !!!cp (996);
2362 $self->{ca}->{value} .= '&';
2363 $self->{state} = $self->{prev_state};
2364 ## Reconsume.
2365 redo A;
2366 }
2367 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2368 if ($self->{nc} == 0x0078 or # x
2369 $self->{nc} == 0x0058) { # X
2370 !!!cp (995);
2371 $self->{state} = HEXREF_X_STATE;
2372 $self->{s_kwd} .= chr $self->{nc};
2373 !!!next-input-character;
2374 redo A;
2375 } elsif (0x0030 <= $self->{nc} and
2376 $self->{nc} <= 0x0039) { # 0..9
2377 !!!cp (994);
2378 $self->{state} = NCR_NUM_STATE;
2379 $self->{s_kwd} = $self->{nc} - 0x0030;
2380 !!!next-input-character;
2381 redo A;
2382 } else {
2383 !!!parse-error (type => 'bare nero',
2384 line => $self->{line_prev},
2385 column => $self->{column_prev} - 1);
2386
2387 ## NOTE: According to the spec algorithm, nothing is returned,
2388 ## and then "&#" is appended to the parent element or the attribute
2389 ## value in the later processing.
2390
2391 if ($self->{prev_state} == DATA_STATE) {
2392 !!!cp (1019);
2393 $self->{state} = $self->{prev_state};
2394 ## Reconsume.
2395 !!!emit ({type => CHARACTER_TOKEN,
2396 data => '&#',
2397 line => $self->{line_prev},
2398 column => $self->{column_prev} - 1,
2399 });
2400 redo A;
2401 } else {
2402 !!!cp (993);
2403 $self->{ca}->{value} .= '&#';
2404 $self->{state} = $self->{prev_state};
2405 ## Reconsume.
2406 redo A;
2407 }
2408 }
2409 } elsif ($self->{state} == NCR_NUM_STATE) {
2410 if (0x0030 <= $self->{nc} and
2411 $self->{nc} <= 0x0039) { # 0..9
2412 !!!cp (1012);
2413 $self->{s_kwd} *= 10;
2414 $self->{s_kwd} += $self->{nc} - 0x0030;
2415
2416 ## Stay in the state.
2417 !!!next-input-character;
2418 redo A;
2419 } elsif ($self->{nc} == 0x003B) { # ;
2420 !!!cp (1013);
2421 !!!next-input-character;
2422 #
2423 } else {
2424 !!!cp (1014);
2425 !!!parse-error (type => 'no refc');
2426 ## Reconsume.
2427 #
2428 }
2429
2430 my $code = $self->{s_kwd};
2431 my $l = $self->{line_prev};
2432 my $c = $self->{column_prev};
2433 if ($charref_map->{$code}) {
2434 !!!cp (1015);
2435 !!!parse-error (type => 'invalid character reference',
2436 text => (sprintf 'U+%04X', $code),
2437 line => $l, column => $c);
2438 $code = $charref_map->{$code};
2439 } elsif ($code > 0x10FFFF) {
2440 !!!cp (1016);
2441 !!!parse-error (type => 'invalid character reference',
2442 text => (sprintf 'U-%08X', $code),
2443 line => $l, column => $c);
2444 $code = 0xFFFD;
2445 }
2446
2447 if ($self->{prev_state} == DATA_STATE) {
2448 !!!cp (992);
2449 $self->{state} = $self->{prev_state};
2450 ## Reconsume.
2451 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2452 line => $l, column => $c,
2453 });
2454 redo A;
2455 } else {
2456 !!!cp (991);
2457 $self->{ca}->{value} .= chr $code;
2458 $self->{ca}->{has_reference} = 1;
2459 $self->{state} = $self->{prev_state};
2460 ## Reconsume.
2461 redo A;
2462 }
2463 } elsif ($self->{state} == HEXREF_X_STATE) {
2464 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2465 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2466 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2467 # 0..9, A..F, a..f
2468 !!!cp (990);
2469 $self->{state} = HEXREF_HEX_STATE;
2470 $self->{s_kwd} = 0;
2471 ## Reconsume.
2472 redo A;
2473 } else {
2474 !!!parse-error (type => 'bare hcro',
2475 line => $self->{line_prev},
2476 column => $self->{column_prev} - 2);
2477
2478 ## NOTE: According to the spec algorithm, nothing is returned,
2479 ## and then "&#" followed by "X" or "x" is appended to the parent
2480 ## element or the attribute value in the later processing.
2481
2482 if ($self->{prev_state} == DATA_STATE) {
2483 !!!cp (1005);
2484 $self->{state} = $self->{prev_state};
2485 ## Reconsume.
2486 !!!emit ({type => CHARACTER_TOKEN,
2487 data => '&' . $self->{s_kwd},
2488 line => $self->{line_prev},
2489 column => $self->{column_prev} - length $self->{s_kwd},
2490 });
2491 redo A;
2492 } else {
2493 !!!cp (989);
2494 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2495 $self->{state} = $self->{prev_state};
2496 ## Reconsume.
2497 redo A;
2498 }
2499 }
2500 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2501 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2502 # 0..9
2503 !!!cp (1002);
2504 $self->{s_kwd} *= 0x10;
2505 $self->{s_kwd} += $self->{nc} - 0x0030;
2506 ## Stay in the state.
2507 !!!next-input-character;
2508 redo A;
2509 } elsif (0x0061 <= $self->{nc} and
2510 $self->{nc} <= 0x0066) { # a..f
2511 !!!cp (1003);
2512 $self->{s_kwd} *= 0x10;
2513 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2514 ## Stay in the state.
2515 !!!next-input-character;
2516 redo A;
2517 } elsif (0x0041 <= $self->{nc} and
2518 $self->{nc} <= 0x0046) { # A..F
2519 !!!cp (1004);
2520 $self->{s_kwd} *= 0x10;
2521 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2522 ## Stay in the state.
2523 !!!next-input-character;
2524 redo A;
2525 } elsif ($self->{nc} == 0x003B) { # ;
2526 !!!cp (1006);
2527 !!!next-input-character;
2528 #
2529 } else {
2530 !!!cp (1007);
2531 !!!parse-error (type => 'no refc',
2532 line => $self->{line},
2533 column => $self->{column});
2534 ## Reconsume.
2535 #
2536 }
2537
2538 my $code = $self->{s_kwd};
2539 my $l = $self->{line_prev};
2540 my $c = $self->{column_prev};
2541 if ($charref_map->{$code}) {
2542 !!!cp (1008);
2543 !!!parse-error (type => 'invalid character reference',
2544 text => (sprintf 'U+%04X', $code),
2545 line => $l, column => $c);
2546 $code = $charref_map->{$code};
2547 } elsif ($code > 0x10FFFF) {
2548 !!!cp (1009);
2549 !!!parse-error (type => 'invalid character reference',
2550 text => (sprintf 'U-%08X', $code),
2551 line => $l, column => $c);
2552 $code = 0xFFFD;
2553 }
2554
2555 if ($self->{prev_state} == DATA_STATE) {
2556 !!!cp (988);
2557 $self->{state} = $self->{prev_state};
2558 ## Reconsume.
2559 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2560 line => $l, column => $c,
2561 });
2562 redo A;
2563 } else {
2564 !!!cp (987);
2565 $self->{ca}->{value} .= chr $code;
2566 $self->{ca}->{has_reference} = 1;
2567 $self->{state} = $self->{prev_state};
2568 ## Reconsume.
2569 redo A;
2570 }
2571 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2572 if (length $self->{s_kwd} < 30 and
2573 ## NOTE: Some number greater than the maximum length of entity name
2574 ((0x0041 <= $self->{nc} and # a
2575 $self->{nc} <= 0x005A) or # x
2576 (0x0061 <= $self->{nc} and # a
2577 $self->{nc} <= 0x007A) or # z
2578 (0x0030 <= $self->{nc} and # 0
2579 $self->{nc} <= 0x0039) or # 9
2580 $self->{nc} == 0x003B)) { # ;
2581 our $EntityChar;
2582 $self->{s_kwd} .= chr $self->{nc};
2583 if (defined $EntityChar->{$self->{s_kwd}}) {
2584 if ($self->{nc} == 0x003B) { # ;
2585 !!!cp (1020);
2586 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2587 $self->{entity__match} = 1;
2588 !!!next-input-character;
2589 #
2590 } else {
2591 !!!cp (1021);
2592 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2593 $self->{entity__match} = -1;
2594 ## Stay in the state.
2595 !!!next-input-character;
2596 redo A;
2597 }
2598 } else {
2599 !!!cp (1022);
2600 $self->{entity__value} .= chr $self->{nc};
2601 $self->{entity__match} *= 2;
2602 ## Stay in the state.
2603 !!!next-input-character;
2604 redo A;
2605 }
2606 }
2607
2608 my $data;
2609 my $has_ref;
2610 if ($self->{entity__match} > 0) {
2611 !!!cp (1023);
2612 $data = $self->{entity__value};
2613 $has_ref = 1;
2614 #
2615 } elsif ($self->{entity__match} < 0) {
2616 !!!parse-error (type => 'no refc');
2617 if ($self->{prev_state} != DATA_STATE and # in attribute
2618 $self->{entity__match} < -1) {
2619 !!!cp (1024);
2620 $data = '&' . $self->{s_kwd};
2621 #
2622 } else {
2623 !!!cp (1025);
2624 $data = $self->{entity__value};
2625 $has_ref = 1;
2626 #
2627 }
2628 } else {
2629 !!!cp (1026);
2630 !!!parse-error (type => 'bare ero',
2631 line => $self->{line_prev},
2632 column => $self->{column_prev} - length $self->{s_kwd});
2633 $data = '&' . $self->{s_kwd};
2634 #
2635 }
2636
2637 ## NOTE: In these cases, when a character reference is found,
2638 ## it is consumed and a character token is returned, or, otherwise,
2639 ## nothing is consumed and returned, according to the spec algorithm.
2640 ## In this implementation, anything that has been examined by the
2641 ## tokenizer is appended to the parent element or the attribute value
2642 ## as string, either literal string when no character reference or
2643 ## entity-replaced string otherwise, in this stage, since any characters
2644 ## that would not be consumed are appended in the data state or in an
2645 ## appropriate attribute value state anyway.
2646
2647 if ($self->{prev_state} == DATA_STATE) {
2648 !!!cp (986);
2649 $self->{state} = $self->{prev_state};
2650 ## Reconsume.
2651 !!!emit ({type => CHARACTER_TOKEN,
2652 data => $data,
2653 line => $self->{line_prev},
2654 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2655 });
2656 redo A;
2657 } else {
2658 !!!cp (985);
2659 $self->{ca}->{value} .= $data;
2660 $self->{ca}->{has_reference} = 1 if $has_ref;
2661 $self->{state} = $self->{prev_state};
2662 ## Reconsume.
2663 redo A;
2664 }
2665 } else {
2666 die "$0: $self->{state}: Unknown state";
2667 }
2668 } # A
2669
2670 die "$0: _get_next_token: unexpected case";
2671 } # _get_next_token
2672
2673 1;
2674 ## $Date: 2008/10/14 05:34:05 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24