/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (show annotations) (download) (as text)
Wed Oct 15 10:50:38 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.10: +101 -10 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	15 Oct 2008 10:50:31 -0000
	* attrs-1.dat: Test cases for tokenizing errors are added.

	* elements-1.dat: A test result updated.

	* ns-attrs-1.dat: Test results updated.  New test cases for
	duplicate namespaced attributes are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 10:48:03 -0000
	* Tokenizer.pm.src: Set index attribute to each attribute token,
	for ignoring namespaced duplicate attribute at the XML namespace
	parser layer.  Raise a parse error if the attribute value is
	omitted, in XML mode.  Raise a parse error if the attribute value
	is not quoted, in XML mode.  Raise a parse error if "<" character
	is found in a quoted attribute value, in XML mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 10:49:16 -0000
	* Parser.pm.src: Use source order to determine which attribute is
	duplicate.  Preserve duplicate namespaced attributes as
	non-namespaced attributes.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.10 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## XML states
118 sub PI_STATE () { 51 }
119 sub PI_TARGET_STATE () { 52 }
120 sub PI_TARGET_AFTER_STATE () { 53 }
121 sub PI_DATA_STATE () { 54 }
122 sub PI_AFTER_STATE () { 55 }
123 sub PI_DATA_AFTER_STATE () { 56 }
124
125 ## Tree constructor state constants (see Whatpm::HTML for the full
126 ## list and descriptions)
127
128 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129 sub FOREIGN_EL () { 0b1_00000000000 }
130
131 ## Character reference mappings
132
133 my $charref_map = {
134 0x0D => 0x000A,
135 0x80 => 0x20AC,
136 0x81 => 0xFFFD,
137 0x82 => 0x201A,
138 0x83 => 0x0192,
139 0x84 => 0x201E,
140 0x85 => 0x2026,
141 0x86 => 0x2020,
142 0x87 => 0x2021,
143 0x88 => 0x02C6,
144 0x89 => 0x2030,
145 0x8A => 0x0160,
146 0x8B => 0x2039,
147 0x8C => 0x0152,
148 0x8D => 0xFFFD,
149 0x8E => 0x017D,
150 0x8F => 0xFFFD,
151 0x90 => 0xFFFD,
152 0x91 => 0x2018,
153 0x92 => 0x2019,
154 0x93 => 0x201C,
155 0x94 => 0x201D,
156 0x95 => 0x2022,
157 0x96 => 0x2013,
158 0x97 => 0x2014,
159 0x98 => 0x02DC,
160 0x99 => 0x2122,
161 0x9A => 0x0161,
162 0x9B => 0x203A,
163 0x9C => 0x0153,
164 0x9D => 0xFFFD,
165 0x9E => 0x017E,
166 0x9F => 0x0178,
167 }; # $charref_map
168 $charref_map->{$_} = 0xFFFD
169 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176
177 ## Implementations MUST act as if state machine in the spec
178
179 sub _initialize_tokenizer ($) {
180 my $self = shift;
181
182 ## NOTE: Fields set by |new| constructor:
183 #$self->{level}
184 #$self->{set_nc}
185 #$self->{parse_error}
186 #$self->{is_xml} (if XML)
187
188 $self->{state} = DATA_STATE; # MUST
189 $self->{s_kwd} = ''; # state keyword
190 #$self->{entity__value}; # initialized when used
191 #$self->{entity__match}; # initialized when used
192 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193 undef $self->{ct}; # current token
194 undef $self->{ca}; # current attribute
195 undef $self->{last_stag_name}; # last emitted start tag name
196 #$self->{prev_state}; # initialized when used
197 delete $self->{self_closing};
198 $self->{char_buffer} = '';
199 $self->{char_buffer_pos} = 0;
200 $self->{nc} = -1; # next input character
201 #$self->{next_nc}
202 !!!next-input-character;
203 $self->{token} = [];
204 # $self->{escape}
205 } # _initialize_tokenizer
206
207 ## A token has:
208 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
210 ## ->{name} (DOCTYPE_TOKEN)
211 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212 ## ->{target} (PI_TOKEN)
213 ## ->{pubid} (DOCTYPE_TOKEN)
214 ## ->{sysid} (DOCTYPE_TOKEN)
215 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
216 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
217 ## ->{name}
218 ## ->{value}
219 ## ->{has_reference} == 1 or 0
220 ## ->{index}: Index of the attribute in a tag.
221 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
222 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
224 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
225 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
226 ## while the token is pushed back to the stack.
227
228 ## Emitted token MUST immediately be handled by the tree construction state.
229
230 ## Before each step, UA MAY check to see if either one of the scripts in
231 ## "list of scripts that will execute as soon as possible" or the first
232 ## script in the "list of scripts that will execute asynchronously",
233 ## has completed loading. If one has, then it MUST be executed
234 ## and removed from the list.
235
236 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
237 ## (This requirement was dropped from HTML5 spec, unfortunately.)
238
239 my $is_space = {
240 0x0009 => 1, # CHARACTER TABULATION (HT)
241 0x000A => 1, # LINE FEED (LF)
242 #0x000B => 0, # LINE TABULATION (VT)
243 0x000C => 1, # FORM FEED (FF)
244 #0x000D => 1, # CARRIAGE RETURN (CR)
245 0x0020 => 1, # SPACE (SP)
246 };
247
248 sub _get_next_token ($) {
249 my $self = shift;
250
251 if ($self->{self_closing}) {
252 !!!parse-error (type => 'nestc', token => $self->{ct});
253 ## NOTE: The |self_closing| flag is only set by start tag token.
254 ## In addition, when a start tag token is emitted, it is always set to
255 ## |ct|.
256 delete $self->{self_closing};
257 }
258
259 if (@{$self->{token}}) {
260 $self->{self_closing} = $self->{token}->[0]->{self_closing};
261 return shift @{$self->{token}};
262 }
263
264 A: {
265 if ($self->{state} == PCDATA_STATE) {
266 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
267
268 if ($self->{nc} == 0x0026) { # &
269 !!!cp (0.1);
270 ## NOTE: In the spec, the tokenizer is switched to the
271 ## "entity data state". In this implementation, the tokenizer
272 ## is switched to the |ENTITY_STATE|, which is an implementation
273 ## of the "consume a character reference" algorithm.
274 $self->{entity_add} = -1;
275 $self->{prev_state} = DATA_STATE;
276 $self->{state} = ENTITY_STATE;
277 !!!next-input-character;
278 redo A;
279 } elsif ($self->{nc} == 0x003C) { # <
280 !!!cp (0.2);
281 $self->{state} = TAG_OPEN_STATE;
282 !!!next-input-character;
283 redo A;
284 } elsif ($self->{nc} == -1) {
285 !!!cp (0.3);
286 !!!emit ({type => END_OF_FILE_TOKEN,
287 line => $self->{line}, column => $self->{column}});
288 last A; ## TODO: ok?
289 } else {
290 !!!cp (0.4);
291 #
292 }
293
294 # Anything else
295 my $token = {type => CHARACTER_TOKEN,
296 data => chr $self->{nc},
297 line => $self->{line}, column => $self->{column},
298 };
299 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
300
301 ## Stay in the state.
302 !!!next-input-character;
303 !!!emit ($token);
304 redo A;
305 } elsif ($self->{state} == DATA_STATE) {
306 $self->{s_kwd} = '' unless defined $self->{s_kwd};
307 if ($self->{nc} == 0x0026) { # &
308 $self->{s_kwd} = '';
309 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
310 not $self->{escape}) {
311 !!!cp (1);
312 ## NOTE: In the spec, the tokenizer is switched to the
313 ## "entity data state". In this implementation, the tokenizer
314 ## is switched to the |ENTITY_STATE|, which is an implementation
315 ## of the "consume a character reference" algorithm.
316 $self->{entity_add} = -1;
317 $self->{prev_state} = DATA_STATE;
318 $self->{state} = ENTITY_STATE;
319 !!!next-input-character;
320 redo A;
321 } else {
322 !!!cp (2);
323 #
324 }
325 } elsif ($self->{nc} == 0x002D) { # -
326 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
327 if ($self->{s_kwd} eq '<!-') {
328 !!!cp (3);
329 $self->{escape} = 1; # unless $self->{escape};
330 $self->{s_kwd} = '--';
331 #
332 } elsif ($self->{s_kwd} eq '-') {
333 !!!cp (4);
334 $self->{s_kwd} = '--';
335 #
336 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
337 !!!cp (4.1);
338 $self->{s_kwd} .= '-';
339 #
340 } else {
341 !!!cp (5);
342 $self->{s_kwd} = '-';
343 #
344 }
345 }
346
347 #
348 } elsif ($self->{nc} == 0x0021) { # !
349 if (length $self->{s_kwd}) {
350 !!!cp (5.1);
351 $self->{s_kwd} .= '!';
352 #
353 } else {
354 !!!cp (5.2);
355 #$self->{s_kwd} = '';
356 #
357 }
358 #
359 } elsif ($self->{nc} == 0x003C) { # <
360 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
361 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
362 not $self->{escape})) {
363 !!!cp (6);
364 $self->{state} = TAG_OPEN_STATE;
365 !!!next-input-character;
366 redo A;
367 } else {
368 !!!cp (7);
369 $self->{s_kwd} = '';
370 #
371 }
372 } elsif ($self->{nc} == 0x003E) { # >
373 if ($self->{escape} and
374 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
375 if ($self->{s_kwd} eq '--') {
376 !!!cp (8);
377 delete $self->{escape};
378 #
379 } else {
380 !!!cp (9);
381 #
382 }
383 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
384 !!!cp (9.1);
385 !!!parse-error (type => 'unmatched mse', ## TODO: type
386 line => $self->{line_prev},
387 column => $self->{column_prev} - 1);
388 #
389 } else {
390 !!!cp (10);
391 #
392 }
393
394 $self->{s_kwd} = '';
395 #
396 } elsif ($self->{nc} == 0x005D) { # ]
397 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
398 !!!cp (10.1);
399 $self->{s_kwd} .= ']';
400 } elsif ($self->{s_kwd} eq ']]') {
401 !!!cp (10.2);
402 #
403 } else {
404 !!!cp (10.3);
405 $self->{s_kwd} = '';
406 }
407 #
408 } elsif ($self->{nc} == -1) {
409 !!!cp (11);
410 $self->{s_kwd} = '';
411 !!!emit ({type => END_OF_FILE_TOKEN,
412 line => $self->{line}, column => $self->{column}});
413 last A; ## TODO: ok?
414 } else {
415 !!!cp (12);
416 $self->{s_kwd} = '';
417 #
418 }
419
420 # Anything else
421 my $token = {type => CHARACTER_TOKEN,
422 data => chr $self->{nc},
423 line => $self->{line}, column => $self->{column},
424 };
425 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
426 length $token->{data})) {
427 $self->{s_kwd} = '';
428 }
429
430 ## Stay in the data state.
431 if (not $self->{is_xml} and
432 $self->{content_model} == PCDATA_CONTENT_MODEL) {
433 !!!cp (13);
434 $self->{state} = PCDATA_STATE;
435 } else {
436 !!!cp (14);
437 ## Stay in the state.
438 }
439 !!!next-input-character;
440 !!!emit ($token);
441 redo A;
442 } elsif ($self->{state} == TAG_OPEN_STATE) {
443 ## XML5: "tag state".
444
445 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446 if ($self->{nc} == 0x002F) { # /
447 !!!cp (15);
448 !!!next-input-character;
449 $self->{state} = CLOSE_TAG_OPEN_STATE;
450 redo A;
451 } elsif ($self->{nc} == 0x0021) { # !
452 !!!cp (15.1);
453 $self->{s_kwd} = '<' unless $self->{escape};
454 #
455 } else {
456 !!!cp (16);
457 #
458 }
459
460 ## reconsume
461 $self->{state} = DATA_STATE;
462 $self->{s_kwd} = '';
463 !!!emit ({type => CHARACTER_TOKEN, data => '<',
464 line => $self->{line_prev},
465 column => $self->{column_prev},
466 });
467 redo A;
468 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
469 if ($self->{nc} == 0x0021) { # !
470 !!!cp (17);
471 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
472 !!!next-input-character;
473 redo A;
474 } elsif ($self->{nc} == 0x002F) { # /
475 !!!cp (18);
476 $self->{state} = CLOSE_TAG_OPEN_STATE;
477 !!!next-input-character;
478 redo A;
479 } elsif (0x0041 <= $self->{nc} and
480 $self->{nc} <= 0x005A) { # A..Z
481 !!!cp (19);
482 $self->{ct}
483 = {type => START_TAG_TOKEN,
484 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
485 line => $self->{line_prev},
486 column => $self->{column_prev}};
487 $self->{state} = TAG_NAME_STATE;
488 !!!next-input-character;
489 redo A;
490 } elsif (0x0061 <= $self->{nc} and
491 $self->{nc} <= 0x007A) { # a..z
492 !!!cp (20);
493 $self->{ct} = {type => START_TAG_TOKEN,
494 tag_name => chr ($self->{nc}),
495 line => $self->{line_prev},
496 column => $self->{column_prev}};
497 $self->{state} = TAG_NAME_STATE;
498 !!!next-input-character;
499 redo A;
500 } elsif ($self->{nc} == 0x003E) { # >
501 !!!cp (21);
502 !!!parse-error (type => 'empty start tag',
503 line => $self->{line_prev},
504 column => $self->{column_prev});
505 $self->{state} = DATA_STATE;
506 $self->{s_kwd} = '';
507 !!!next-input-character;
508
509 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
510 line => $self->{line_prev},
511 column => $self->{column_prev},
512 });
513
514 redo A;
515 } elsif ($self->{nc} == 0x003F) { # ?
516 if ($self->{is_xml}) {
517 !!!cp (22.1);
518 $self->{state} = PI_STATE;
519 !!!next-input-character;
520 redo A;
521 } else {
522 !!!cp (22);
523 !!!parse-error (type => 'pio',
524 line => $self->{line_prev},
525 column => $self->{column_prev});
526 $self->{state} = BOGUS_COMMENT_STATE;
527 $self->{ct} = {type => COMMENT_TOKEN, data => '',
528 line => $self->{line_prev},
529 column => $self->{column_prev},
530 };
531 ## $self->{nc} is intentionally left as is
532 redo A;
533 }
534 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
535 !!!cp (23);
536 !!!parse-error (type => 'bare stago',
537 line => $self->{line_prev},
538 column => $self->{column_prev});
539 $self->{state} = DATA_STATE;
540 $self->{s_kwd} = '';
541 ## reconsume
542
543 !!!emit ({type => CHARACTER_TOKEN, data => '<',
544 line => $self->{line_prev},
545 column => $self->{column_prev},
546 });
547
548 redo A;
549 } else {
550 ## XML5: "<:" is a parse error.
551 !!!cp (23.1);
552 $self->{ct} = {type => START_TAG_TOKEN,
553 tag_name => chr ($self->{nc}),
554 line => $self->{line_prev},
555 column => $self->{column_prev}};
556 $self->{state} = TAG_NAME_STATE;
557 !!!next-input-character;
558 redo A;
559 }
560 } else {
561 die "$0: $self->{content_model} in tag open";
562 }
563 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
564 ## NOTE: The "close tag open state" in the spec is implemented as
565 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
566
567 ## XML5: "end tag state".
568
569 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
570 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
571 if (defined $self->{last_stag_name}) {
572 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
573 $self->{s_kwd} = '';
574 ## Reconsume.
575 redo A;
576 } else {
577 ## No start tag token has ever been emitted
578 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
579 !!!cp (28);
580 $self->{state} = DATA_STATE;
581 $self->{s_kwd} = '';
582 ## Reconsume.
583 !!!emit ({type => CHARACTER_TOKEN, data => '</',
584 line => $l, column => $c,
585 });
586 redo A;
587 }
588 }
589
590 if (0x0041 <= $self->{nc} and
591 $self->{nc} <= 0x005A) { # A..Z
592 !!!cp (29);
593 $self->{ct}
594 = {type => END_TAG_TOKEN,
595 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
596 line => $l, column => $c};
597 $self->{state} = TAG_NAME_STATE;
598 !!!next-input-character;
599 redo A;
600 } elsif (0x0061 <= $self->{nc} and
601 $self->{nc} <= 0x007A) { # a..z
602 !!!cp (30);
603 $self->{ct} = {type => END_TAG_TOKEN,
604 tag_name => chr ($self->{nc}),
605 line => $l, column => $c};
606 $self->{state} = TAG_NAME_STATE;
607 !!!next-input-character;
608 redo A;
609 } elsif ($self->{nc} == 0x003E) { # >
610 !!!parse-error (type => 'empty end tag',
611 line => $self->{line_prev}, ## "<" in "</>"
612 column => $self->{column_prev} - 1);
613 $self->{state} = DATA_STATE;
614 $self->{s_kwd} = '';
615 if ($self->{is_xml}) {
616 !!!cp (31);
617 ## XML5: No parse error.
618
619 ## NOTE: This parser raises a parse error, since it supports
620 ## XML1, not XML5.
621
622 ## NOTE: A short end tag token.
623 my $ct = {type => END_TAG_TOKEN,
624 tag_name => '',
625 line => $self->{line_prev},
626 column => $self->{column_prev} - 1,
627 };
628 !!!next-input-character;
629 !!!emit ($ct);
630 } else {
631 !!!cp (31.1);
632 !!!next-input-character;
633 }
634 redo A;
635 } elsif ($self->{nc} == -1) {
636 !!!cp (32);
637 !!!parse-error (type => 'bare etago');
638 $self->{s_kwd} = '';
639 $self->{state} = DATA_STATE;
640 # reconsume
641
642 !!!emit ({type => CHARACTER_TOKEN, data => '</',
643 line => $l, column => $c,
644 });
645
646 redo A;
647 } elsif (not $self->{is_xml} or
648 $is_space->{$self->{nc}}) {
649 !!!cp (33);
650 !!!parse-error (type => 'bogus end tag',
651 line => $self->{line_prev}, # "<" of "</"
652 column => $self->{column_prev} - 1);
653 $self->{state} = BOGUS_COMMENT_STATE;
654 $self->{ct} = {type => COMMENT_TOKEN, data => '',
655 line => $self->{line_prev}, # "<" of "</"
656 column => $self->{column_prev} - 1,
657 };
658 ## NOTE: $self->{nc} is intentionally left as is.
659 ## Although the "anything else" case of the spec not explicitly
660 ## states that the next input character is to be reconsumed,
661 ## it will be included to the |data| of the comment token
662 ## generated from the bogus end tag, as defined in the
663 ## "bogus comment state" entry.
664 redo A;
665 } else {
666 ## XML5: "</:" is a parse error.
667 !!!cp (30.1);
668 $self->{ct} = {type => END_TAG_TOKEN,
669 tag_name => chr ($self->{nc}),
670 line => $l, column => $c};
671 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
672 !!!next-input-character;
673 redo A;
674 }
675 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
676 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
677 if (length $ch) {
678 my $CH = $ch;
679 $ch =~ tr/a-z/A-Z/;
680 my $nch = chr $self->{nc};
681 if ($nch eq $ch or $nch eq $CH) {
682 !!!cp (24);
683 ## Stay in the state.
684 $self->{s_kwd} .= $nch;
685 !!!next-input-character;
686 redo A;
687 } else {
688 !!!cp (25);
689 $self->{state} = DATA_STATE;
690 $self->{s_kwd} = '';
691 ## Reconsume.
692 !!!emit ({type => CHARACTER_TOKEN,
693 data => '</' . $self->{s_kwd},
694 line => $self->{line_prev},
695 column => $self->{column_prev} - 1 - length $self->{s_kwd},
696 });
697 redo A;
698 }
699 } else { # after "<{tag-name}"
700 unless ($is_space->{$self->{nc}} or
701 {
702 0x003E => 1, # >
703 0x002F => 1, # /
704 -1 => 1, # EOF
705 }->{$self->{nc}}) {
706 !!!cp (26);
707 ## Reconsume.
708 $self->{state} = DATA_STATE;
709 $self->{s_kwd} = '';
710 !!!emit ({type => CHARACTER_TOKEN,
711 data => '</' . $self->{s_kwd},
712 line => $self->{line_prev},
713 column => $self->{column_prev} - 1 - length $self->{s_kwd},
714 });
715 redo A;
716 } else {
717 !!!cp (27);
718 $self->{ct}
719 = {type => END_TAG_TOKEN,
720 tag_name => $self->{last_stag_name},
721 line => $self->{line_prev},
722 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
723 $self->{state} = TAG_NAME_STATE;
724 ## Reconsume.
725 redo A;
726 }
727 }
728 } elsif ($self->{state} == TAG_NAME_STATE) {
729 if ($is_space->{$self->{nc}}) {
730 !!!cp (34);
731 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
732 !!!next-input-character;
733 redo A;
734 } elsif ($self->{nc} == 0x003E) { # >
735 if ($self->{ct}->{type} == START_TAG_TOKEN) {
736 !!!cp (35);
737 $self->{last_stag_name} = $self->{ct}->{tag_name};
738 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
739 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
740 #if ($self->{ct}->{attributes}) {
741 # ## NOTE: This should never be reached.
742 # !!! cp (36);
743 # !!! parse-error (type => 'end tag attribute');
744 #} else {
745 !!!cp (37);
746 #}
747 } else {
748 die "$0: $self->{ct}->{type}: Unknown token type";
749 }
750 $self->{state} = DATA_STATE;
751 $self->{s_kwd} = '';
752 !!!next-input-character;
753
754 !!!emit ($self->{ct}); # start tag or end tag
755
756 redo A;
757 } elsif (0x0041 <= $self->{nc} and
758 $self->{nc} <= 0x005A) { # A..Z
759 !!!cp (38);
760 $self->{ct}->{tag_name}
761 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
762 # start tag or end tag
763 ## Stay in this state
764 !!!next-input-character;
765 redo A;
766 } elsif ($self->{nc} == -1) {
767 !!!parse-error (type => 'unclosed tag');
768 if ($self->{ct}->{type} == START_TAG_TOKEN) {
769 !!!cp (39);
770 $self->{last_stag_name} = $self->{ct}->{tag_name};
771 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
772 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
773 #if ($self->{ct}->{attributes}) {
774 # ## NOTE: This state should never be reached.
775 # !!! cp (40);
776 # !!! parse-error (type => 'end tag attribute');
777 #} else {
778 !!!cp (41);
779 #}
780 } else {
781 die "$0: $self->{ct}->{type}: Unknown token type";
782 }
783 $self->{state} = DATA_STATE;
784 $self->{s_kwd} = '';
785 # reconsume
786
787 !!!emit ($self->{ct}); # start tag or end tag
788
789 redo A;
790 } elsif ($self->{nc} == 0x002F) { # /
791 !!!cp (42);
792 $self->{state} = SELF_CLOSING_START_TAG_STATE;
793 !!!next-input-character;
794 redo A;
795 } else {
796 !!!cp (44);
797 $self->{ct}->{tag_name} .= chr $self->{nc};
798 # start tag or end tag
799 ## Stay in the state
800 !!!next-input-character;
801 redo A;
802 }
803 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
804 ## XML5: "Tag attribute name before state".
805
806 if ($is_space->{$self->{nc}}) {
807 !!!cp (45);
808 ## Stay in the state
809 !!!next-input-character;
810 redo A;
811 } elsif ($self->{nc} == 0x003E) { # >
812 if ($self->{ct}->{type} == START_TAG_TOKEN) {
813 !!!cp (46);
814 $self->{last_stag_name} = $self->{ct}->{tag_name};
815 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
816 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
817 if ($self->{ct}->{attributes}) {
818 !!!cp (47);
819 !!!parse-error (type => 'end tag attribute');
820 } else {
821 !!!cp (48);
822 }
823 } else {
824 die "$0: $self->{ct}->{type}: Unknown token type";
825 }
826 $self->{state} = DATA_STATE;
827 $self->{s_kwd} = '';
828 !!!next-input-character;
829
830 !!!emit ($self->{ct}); # start tag or end tag
831
832 redo A;
833 } elsif (0x0041 <= $self->{nc} and
834 $self->{nc} <= 0x005A) { # A..Z
835 !!!cp (49);
836 $self->{ca}
837 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
838 value => '',
839 line => $self->{line}, column => $self->{column}};
840 $self->{state} = ATTRIBUTE_NAME_STATE;
841 !!!next-input-character;
842 redo A;
843 } elsif ($self->{nc} == 0x002F) { # /
844 !!!cp (50);
845 $self->{state} = SELF_CLOSING_START_TAG_STATE;
846 !!!next-input-character;
847 redo A;
848 } elsif ($self->{nc} == -1) {
849 !!!parse-error (type => 'unclosed tag');
850 if ($self->{ct}->{type} == START_TAG_TOKEN) {
851 !!!cp (52);
852 $self->{last_stag_name} = $self->{ct}->{tag_name};
853 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
854 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
855 if ($self->{ct}->{attributes}) {
856 !!!cp (53);
857 !!!parse-error (type => 'end tag attribute');
858 } else {
859 !!!cp (54);
860 }
861 } else {
862 die "$0: $self->{ct}->{type}: Unknown token type";
863 }
864 $self->{state} = DATA_STATE;
865 $self->{s_kwd} = '';
866 # reconsume
867
868 !!!emit ($self->{ct}); # start tag or end tag
869
870 redo A;
871 } else {
872 if ({
873 0x0022 => 1, # "
874 0x0027 => 1, # '
875 0x003D => 1, # =
876 }->{$self->{nc}}) {
877 !!!cp (55);
878 ## XML5: Not a parse error.
879 !!!parse-error (type => 'bad attribute name');
880 } else {
881 !!!cp (56);
882 ## XML5: ":" raises a parse error and is ignored.
883 }
884 $self->{ca}
885 = {name => chr ($self->{nc}),
886 value => '',
887 line => $self->{line}, column => $self->{column}};
888 $self->{state} = ATTRIBUTE_NAME_STATE;
889 !!!next-input-character;
890 redo A;
891 }
892 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
893 ## XML5: "Tag attribute name state".
894
895 my $before_leave = sub {
896 if (exists $self->{ct}->{attributes} # start tag or end tag
897 ->{$self->{ca}->{name}}) { # MUST
898 !!!cp (57);
899 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
900 ## Discard $self->{ca} # MUST
901 } else {
902 !!!cp (58);
903 $self->{ct}->{attributes}->{$self->{ca}->{name}}
904 = $self->{ca};
905 $self->{ca}->{index} = ++$self->{ct}->{last_index};
906 }
907 }; # $before_leave
908
909 if ($is_space->{$self->{nc}}) {
910 !!!cp (59);
911 $before_leave->();
912 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
913 !!!next-input-character;
914 redo A;
915 } elsif ($self->{nc} == 0x003D) { # =
916 !!!cp (60);
917 $before_leave->();
918 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
919 !!!next-input-character;
920 redo A;
921 } elsif ($self->{nc} == 0x003E) { # >
922 if ($self->{is_xml}) {
923 !!!cp (60.1);
924 ## XML5: Not a parse error.
925 !!!parse-error (type => 'no attr value'); ## TODO: type
926 } else {
927 !!!cp (60.2);
928 }
929
930 $before_leave->();
931 if ($self->{ct}->{type} == START_TAG_TOKEN) {
932 !!!cp (61);
933 $self->{last_stag_name} = $self->{ct}->{tag_name};
934 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
935 !!!cp (62);
936 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
937 if ($self->{ct}->{attributes}) {
938 !!!parse-error (type => 'end tag attribute');
939 }
940 } else {
941 die "$0: $self->{ct}->{type}: Unknown token type";
942 }
943 $self->{state} = DATA_STATE;
944 $self->{s_kwd} = '';
945 !!!next-input-character;
946
947 !!!emit ($self->{ct}); # start tag or end tag
948
949 redo A;
950 } elsif (0x0041 <= $self->{nc} and
951 $self->{nc} <= 0x005A) { # A..Z
952 !!!cp (63);
953 $self->{ca}->{name}
954 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
955 ## Stay in the state
956 !!!next-input-character;
957 redo A;
958 } elsif ($self->{nc} == 0x002F) { # /
959 if ($self->{is_xml}) {
960 !!!cp (64);
961 ## XML5: Not a parse error.
962 !!!parse-error (type => 'no attr value'); ## TODO: type
963 } else {
964 !!!cp (64.1);
965 }
966
967 $before_leave->();
968 $self->{state} = SELF_CLOSING_START_TAG_STATE;
969 !!!next-input-character;
970 redo A;
971 } elsif ($self->{nc} == -1) {
972 !!!parse-error (type => 'unclosed tag');
973 $before_leave->();
974 if ($self->{ct}->{type} == START_TAG_TOKEN) {
975 !!!cp (66);
976 $self->{last_stag_name} = $self->{ct}->{tag_name};
977 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
978 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
979 if ($self->{ct}->{attributes}) {
980 !!!cp (67);
981 !!!parse-error (type => 'end tag attribute');
982 } else {
983 ## NOTE: This state should never be reached.
984 !!!cp (68);
985 }
986 } else {
987 die "$0: $self->{ct}->{type}: Unknown token type";
988 }
989 $self->{state} = DATA_STATE;
990 $self->{s_kwd} = '';
991 # reconsume
992
993 !!!emit ($self->{ct}); # start tag or end tag
994
995 redo A;
996 } else {
997 if ($self->{nc} == 0x0022 or # "
998 $self->{nc} == 0x0027) { # '
999 !!!cp (69);
1000 ## XML5: Not a parse error.
1001 !!!parse-error (type => 'bad attribute name');
1002 } else {
1003 !!!cp (70);
1004 }
1005 $self->{ca}->{name} .= chr ($self->{nc});
1006 ## Stay in the state
1007 !!!next-input-character;
1008 redo A;
1009 }
1010 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1011 ## XML5: "Tag attribute name after state".
1012
1013 if ($is_space->{$self->{nc}}) {
1014 !!!cp (71);
1015 ## Stay in the state
1016 !!!next-input-character;
1017 redo A;
1018 } elsif ($self->{nc} == 0x003D) { # =
1019 !!!cp (72);
1020 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1021 !!!next-input-character;
1022 redo A;
1023 } elsif ($self->{nc} == 0x003E) { # >
1024 if ($self->{is_xml}) {
1025 !!!cp (72.1);
1026 ## XML5: Not a parse error.
1027 !!!parse-error (type => 'no attr value'); ## TODO: type
1028 } else {
1029 !!!cp (72.2);
1030 }
1031
1032 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033 !!!cp (73);
1034 $self->{last_stag_name} = $self->{ct}->{tag_name};
1035 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1036 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1037 if ($self->{ct}->{attributes}) {
1038 !!!cp (74);
1039 !!!parse-error (type => 'end tag attribute');
1040 } else {
1041 ## NOTE: This state should never be reached.
1042 !!!cp (75);
1043 }
1044 } else {
1045 die "$0: $self->{ct}->{type}: Unknown token type";
1046 }
1047 $self->{state} = DATA_STATE;
1048 $self->{s_kwd} = '';
1049 !!!next-input-character;
1050
1051 !!!emit ($self->{ct}); # start tag or end tag
1052
1053 redo A;
1054 } elsif (0x0041 <= $self->{nc} and
1055 $self->{nc} <= 0x005A) { # A..Z
1056 !!!cp (76);
1057 $self->{ca}
1058 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1059 value => '',
1060 line => $self->{line}, column => $self->{column}};
1061 $self->{state} = ATTRIBUTE_NAME_STATE;
1062 !!!next-input-character;
1063 redo A;
1064 } elsif ($self->{nc} == 0x002F) { # /
1065 if ($self->{is_xml}) {
1066 !!!cp (77);
1067 ## XML5: Not a parse error.
1068 !!!parse-error (type => 'no attr value'); ## TODO: type
1069 } else {
1070 !!!cp (77.1);
1071 }
1072
1073 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1074 !!!next-input-character;
1075 redo A;
1076 } elsif ($self->{nc} == -1) {
1077 !!!parse-error (type => 'unclosed tag');
1078 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1079 !!!cp (79);
1080 $self->{last_stag_name} = $self->{ct}->{tag_name};
1081 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1082 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1083 if ($self->{ct}->{attributes}) {
1084 !!!cp (80);
1085 !!!parse-error (type => 'end tag attribute');
1086 } else {
1087 ## NOTE: This state should never be reached.
1088 !!!cp (81);
1089 }
1090 } else {
1091 die "$0: $self->{ct}->{type}: Unknown token type";
1092 }
1093 $self->{s_kwd} = '';
1094 $self->{state} = DATA_STATE;
1095 # reconsume
1096
1097 !!!emit ($self->{ct}); # start tag or end tag
1098
1099 redo A;
1100 } else {
1101 if ($self->{is_xml}) {
1102 !!!cp (78.1);
1103 ## XML5: Not a parse error.
1104 !!!parse-error (type => 'no attr value'); ## TODO: type
1105 } else {
1106 !!!cp (78.2);
1107 }
1108
1109 if ($self->{nc} == 0x0022 or # "
1110 $self->{nc} == 0x0027) { # '
1111 !!!cp (78);
1112 ## XML5: Not a parse error.
1113 !!!parse-error (type => 'bad attribute name');
1114 } else {
1115 !!!cp (82);
1116 }
1117 $self->{ca}
1118 = {name => chr ($self->{nc}),
1119 value => '',
1120 line => $self->{line}, column => $self->{column}};
1121 $self->{state} = ATTRIBUTE_NAME_STATE;
1122 !!!next-input-character;
1123 redo A;
1124 }
1125 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1126 ## XML5: "Tag attribute value before state".
1127
1128 if ($is_space->{$self->{nc}}) {
1129 !!!cp (83);
1130 ## Stay in the state
1131 !!!next-input-character;
1132 redo A;
1133 } elsif ($self->{nc} == 0x0022) { # "
1134 !!!cp (84);
1135 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1136 !!!next-input-character;
1137 redo A;
1138 } elsif ($self->{nc} == 0x0026) { # &
1139 !!!cp (85);
1140 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1141 ## reconsume
1142 redo A;
1143 } elsif ($self->{nc} == 0x0027) { # '
1144 !!!cp (86);
1145 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1146 !!!next-input-character;
1147 redo A;
1148 } elsif ($self->{nc} == 0x003E) { # >
1149 !!!parse-error (type => 'empty unquoted attribute value');
1150 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1151 !!!cp (87);
1152 $self->{last_stag_name} = $self->{ct}->{tag_name};
1153 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1154 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1155 if ($self->{ct}->{attributes}) {
1156 !!!cp (88);
1157 !!!parse-error (type => 'end tag attribute');
1158 } else {
1159 ## NOTE: This state should never be reached.
1160 !!!cp (89);
1161 }
1162 } else {
1163 die "$0: $self->{ct}->{type}: Unknown token type";
1164 }
1165 $self->{state} = DATA_STATE;
1166 $self->{s_kwd} = '';
1167 !!!next-input-character;
1168
1169 !!!emit ($self->{ct}); # start tag or end tag
1170
1171 redo A;
1172 } elsif ($self->{nc} == -1) {
1173 !!!parse-error (type => 'unclosed tag');
1174 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1175 !!!cp (90);
1176 $self->{last_stag_name} = $self->{ct}->{tag_name};
1177 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1178 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1179 if ($self->{ct}->{attributes}) {
1180 !!!cp (91);
1181 !!!parse-error (type => 'end tag attribute');
1182 } else {
1183 ## NOTE: This state should never be reached.
1184 !!!cp (92);
1185 }
1186 } else {
1187 die "$0: $self->{ct}->{type}: Unknown token type";
1188 }
1189 $self->{state} = DATA_STATE;
1190 $self->{s_kwd} = '';
1191 ## reconsume
1192
1193 !!!emit ($self->{ct}); # start tag or end tag
1194
1195 redo A;
1196 } else {
1197 if ($self->{nc} == 0x003D) { # =
1198 !!!cp (93);
1199 ## XML5: Not a parse error.
1200 !!!parse-error (type => 'bad attribute value');
1201 } elsif ($self->{is_xml}) {
1202 !!!cp (93.1);
1203 ## XML5: No parse error.
1204 !!!parse-error (type => 'unquoted attr value'); ## TODO
1205 } else {
1206 !!!cp (94);
1207 }
1208 $self->{ca}->{value} .= chr ($self->{nc});
1209 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1210 !!!next-input-character;
1211 redo A;
1212 }
1213 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1214 ## XML5: "Tag attribute value double quoted state".
1215
1216 if ($self->{nc} == 0x0022) { # "
1217 !!!cp (95);
1218 ## XML5: "Tag attribute name before state".
1219 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1220 !!!next-input-character;
1221 redo A;
1222 } elsif ($self->{nc} == 0x0026) { # &
1223 !!!cp (96);
1224 ## XML5: Not defined yet.
1225
1226 ## NOTE: In the spec, the tokenizer is switched to the
1227 ## "entity in attribute value state". In this implementation, the
1228 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1229 ## implementation of the "consume a character reference" algorithm.
1230 $self->{prev_state} = $self->{state};
1231 $self->{entity_add} = 0x0022; # "
1232 $self->{state} = ENTITY_STATE;
1233 !!!next-input-character;
1234 redo A;
1235 } elsif ($self->{nc} == -1) {
1236 !!!parse-error (type => 'unclosed attribute value');
1237 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1238 !!!cp (97);
1239 $self->{last_stag_name} = $self->{ct}->{tag_name};
1240 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1241 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1242 if ($self->{ct}->{attributes}) {
1243 !!!cp (98);
1244 !!!parse-error (type => 'end tag attribute');
1245 } else {
1246 ## NOTE: This state should never be reached.
1247 !!!cp (99);
1248 }
1249 } else {
1250 die "$0: $self->{ct}->{type}: Unknown token type";
1251 }
1252 $self->{state} = DATA_STATE;
1253 $self->{s_kwd} = '';
1254 ## reconsume
1255
1256 !!!emit ($self->{ct}); # start tag or end tag
1257
1258 redo A;
1259 } else {
1260 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1261 !!!cp (100);
1262 ## XML5: Not a parse error.
1263 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1264 } else {
1265 !!!cp (100.1);
1266 }
1267 $self->{ca}->{value} .= chr ($self->{nc});
1268 $self->{read_until}->($self->{ca}->{value},
1269 q["&<],
1270 length $self->{ca}->{value});
1271
1272 ## Stay in the state
1273 !!!next-input-character;
1274 redo A;
1275 }
1276 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1277 ## XML5: "Tag attribute value single quoted state".
1278
1279 if ($self->{nc} == 0x0027) { # '
1280 !!!cp (101);
1281 ## XML5: "Before attribute name state" (sic).
1282 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1283 !!!next-input-character;
1284 redo A;
1285 } elsif ($self->{nc} == 0x0026) { # &
1286 !!!cp (102);
1287 ## XML5: Not defined yet.
1288
1289 ## NOTE: In the spec, the tokenizer is switched to the
1290 ## "entity in attribute value state". In this implementation, the
1291 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1292 ## implementation of the "consume a character reference" algorithm.
1293 $self->{entity_add} = 0x0027; # '
1294 $self->{prev_state} = $self->{state};
1295 $self->{state} = ENTITY_STATE;
1296 !!!next-input-character;
1297 redo A;
1298 } elsif ($self->{nc} == -1) {
1299 !!!parse-error (type => 'unclosed attribute value');
1300 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1301 !!!cp (103);
1302 $self->{last_stag_name} = $self->{ct}->{tag_name};
1303 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1304 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1305 if ($self->{ct}->{attributes}) {
1306 !!!cp (104);
1307 !!!parse-error (type => 'end tag attribute');
1308 } else {
1309 ## NOTE: This state should never be reached.
1310 !!!cp (105);
1311 }
1312 } else {
1313 die "$0: $self->{ct}->{type}: Unknown token type";
1314 }
1315 $self->{state} = DATA_STATE;
1316 $self->{s_kwd} = '';
1317 ## reconsume
1318
1319 !!!emit ($self->{ct}); # start tag or end tag
1320
1321 redo A;
1322 } else {
1323 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1324 !!!cp (106);
1325 ## XML5: Not a parse error.
1326 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1327 } else {
1328 !!!cp (106.1);
1329 }
1330 $self->{ca}->{value} .= chr ($self->{nc});
1331 $self->{read_until}->($self->{ca}->{value},
1332 q['&<],
1333 length $self->{ca}->{value});
1334
1335 ## Stay in the state
1336 !!!next-input-character;
1337 redo A;
1338 }
1339 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1340 ## XML5: "Tag attribute value unquoted state".
1341
1342 if ($is_space->{$self->{nc}}) {
1343 !!!cp (107);
1344 ## XML5: "Tag attribute name before state".
1345 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346 !!!next-input-character;
1347 redo A;
1348 } elsif ($self->{nc} == 0x0026) { # &
1349 !!!cp (108);
1350
1351 ## XML5: Not defined yet.
1352
1353 ## NOTE: In the spec, the tokenizer is switched to the
1354 ## "entity in attribute value state". In this implementation, the
1355 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1356 ## implementation of the "consume a character reference" algorithm.
1357 $self->{entity_add} = -1;
1358 $self->{prev_state} = $self->{state};
1359 $self->{state} = ENTITY_STATE;
1360 !!!next-input-character;
1361 redo A;
1362 } elsif ($self->{nc} == 0x003E) { # >
1363 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1364 !!!cp (109);
1365 $self->{last_stag_name} = $self->{ct}->{tag_name};
1366 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1367 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1368 if ($self->{ct}->{attributes}) {
1369 !!!cp (110);
1370 !!!parse-error (type => 'end tag attribute');
1371 } else {
1372 ## NOTE: This state should never be reached.
1373 !!!cp (111);
1374 }
1375 } else {
1376 die "$0: $self->{ct}->{type}: Unknown token type";
1377 }
1378 $self->{state} = DATA_STATE;
1379 $self->{s_kwd} = '';
1380 !!!next-input-character;
1381
1382 !!!emit ($self->{ct}); # start tag or end tag
1383
1384 redo A;
1385 } elsif ($self->{nc} == -1) {
1386 !!!parse-error (type => 'unclosed tag');
1387 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1388 !!!cp (112);
1389 $self->{last_stag_name} = $self->{ct}->{tag_name};
1390 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1391 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1392 if ($self->{ct}->{attributes}) {
1393 !!!cp (113);
1394 !!!parse-error (type => 'end tag attribute');
1395 } else {
1396 ## NOTE: This state should never be reached.
1397 !!!cp (114);
1398 }
1399 } else {
1400 die "$0: $self->{ct}->{type}: Unknown token type";
1401 }
1402 $self->{state} = DATA_STATE;
1403 $self->{s_kwd} = '';
1404 ## reconsume
1405
1406 !!!emit ($self->{ct}); # start tag or end tag
1407
1408 redo A;
1409 } else {
1410 if ({
1411 0x0022 => 1, # "
1412 0x0027 => 1, # '
1413 0x003D => 1, # =
1414 }->{$self->{nc}}) {
1415 !!!cp (115);
1416 ## XML5: Not a parse error.
1417 !!!parse-error (type => 'bad attribute value');
1418 } else {
1419 !!!cp (116);
1420 }
1421 $self->{ca}->{value} .= chr ($self->{nc});
1422 $self->{read_until}->($self->{ca}->{value},
1423 q["'=& >],
1424 length $self->{ca}->{value});
1425
1426 ## Stay in the state
1427 !!!next-input-character;
1428 redo A;
1429 }
1430 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1431 if ($is_space->{$self->{nc}}) {
1432 !!!cp (118);
1433 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1434 !!!next-input-character;
1435 redo A;
1436 } elsif ($self->{nc} == 0x003E) { # >
1437 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1438 !!!cp (119);
1439 $self->{last_stag_name} = $self->{ct}->{tag_name};
1440 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1441 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1442 if ($self->{ct}->{attributes}) {
1443 !!!cp (120);
1444 !!!parse-error (type => 'end tag attribute');
1445 } else {
1446 ## NOTE: This state should never be reached.
1447 !!!cp (121);
1448 }
1449 } else {
1450 die "$0: $self->{ct}->{type}: Unknown token type";
1451 }
1452 $self->{state} = DATA_STATE;
1453 $self->{s_kwd} = '';
1454 !!!next-input-character;
1455
1456 !!!emit ($self->{ct}); # start tag or end tag
1457
1458 redo A;
1459 } elsif ($self->{nc} == 0x002F) { # /
1460 !!!cp (122);
1461 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1462 !!!next-input-character;
1463 redo A;
1464 } elsif ($self->{nc} == -1) {
1465 !!!parse-error (type => 'unclosed tag');
1466 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1467 !!!cp (122.3);
1468 $self->{last_stag_name} = $self->{ct}->{tag_name};
1469 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1470 if ($self->{ct}->{attributes}) {
1471 !!!cp (122.1);
1472 !!!parse-error (type => 'end tag attribute');
1473 } else {
1474 ## NOTE: This state should never be reached.
1475 !!!cp (122.2);
1476 }
1477 } else {
1478 die "$0: $self->{ct}->{type}: Unknown token type";
1479 }
1480 $self->{state} = DATA_STATE;
1481 $self->{s_kwd} = '';
1482 ## Reconsume.
1483 !!!emit ($self->{ct}); # start tag or end tag
1484 redo A;
1485 } else {
1486 !!!cp ('124.1');
1487 !!!parse-error (type => 'no space between attributes');
1488 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1489 ## reconsume
1490 redo A;
1491 }
1492 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1493 ## XML5: "Empty tag state".
1494
1495 if ($self->{nc} == 0x003E) { # >
1496 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1497 !!!cp ('124.2');
1498 !!!parse-error (type => 'nestc', token => $self->{ct});
1499 ## TODO: Different type than slash in start tag
1500 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1501 if ($self->{ct}->{attributes}) {
1502 !!!cp ('124.4');
1503 !!!parse-error (type => 'end tag attribute');
1504 } else {
1505 !!!cp ('124.5');
1506 }
1507 ## TODO: Test |<title></title/>|
1508 } else {
1509 !!!cp ('124.3');
1510 $self->{self_closing} = 1;
1511 }
1512
1513 $self->{state} = DATA_STATE;
1514 $self->{s_kwd} = '';
1515 !!!next-input-character;
1516
1517 !!!emit ($self->{ct}); # start tag or end tag
1518
1519 redo A;
1520 } elsif ($self->{nc} == -1) {
1521 !!!parse-error (type => 'unclosed tag');
1522 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1523 !!!cp (124.7);
1524 $self->{last_stag_name} = $self->{ct}->{tag_name};
1525 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1526 if ($self->{ct}->{attributes}) {
1527 !!!cp (124.5);
1528 !!!parse-error (type => 'end tag attribute');
1529 } else {
1530 ## NOTE: This state should never be reached.
1531 !!!cp (124.6);
1532 }
1533 } else {
1534 die "$0: $self->{ct}->{type}: Unknown token type";
1535 }
1536 ## XML5: "Tag attribute name before state".
1537 $self->{state} = DATA_STATE;
1538 $self->{s_kwd} = '';
1539 ## Reconsume.
1540 !!!emit ($self->{ct}); # start tag or end tag
1541 redo A;
1542 } else {
1543 !!!cp ('124.4');
1544 !!!parse-error (type => 'nestc');
1545 ## TODO: This error type is wrong.
1546 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1547 ## Reconsume.
1548 redo A;
1549 }
1550 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1551 ## (only happen if PCDATA state)
1552
1553 ## NOTE: Unlike spec's "bogus comment state", this implementation
1554 ## consumes characters one-by-one basis.
1555
1556 if ($self->{nc} == 0x003E) { # >
1557 !!!cp (124);
1558 $self->{state} = DATA_STATE;
1559 $self->{s_kwd} = '';
1560 !!!next-input-character;
1561
1562 !!!emit ($self->{ct}); # comment
1563 redo A;
1564 } elsif ($self->{nc} == -1) {
1565 !!!cp (125);
1566 $self->{state} = DATA_STATE;
1567 $self->{s_kwd} = '';
1568 ## reconsume
1569
1570 !!!emit ($self->{ct}); # comment
1571 redo A;
1572 } else {
1573 !!!cp (126);
1574 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1575 $self->{read_until}->($self->{ct}->{data},
1576 q[>],
1577 length $self->{ct}->{data});
1578
1579 ## Stay in the state.
1580 !!!next-input-character;
1581 redo A;
1582 }
1583 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1584 ## (only happen if PCDATA state)
1585
1586 if ($self->{nc} == 0x002D) { # -
1587 !!!cp (133);
1588 $self->{state} = MD_HYPHEN_STATE;
1589 !!!next-input-character;
1590 redo A;
1591 } elsif ($self->{nc} == 0x0044 or # D
1592 $self->{nc} == 0x0064) { # d
1593 ## ASCII case-insensitive.
1594 !!!cp (130);
1595 $self->{state} = MD_DOCTYPE_STATE;
1596 $self->{s_kwd} = chr $self->{nc};
1597 !!!next-input-character;
1598 redo A;
1599 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1600 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1601 $self->{is_xml}) and
1602 $self->{nc} == 0x005B) { # [
1603 !!!cp (135.4);
1604 $self->{state} = MD_CDATA_STATE;
1605 $self->{s_kwd} = '[';
1606 !!!next-input-character;
1607 redo A;
1608 } else {
1609 !!!cp (136);
1610 }
1611
1612 !!!parse-error (type => 'bogus comment',
1613 line => $self->{line_prev},
1614 column => $self->{column_prev} - 1);
1615 ## Reconsume.
1616 $self->{state} = BOGUS_COMMENT_STATE;
1617 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1618 line => $self->{line_prev},
1619 column => $self->{column_prev} - 1,
1620 };
1621 redo A;
1622 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1623 if ($self->{nc} == 0x002D) { # -
1624 !!!cp (127);
1625 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1626 line => $self->{line_prev},
1627 column => $self->{column_prev} - 2,
1628 };
1629 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1630 !!!next-input-character;
1631 redo A;
1632 } else {
1633 !!!cp (128);
1634 !!!parse-error (type => 'bogus comment',
1635 line => $self->{line_prev},
1636 column => $self->{column_prev} - 2);
1637 $self->{state} = BOGUS_COMMENT_STATE;
1638 ## Reconsume.
1639 $self->{ct} = {type => COMMENT_TOKEN,
1640 data => '-',
1641 line => $self->{line_prev},
1642 column => $self->{column_prev} - 2,
1643 };
1644 redo A;
1645 }
1646 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1647 ## ASCII case-insensitive.
1648 if ($self->{nc} == [
1649 undef,
1650 0x004F, # O
1651 0x0043, # C
1652 0x0054, # T
1653 0x0059, # Y
1654 0x0050, # P
1655 ]->[length $self->{s_kwd}] or
1656 $self->{nc} == [
1657 undef,
1658 0x006F, # o
1659 0x0063, # c
1660 0x0074, # t
1661 0x0079, # y
1662 0x0070, # p
1663 ]->[length $self->{s_kwd}]) {
1664 !!!cp (131);
1665 ## Stay in the state.
1666 $self->{s_kwd} .= chr $self->{nc};
1667 !!!next-input-character;
1668 redo A;
1669 } elsif ((length $self->{s_kwd}) == 6 and
1670 ($self->{nc} == 0x0045 or # E
1671 $self->{nc} == 0x0065)) { # e
1672 if ($self->{s_kwd} ne 'DOCTYP') {
1673 !!!cp (129);
1674 ## XML5: case-sensitive.
1675 !!!parse-error (type => 'lowercase keyword', ## TODO
1676 text => 'DOCTYPE',
1677 line => $self->{line_prev},
1678 column => $self->{column_prev} - 5);
1679 } else {
1680 !!!cp (129.1);
1681 }
1682 $self->{state} = DOCTYPE_STATE;
1683 $self->{ct} = {type => DOCTYPE_TOKEN,
1684 quirks => 1,
1685 line => $self->{line_prev},
1686 column => $self->{column_prev} - 7,
1687 };
1688 !!!next-input-character;
1689 redo A;
1690 } else {
1691 !!!cp (132);
1692 !!!parse-error (type => 'bogus comment',
1693 line => $self->{line_prev},
1694 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1695 $self->{state} = BOGUS_COMMENT_STATE;
1696 ## Reconsume.
1697 $self->{ct} = {type => COMMENT_TOKEN,
1698 data => $self->{s_kwd},
1699 line => $self->{line_prev},
1700 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1701 };
1702 redo A;
1703 }
1704 } elsif ($self->{state} == MD_CDATA_STATE) {
1705 if ($self->{nc} == {
1706 '[' => 0x0043, # C
1707 '[C' => 0x0044, # D
1708 '[CD' => 0x0041, # A
1709 '[CDA' => 0x0054, # T
1710 '[CDAT' => 0x0041, # A
1711 }->{$self->{s_kwd}}) {
1712 !!!cp (135.1);
1713 ## Stay in the state.
1714 $self->{s_kwd} .= chr $self->{nc};
1715 !!!next-input-character;
1716 redo A;
1717 } elsif ($self->{s_kwd} eq '[CDATA' and
1718 $self->{nc} == 0x005B) { # [
1719 if ($self->{is_xml} and
1720 not $self->{tainted} and
1721 @{$self->{open_elements} or []} == 0) {
1722 !!!cp (135.2);
1723 !!!parse-error (type => 'cdata outside of root element',
1724 line => $self->{line_prev},
1725 column => $self->{column_prev} - 7);
1726 $self->{tainted} = 1;
1727 } else {
1728 !!!cp (135.21);
1729 }
1730
1731 $self->{ct} = {type => CHARACTER_TOKEN,
1732 data => '',
1733 line => $self->{line_prev},
1734 column => $self->{column_prev} - 7};
1735 $self->{state} = CDATA_SECTION_STATE;
1736 !!!next-input-character;
1737 redo A;
1738 } else {
1739 !!!cp (135.3);
1740 !!!parse-error (type => 'bogus comment',
1741 line => $self->{line_prev},
1742 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1743 $self->{state} = BOGUS_COMMENT_STATE;
1744 ## Reconsume.
1745 $self->{ct} = {type => COMMENT_TOKEN,
1746 data => $self->{s_kwd},
1747 line => $self->{line_prev},
1748 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1749 };
1750 redo A;
1751 }
1752 } elsif ($self->{state} == COMMENT_START_STATE) {
1753 if ($self->{nc} == 0x002D) { # -
1754 !!!cp (137);
1755 $self->{state} = COMMENT_START_DASH_STATE;
1756 !!!next-input-character;
1757 redo A;
1758 } elsif ($self->{nc} == 0x003E) { # >
1759 !!!cp (138);
1760 !!!parse-error (type => 'bogus comment');
1761 $self->{state} = DATA_STATE;
1762 $self->{s_kwd} = '';
1763 !!!next-input-character;
1764
1765 !!!emit ($self->{ct}); # comment
1766
1767 redo A;
1768 } elsif ($self->{nc} == -1) {
1769 !!!cp (139);
1770 !!!parse-error (type => 'unclosed comment');
1771 $self->{state} = DATA_STATE;
1772 $self->{s_kwd} = '';
1773 ## reconsume
1774
1775 !!!emit ($self->{ct}); # comment
1776
1777 redo A;
1778 } else {
1779 !!!cp (140);
1780 $self->{ct}->{data} # comment
1781 .= chr ($self->{nc});
1782 $self->{state} = COMMENT_STATE;
1783 !!!next-input-character;
1784 redo A;
1785 }
1786 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1787 if ($self->{nc} == 0x002D) { # -
1788 !!!cp (141);
1789 $self->{state} = COMMENT_END_STATE;
1790 !!!next-input-character;
1791 redo A;
1792 } elsif ($self->{nc} == 0x003E) { # >
1793 !!!cp (142);
1794 !!!parse-error (type => 'bogus comment');
1795 $self->{state} = DATA_STATE;
1796 $self->{s_kwd} = '';
1797 !!!next-input-character;
1798
1799 !!!emit ($self->{ct}); # comment
1800
1801 redo A;
1802 } elsif ($self->{nc} == -1) {
1803 !!!cp (143);
1804 !!!parse-error (type => 'unclosed comment');
1805 $self->{state} = DATA_STATE;
1806 $self->{s_kwd} = '';
1807 ## reconsume
1808
1809 !!!emit ($self->{ct}); # comment
1810
1811 redo A;
1812 } else {
1813 !!!cp (144);
1814 $self->{ct}->{data} # comment
1815 .= '-' . chr ($self->{nc});
1816 $self->{state} = COMMENT_STATE;
1817 !!!next-input-character;
1818 redo A;
1819 }
1820 } elsif ($self->{state} == COMMENT_STATE) {
1821 if ($self->{nc} == 0x002D) { # -
1822 !!!cp (145);
1823 $self->{state} = COMMENT_END_DASH_STATE;
1824 !!!next-input-character;
1825 redo A;
1826 } elsif ($self->{nc} == -1) {
1827 !!!cp (146);
1828 !!!parse-error (type => 'unclosed comment');
1829 $self->{state} = DATA_STATE;
1830 $self->{s_kwd} = '';
1831 ## reconsume
1832
1833 !!!emit ($self->{ct}); # comment
1834
1835 redo A;
1836 } else {
1837 !!!cp (147);
1838 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1839 $self->{read_until}->($self->{ct}->{data},
1840 q[-],
1841 length $self->{ct}->{data});
1842
1843 ## Stay in the state
1844 !!!next-input-character;
1845 redo A;
1846 }
1847 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1848 ## XML5: "comment dash state".
1849
1850 if ($self->{nc} == 0x002D) { # -
1851 !!!cp (148);
1852 $self->{state} = COMMENT_END_STATE;
1853 !!!next-input-character;
1854 redo A;
1855 } elsif ($self->{nc} == -1) {
1856 !!!cp (149);
1857 !!!parse-error (type => 'unclosed comment');
1858 $self->{s_kwd} = '';
1859 $self->{state} = DATA_STATE;
1860 $self->{s_kwd} = '';
1861 ## reconsume
1862
1863 !!!emit ($self->{ct}); # comment
1864
1865 redo A;
1866 } else {
1867 !!!cp (150);
1868 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1869 $self->{state} = COMMENT_STATE;
1870 !!!next-input-character;
1871 redo A;
1872 }
1873 } elsif ($self->{state} == COMMENT_END_STATE) {
1874 if ($self->{nc} == 0x003E) { # >
1875 !!!cp (151);
1876 $self->{state} = DATA_STATE;
1877 $self->{s_kwd} = '';
1878 !!!next-input-character;
1879
1880 !!!emit ($self->{ct}); # comment
1881
1882 redo A;
1883 } elsif ($self->{nc} == 0x002D) { # -
1884 !!!cp (152);
1885 ## XML5: Not a parse error.
1886 !!!parse-error (type => 'dash in comment',
1887 line => $self->{line_prev},
1888 column => $self->{column_prev});
1889 $self->{ct}->{data} .= '-'; # comment
1890 ## Stay in the state
1891 !!!next-input-character;
1892 redo A;
1893 } elsif ($self->{nc} == -1) {
1894 !!!cp (153);
1895 !!!parse-error (type => 'unclosed comment');
1896 $self->{state} = DATA_STATE;
1897 $self->{s_kwd} = '';
1898 ## reconsume
1899
1900 !!!emit ($self->{ct}); # comment
1901
1902 redo A;
1903 } else {
1904 !!!cp (154);
1905 ## XML5: Not a parse error.
1906 !!!parse-error (type => 'dash in comment',
1907 line => $self->{line_prev},
1908 column => $self->{column_prev});
1909 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1910 $self->{state} = COMMENT_STATE;
1911 !!!next-input-character;
1912 redo A;
1913 }
1914 } elsif ($self->{state} == DOCTYPE_STATE) {
1915 if ($is_space->{$self->{nc}}) {
1916 !!!cp (155);
1917 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1918 !!!next-input-character;
1919 redo A;
1920 } else {
1921 !!!cp (156);
1922 !!!parse-error (type => 'no space before DOCTYPE name');
1923 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1924 ## reconsume
1925 redo A;
1926 }
1927 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1928 if ($is_space->{$self->{nc}}) {
1929 !!!cp (157);
1930 ## Stay in the state
1931 !!!next-input-character;
1932 redo A;
1933 } elsif ($self->{nc} == 0x003E) { # >
1934 !!!cp (158);
1935 !!!parse-error (type => 'no DOCTYPE name');
1936 $self->{state} = DATA_STATE;
1937 $self->{s_kwd} = '';
1938 !!!next-input-character;
1939
1940 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1941
1942 redo A;
1943 } elsif ($self->{nc} == -1) {
1944 !!!cp (159);
1945 !!!parse-error (type => 'no DOCTYPE name');
1946 $self->{state} = DATA_STATE;
1947 $self->{s_kwd} = '';
1948 ## reconsume
1949
1950 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1951
1952 redo A;
1953 } else {
1954 !!!cp (160);
1955 $self->{ct}->{name} = chr $self->{nc};
1956 delete $self->{ct}->{quirks};
1957 $self->{state} = DOCTYPE_NAME_STATE;
1958 !!!next-input-character;
1959 redo A;
1960 }
1961 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1962 ## ISSUE: Redundant "First," in the spec.
1963 if ($is_space->{$self->{nc}}) {
1964 !!!cp (161);
1965 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1966 !!!next-input-character;
1967 redo A;
1968 } elsif ($self->{nc} == 0x003E) { # >
1969 !!!cp (162);
1970 $self->{state} = DATA_STATE;
1971 $self->{s_kwd} = '';
1972 !!!next-input-character;
1973
1974 !!!emit ($self->{ct}); # DOCTYPE
1975
1976 redo A;
1977 } elsif ($self->{nc} == -1) {
1978 !!!cp (163);
1979 !!!parse-error (type => 'unclosed DOCTYPE');
1980 $self->{state} = DATA_STATE;
1981 $self->{s_kwd} = '';
1982 ## reconsume
1983
1984 $self->{ct}->{quirks} = 1;
1985 !!!emit ($self->{ct}); # DOCTYPE
1986
1987 redo A;
1988 } else {
1989 !!!cp (164);
1990 $self->{ct}->{name}
1991 .= chr ($self->{nc}); # DOCTYPE
1992 ## Stay in the state
1993 !!!next-input-character;
1994 redo A;
1995 }
1996 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1997 if ($is_space->{$self->{nc}}) {
1998 !!!cp (165);
1999 ## Stay in the state
2000 !!!next-input-character;
2001 redo A;
2002 } elsif ($self->{nc} == 0x003E) { # >
2003 !!!cp (166);
2004 $self->{state} = DATA_STATE;
2005 $self->{s_kwd} = '';
2006 !!!next-input-character;
2007
2008 !!!emit ($self->{ct}); # DOCTYPE
2009
2010 redo A;
2011 } elsif ($self->{nc} == -1) {
2012 !!!cp (167);
2013 !!!parse-error (type => 'unclosed DOCTYPE');
2014 $self->{state} = DATA_STATE;
2015 $self->{s_kwd} = '';
2016 ## reconsume
2017
2018 $self->{ct}->{quirks} = 1;
2019 !!!emit ($self->{ct}); # DOCTYPE
2020
2021 redo A;
2022 } elsif ($self->{nc} == 0x0050 or # P
2023 $self->{nc} == 0x0070) { # p
2024 $self->{state} = PUBLIC_STATE;
2025 $self->{s_kwd} = chr $self->{nc};
2026 !!!next-input-character;
2027 redo A;
2028 } elsif ($self->{nc} == 0x0053 or # S
2029 $self->{nc} == 0x0073) { # s
2030 $self->{state} = SYSTEM_STATE;
2031 $self->{s_kwd} = chr $self->{nc};
2032 !!!next-input-character;
2033 redo A;
2034 } else {
2035 !!!cp (180);
2036 !!!parse-error (type => 'string after DOCTYPE name');
2037 $self->{ct}->{quirks} = 1;
2038
2039 $self->{state} = BOGUS_DOCTYPE_STATE;
2040 !!!next-input-character;
2041 redo A;
2042 }
2043 } elsif ($self->{state} == PUBLIC_STATE) {
2044 ## ASCII case-insensitive
2045 if ($self->{nc} == [
2046 undef,
2047 0x0055, # U
2048 0x0042, # B
2049 0x004C, # L
2050 0x0049, # I
2051 ]->[length $self->{s_kwd}] or
2052 $self->{nc} == [
2053 undef,
2054 0x0075, # u
2055 0x0062, # b
2056 0x006C, # l
2057 0x0069, # i
2058 ]->[length $self->{s_kwd}]) {
2059 !!!cp (175);
2060 ## Stay in the state.
2061 $self->{s_kwd} .= chr $self->{nc};
2062 !!!next-input-character;
2063 redo A;
2064 } elsif ((length $self->{s_kwd}) == 5 and
2065 ($self->{nc} == 0x0043 or # C
2066 $self->{nc} == 0x0063)) { # c
2067 !!!cp (168);
2068 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2069 !!!next-input-character;
2070 redo A;
2071 } else {
2072 !!!cp (169);
2073 !!!parse-error (type => 'string after DOCTYPE name',
2074 line => $self->{line_prev},
2075 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2076 $self->{ct}->{quirks} = 1;
2077
2078 $self->{state} = BOGUS_DOCTYPE_STATE;
2079 ## Reconsume.
2080 redo A;
2081 }
2082 } elsif ($self->{state} == SYSTEM_STATE) {
2083 ## ASCII case-insensitive
2084 if ($self->{nc} == [
2085 undef,
2086 0x0059, # Y
2087 0x0053, # S
2088 0x0054, # T
2089 0x0045, # E
2090 ]->[length $self->{s_kwd}] or
2091 $self->{nc} == [
2092 undef,
2093 0x0079, # y
2094 0x0073, # s
2095 0x0074, # t
2096 0x0065, # e
2097 ]->[length $self->{s_kwd}]) {
2098 !!!cp (170);
2099 ## Stay in the state.
2100 $self->{s_kwd} .= chr $self->{nc};
2101 !!!next-input-character;
2102 redo A;
2103 } elsif ((length $self->{s_kwd}) == 5 and
2104 ($self->{nc} == 0x004D or # M
2105 $self->{nc} == 0x006D)) { # m
2106 !!!cp (171);
2107 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2108 !!!next-input-character;
2109 redo A;
2110 } else {
2111 !!!cp (172);
2112 !!!parse-error (type => 'string after DOCTYPE name',
2113 line => $self->{line_prev},
2114 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2115 $self->{ct}->{quirks} = 1;
2116
2117 $self->{state} = BOGUS_DOCTYPE_STATE;
2118 ## Reconsume.
2119 redo A;
2120 }
2121 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2122 if ($is_space->{$self->{nc}}) {
2123 !!!cp (181);
2124 ## Stay in the state
2125 !!!next-input-character;
2126 redo A;
2127 } elsif ($self->{nc} eq 0x0022) { # "
2128 !!!cp (182);
2129 $self->{ct}->{pubid} = ''; # DOCTYPE
2130 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2131 !!!next-input-character;
2132 redo A;
2133 } elsif ($self->{nc} eq 0x0027) { # '
2134 !!!cp (183);
2135 $self->{ct}->{pubid} = ''; # DOCTYPE
2136 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2137 !!!next-input-character;
2138 redo A;
2139 } elsif ($self->{nc} eq 0x003E) { # >
2140 !!!cp (184);
2141 !!!parse-error (type => 'no PUBLIC literal');
2142
2143 $self->{state} = DATA_STATE;
2144 $self->{s_kwd} = '';
2145 !!!next-input-character;
2146
2147 $self->{ct}->{quirks} = 1;
2148 !!!emit ($self->{ct}); # DOCTYPE
2149
2150 redo A;
2151 } elsif ($self->{nc} == -1) {
2152 !!!cp (185);
2153 !!!parse-error (type => 'unclosed DOCTYPE');
2154
2155 $self->{state} = DATA_STATE;
2156 $self->{s_kwd} = '';
2157 ## reconsume
2158
2159 $self->{ct}->{quirks} = 1;
2160 !!!emit ($self->{ct}); # DOCTYPE
2161
2162 redo A;
2163 } else {
2164 !!!cp (186);
2165 !!!parse-error (type => 'string after PUBLIC');
2166 $self->{ct}->{quirks} = 1;
2167
2168 $self->{state} = BOGUS_DOCTYPE_STATE;
2169 !!!next-input-character;
2170 redo A;
2171 }
2172 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2173 if ($self->{nc} == 0x0022) { # "
2174 !!!cp (187);
2175 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2176 !!!next-input-character;
2177 redo A;
2178 } elsif ($self->{nc} == 0x003E) { # >
2179 !!!cp (188);
2180 !!!parse-error (type => 'unclosed PUBLIC literal');
2181
2182 $self->{state} = DATA_STATE;
2183 $self->{s_kwd} = '';
2184 !!!next-input-character;
2185
2186 $self->{ct}->{quirks} = 1;
2187 !!!emit ($self->{ct}); # DOCTYPE
2188
2189 redo A;
2190 } elsif ($self->{nc} == -1) {
2191 !!!cp (189);
2192 !!!parse-error (type => 'unclosed PUBLIC literal');
2193
2194 $self->{state} = DATA_STATE;
2195 $self->{s_kwd} = '';
2196 ## reconsume
2197
2198 $self->{ct}->{quirks} = 1;
2199 !!!emit ($self->{ct}); # DOCTYPE
2200
2201 redo A;
2202 } else {
2203 !!!cp (190);
2204 $self->{ct}->{pubid} # DOCTYPE
2205 .= chr $self->{nc};
2206 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2207 length $self->{ct}->{pubid});
2208
2209 ## Stay in the state
2210 !!!next-input-character;
2211 redo A;
2212 }
2213 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2214 if ($self->{nc} == 0x0027) { # '
2215 !!!cp (191);
2216 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2217 !!!next-input-character;
2218 redo A;
2219 } elsif ($self->{nc} == 0x003E) { # >
2220 !!!cp (192);
2221 !!!parse-error (type => 'unclosed PUBLIC literal');
2222
2223 $self->{state} = DATA_STATE;
2224 $self->{s_kwd} = '';
2225 !!!next-input-character;
2226
2227 $self->{ct}->{quirks} = 1;
2228 !!!emit ($self->{ct}); # DOCTYPE
2229
2230 redo A;
2231 } elsif ($self->{nc} == -1) {
2232 !!!cp (193);
2233 !!!parse-error (type => 'unclosed PUBLIC literal');
2234
2235 $self->{state} = DATA_STATE;
2236 $self->{s_kwd} = '';
2237 ## reconsume
2238
2239 $self->{ct}->{quirks} = 1;
2240 !!!emit ($self->{ct}); # DOCTYPE
2241
2242 redo A;
2243 } else {
2244 !!!cp (194);
2245 $self->{ct}->{pubid} # DOCTYPE
2246 .= chr $self->{nc};
2247 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2248 length $self->{ct}->{pubid});
2249
2250 ## Stay in the state
2251 !!!next-input-character;
2252 redo A;
2253 }
2254 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2255 if ($is_space->{$self->{nc}}) {
2256 !!!cp (195);
2257 ## Stay in the state
2258 !!!next-input-character;
2259 redo A;
2260 } elsif ($self->{nc} == 0x0022) { # "
2261 !!!cp (196);
2262 $self->{ct}->{sysid} = ''; # DOCTYPE
2263 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2264 !!!next-input-character;
2265 redo A;
2266 } elsif ($self->{nc} == 0x0027) { # '
2267 !!!cp (197);
2268 $self->{ct}->{sysid} = ''; # DOCTYPE
2269 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2270 !!!next-input-character;
2271 redo A;
2272 } elsif ($self->{nc} == 0x003E) { # >
2273 !!!cp (198);
2274 $self->{state} = DATA_STATE;
2275 $self->{s_kwd} = '';
2276 !!!next-input-character;
2277
2278 !!!emit ($self->{ct}); # DOCTYPE
2279
2280 redo A;
2281 } elsif ($self->{nc} == -1) {
2282 !!!cp (199);
2283 !!!parse-error (type => 'unclosed DOCTYPE');
2284
2285 $self->{state} = DATA_STATE;
2286 $self->{s_kwd} = '';
2287 ## reconsume
2288
2289 $self->{ct}->{quirks} = 1;
2290 !!!emit ($self->{ct}); # DOCTYPE
2291
2292 redo A;
2293 } else {
2294 !!!cp (200);
2295 !!!parse-error (type => 'string after PUBLIC literal');
2296 $self->{ct}->{quirks} = 1;
2297
2298 $self->{state} = BOGUS_DOCTYPE_STATE;
2299 !!!next-input-character;
2300 redo A;
2301 }
2302 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2303 if ($is_space->{$self->{nc}}) {
2304 !!!cp (201);
2305 ## Stay in the state
2306 !!!next-input-character;
2307 redo A;
2308 } elsif ($self->{nc} == 0x0022) { # "
2309 !!!cp (202);
2310 $self->{ct}->{sysid} = ''; # DOCTYPE
2311 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2312 !!!next-input-character;
2313 redo A;
2314 } elsif ($self->{nc} == 0x0027) { # '
2315 !!!cp (203);
2316 $self->{ct}->{sysid} = ''; # DOCTYPE
2317 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2318 !!!next-input-character;
2319 redo A;
2320 } elsif ($self->{nc} == 0x003E) { # >
2321 !!!cp (204);
2322 !!!parse-error (type => 'no SYSTEM literal');
2323 $self->{state} = DATA_STATE;
2324 $self->{s_kwd} = '';
2325 !!!next-input-character;
2326
2327 $self->{ct}->{quirks} = 1;
2328 !!!emit ($self->{ct}); # DOCTYPE
2329
2330 redo A;
2331 } elsif ($self->{nc} == -1) {
2332 !!!cp (205);
2333 !!!parse-error (type => 'unclosed DOCTYPE');
2334
2335 $self->{state} = DATA_STATE;
2336 $self->{s_kwd} = '';
2337 ## reconsume
2338
2339 $self->{ct}->{quirks} = 1;
2340 !!!emit ($self->{ct}); # DOCTYPE
2341
2342 redo A;
2343 } else {
2344 !!!cp (206);
2345 !!!parse-error (type => 'string after SYSTEM');
2346 $self->{ct}->{quirks} = 1;
2347
2348 $self->{state} = BOGUS_DOCTYPE_STATE;
2349 !!!next-input-character;
2350 redo A;
2351 }
2352 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2353 if ($self->{nc} == 0x0022) { # "
2354 !!!cp (207);
2355 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2356 !!!next-input-character;
2357 redo A;
2358 } elsif ($self->{nc} == 0x003E) { # >
2359 !!!cp (208);
2360 !!!parse-error (type => 'unclosed SYSTEM literal');
2361
2362 $self->{state} = DATA_STATE;
2363 $self->{s_kwd} = '';
2364 !!!next-input-character;
2365
2366 $self->{ct}->{quirks} = 1;
2367 !!!emit ($self->{ct}); # DOCTYPE
2368
2369 redo A;
2370 } elsif ($self->{nc} == -1) {
2371 !!!cp (209);
2372 !!!parse-error (type => 'unclosed SYSTEM literal');
2373
2374 $self->{state} = DATA_STATE;
2375 $self->{s_kwd} = '';
2376 ## reconsume
2377
2378 $self->{ct}->{quirks} = 1;
2379 !!!emit ($self->{ct}); # DOCTYPE
2380
2381 redo A;
2382 } else {
2383 !!!cp (210);
2384 $self->{ct}->{sysid} # DOCTYPE
2385 .= chr $self->{nc};
2386 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2387 length $self->{ct}->{sysid});
2388
2389 ## Stay in the state
2390 !!!next-input-character;
2391 redo A;
2392 }
2393 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2394 if ($self->{nc} == 0x0027) { # '
2395 !!!cp (211);
2396 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2397 !!!next-input-character;
2398 redo A;
2399 } elsif ($self->{nc} == 0x003E) { # >
2400 !!!cp (212);
2401 !!!parse-error (type => 'unclosed SYSTEM literal');
2402
2403 $self->{state} = DATA_STATE;
2404 $self->{s_kwd} = '';
2405 !!!next-input-character;
2406
2407 $self->{ct}->{quirks} = 1;
2408 !!!emit ($self->{ct}); # DOCTYPE
2409
2410 redo A;
2411 } elsif ($self->{nc} == -1) {
2412 !!!cp (213);
2413 !!!parse-error (type => 'unclosed SYSTEM literal');
2414
2415 $self->{state} = DATA_STATE;
2416 $self->{s_kwd} = '';
2417 ## reconsume
2418
2419 $self->{ct}->{quirks} = 1;
2420 !!!emit ($self->{ct}); # DOCTYPE
2421
2422 redo A;
2423 } else {
2424 !!!cp (214);
2425 $self->{ct}->{sysid} # DOCTYPE
2426 .= chr $self->{nc};
2427 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2428 length $self->{ct}->{sysid});
2429
2430 ## Stay in the state
2431 !!!next-input-character;
2432 redo A;
2433 }
2434 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2435 if ($is_space->{$self->{nc}}) {
2436 !!!cp (215);
2437 ## Stay in the state
2438 !!!next-input-character;
2439 redo A;
2440 } elsif ($self->{nc} == 0x003E) { # >
2441 !!!cp (216);
2442 $self->{state} = DATA_STATE;
2443 $self->{s_kwd} = '';
2444 !!!next-input-character;
2445
2446 !!!emit ($self->{ct}); # DOCTYPE
2447
2448 redo A;
2449 } elsif ($self->{nc} == -1) {
2450 !!!cp (217);
2451 !!!parse-error (type => 'unclosed DOCTYPE');
2452 $self->{state} = DATA_STATE;
2453 $self->{s_kwd} = '';
2454 ## reconsume
2455
2456 $self->{ct}->{quirks} = 1;
2457 !!!emit ($self->{ct}); # DOCTYPE
2458
2459 redo A;
2460 } else {
2461 !!!cp (218);
2462 !!!parse-error (type => 'string after SYSTEM literal');
2463 #$self->{ct}->{quirks} = 1;
2464
2465 $self->{state} = BOGUS_DOCTYPE_STATE;
2466 !!!next-input-character;
2467 redo A;
2468 }
2469 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2470 if ($self->{nc} == 0x003E) { # >
2471 !!!cp (219);
2472 $self->{state} = DATA_STATE;
2473 $self->{s_kwd} = '';
2474 !!!next-input-character;
2475
2476 !!!emit ($self->{ct}); # DOCTYPE
2477
2478 redo A;
2479 } elsif ($self->{nc} == -1) {
2480 !!!cp (220);
2481 $self->{state} = DATA_STATE;
2482 $self->{s_kwd} = '';
2483 ## reconsume
2484
2485 !!!emit ($self->{ct}); # DOCTYPE
2486
2487 redo A;
2488 } else {
2489 !!!cp (221);
2490 my $s = '';
2491 $self->{read_until}->($s, q[>], 0);
2492
2493 ## Stay in the state
2494 !!!next-input-character;
2495 redo A;
2496 }
2497 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2498 ## NOTE: "CDATA section state" in the state is jointly implemented
2499 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2500 ## and |CDATA_SECTION_MSE2_STATE|.
2501
2502 ## XML5: "CDATA state".
2503
2504 if ($self->{nc} == 0x005D) { # ]
2505 !!!cp (221.1);
2506 $self->{state} = CDATA_SECTION_MSE1_STATE;
2507 !!!next-input-character;
2508 redo A;
2509 } elsif ($self->{nc} == -1) {
2510 if ($self->{is_xml}) {
2511 !!!cp (221.11);
2512 !!!parse-error (type => 'no mse'); ## TODO: type
2513 } else {
2514 !!!cp (221.12);
2515 }
2516
2517 $self->{state} = DATA_STATE;
2518 $self->{s_kwd} = '';
2519 ## Reconsume.
2520 if (length $self->{ct}->{data}) { # character
2521 !!!cp (221.2);
2522 !!!emit ($self->{ct}); # character
2523 } else {
2524 !!!cp (221.3);
2525 ## No token to emit. $self->{ct} is discarded.
2526 }
2527 redo A;
2528 } else {
2529 !!!cp (221.4);
2530 $self->{ct}->{data} .= chr $self->{nc};
2531 $self->{read_until}->($self->{ct}->{data},
2532 q<]>,
2533 length $self->{ct}->{data});
2534
2535 ## Stay in the state.
2536 !!!next-input-character;
2537 redo A;
2538 }
2539
2540 ## ISSUE: "text tokens" in spec.
2541 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2542 ## XML5: "CDATA bracket state".
2543
2544 if ($self->{nc} == 0x005D) { # ]
2545 !!!cp (221.5);
2546 $self->{state} = CDATA_SECTION_MSE2_STATE;
2547 !!!next-input-character;
2548 redo A;
2549 } else {
2550 !!!cp (221.6);
2551 ## XML5: If EOF, "]" is not appended and changed to the data state.
2552 $self->{ct}->{data} .= ']';
2553 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2554 ## Reconsume.
2555 redo A;
2556 }
2557 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2558 ## XML5: "CDATA end state".
2559
2560 if ($self->{nc} == 0x003E) { # >
2561 $self->{state} = DATA_STATE;
2562 $self->{s_kwd} = '';
2563 !!!next-input-character;
2564 if (length $self->{ct}->{data}) { # character
2565 !!!cp (221.7);
2566 !!!emit ($self->{ct}); # character
2567 } else {
2568 !!!cp (221.8);
2569 ## No token to emit. $self->{ct} is discarded.
2570 }
2571 redo A;
2572 } elsif ($self->{nc} == 0x005D) { # ]
2573 !!!cp (221.9); # character
2574 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2575 ## Stay in the state.
2576 !!!next-input-character;
2577 redo A;
2578 } else {
2579 !!!cp (221.11);
2580 $self->{ct}->{data} .= ']]'; # character
2581 $self->{state} = CDATA_SECTION_STATE;
2582 ## Reconsume. ## XML5: Emit.
2583 redo A;
2584 }
2585 } elsif ($self->{state} == ENTITY_STATE) {
2586 if ($is_space->{$self->{nc}} or
2587 {
2588 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2589 $self->{entity_add} => 1,
2590 }->{$self->{nc}}) {
2591 !!!cp (1001);
2592 ## Don't consume
2593 ## No error
2594 ## Return nothing.
2595 #
2596 } elsif ($self->{nc} == 0x0023) { # #
2597 !!!cp (999);
2598 $self->{state} = ENTITY_HASH_STATE;
2599 $self->{s_kwd} = '#';
2600 !!!next-input-character;
2601 redo A;
2602 } elsif ((0x0041 <= $self->{nc} and
2603 $self->{nc} <= 0x005A) or # A..Z
2604 (0x0061 <= $self->{nc} and
2605 $self->{nc} <= 0x007A)) { # a..z
2606 !!!cp (998);
2607 require Whatpm::_NamedEntityList;
2608 $self->{state} = ENTITY_NAME_STATE;
2609 $self->{s_kwd} = chr $self->{nc};
2610 $self->{entity__value} = $self->{s_kwd};
2611 $self->{entity__match} = 0;
2612 !!!next-input-character;
2613 redo A;
2614 } else {
2615 !!!cp (1027);
2616 !!!parse-error (type => 'bare ero');
2617 ## Return nothing.
2618 #
2619 }
2620
2621 ## NOTE: No character is consumed by the "consume a character
2622 ## reference" algorithm. In other word, there is an "&" character
2623 ## that does not introduce a character reference, which would be
2624 ## appended to the parent element or the attribute value in later
2625 ## process of the tokenizer.
2626
2627 if ($self->{prev_state} == DATA_STATE) {
2628 !!!cp (997);
2629 $self->{state} = $self->{prev_state};
2630 $self->{s_kwd} = '';
2631 ## Reconsume.
2632 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2633 line => $self->{line_prev},
2634 column => $self->{column_prev},
2635 });
2636 redo A;
2637 } else {
2638 !!!cp (996);
2639 $self->{ca}->{value} .= '&';
2640 $self->{state} = $self->{prev_state};
2641 $self->{s_kwd} = '';
2642 ## Reconsume.
2643 redo A;
2644 }
2645 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2646 if ($self->{nc} == 0x0078 or # x
2647 $self->{nc} == 0x0058) { # X
2648 !!!cp (995);
2649 $self->{state} = HEXREF_X_STATE;
2650 $self->{s_kwd} .= chr $self->{nc};
2651 !!!next-input-character;
2652 redo A;
2653 } elsif (0x0030 <= $self->{nc} and
2654 $self->{nc} <= 0x0039) { # 0..9
2655 !!!cp (994);
2656 $self->{state} = NCR_NUM_STATE;
2657 $self->{s_kwd} = $self->{nc} - 0x0030;
2658 !!!next-input-character;
2659 redo A;
2660 } else {
2661 !!!parse-error (type => 'bare nero',
2662 line => $self->{line_prev},
2663 column => $self->{column_prev} - 1);
2664
2665 ## NOTE: According to the spec algorithm, nothing is returned,
2666 ## and then "&#" is appended to the parent element or the attribute
2667 ## value in the later processing.
2668
2669 if ($self->{prev_state} == DATA_STATE) {
2670 !!!cp (1019);
2671 $self->{state} = $self->{prev_state};
2672 $self->{s_kwd} = '';
2673 ## Reconsume.
2674 !!!emit ({type => CHARACTER_TOKEN,
2675 data => '&#',
2676 line => $self->{line_prev},
2677 column => $self->{column_prev} - 1,
2678 });
2679 redo A;
2680 } else {
2681 !!!cp (993);
2682 $self->{ca}->{value} .= '&#';
2683 $self->{state} = $self->{prev_state};
2684 $self->{s_kwd} = '';
2685 ## Reconsume.
2686 redo A;
2687 }
2688 }
2689 } elsif ($self->{state} == NCR_NUM_STATE) {
2690 if (0x0030 <= $self->{nc} and
2691 $self->{nc} <= 0x0039) { # 0..9
2692 !!!cp (1012);
2693 $self->{s_kwd} *= 10;
2694 $self->{s_kwd} += $self->{nc} - 0x0030;
2695
2696 ## Stay in the state.
2697 !!!next-input-character;
2698 redo A;
2699 } elsif ($self->{nc} == 0x003B) { # ;
2700 !!!cp (1013);
2701 !!!next-input-character;
2702 #
2703 } else {
2704 !!!cp (1014);
2705 !!!parse-error (type => 'no refc');
2706 ## Reconsume.
2707 #
2708 }
2709
2710 my $code = $self->{s_kwd};
2711 my $l = $self->{line_prev};
2712 my $c = $self->{column_prev};
2713 if ($charref_map->{$code}) {
2714 !!!cp (1015);
2715 !!!parse-error (type => 'invalid character reference',
2716 text => (sprintf 'U+%04X', $code),
2717 line => $l, column => $c);
2718 $code = $charref_map->{$code};
2719 } elsif ($code > 0x10FFFF) {
2720 !!!cp (1016);
2721 !!!parse-error (type => 'invalid character reference',
2722 text => (sprintf 'U-%08X', $code),
2723 line => $l, column => $c);
2724 $code = 0xFFFD;
2725 }
2726
2727 if ($self->{prev_state} == DATA_STATE) {
2728 !!!cp (992);
2729 $self->{state} = $self->{prev_state};
2730 $self->{s_kwd} = '';
2731 ## Reconsume.
2732 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2733 has_reference => 1,
2734 line => $l, column => $c,
2735 });
2736 redo A;
2737 } else {
2738 !!!cp (991);
2739 $self->{ca}->{value} .= chr $code;
2740 $self->{ca}->{has_reference} = 1;
2741 $self->{state} = $self->{prev_state};
2742 $self->{s_kwd} = '';
2743 ## Reconsume.
2744 redo A;
2745 }
2746 } elsif ($self->{state} == HEXREF_X_STATE) {
2747 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2748 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2749 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2750 # 0..9, A..F, a..f
2751 !!!cp (990);
2752 $self->{state} = HEXREF_HEX_STATE;
2753 $self->{s_kwd} = 0;
2754 ## Reconsume.
2755 redo A;
2756 } else {
2757 !!!parse-error (type => 'bare hcro',
2758 line => $self->{line_prev},
2759 column => $self->{column_prev} - 2);
2760
2761 ## NOTE: According to the spec algorithm, nothing is returned,
2762 ## and then "&#" followed by "X" or "x" is appended to the parent
2763 ## element or the attribute value in the later processing.
2764
2765 if ($self->{prev_state} == DATA_STATE) {
2766 !!!cp (1005);
2767 $self->{state} = $self->{prev_state};
2768 $self->{s_kwd} = '';
2769 ## Reconsume.
2770 !!!emit ({type => CHARACTER_TOKEN,
2771 data => '&' . $self->{s_kwd},
2772 line => $self->{line_prev},
2773 column => $self->{column_prev} - length $self->{s_kwd},
2774 });
2775 redo A;
2776 } else {
2777 !!!cp (989);
2778 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2779 $self->{state} = $self->{prev_state};
2780 $self->{s_kwd} = '';
2781 ## Reconsume.
2782 redo A;
2783 }
2784 }
2785 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2786 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2787 # 0..9
2788 !!!cp (1002);
2789 $self->{s_kwd} *= 0x10;
2790 $self->{s_kwd} += $self->{nc} - 0x0030;
2791 ## Stay in the state.
2792 !!!next-input-character;
2793 redo A;
2794 } elsif (0x0061 <= $self->{nc} and
2795 $self->{nc} <= 0x0066) { # a..f
2796 !!!cp (1003);
2797 $self->{s_kwd} *= 0x10;
2798 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2799 ## Stay in the state.
2800 !!!next-input-character;
2801 redo A;
2802 } elsif (0x0041 <= $self->{nc} and
2803 $self->{nc} <= 0x0046) { # A..F
2804 !!!cp (1004);
2805 $self->{s_kwd} *= 0x10;
2806 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2807 ## Stay in the state.
2808 !!!next-input-character;
2809 redo A;
2810 } elsif ($self->{nc} == 0x003B) { # ;
2811 !!!cp (1006);
2812 !!!next-input-character;
2813 #
2814 } else {
2815 !!!cp (1007);
2816 !!!parse-error (type => 'no refc',
2817 line => $self->{line},
2818 column => $self->{column});
2819 ## Reconsume.
2820 #
2821 }
2822
2823 my $code = $self->{s_kwd};
2824 my $l = $self->{line_prev};
2825 my $c = $self->{column_prev};
2826 if ($charref_map->{$code}) {
2827 !!!cp (1008);
2828 !!!parse-error (type => 'invalid character reference',
2829 text => (sprintf 'U+%04X', $code),
2830 line => $l, column => $c);
2831 $code = $charref_map->{$code};
2832 } elsif ($code > 0x10FFFF) {
2833 !!!cp (1009);
2834 !!!parse-error (type => 'invalid character reference',
2835 text => (sprintf 'U-%08X', $code),
2836 line => $l, column => $c);
2837 $code = 0xFFFD;
2838 }
2839
2840 if ($self->{prev_state} == DATA_STATE) {
2841 !!!cp (988);
2842 $self->{state} = $self->{prev_state};
2843 $self->{s_kwd} = '';
2844 ## Reconsume.
2845 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2846 has_reference => 1,
2847 line => $l, column => $c,
2848 });
2849 redo A;
2850 } else {
2851 !!!cp (987);
2852 $self->{ca}->{value} .= chr $code;
2853 $self->{ca}->{has_reference} = 1;
2854 $self->{state} = $self->{prev_state};
2855 $self->{s_kwd} = '';
2856 ## Reconsume.
2857 redo A;
2858 }
2859 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2860 if (length $self->{s_kwd} < 30 and
2861 ## NOTE: Some number greater than the maximum length of entity name
2862 ((0x0041 <= $self->{nc} and # a
2863 $self->{nc} <= 0x005A) or # x
2864 (0x0061 <= $self->{nc} and # a
2865 $self->{nc} <= 0x007A) or # z
2866 (0x0030 <= $self->{nc} and # 0
2867 $self->{nc} <= 0x0039) or # 9
2868 $self->{nc} == 0x003B)) { # ;
2869 our $EntityChar;
2870 $self->{s_kwd} .= chr $self->{nc};
2871 if (defined $EntityChar->{$self->{s_kwd}}) {
2872 if ($self->{nc} == 0x003B) { # ;
2873 !!!cp (1020);
2874 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2875 $self->{entity__match} = 1;
2876 !!!next-input-character;
2877 #
2878 } else {
2879 !!!cp (1021);
2880 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2881 $self->{entity__match} = -1;
2882 ## Stay in the state.
2883 !!!next-input-character;
2884 redo A;
2885 }
2886 } else {
2887 !!!cp (1022);
2888 $self->{entity__value} .= chr $self->{nc};
2889 $self->{entity__match} *= 2;
2890 ## Stay in the state.
2891 !!!next-input-character;
2892 redo A;
2893 }
2894 }
2895
2896 my $data;
2897 my $has_ref;
2898 if ($self->{entity__match} > 0) {
2899 !!!cp (1023);
2900 $data = $self->{entity__value};
2901 $has_ref = 1;
2902 #
2903 } elsif ($self->{entity__match} < 0) {
2904 !!!parse-error (type => 'no refc');
2905 if ($self->{prev_state} != DATA_STATE and # in attribute
2906 $self->{entity__match} < -1) {
2907 !!!cp (1024);
2908 $data = '&' . $self->{s_kwd};
2909 #
2910 } else {
2911 !!!cp (1025);
2912 $data = $self->{entity__value};
2913 $has_ref = 1;
2914 #
2915 }
2916 } else {
2917 !!!cp (1026);
2918 !!!parse-error (type => 'bare ero',
2919 line => $self->{line_prev},
2920 column => $self->{column_prev} - length $self->{s_kwd});
2921 $data = '&' . $self->{s_kwd};
2922 #
2923 }
2924
2925 ## NOTE: In these cases, when a character reference is found,
2926 ## it is consumed and a character token is returned, or, otherwise,
2927 ## nothing is consumed and returned, according to the spec algorithm.
2928 ## In this implementation, anything that has been examined by the
2929 ## tokenizer is appended to the parent element or the attribute value
2930 ## as string, either literal string when no character reference or
2931 ## entity-replaced string otherwise, in this stage, since any characters
2932 ## that would not be consumed are appended in the data state or in an
2933 ## appropriate attribute value state anyway.
2934
2935 if ($self->{prev_state} == DATA_STATE) {
2936 !!!cp (986);
2937 $self->{state} = $self->{prev_state};
2938 $self->{s_kwd} = '';
2939 ## Reconsume.
2940 !!!emit ({type => CHARACTER_TOKEN,
2941 data => $data,
2942 has_reference => $has_ref,
2943 line => $self->{line_prev},
2944 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2945 });
2946 redo A;
2947 } else {
2948 !!!cp (985);
2949 $self->{ca}->{value} .= $data;
2950 $self->{ca}->{has_reference} = 1 if $has_ref;
2951 $self->{state} = $self->{prev_state};
2952 $self->{s_kwd} = '';
2953 ## Reconsume.
2954 redo A;
2955 }
2956
2957 ## XML-only states
2958
2959 } elsif ($self->{state} == PI_STATE) {
2960 if ($is_space->{$self->{nc}} or
2961 $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2962 $self->{nc} == -1) {
2963 !!!parse-error (type => 'bare pio', ## TODO: type
2964 line => $self->{line_prev},
2965 column => $self->{column_prev}
2966 - 1 * ($self->{nc} != -1));
2967 $self->{state} = BOGUS_COMMENT_STATE;
2968 ## Reconsume.
2969 $self->{ct} = {type => COMMENT_TOKEN,
2970 data => '?',
2971 line => $self->{line_prev},
2972 column => $self->{column_prev}
2973 - 1 * ($self->{nc} != -1),
2974 };
2975 redo A;
2976 } else {
2977 $self->{ct} = {type => PI_TOKEN,
2978 target => chr $self->{nc},
2979 data => '',
2980 line => $self->{line_prev},
2981 column => $self->{column_prev} - 1,
2982 };
2983 $self->{state} = PI_TARGET_STATE;
2984 !!!next-input-character;
2985 redo A;
2986 }
2987 } elsif ($self->{state} == PI_TARGET_STATE) {
2988 if ($is_space->{$self->{nc}}) {
2989 $self->{state} = PI_TARGET_AFTER_STATE;
2990 !!!next-input-character;
2991 redo A;
2992 } elsif ($self->{nc} == -1) {
2993 !!!parse-error (type => 'no pic'); ## TODO: type
2994 $self->{state} = DATA_STATE;
2995 $self->{s_kwd} = '';
2996 ## Reconsume.
2997 !!!emit ($self->{ct}); # pi
2998 redo A;
2999 } elsif ($self->{nc} == 0x003F) { # ?
3000 $self->{state} = PI_AFTER_STATE;
3001 !!!next-input-character;
3002 redo A;
3003 } else {
3004 ## XML5: typo ("tag name" -> "target")
3005 $self->{ct}->{target} .= chr $self->{nc}; # pi
3006 !!!next-input-character;
3007 redo A;
3008 }
3009 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3010 if ($is_space->{$self->{nc}}) {
3011 ## Stay in the state.
3012 !!!next-input-character;
3013 redo A;
3014 } else {
3015 $self->{state} = PI_DATA_STATE;
3016 ## Reprocess.
3017 redo A;
3018 }
3019 } elsif ($self->{state} == PI_DATA_STATE) {
3020 if ($self->{nc} == 0x003F) { # ?
3021 $self->{state} = PI_DATA_AFTER_STATE;
3022 !!!next-input-character;
3023 redo A;
3024 } elsif ($self->{nc} == -1) {
3025 !!!parse-error (type => 'no pic'); ## TODO: type
3026 $self->{state} = DATA_STATE;
3027 $self->{s_kwd} = '';
3028 ## Reprocess.
3029 !!!emit ($self->{ct}); # pi
3030 redo A;
3031 } else {
3032 $self->{ct}->{data} .= chr $self->{nc}; # pi
3033 $self->{read_until}->($self->{ct}->{data}, q[?],
3034 length $self->{ct}->{data});
3035 ## Stay in the state.
3036 !!!next-input-character;
3037 ## Reprocess.
3038 redo A;
3039 }
3040 } elsif ($self->{state} == PI_AFTER_STATE) {
3041 if ($self->{nc} == 0x003E) { # >
3042 $self->{state} = DATA_STATE;
3043 $self->{s_kwd} = '';
3044 !!!next-input-character;
3045 !!!emit ($self->{ct}); # pi
3046 redo A;
3047 } elsif ($self->{nc} == 0x003F) { # ?
3048 !!!parse-error (type => 'no s after target', ## TODO: type
3049 line => $self->{line_prev},
3050 column => $self->{column_prev}); ## XML5: no error
3051 $self->{ct}->{data} .= '?';
3052 $self->{state} = PI_DATA_AFTER_STATE;
3053 !!!next-input-character;
3054 redo A;
3055 } else {
3056 !!!parse-error (type => 'no s after target', ## TODO: type
3057 line => $self->{line_prev},
3058 column => $self->{column_prev}
3059 + 1 * ($self->{nc} == -1)); ## XML5: no error
3060 $self->{ct}->{data} .= '?'; ## XML5: not appended
3061 $self->{state} = PI_DATA_STATE;
3062 ## Reprocess.
3063 redo A;
3064 }
3065 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3066 ## XML5: Same as "pi after state" in XML5
3067 if ($self->{nc} == 0x003E) { # >
3068 $self->{state} = DATA_STATE;
3069 $self->{s_kwd} = '';
3070 !!!next-input-character;
3071 !!!emit ($self->{ct}); # pi
3072 redo A;
3073 } elsif ($self->{nc} == 0x003F) { # ?
3074 $self->{ct}->{data} .= '?';
3075 ## Stay in the state.
3076 !!!next-input-character;
3077 redo A;
3078 } else {
3079 $self->{ct}->{data} .= '?'; ## XML5: not appended
3080 $self->{state} = PI_DATA_STATE;
3081 ## Reprocess.
3082 redo A;
3083 }
3084
3085 } else {
3086 die "$0: $self->{state}: Unknown state";
3087 }
3088 } # A
3089
3090 die "$0: _get_next_token: unexpected case";
3091 } # _get_next_token
3092
3093 1;
3094 ## $Date: 2008/10/15 08:51:02 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24