/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (show annotations) (download) (as text)
Thu Oct 16 03:39:57 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.12: +187 -60 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	16 Oct 2008 03:39:39 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/pis-2.dat" and "xml/comments-2.dat" are added.

++ whatpm/t/xml/ChangeLog	16 Oct 2008 03:39:53 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* doctypes-2.dat: New test added.

	* comments-2.dat, pis-2.dat: New test data files.

++ whatpm/Whatpm/HTML/ChangeLog	16 Oct 2008 03:36:51 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token type END_OF_DOCTYPE_TOKEN added.
	New states DOCTYPE_TAG_STATE and
	BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE are added.  (Bogus
	string after the internal subset, which was handled by the state
	BOGUS_DOCTYPE_STATE, are now handled by the new state.)  Support
	for comments, bogus comments, and processing instructions in the
	internal subset.  If there is the internal subset, then emit the
	doctype token before the internal subset (with its
	$token->{has_internal_subset} flag set) and an
	END_OF_DOCTYPE_TOKEN after the internal subset.

++ whatpm/Whatpm/XML/ChangeLog	16 Oct 2008 03:39:19 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Insertion mode IN_SUBSET_IM added.  In the
	"initial" insertion mode, if the DOCTYPE token's "has internal
	subset" flag is set, then switch to the "in subset" insertion
	mode.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.12 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 );
20
21 our %EXPORT_TAGS = (
22 token => [qw(
23 DOCTYPE_TOKEN
24 COMMENT_TOKEN
25 START_TAG_TOKEN
26 END_TAG_TOKEN
27 END_OF_FILE_TOKEN
28 CHARACTER_TOKEN
29 PI_TOKEN
30 ABORT_TOKEN
31 END_OF_DOCTYPE_TOKEN
32 )],
33 );
34 }
35
36 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
37
38 ## Token types
39
40 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
41 sub COMMENT_TOKEN () { 2 }
42 sub START_TAG_TOKEN () { 3 }
43 sub END_TAG_TOKEN () { 4 }
44 sub END_OF_FILE_TOKEN () { 5 }
45 sub CHARACTER_TOKEN () { 6 }
46 sub PI_TOKEN () { 7 } ## NOTE: XML only.
47 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49
50 ## XML5: XML5 has "empty tag token". In this implementation, it is
51 ## represented as a start tag token with $self->{self_closing} flag
52 ## set to true.
53
54 ## XML5: XML5 has "short end tag token". In this implementation, it
55 ## is represented as an end tag token with $token->{tag_name} flag set
56 ## to an empty string.
57
58 package Whatpm::HTML;
59
60 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
61
62 ## Content model flags
63
64 sub CM_ENTITY () { 0b001 } # & markup in data
65 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
66 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
67
68 sub PLAINTEXT_CONTENT_MODEL () { 0 }
69 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
70 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
71 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
72
73 ## Tokenizer states
74
75 sub DATA_STATE () { 0 }
76 #sub ENTITY_DATA_STATE () { 1 }
77 sub TAG_OPEN_STATE () { 2 }
78 sub CLOSE_TAG_OPEN_STATE () { 3 }
79 sub TAG_NAME_STATE () { 4 }
80 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
81 sub ATTRIBUTE_NAME_STATE () { 6 }
82 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
83 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
84 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
85 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
86 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
87 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
88 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
89 sub COMMENT_START_STATE () { 14 }
90 sub COMMENT_START_DASH_STATE () { 15 }
91 sub COMMENT_STATE () { 16 }
92 sub COMMENT_END_STATE () { 17 }
93 sub COMMENT_END_DASH_STATE () { 18 }
94 sub BOGUS_COMMENT_STATE () { 19 }
95 sub DOCTYPE_STATE () { 20 }
96 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
97 sub DOCTYPE_NAME_STATE () { 22 }
98 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
99 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
100 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
101 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
102 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
103 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
104 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
105 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
106 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
107 sub BOGUS_DOCTYPE_STATE () { 32 }
108 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
109 sub SELF_CLOSING_START_TAG_STATE () { 34 }
110 sub CDATA_SECTION_STATE () { 35 }
111 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
112 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
113 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
114 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
115 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
116 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
117 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
118 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
119 ## NOTE: "Entity data state", "entity in attribute value state", and
120 ## "consume a character reference" algorithm are jointly implemented
121 ## using the following six states:
122 sub ENTITY_STATE () { 44 }
123 sub ENTITY_HASH_STATE () { 45 }
124 sub NCR_NUM_STATE () { 46 }
125 sub HEXREF_X_STATE () { 47 }
126 sub HEXREF_HEX_STATE () { 48 }
127 sub ENTITY_NAME_STATE () { 49 }
128 sub PCDATA_STATE () { 50 } # "data state" in the spec
129
130 ## XML-only states
131 sub PI_STATE () { 51 }
132 sub PI_TARGET_STATE () { 52 }
133 sub PI_TARGET_AFTER_STATE () { 53 }
134 sub PI_DATA_STATE () { 54 }
135 sub PI_AFTER_STATE () { 55 }
136 sub PI_DATA_AFTER_STATE () { 56 }
137 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139 sub DOCTYPE_TAG_STATE () { 59 }
140 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141
142 ## Tree constructor state constants (see Whatpm::HTML for the full
143 ## list and descriptions)
144
145 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
146 sub FOREIGN_EL () { 0b1_00000000000 }
147
148 ## Character reference mappings
149
150 my $charref_map = {
151 0x0D => 0x000A,
152 0x80 => 0x20AC,
153 0x81 => 0xFFFD,
154 0x82 => 0x201A,
155 0x83 => 0x0192,
156 0x84 => 0x201E,
157 0x85 => 0x2026,
158 0x86 => 0x2020,
159 0x87 => 0x2021,
160 0x88 => 0x02C6,
161 0x89 => 0x2030,
162 0x8A => 0x0160,
163 0x8B => 0x2039,
164 0x8C => 0x0152,
165 0x8D => 0xFFFD,
166 0x8E => 0x017D,
167 0x8F => 0xFFFD,
168 0x90 => 0xFFFD,
169 0x91 => 0x2018,
170 0x92 => 0x2019,
171 0x93 => 0x201C,
172 0x94 => 0x201D,
173 0x95 => 0x2022,
174 0x96 => 0x2013,
175 0x97 => 0x2014,
176 0x98 => 0x02DC,
177 0x99 => 0x2122,
178 0x9A => 0x0161,
179 0x9B => 0x203A,
180 0x9C => 0x0153,
181 0x9D => 0xFFFD,
182 0x9E => 0x017E,
183 0x9F => 0x0178,
184 }; # $charref_map
185 $charref_map->{$_} = 0xFFFD
186 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
187 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
188 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
189 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
190 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
191 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
192 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
193
194 ## Implementations MUST act as if state machine in the spec
195
196 sub _initialize_tokenizer ($) {
197 my $self = shift;
198
199 ## NOTE: Fields set by |new| constructor:
200 #$self->{level}
201 #$self->{set_nc}
202 #$self->{parse_error}
203 #$self->{is_xml} (if XML)
204
205 $self->{state} = DATA_STATE; # MUST
206 $self->{s_kwd} = ''; # Data state keyword
207 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
208 #$self->{entity__value}; # initialized when used
209 #$self->{entity__match}; # initialized when used
210 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
211 undef $self->{ct}; # current token
212 undef $self->{ca}; # current attribute
213 undef $self->{last_stag_name}; # last emitted start tag name
214 #$self->{prev_state}; # initialized when used
215 delete $self->{self_closing};
216 $self->{char_buffer} = '';
217 $self->{char_buffer_pos} = 0;
218 $self->{nc} = -1; # next input character
219 #$self->{next_nc}
220 !!!next-input-character;
221 $self->{token} = [];
222 # $self->{escape}
223 } # _initialize_tokenizer
224
225 ## A token has:
226 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
227 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
228 ## ->{name} (DOCTYPE_TOKEN)
229 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
230 ## ->{target} (PI_TOKEN)
231 ## ->{pubid} (DOCTYPE_TOKEN)
232 ## ->{sysid} (DOCTYPE_TOKEN)
233 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
234 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
235 ## ->{name}
236 ## ->{value}
237 ## ->{has_reference} == 1 or 0
238 ## ->{index}: Index of the attribute in a tag.
239 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
240 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
241 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
242 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
243
244 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
245 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
246 ## while the token is pushed back to the stack.
247
248 ## Emitted token MUST immediately be handled by the tree construction state.
249
250 ## Before each step, UA MAY check to see if either one of the scripts in
251 ## "list of scripts that will execute as soon as possible" or the first
252 ## script in the "list of scripts that will execute asynchronously",
253 ## has completed loading. If one has, then it MUST be executed
254 ## and removed from the list.
255
256 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
257 ## (This requirement was dropped from HTML5 spec, unfortunately.)
258
259 my $is_space = {
260 0x0009 => 1, # CHARACTER TABULATION (HT)
261 0x000A => 1, # LINE FEED (LF)
262 #0x000B => 0, # LINE TABULATION (VT)
263 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
264 #0x000D => 1, # CARRIAGE RETURN (CR)
265 0x0020 => 1, # SPACE (SP)
266 };
267
268 sub _get_next_token ($) {
269 my $self = shift;
270
271 if ($self->{self_closing}) {
272 !!!parse-error (type => 'nestc', token => $self->{ct});
273 ## NOTE: The |self_closing| flag is only set by start tag token.
274 ## In addition, when a start tag token is emitted, it is always set to
275 ## |ct|.
276 delete $self->{self_closing};
277 }
278
279 if (@{$self->{token}}) {
280 $self->{self_closing} = $self->{token}->[0]->{self_closing};
281 return shift @{$self->{token}};
282 }
283
284 A: {
285 if ($self->{state} == PCDATA_STATE) {
286 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
287
288 if ($self->{nc} == 0x0026) { # &
289 !!!cp (0.1);
290 ## NOTE: In the spec, the tokenizer is switched to the
291 ## "entity data state". In this implementation, the tokenizer
292 ## is switched to the |ENTITY_STATE|, which is an implementation
293 ## of the "consume a character reference" algorithm.
294 $self->{entity_add} = -1;
295 $self->{prev_state} = DATA_STATE;
296 $self->{state} = ENTITY_STATE;
297 !!!next-input-character;
298 redo A;
299 } elsif ($self->{nc} == 0x003C) { # <
300 !!!cp (0.2);
301 $self->{state} = TAG_OPEN_STATE;
302 !!!next-input-character;
303 redo A;
304 } elsif ($self->{nc} == -1) {
305 !!!cp (0.3);
306 !!!emit ({type => END_OF_FILE_TOKEN,
307 line => $self->{line}, column => $self->{column}});
308 last A; ## TODO: ok?
309 } else {
310 !!!cp (0.4);
311 #
312 }
313
314 # Anything else
315 my $token = {type => CHARACTER_TOKEN,
316 data => chr $self->{nc},
317 line => $self->{line}, column => $self->{column},
318 };
319 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
320
321 ## Stay in the state.
322 !!!next-input-character;
323 !!!emit ($token);
324 redo A;
325 } elsif ($self->{state} == DATA_STATE) {
326 $self->{s_kwd} = '' unless defined $self->{s_kwd};
327 if ($self->{nc} == 0x0026) { # &
328 $self->{s_kwd} = '';
329 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
330 not $self->{escape}) {
331 !!!cp (1);
332 ## NOTE: In the spec, the tokenizer is switched to the
333 ## "entity data state". In this implementation, the tokenizer
334 ## is switched to the |ENTITY_STATE|, which is an implementation
335 ## of the "consume a character reference" algorithm.
336 $self->{entity_add} = -1;
337 $self->{prev_state} = DATA_STATE;
338 $self->{state} = ENTITY_STATE;
339 !!!next-input-character;
340 redo A;
341 } else {
342 !!!cp (2);
343 #
344 }
345 } elsif ($self->{nc} == 0x002D) { # -
346 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
347 if ($self->{s_kwd} eq '<!-') {
348 !!!cp (3);
349 $self->{escape} = 1; # unless $self->{escape};
350 $self->{s_kwd} = '--';
351 #
352 } elsif ($self->{s_kwd} eq '-') {
353 !!!cp (4);
354 $self->{s_kwd} = '--';
355 #
356 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
357 !!!cp (4.1);
358 $self->{s_kwd} .= '-';
359 #
360 } else {
361 !!!cp (5);
362 $self->{s_kwd} = '-';
363 #
364 }
365 }
366
367 #
368 } elsif ($self->{nc} == 0x0021) { # !
369 if (length $self->{s_kwd}) {
370 !!!cp (5.1);
371 $self->{s_kwd} .= '!';
372 #
373 } else {
374 !!!cp (5.2);
375 #$self->{s_kwd} = '';
376 #
377 }
378 #
379 } elsif ($self->{nc} == 0x003C) { # <
380 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
381 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
382 not $self->{escape})) {
383 !!!cp (6);
384 $self->{state} = TAG_OPEN_STATE;
385 !!!next-input-character;
386 redo A;
387 } else {
388 !!!cp (7);
389 $self->{s_kwd} = '';
390 #
391 }
392 } elsif ($self->{nc} == 0x003E) { # >
393 if ($self->{escape} and
394 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
395 if ($self->{s_kwd} eq '--') {
396 !!!cp (8);
397 delete $self->{escape};
398 #
399 } else {
400 !!!cp (9);
401 #
402 }
403 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
404 !!!cp (9.1);
405 !!!parse-error (type => 'unmatched mse', ## TODO: type
406 line => $self->{line_prev},
407 column => $self->{column_prev} - 1);
408 #
409 } else {
410 !!!cp (10);
411 #
412 }
413
414 $self->{s_kwd} = '';
415 #
416 } elsif ($self->{nc} == 0x005D) { # ]
417 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
418 !!!cp (10.1);
419 $self->{s_kwd} .= ']';
420 } elsif ($self->{s_kwd} eq ']]') {
421 !!!cp (10.2);
422 #
423 } else {
424 !!!cp (10.3);
425 $self->{s_kwd} = '';
426 }
427 #
428 } elsif ($self->{nc} == -1) {
429 !!!cp (11);
430 $self->{s_kwd} = '';
431 !!!emit ({type => END_OF_FILE_TOKEN,
432 line => $self->{line}, column => $self->{column}});
433 last A; ## TODO: ok?
434 } else {
435 !!!cp (12);
436 $self->{s_kwd} = '';
437 #
438 }
439
440 # Anything else
441 my $token = {type => CHARACTER_TOKEN,
442 data => chr $self->{nc},
443 line => $self->{line}, column => $self->{column},
444 };
445 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
446 length $token->{data})) {
447 $self->{s_kwd} = '';
448 }
449
450 ## Stay in the data state.
451 if (not $self->{is_xml} and
452 $self->{content_model} == PCDATA_CONTENT_MODEL) {
453 !!!cp (13);
454 $self->{state} = PCDATA_STATE;
455 } else {
456 !!!cp (14);
457 ## Stay in the state.
458 }
459 !!!next-input-character;
460 !!!emit ($token);
461 redo A;
462 } elsif ($self->{state} == TAG_OPEN_STATE) {
463 ## XML5: "tag state".
464
465 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
466 if ($self->{nc} == 0x002F) { # /
467 !!!cp (15);
468 !!!next-input-character;
469 $self->{state} = CLOSE_TAG_OPEN_STATE;
470 redo A;
471 } elsif ($self->{nc} == 0x0021) { # !
472 !!!cp (15.1);
473 $self->{s_kwd} = $self->{escaped} ? '' : '<';
474 #
475 } else {
476 !!!cp (16);
477 $self->{s_kwd} = '';
478 #
479 }
480
481 ## reconsume
482 $self->{state} = DATA_STATE;
483 !!!emit ({type => CHARACTER_TOKEN, data => '<',
484 line => $self->{line_prev},
485 column => $self->{column_prev},
486 });
487 redo A;
488 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
489 if ($self->{nc} == 0x0021) { # !
490 !!!cp (17);
491 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
492 !!!next-input-character;
493 redo A;
494 } elsif ($self->{nc} == 0x002F) { # /
495 !!!cp (18);
496 $self->{state} = CLOSE_TAG_OPEN_STATE;
497 !!!next-input-character;
498 redo A;
499 } elsif (0x0041 <= $self->{nc} and
500 $self->{nc} <= 0x005A) { # A..Z
501 !!!cp (19);
502 $self->{ct}
503 = {type => START_TAG_TOKEN,
504 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
505 line => $self->{line_prev},
506 column => $self->{column_prev}};
507 $self->{state} = TAG_NAME_STATE;
508 !!!next-input-character;
509 redo A;
510 } elsif (0x0061 <= $self->{nc} and
511 $self->{nc} <= 0x007A) { # a..z
512 !!!cp (20);
513 $self->{ct} = {type => START_TAG_TOKEN,
514 tag_name => chr ($self->{nc}),
515 line => $self->{line_prev},
516 column => $self->{column_prev}};
517 $self->{state} = TAG_NAME_STATE;
518 !!!next-input-character;
519 redo A;
520 } elsif ($self->{nc} == 0x003E) { # >
521 !!!cp (21);
522 !!!parse-error (type => 'empty start tag',
523 line => $self->{line_prev},
524 column => $self->{column_prev});
525 $self->{state} = DATA_STATE;
526 $self->{s_kwd} = '';
527 !!!next-input-character;
528
529 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
530 line => $self->{line_prev},
531 column => $self->{column_prev},
532 });
533
534 redo A;
535 } elsif ($self->{nc} == 0x003F) { # ?
536 if ($self->{is_xml}) {
537 !!!cp (22.1);
538 $self->{state} = PI_STATE;
539 !!!next-input-character;
540 redo A;
541 } else {
542 !!!cp (22);
543 !!!parse-error (type => 'pio',
544 line => $self->{line_prev},
545 column => $self->{column_prev});
546 $self->{state} = BOGUS_COMMENT_STATE;
547 $self->{ct} = {type => COMMENT_TOKEN, data => '',
548 line => $self->{line_prev},
549 column => $self->{column_prev},
550 };
551 ## $self->{nc} is intentionally left as is
552 redo A;
553 }
554 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
555 !!!cp (23);
556 !!!parse-error (type => 'bare stago',
557 line => $self->{line_prev},
558 column => $self->{column_prev});
559 $self->{state} = DATA_STATE;
560 $self->{s_kwd} = '';
561 ## reconsume
562
563 !!!emit ({type => CHARACTER_TOKEN, data => '<',
564 line => $self->{line_prev},
565 column => $self->{column_prev},
566 });
567
568 redo A;
569 } else {
570 ## XML5: "<:" is a parse error.
571 !!!cp (23.1);
572 $self->{ct} = {type => START_TAG_TOKEN,
573 tag_name => chr ($self->{nc}),
574 line => $self->{line_prev},
575 column => $self->{column_prev}};
576 $self->{state} = TAG_NAME_STATE;
577 !!!next-input-character;
578 redo A;
579 }
580 } else {
581 die "$0: $self->{content_model} in tag open";
582 }
583 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
584 ## NOTE: The "close tag open state" in the spec is implemented as
585 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
586
587 ## XML5: "end tag state".
588
589 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
590 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
591 if (defined $self->{last_stag_name}) {
592 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
593 $self->{kwd} = '';
594 ## Reconsume.
595 redo A;
596 } else {
597 ## No start tag token has ever been emitted
598 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
599 !!!cp (28);
600 $self->{state} = DATA_STATE;
601 $self->{s_kwd} = '';
602 ## Reconsume.
603 !!!emit ({type => CHARACTER_TOKEN, data => '</',
604 line => $l, column => $c,
605 });
606 redo A;
607 }
608 }
609
610 if (0x0041 <= $self->{nc} and
611 $self->{nc} <= 0x005A) { # A..Z
612 !!!cp (29);
613 $self->{ct}
614 = {type => END_TAG_TOKEN,
615 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
616 line => $l, column => $c};
617 $self->{state} = TAG_NAME_STATE;
618 !!!next-input-character;
619 redo A;
620 } elsif (0x0061 <= $self->{nc} and
621 $self->{nc} <= 0x007A) { # a..z
622 !!!cp (30);
623 $self->{ct} = {type => END_TAG_TOKEN,
624 tag_name => chr ($self->{nc}),
625 line => $l, column => $c};
626 $self->{state} = TAG_NAME_STATE;
627 !!!next-input-character;
628 redo A;
629 } elsif ($self->{nc} == 0x003E) { # >
630 !!!parse-error (type => 'empty end tag',
631 line => $self->{line_prev}, ## "<" in "</>"
632 column => $self->{column_prev} - 1);
633 $self->{state} = DATA_STATE;
634 $self->{s_kwd} = '';
635 if ($self->{is_xml}) {
636 !!!cp (31);
637 ## XML5: No parse error.
638
639 ## NOTE: This parser raises a parse error, since it supports
640 ## XML1, not XML5.
641
642 ## NOTE: A short end tag token.
643 my $ct = {type => END_TAG_TOKEN,
644 tag_name => '',
645 line => $self->{line_prev},
646 column => $self->{column_prev} - 1,
647 };
648 !!!next-input-character;
649 !!!emit ($ct);
650 } else {
651 !!!cp (31.1);
652 !!!next-input-character;
653 }
654 redo A;
655 } elsif ($self->{nc} == -1) {
656 !!!cp (32);
657 !!!parse-error (type => 'bare etago');
658 $self->{s_kwd} = '';
659 $self->{state} = DATA_STATE;
660 # reconsume
661
662 !!!emit ({type => CHARACTER_TOKEN, data => '</',
663 line => $l, column => $c,
664 });
665
666 redo A;
667 } elsif (not $self->{is_xml} or
668 $is_space->{$self->{nc}}) {
669 !!!cp (33);
670 !!!parse-error (type => 'bogus end tag',
671 line => $self->{line_prev}, # "<" of "</"
672 column => $self->{column_prev} - 1);
673 $self->{state} = BOGUS_COMMENT_STATE;
674 $self->{ct} = {type => COMMENT_TOKEN, data => '',
675 line => $self->{line_prev}, # "<" of "</"
676 column => $self->{column_prev} - 1,
677 };
678 ## NOTE: $self->{nc} is intentionally left as is.
679 ## Although the "anything else" case of the spec not explicitly
680 ## states that the next input character is to be reconsumed,
681 ## it will be included to the |data| of the comment token
682 ## generated from the bogus end tag, as defined in the
683 ## "bogus comment state" entry.
684 redo A;
685 } else {
686 ## XML5: "</:" is a parse error.
687 !!!cp (30.1);
688 $self->{ct} = {type => END_TAG_TOKEN,
689 tag_name => chr ($self->{nc}),
690 line => $l, column => $c};
691 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
692 !!!next-input-character;
693 redo A;
694 }
695 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
696 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
697 if (length $ch) {
698 my $CH = $ch;
699 $ch =~ tr/a-z/A-Z/;
700 my $nch = chr $self->{nc};
701 if ($nch eq $ch or $nch eq $CH) {
702 !!!cp (24);
703 ## Stay in the state.
704 $self->{kwd} .= $nch;
705 !!!next-input-character;
706 redo A;
707 } else {
708 !!!cp (25);
709 $self->{state} = DATA_STATE;
710 $self->{s_kwd} = '';
711 ## Reconsume.
712 !!!emit ({type => CHARACTER_TOKEN,
713 data => '</' . $self->{kwd},
714 line => $self->{line_prev},
715 column => $self->{column_prev} - 1 - length $self->{kwd},
716 });
717 redo A;
718 }
719 } else { # after "<{tag-name}"
720 unless ($is_space->{$self->{nc}} or
721 {
722 0x003E => 1, # >
723 0x002F => 1, # /
724 -1 => 1, # EOF
725 }->{$self->{nc}}) {
726 !!!cp (26);
727 ## Reconsume.
728 $self->{state} = DATA_STATE;
729 $self->{s_kwd} = '';
730 !!!emit ({type => CHARACTER_TOKEN,
731 data => '</' . $self->{kwd},
732 line => $self->{line_prev},
733 column => $self->{column_prev} - 1 - length $self->{kwd},
734 });
735 redo A;
736 } else {
737 !!!cp (27);
738 $self->{ct}
739 = {type => END_TAG_TOKEN,
740 tag_name => $self->{last_stag_name},
741 line => $self->{line_prev},
742 column => $self->{column_prev} - 1 - length $self->{kwd}};
743 $self->{state} = TAG_NAME_STATE;
744 ## Reconsume.
745 redo A;
746 }
747 }
748 } elsif ($self->{state} == TAG_NAME_STATE) {
749 if ($is_space->{$self->{nc}}) {
750 !!!cp (34);
751 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
752 !!!next-input-character;
753 redo A;
754 } elsif ($self->{nc} == 0x003E) { # >
755 if ($self->{ct}->{type} == START_TAG_TOKEN) {
756 !!!cp (35);
757 $self->{last_stag_name} = $self->{ct}->{tag_name};
758 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
759 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
760 #if ($self->{ct}->{attributes}) {
761 # ## NOTE: This should never be reached.
762 # !!! cp (36);
763 # !!! parse-error (type => 'end tag attribute');
764 #} else {
765 !!!cp (37);
766 #}
767 } else {
768 die "$0: $self->{ct}->{type}: Unknown token type";
769 }
770 $self->{state} = DATA_STATE;
771 $self->{s_kwd} = '';
772 !!!next-input-character;
773
774 !!!emit ($self->{ct}); # start tag or end tag
775
776 redo A;
777 } elsif (0x0041 <= $self->{nc} and
778 $self->{nc} <= 0x005A) { # A..Z
779 !!!cp (38);
780 $self->{ct}->{tag_name}
781 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
782 # start tag or end tag
783 ## Stay in this state
784 !!!next-input-character;
785 redo A;
786 } elsif ($self->{nc} == -1) {
787 !!!parse-error (type => 'unclosed tag');
788 if ($self->{ct}->{type} == START_TAG_TOKEN) {
789 !!!cp (39);
790 $self->{last_stag_name} = $self->{ct}->{tag_name};
791 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
792 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
793 #if ($self->{ct}->{attributes}) {
794 # ## NOTE: This state should never be reached.
795 # !!! cp (40);
796 # !!! parse-error (type => 'end tag attribute');
797 #} else {
798 !!!cp (41);
799 #}
800 } else {
801 die "$0: $self->{ct}->{type}: Unknown token type";
802 }
803 $self->{state} = DATA_STATE;
804 $self->{s_kwd} = '';
805 # reconsume
806
807 !!!emit ($self->{ct}); # start tag or end tag
808
809 redo A;
810 } elsif ($self->{nc} == 0x002F) { # /
811 !!!cp (42);
812 $self->{state} = SELF_CLOSING_START_TAG_STATE;
813 !!!next-input-character;
814 redo A;
815 } else {
816 !!!cp (44);
817 $self->{ct}->{tag_name} .= chr $self->{nc};
818 # start tag or end tag
819 ## Stay in the state
820 !!!next-input-character;
821 redo A;
822 }
823 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
824 ## XML5: "Tag attribute name before state".
825
826 if ($is_space->{$self->{nc}}) {
827 !!!cp (45);
828 ## Stay in the state
829 !!!next-input-character;
830 redo A;
831 } elsif ($self->{nc} == 0x003E) { # >
832 if ($self->{ct}->{type} == START_TAG_TOKEN) {
833 !!!cp (46);
834 $self->{last_stag_name} = $self->{ct}->{tag_name};
835 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
836 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
837 if ($self->{ct}->{attributes}) {
838 !!!cp (47);
839 !!!parse-error (type => 'end tag attribute');
840 } else {
841 !!!cp (48);
842 }
843 } else {
844 die "$0: $self->{ct}->{type}: Unknown token type";
845 }
846 $self->{state} = DATA_STATE;
847 $self->{s_kwd} = '';
848 !!!next-input-character;
849
850 !!!emit ($self->{ct}); # start tag or end tag
851
852 redo A;
853 } elsif (0x0041 <= $self->{nc} and
854 $self->{nc} <= 0x005A) { # A..Z
855 !!!cp (49);
856 $self->{ca}
857 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
858 value => '',
859 line => $self->{line}, column => $self->{column}};
860 $self->{state} = ATTRIBUTE_NAME_STATE;
861 !!!next-input-character;
862 redo A;
863 } elsif ($self->{nc} == 0x002F) { # /
864 !!!cp (50);
865 $self->{state} = SELF_CLOSING_START_TAG_STATE;
866 !!!next-input-character;
867 redo A;
868 } elsif ($self->{nc} == -1) {
869 !!!parse-error (type => 'unclosed tag');
870 if ($self->{ct}->{type} == START_TAG_TOKEN) {
871 !!!cp (52);
872 $self->{last_stag_name} = $self->{ct}->{tag_name};
873 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
874 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
875 if ($self->{ct}->{attributes}) {
876 !!!cp (53);
877 !!!parse-error (type => 'end tag attribute');
878 } else {
879 !!!cp (54);
880 }
881 } else {
882 die "$0: $self->{ct}->{type}: Unknown token type";
883 }
884 $self->{state} = DATA_STATE;
885 $self->{s_kwd} = '';
886 # reconsume
887
888 !!!emit ($self->{ct}); # start tag or end tag
889
890 redo A;
891 } else {
892 if ({
893 0x0022 => 1, # "
894 0x0027 => 1, # '
895 0x003D => 1, # =
896 }->{$self->{nc}}) {
897 !!!cp (55);
898 ## XML5: Not a parse error.
899 !!!parse-error (type => 'bad attribute name');
900 } else {
901 !!!cp (56);
902 ## XML5: ":" raises a parse error and is ignored.
903 }
904 $self->{ca}
905 = {name => chr ($self->{nc}),
906 value => '',
907 line => $self->{line}, column => $self->{column}};
908 $self->{state} = ATTRIBUTE_NAME_STATE;
909 !!!next-input-character;
910 redo A;
911 }
912 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
913 ## XML5: "Tag attribute name state".
914
915 my $before_leave = sub {
916 if (exists $self->{ct}->{attributes} # start tag or end tag
917 ->{$self->{ca}->{name}}) { # MUST
918 !!!cp (57);
919 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
920 ## Discard $self->{ca} # MUST
921 } else {
922 !!!cp (58);
923 $self->{ct}->{attributes}->{$self->{ca}->{name}}
924 = $self->{ca};
925 $self->{ca}->{index} = ++$self->{ct}->{last_index};
926 }
927 }; # $before_leave
928
929 if ($is_space->{$self->{nc}}) {
930 !!!cp (59);
931 $before_leave->();
932 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
933 !!!next-input-character;
934 redo A;
935 } elsif ($self->{nc} == 0x003D) { # =
936 !!!cp (60);
937 $before_leave->();
938 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
939 !!!next-input-character;
940 redo A;
941 } elsif ($self->{nc} == 0x003E) { # >
942 if ($self->{is_xml}) {
943 !!!cp (60.1);
944 ## XML5: Not a parse error.
945 !!!parse-error (type => 'no attr value'); ## TODO: type
946 } else {
947 !!!cp (60.2);
948 }
949
950 $before_leave->();
951 if ($self->{ct}->{type} == START_TAG_TOKEN) {
952 !!!cp (61);
953 $self->{last_stag_name} = $self->{ct}->{tag_name};
954 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
955 !!!cp (62);
956 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
957 if ($self->{ct}->{attributes}) {
958 !!!parse-error (type => 'end tag attribute');
959 }
960 } else {
961 die "$0: $self->{ct}->{type}: Unknown token type";
962 }
963 $self->{state} = DATA_STATE;
964 $self->{s_kwd} = '';
965 !!!next-input-character;
966
967 !!!emit ($self->{ct}); # start tag or end tag
968
969 redo A;
970 } elsif (0x0041 <= $self->{nc} and
971 $self->{nc} <= 0x005A) { # A..Z
972 !!!cp (63);
973 $self->{ca}->{name}
974 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
975 ## Stay in the state
976 !!!next-input-character;
977 redo A;
978 } elsif ($self->{nc} == 0x002F) { # /
979 if ($self->{is_xml}) {
980 !!!cp (64);
981 ## XML5: Not a parse error.
982 !!!parse-error (type => 'no attr value'); ## TODO: type
983 } else {
984 !!!cp (64.1);
985 }
986
987 $before_leave->();
988 $self->{state} = SELF_CLOSING_START_TAG_STATE;
989 !!!next-input-character;
990 redo A;
991 } elsif ($self->{nc} == -1) {
992 !!!parse-error (type => 'unclosed tag');
993 $before_leave->();
994 if ($self->{ct}->{type} == START_TAG_TOKEN) {
995 !!!cp (66);
996 $self->{last_stag_name} = $self->{ct}->{tag_name};
997 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
998 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
999 if ($self->{ct}->{attributes}) {
1000 !!!cp (67);
1001 !!!parse-error (type => 'end tag attribute');
1002 } else {
1003 ## NOTE: This state should never be reached.
1004 !!!cp (68);
1005 }
1006 } else {
1007 die "$0: $self->{ct}->{type}: Unknown token type";
1008 }
1009 $self->{state} = DATA_STATE;
1010 $self->{s_kwd} = '';
1011 # reconsume
1012
1013 !!!emit ($self->{ct}); # start tag or end tag
1014
1015 redo A;
1016 } else {
1017 if ($self->{nc} == 0x0022 or # "
1018 $self->{nc} == 0x0027) { # '
1019 !!!cp (69);
1020 ## XML5: Not a parse error.
1021 !!!parse-error (type => 'bad attribute name');
1022 } else {
1023 !!!cp (70);
1024 }
1025 $self->{ca}->{name} .= chr ($self->{nc});
1026 ## Stay in the state
1027 !!!next-input-character;
1028 redo A;
1029 }
1030 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1031 ## XML5: "Tag attribute name after state".
1032
1033 if ($is_space->{$self->{nc}}) {
1034 !!!cp (71);
1035 ## Stay in the state
1036 !!!next-input-character;
1037 redo A;
1038 } elsif ($self->{nc} == 0x003D) { # =
1039 !!!cp (72);
1040 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1041 !!!next-input-character;
1042 redo A;
1043 } elsif ($self->{nc} == 0x003E) { # >
1044 if ($self->{is_xml}) {
1045 !!!cp (72.1);
1046 ## XML5: Not a parse error.
1047 !!!parse-error (type => 'no attr value'); ## TODO: type
1048 } else {
1049 !!!cp (72.2);
1050 }
1051
1052 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1053 !!!cp (73);
1054 $self->{last_stag_name} = $self->{ct}->{tag_name};
1055 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1056 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1057 if ($self->{ct}->{attributes}) {
1058 !!!cp (74);
1059 !!!parse-error (type => 'end tag attribute');
1060 } else {
1061 ## NOTE: This state should never be reached.
1062 !!!cp (75);
1063 }
1064 } else {
1065 die "$0: $self->{ct}->{type}: Unknown token type";
1066 }
1067 $self->{state} = DATA_STATE;
1068 $self->{s_kwd} = '';
1069 !!!next-input-character;
1070
1071 !!!emit ($self->{ct}); # start tag or end tag
1072
1073 redo A;
1074 } elsif (0x0041 <= $self->{nc} and
1075 $self->{nc} <= 0x005A) { # A..Z
1076 !!!cp (76);
1077 $self->{ca}
1078 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1079 value => '',
1080 line => $self->{line}, column => $self->{column}};
1081 $self->{state} = ATTRIBUTE_NAME_STATE;
1082 !!!next-input-character;
1083 redo A;
1084 } elsif ($self->{nc} == 0x002F) { # /
1085 if ($self->{is_xml}) {
1086 !!!cp (77);
1087 ## XML5: Not a parse error.
1088 !!!parse-error (type => 'no attr value'); ## TODO: type
1089 } else {
1090 !!!cp (77.1);
1091 }
1092
1093 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1094 !!!next-input-character;
1095 redo A;
1096 } elsif ($self->{nc} == -1) {
1097 !!!parse-error (type => 'unclosed tag');
1098 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1099 !!!cp (79);
1100 $self->{last_stag_name} = $self->{ct}->{tag_name};
1101 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1102 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1103 if ($self->{ct}->{attributes}) {
1104 !!!cp (80);
1105 !!!parse-error (type => 'end tag attribute');
1106 } else {
1107 ## NOTE: This state should never be reached.
1108 !!!cp (81);
1109 }
1110 } else {
1111 die "$0: $self->{ct}->{type}: Unknown token type";
1112 }
1113 $self->{s_kwd} = '';
1114 $self->{state} = DATA_STATE;
1115 # reconsume
1116
1117 !!!emit ($self->{ct}); # start tag or end tag
1118
1119 redo A;
1120 } else {
1121 if ($self->{is_xml}) {
1122 !!!cp (78.1);
1123 ## XML5: Not a parse error.
1124 !!!parse-error (type => 'no attr value'); ## TODO: type
1125 } else {
1126 !!!cp (78.2);
1127 }
1128
1129 if ($self->{nc} == 0x0022 or # "
1130 $self->{nc} == 0x0027) { # '
1131 !!!cp (78);
1132 ## XML5: Not a parse error.
1133 !!!parse-error (type => 'bad attribute name');
1134 } else {
1135 !!!cp (82);
1136 }
1137 $self->{ca}
1138 = {name => chr ($self->{nc}),
1139 value => '',
1140 line => $self->{line}, column => $self->{column}};
1141 $self->{state} = ATTRIBUTE_NAME_STATE;
1142 !!!next-input-character;
1143 redo A;
1144 }
1145 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1146 ## XML5: "Tag attribute value before state".
1147
1148 if ($is_space->{$self->{nc}}) {
1149 !!!cp (83);
1150 ## Stay in the state
1151 !!!next-input-character;
1152 redo A;
1153 } elsif ($self->{nc} == 0x0022) { # "
1154 !!!cp (84);
1155 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1156 !!!next-input-character;
1157 redo A;
1158 } elsif ($self->{nc} == 0x0026) { # &
1159 !!!cp (85);
1160 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1161 ## reconsume
1162 redo A;
1163 } elsif ($self->{nc} == 0x0027) { # '
1164 !!!cp (86);
1165 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1166 !!!next-input-character;
1167 redo A;
1168 } elsif ($self->{nc} == 0x003E) { # >
1169 !!!parse-error (type => 'empty unquoted attribute value');
1170 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171 !!!cp (87);
1172 $self->{last_stag_name} = $self->{ct}->{tag_name};
1173 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175 if ($self->{ct}->{attributes}) {
1176 !!!cp (88);
1177 !!!parse-error (type => 'end tag attribute');
1178 } else {
1179 ## NOTE: This state should never be reached.
1180 !!!cp (89);
1181 }
1182 } else {
1183 die "$0: $self->{ct}->{type}: Unknown token type";
1184 }
1185 $self->{state} = DATA_STATE;
1186 $self->{s_kwd} = '';
1187 !!!next-input-character;
1188
1189 !!!emit ($self->{ct}); # start tag or end tag
1190
1191 redo A;
1192 } elsif ($self->{nc} == -1) {
1193 !!!parse-error (type => 'unclosed tag');
1194 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1195 !!!cp (90);
1196 $self->{last_stag_name} = $self->{ct}->{tag_name};
1197 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1198 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1199 if ($self->{ct}->{attributes}) {
1200 !!!cp (91);
1201 !!!parse-error (type => 'end tag attribute');
1202 } else {
1203 ## NOTE: This state should never be reached.
1204 !!!cp (92);
1205 }
1206 } else {
1207 die "$0: $self->{ct}->{type}: Unknown token type";
1208 }
1209 $self->{state} = DATA_STATE;
1210 $self->{s_kwd} = '';
1211 ## reconsume
1212
1213 !!!emit ($self->{ct}); # start tag or end tag
1214
1215 redo A;
1216 } else {
1217 if ($self->{nc} == 0x003D) { # =
1218 !!!cp (93);
1219 ## XML5: Not a parse error.
1220 !!!parse-error (type => 'bad attribute value');
1221 } elsif ($self->{is_xml}) {
1222 !!!cp (93.1);
1223 ## XML5: No parse error.
1224 !!!parse-error (type => 'unquoted attr value'); ## TODO
1225 } else {
1226 !!!cp (94);
1227 }
1228 $self->{ca}->{value} .= chr ($self->{nc});
1229 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1230 !!!next-input-character;
1231 redo A;
1232 }
1233 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1234 ## XML5: "Tag attribute value double quoted state".
1235
1236 if ($self->{nc} == 0x0022) { # "
1237 !!!cp (95);
1238 ## XML5: "Tag attribute name before state".
1239 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1240 !!!next-input-character;
1241 redo A;
1242 } elsif ($self->{nc} == 0x0026) { # &
1243 !!!cp (96);
1244 ## XML5: Not defined yet.
1245
1246 ## NOTE: In the spec, the tokenizer is switched to the
1247 ## "entity in attribute value state". In this implementation, the
1248 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1249 ## implementation of the "consume a character reference" algorithm.
1250 $self->{prev_state} = $self->{state};
1251 $self->{entity_add} = 0x0022; # "
1252 $self->{state} = ENTITY_STATE;
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{nc} == -1) {
1256 !!!parse-error (type => 'unclosed attribute value');
1257 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1258 !!!cp (97);
1259 $self->{last_stag_name} = $self->{ct}->{tag_name};
1260 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1261 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1262 if ($self->{ct}->{attributes}) {
1263 !!!cp (98);
1264 !!!parse-error (type => 'end tag attribute');
1265 } else {
1266 ## NOTE: This state should never be reached.
1267 !!!cp (99);
1268 }
1269 } else {
1270 die "$0: $self->{ct}->{type}: Unknown token type";
1271 }
1272 $self->{state} = DATA_STATE;
1273 $self->{s_kwd} = '';
1274 ## reconsume
1275
1276 !!!emit ($self->{ct}); # start tag or end tag
1277
1278 redo A;
1279 } else {
1280 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1281 !!!cp (100);
1282 ## XML5: Not a parse error.
1283 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1284 } else {
1285 !!!cp (100.1);
1286 }
1287 $self->{ca}->{value} .= chr ($self->{nc});
1288 $self->{read_until}->($self->{ca}->{value},
1289 q["&<],
1290 length $self->{ca}->{value});
1291
1292 ## Stay in the state
1293 !!!next-input-character;
1294 redo A;
1295 }
1296 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1297 ## XML5: "Tag attribute value single quoted state".
1298
1299 if ($self->{nc} == 0x0027) { # '
1300 !!!cp (101);
1301 ## XML5: "Before attribute name state" (sic).
1302 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303 !!!next-input-character;
1304 redo A;
1305 } elsif ($self->{nc} == 0x0026) { # &
1306 !!!cp (102);
1307 ## XML5: Not defined yet.
1308
1309 ## NOTE: In the spec, the tokenizer is switched to the
1310 ## "entity in attribute value state". In this implementation, the
1311 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1312 ## implementation of the "consume a character reference" algorithm.
1313 $self->{entity_add} = 0x0027; # '
1314 $self->{prev_state} = $self->{state};
1315 $self->{state} = ENTITY_STATE;
1316 !!!next-input-character;
1317 redo A;
1318 } elsif ($self->{nc} == -1) {
1319 !!!parse-error (type => 'unclosed attribute value');
1320 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1321 !!!cp (103);
1322 $self->{last_stag_name} = $self->{ct}->{tag_name};
1323 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1325 if ($self->{ct}->{attributes}) {
1326 !!!cp (104);
1327 !!!parse-error (type => 'end tag attribute');
1328 } else {
1329 ## NOTE: This state should never be reached.
1330 !!!cp (105);
1331 }
1332 } else {
1333 die "$0: $self->{ct}->{type}: Unknown token type";
1334 }
1335 $self->{state} = DATA_STATE;
1336 $self->{s_kwd} = '';
1337 ## reconsume
1338
1339 !!!emit ($self->{ct}); # start tag or end tag
1340
1341 redo A;
1342 } else {
1343 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1344 !!!cp (106);
1345 ## XML5: Not a parse error.
1346 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1347 } else {
1348 !!!cp (106.1);
1349 }
1350 $self->{ca}->{value} .= chr ($self->{nc});
1351 $self->{read_until}->($self->{ca}->{value},
1352 q['&<],
1353 length $self->{ca}->{value});
1354
1355 ## Stay in the state
1356 !!!next-input-character;
1357 redo A;
1358 }
1359 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1360 ## XML5: "Tag attribute value unquoted state".
1361
1362 if ($is_space->{$self->{nc}}) {
1363 !!!cp (107);
1364 ## XML5: "Tag attribute name before state".
1365 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1366 !!!next-input-character;
1367 redo A;
1368 } elsif ($self->{nc} == 0x0026) { # &
1369 !!!cp (108);
1370
1371 ## XML5: Not defined yet.
1372
1373 ## NOTE: In the spec, the tokenizer is switched to the
1374 ## "entity in attribute value state". In this implementation, the
1375 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1376 ## implementation of the "consume a character reference" algorithm.
1377 $self->{entity_add} = -1;
1378 $self->{prev_state} = $self->{state};
1379 $self->{state} = ENTITY_STATE;
1380 !!!next-input-character;
1381 redo A;
1382 } elsif ($self->{nc} == 0x003E) { # >
1383 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1384 !!!cp (109);
1385 $self->{last_stag_name} = $self->{ct}->{tag_name};
1386 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1387 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1388 if ($self->{ct}->{attributes}) {
1389 !!!cp (110);
1390 !!!parse-error (type => 'end tag attribute');
1391 } else {
1392 ## NOTE: This state should never be reached.
1393 !!!cp (111);
1394 }
1395 } else {
1396 die "$0: $self->{ct}->{type}: Unknown token type";
1397 }
1398 $self->{state} = DATA_STATE;
1399 $self->{s_kwd} = '';
1400 !!!next-input-character;
1401
1402 !!!emit ($self->{ct}); # start tag or end tag
1403
1404 redo A;
1405 } elsif ($self->{nc} == -1) {
1406 !!!parse-error (type => 'unclosed tag');
1407 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1408 !!!cp (112);
1409 $self->{last_stag_name} = $self->{ct}->{tag_name};
1410 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1411 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1412 if ($self->{ct}->{attributes}) {
1413 !!!cp (113);
1414 !!!parse-error (type => 'end tag attribute');
1415 } else {
1416 ## NOTE: This state should never be reached.
1417 !!!cp (114);
1418 }
1419 } else {
1420 die "$0: $self->{ct}->{type}: Unknown token type";
1421 }
1422 $self->{state} = DATA_STATE;
1423 $self->{s_kwd} = '';
1424 ## reconsume
1425
1426 !!!emit ($self->{ct}); # start tag or end tag
1427
1428 redo A;
1429 } else {
1430 if ({
1431 0x0022 => 1, # "
1432 0x0027 => 1, # '
1433 0x003D => 1, # =
1434 }->{$self->{nc}}) {
1435 !!!cp (115);
1436 ## XML5: Not a parse error.
1437 !!!parse-error (type => 'bad attribute value');
1438 } else {
1439 !!!cp (116);
1440 }
1441 $self->{ca}->{value} .= chr ($self->{nc});
1442 $self->{read_until}->($self->{ca}->{value},
1443 q["'=& >],
1444 length $self->{ca}->{value});
1445
1446 ## Stay in the state
1447 !!!next-input-character;
1448 redo A;
1449 }
1450 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1451 if ($is_space->{$self->{nc}}) {
1452 !!!cp (118);
1453 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1454 !!!next-input-character;
1455 redo A;
1456 } elsif ($self->{nc} == 0x003E) { # >
1457 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1458 !!!cp (119);
1459 $self->{last_stag_name} = $self->{ct}->{tag_name};
1460 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1461 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1462 if ($self->{ct}->{attributes}) {
1463 !!!cp (120);
1464 !!!parse-error (type => 'end tag attribute');
1465 } else {
1466 ## NOTE: This state should never be reached.
1467 !!!cp (121);
1468 }
1469 } else {
1470 die "$0: $self->{ct}->{type}: Unknown token type";
1471 }
1472 $self->{state} = DATA_STATE;
1473 $self->{s_kwd} = '';
1474 !!!next-input-character;
1475
1476 !!!emit ($self->{ct}); # start tag or end tag
1477
1478 redo A;
1479 } elsif ($self->{nc} == 0x002F) { # /
1480 !!!cp (122);
1481 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1482 !!!next-input-character;
1483 redo A;
1484 } elsif ($self->{nc} == -1) {
1485 !!!parse-error (type => 'unclosed tag');
1486 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1487 !!!cp (122.3);
1488 $self->{last_stag_name} = $self->{ct}->{tag_name};
1489 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1490 if ($self->{ct}->{attributes}) {
1491 !!!cp (122.1);
1492 !!!parse-error (type => 'end tag attribute');
1493 } else {
1494 ## NOTE: This state should never be reached.
1495 !!!cp (122.2);
1496 }
1497 } else {
1498 die "$0: $self->{ct}->{type}: Unknown token type";
1499 }
1500 $self->{state} = DATA_STATE;
1501 $self->{s_kwd} = '';
1502 ## Reconsume.
1503 !!!emit ($self->{ct}); # start tag or end tag
1504 redo A;
1505 } else {
1506 !!!cp ('124.1');
1507 !!!parse-error (type => 'no space between attributes');
1508 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1509 ## reconsume
1510 redo A;
1511 }
1512 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1513 ## XML5: "Empty tag state".
1514
1515 if ($self->{nc} == 0x003E) { # >
1516 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1517 !!!cp ('124.2');
1518 !!!parse-error (type => 'nestc', token => $self->{ct});
1519 ## TODO: Different type than slash in start tag
1520 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1521 if ($self->{ct}->{attributes}) {
1522 !!!cp ('124.4');
1523 !!!parse-error (type => 'end tag attribute');
1524 } else {
1525 !!!cp ('124.5');
1526 }
1527 ## TODO: Test |<title></title/>|
1528 } else {
1529 !!!cp ('124.3');
1530 $self->{self_closing} = 1;
1531 }
1532
1533 $self->{state} = DATA_STATE;
1534 $self->{s_kwd} = '';
1535 !!!next-input-character;
1536
1537 !!!emit ($self->{ct}); # start tag or end tag
1538
1539 redo A;
1540 } elsif ($self->{nc} == -1) {
1541 !!!parse-error (type => 'unclosed tag');
1542 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1543 !!!cp (124.7);
1544 $self->{last_stag_name} = $self->{ct}->{tag_name};
1545 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 if ($self->{ct}->{attributes}) {
1547 !!!cp (124.5);
1548 !!!parse-error (type => 'end tag attribute');
1549 } else {
1550 ## NOTE: This state should never be reached.
1551 !!!cp (124.6);
1552 }
1553 } else {
1554 die "$0: $self->{ct}->{type}: Unknown token type";
1555 }
1556 ## XML5: "Tag attribute name before state".
1557 $self->{state} = DATA_STATE;
1558 $self->{s_kwd} = '';
1559 ## Reconsume.
1560 !!!emit ($self->{ct}); # start tag or end tag
1561 redo A;
1562 } else {
1563 !!!cp ('124.4');
1564 !!!parse-error (type => 'nestc');
1565 ## TODO: This error type is wrong.
1566 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1567 ## Reconsume.
1568 redo A;
1569 }
1570 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1571 ## NOTE: Unlike spec's "bogus comment state", this implementation
1572 ## consumes characters one-by-one basis.
1573
1574 if ($self->{nc} == 0x003E) { # >
1575 if ($self->{in_subset}) {
1576 !!!cp (123);
1577 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1578 } else {
1579 !!!cp (124);
1580 $self->{state} = DATA_STATE;
1581 $self->{s_kwd} = '';
1582 }
1583 !!!next-input-character;
1584
1585 !!!emit ($self->{ct}); # comment
1586 redo A;
1587 } elsif ($self->{nc} == -1) {
1588 if ($self->{in_subset}) {
1589 !!!cp (125.1);
1590 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1591 } else {
1592 !!!cp (125);
1593 $self->{state} = DATA_STATE;
1594 $self->{s_kwd} = '';
1595 }
1596 ## reconsume
1597
1598 !!!emit ($self->{ct}); # comment
1599 redo A;
1600 } else {
1601 !!!cp (126);
1602 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1603 $self->{read_until}->($self->{ct}->{data},
1604 q[>],
1605 length $self->{ct}->{data});
1606
1607 ## Stay in the state.
1608 !!!next-input-character;
1609 redo A;
1610 }
1611 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1612 ## XML5: "Markup declaration state" and "DOCTYPE markup
1613 ## declaration state".
1614
1615 if ($self->{nc} == 0x002D) { # -
1616 !!!cp (133);
1617 $self->{state} = MD_HYPHEN_STATE;
1618 !!!next-input-character;
1619 redo A;
1620 } elsif ($self->{nc} == 0x0044 or # D
1621 $self->{nc} == 0x0064) { # d
1622 ## ASCII case-insensitive.
1623 !!!cp (130);
1624 $self->{state} = MD_DOCTYPE_STATE;
1625 $self->{kwd} = chr $self->{nc};
1626 !!!next-input-character;
1627 redo A;
1628 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1629 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1630 $self->{is_xml}) and
1631 $self->{nc} == 0x005B) { # [
1632 !!!cp (135.4);
1633 $self->{state} = MD_CDATA_STATE;
1634 $self->{kwd} = '[';
1635 !!!next-input-character;
1636 redo A;
1637 } else {
1638 !!!cp (136);
1639 }
1640
1641 !!!parse-error (type => 'bogus comment',
1642 line => $self->{line_prev},
1643 column => $self->{column_prev} - 1);
1644 ## Reconsume.
1645 $self->{state} = BOGUS_COMMENT_STATE;
1646 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1647 line => $self->{line_prev},
1648 column => $self->{column_prev} - 1,
1649 };
1650 redo A;
1651 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1652 if ($self->{nc} == 0x002D) { # -
1653 !!!cp (127);
1654 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1655 line => $self->{line_prev},
1656 column => $self->{column_prev} - 2,
1657 };
1658 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1659 !!!next-input-character;
1660 redo A;
1661 } else {
1662 !!!cp (128);
1663 !!!parse-error (type => 'bogus comment',
1664 line => $self->{line_prev},
1665 column => $self->{column_prev} - 2);
1666 $self->{state} = BOGUS_COMMENT_STATE;
1667 ## Reconsume.
1668 $self->{ct} = {type => COMMENT_TOKEN,
1669 data => '-',
1670 line => $self->{line_prev},
1671 column => $self->{column_prev} - 2,
1672 };
1673 redo A;
1674 }
1675 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1676 ## ASCII case-insensitive.
1677 if ($self->{nc} == [
1678 undef,
1679 0x004F, # O
1680 0x0043, # C
1681 0x0054, # T
1682 0x0059, # Y
1683 0x0050, # P
1684 ]->[length $self->{kwd}] or
1685 $self->{nc} == [
1686 undef,
1687 0x006F, # o
1688 0x0063, # c
1689 0x0074, # t
1690 0x0079, # y
1691 0x0070, # p
1692 ]->[length $self->{kwd}]) {
1693 !!!cp (131);
1694 ## Stay in the state.
1695 $self->{kwd} .= chr $self->{nc};
1696 !!!next-input-character;
1697 redo A;
1698 } elsif ((length $self->{kwd}) == 6 and
1699 ($self->{nc} == 0x0045 or # E
1700 $self->{nc} == 0x0065)) { # e
1701 if ($self->{is_xml} and
1702 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1703 !!!cp (129);
1704 ## XML5: case-sensitive.
1705 !!!parse-error (type => 'lowercase keyword', ## TODO
1706 text => 'DOCTYPE',
1707 line => $self->{line_prev},
1708 column => $self->{column_prev} - 5);
1709 } else {
1710 !!!cp (129.1);
1711 }
1712 $self->{state} = DOCTYPE_STATE;
1713 $self->{ct} = {type => DOCTYPE_TOKEN,
1714 quirks => 1,
1715 line => $self->{line_prev},
1716 column => $self->{column_prev} - 7,
1717 };
1718 !!!next-input-character;
1719 redo A;
1720 } else {
1721 !!!cp (132);
1722 !!!parse-error (type => 'bogus comment',
1723 line => $self->{line_prev},
1724 column => $self->{column_prev} - 1 - length $self->{kwd});
1725 $self->{state} = BOGUS_COMMENT_STATE;
1726 ## Reconsume.
1727 $self->{ct} = {type => COMMENT_TOKEN,
1728 data => $self->{kwd},
1729 line => $self->{line_prev},
1730 column => $self->{column_prev} - 1 - length $self->{kwd},
1731 };
1732 redo A;
1733 }
1734 } elsif ($self->{state} == MD_CDATA_STATE) {
1735 if ($self->{nc} == {
1736 '[' => 0x0043, # C
1737 '[C' => 0x0044, # D
1738 '[CD' => 0x0041, # A
1739 '[CDA' => 0x0054, # T
1740 '[CDAT' => 0x0041, # A
1741 }->{$self->{kwd}}) {
1742 !!!cp (135.1);
1743 ## Stay in the state.
1744 $self->{kwd} .= chr $self->{nc};
1745 !!!next-input-character;
1746 redo A;
1747 } elsif ($self->{kwd} eq '[CDATA' and
1748 $self->{nc} == 0x005B) { # [
1749 if ($self->{is_xml} and
1750 not $self->{tainted} and
1751 @{$self->{open_elements} or []} == 0) {
1752 !!!cp (135.2);
1753 !!!parse-error (type => 'cdata outside of root element',
1754 line => $self->{line_prev},
1755 column => $self->{column_prev} - 7);
1756 $self->{tainted} = 1;
1757 } else {
1758 !!!cp (135.21);
1759 }
1760
1761 $self->{ct} = {type => CHARACTER_TOKEN,
1762 data => '',
1763 line => $self->{line_prev},
1764 column => $self->{column_prev} - 7};
1765 $self->{state} = CDATA_SECTION_STATE;
1766 !!!next-input-character;
1767 redo A;
1768 } else {
1769 !!!cp (135.3);
1770 !!!parse-error (type => 'bogus comment',
1771 line => $self->{line_prev},
1772 column => $self->{column_prev} - 1 - length $self->{kwd});
1773 $self->{state} = BOGUS_COMMENT_STATE;
1774 ## Reconsume.
1775 $self->{ct} = {type => COMMENT_TOKEN,
1776 data => $self->{kwd},
1777 line => $self->{line_prev},
1778 column => $self->{column_prev} - 1 - length $self->{kwd},
1779 };
1780 redo A;
1781 }
1782 } elsif ($self->{state} == COMMENT_START_STATE) {
1783 if ($self->{nc} == 0x002D) { # -
1784 !!!cp (137);
1785 $self->{state} = COMMENT_START_DASH_STATE;
1786 !!!next-input-character;
1787 redo A;
1788 } elsif ($self->{nc} == 0x003E) { # >
1789 !!!parse-error (type => 'bogus comment');
1790 if ($self->{in_subset}) {
1791 !!!cp (138.1);
1792 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1793 } else {
1794 !!!cp (138);
1795 $self->{state} = DATA_STATE;
1796 $self->{s_kwd} = '';
1797 }
1798 !!!next-input-character;
1799
1800 !!!emit ($self->{ct}); # comment
1801
1802 redo A;
1803 } elsif ($self->{nc} == -1) {
1804 !!!parse-error (type => 'unclosed comment');
1805 if ($self->{in_subset}) {
1806 !!!cp (139.1);
1807 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1808 } else {
1809 !!!cp (139);
1810 $self->{state} = DATA_STATE;
1811 $self->{s_kwd} = '';
1812 }
1813 ## reconsume
1814
1815 !!!emit ($self->{ct}); # comment
1816
1817 redo A;
1818 } else {
1819 !!!cp (140);
1820 $self->{ct}->{data} # comment
1821 .= chr ($self->{nc});
1822 $self->{state} = COMMENT_STATE;
1823 !!!next-input-character;
1824 redo A;
1825 }
1826 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1827 if ($self->{nc} == 0x002D) { # -
1828 !!!cp (141);
1829 $self->{state} = COMMENT_END_STATE;
1830 !!!next-input-character;
1831 redo A;
1832 } elsif ($self->{nc} == 0x003E) { # >
1833 !!!parse-error (type => 'bogus comment');
1834 if ($self->{in_subset}) {
1835 !!!cp (142.1);
1836 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1837 } else {
1838 !!!cp (142);
1839 $self->{state} = DATA_STATE;
1840 $self->{s_kwd} = '';
1841 }
1842 !!!next-input-character;
1843
1844 !!!emit ($self->{ct}); # comment
1845
1846 redo A;
1847 } elsif ($self->{nc} == -1) {
1848 !!!parse-error (type => 'unclosed comment');
1849 if ($self->{in_subset}) {
1850 !!!cp (143.1);
1851 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1852 } else {
1853 !!!cp (143);
1854 $self->{state} = DATA_STATE;
1855 $self->{s_kwd} = '';
1856 }
1857 ## reconsume
1858
1859 !!!emit ($self->{ct}); # comment
1860
1861 redo A;
1862 } else {
1863 !!!cp (144);
1864 $self->{ct}->{data} # comment
1865 .= '-' . chr ($self->{nc});
1866 $self->{state} = COMMENT_STATE;
1867 !!!next-input-character;
1868 redo A;
1869 }
1870 } elsif ($self->{state} == COMMENT_STATE) {
1871 if ($self->{nc} == 0x002D) { # -
1872 !!!cp (145);
1873 $self->{state} = COMMENT_END_DASH_STATE;
1874 !!!next-input-character;
1875 redo A;
1876 } elsif ($self->{nc} == -1) {
1877 !!!parse-error (type => 'unclosed comment');
1878 if ($self->{in_subset}) {
1879 !!!cp (146.1);
1880 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1881 } else {
1882 !!!cp (146);
1883 $self->{state} = DATA_STATE;
1884 $self->{s_kwd} = '';
1885 }
1886 ## reconsume
1887
1888 !!!emit ($self->{ct}); # comment
1889
1890 redo A;
1891 } else {
1892 !!!cp (147);
1893 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1894 $self->{read_until}->($self->{ct}->{data},
1895 q[-],
1896 length $self->{ct}->{data});
1897
1898 ## Stay in the state
1899 !!!next-input-character;
1900 redo A;
1901 }
1902 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1903 ## XML5: "comment dash state".
1904
1905 if ($self->{nc} == 0x002D) { # -
1906 !!!cp (148);
1907 $self->{state} = COMMENT_END_STATE;
1908 !!!next-input-character;
1909 redo A;
1910 } elsif ($self->{nc} == -1) {
1911 !!!parse-error (type => 'unclosed comment');
1912 if ($self->{in_subset}) {
1913 !!!cp (149.1);
1914 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1915 } else {
1916 !!!cp (149);
1917 $self->{state} = DATA_STATE;
1918 $self->{s_kwd} = '';
1919 }
1920 ## reconsume
1921
1922 !!!emit ($self->{ct}); # comment
1923
1924 redo A;
1925 } else {
1926 !!!cp (150);
1927 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1928 $self->{state} = COMMENT_STATE;
1929 !!!next-input-character;
1930 redo A;
1931 }
1932 } elsif ($self->{state} == COMMENT_END_STATE) {
1933 if ($self->{nc} == 0x003E) { # >
1934 if ($self->{in_subset}) {
1935 !!!cp (151.1);
1936 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937 } else {
1938 !!!cp (151);
1939 $self->{state} = DATA_STATE;
1940 $self->{s_kwd} = '';
1941 }
1942 !!!next-input-character;
1943
1944 !!!emit ($self->{ct}); # comment
1945
1946 redo A;
1947 } elsif ($self->{nc} == 0x002D) { # -
1948 !!!cp (152);
1949 ## XML5: Not a parse error.
1950 !!!parse-error (type => 'dash in comment',
1951 line => $self->{line_prev},
1952 column => $self->{column_prev});
1953 $self->{ct}->{data} .= '-'; # comment
1954 ## Stay in the state
1955 !!!next-input-character;
1956 redo A;
1957 } elsif ($self->{nc} == -1) {
1958 !!!parse-error (type => 'unclosed comment');
1959 if ($self->{in_subset}) {
1960 !!!cp (153.1);
1961 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1962 } else {
1963 !!!cp (153);
1964 $self->{state} = DATA_STATE;
1965 $self->{s_kwd} = '';
1966 }
1967 ## reconsume
1968
1969 !!!emit ($self->{ct}); # comment
1970
1971 redo A;
1972 } else {
1973 !!!cp (154);
1974 ## XML5: Not a parse error.
1975 !!!parse-error (type => 'dash in comment',
1976 line => $self->{line_prev},
1977 column => $self->{column_prev});
1978 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1979 $self->{state} = COMMENT_STATE;
1980 !!!next-input-character;
1981 redo A;
1982 }
1983 } elsif ($self->{state} == DOCTYPE_STATE) {
1984 if ($is_space->{$self->{nc}}) {
1985 !!!cp (155);
1986 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1987 !!!next-input-character;
1988 redo A;
1989 } else {
1990 !!!cp (156);
1991 ## XML5: Unless EOF, swith to the bogus comment state.
1992 !!!parse-error (type => 'no space before DOCTYPE name');
1993 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1994 ## reconsume
1995 redo A;
1996 }
1997 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1998 ## XML5: "DOCTYPE root name before state".
1999
2000 if ($is_space->{$self->{nc}}) {
2001 !!!cp (157);
2002 ## Stay in the state
2003 !!!next-input-character;
2004 redo A;
2005 } elsif ($self->{nc} == 0x003E) { # >
2006 !!!cp (158);
2007 ## XML5: No parse error.
2008 !!!parse-error (type => 'no DOCTYPE name');
2009 $self->{state} = DATA_STATE;
2010 $self->{s_kwd} = '';
2011 !!!next-input-character;
2012
2013 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2014
2015 redo A;
2016 } elsif ($self->{nc} == -1) {
2017 !!!cp (159);
2018 !!!parse-error (type => 'no DOCTYPE name');
2019 $self->{state} = DATA_STATE;
2020 $self->{s_kwd} = '';
2021 ## reconsume
2022
2023 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2024
2025 redo A;
2026 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2027 !!!cp (159.1);
2028 !!!parse-error (type => 'no DOCTYPE name');
2029 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2030 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2031 $self->{in_subset} = 1;
2032 !!!next-input-character;
2033 !!!emit ($self->{ct}); # DOCTYPE
2034 redo A;
2035 } else {
2036 !!!cp (160);
2037 $self->{ct}->{name} = chr $self->{nc};
2038 delete $self->{ct}->{quirks};
2039 $self->{state} = DOCTYPE_NAME_STATE;
2040 !!!next-input-character;
2041 redo A;
2042 }
2043 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2044 ## XML5: "DOCTYPE root name state".
2045
2046 ## ISSUE: Redundant "First," in the spec.
2047
2048 if ($is_space->{$self->{nc}}) {
2049 !!!cp (161);
2050 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2051 !!!next-input-character;
2052 redo A;
2053 } elsif ($self->{nc} == 0x003E) { # >
2054 !!!cp (162);
2055 $self->{state} = DATA_STATE;
2056 $self->{s_kwd} = '';
2057 !!!next-input-character;
2058
2059 !!!emit ($self->{ct}); # DOCTYPE
2060
2061 redo A;
2062 } elsif ($self->{nc} == -1) {
2063 !!!cp (163);
2064 !!!parse-error (type => 'unclosed DOCTYPE');
2065 $self->{state} = DATA_STATE;
2066 $self->{s_kwd} = '';
2067 ## reconsume
2068
2069 $self->{ct}->{quirks} = 1;
2070 !!!emit ($self->{ct}); # DOCTYPE
2071
2072 redo A;
2073 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2074 !!!cp (163.1);
2075 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2076 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2077 $self->{in_subset} = 1;
2078 !!!next-input-character;
2079 !!!emit ($self->{ct}); # DOCTYPE
2080 redo A;
2081 } else {
2082 !!!cp (164);
2083 $self->{ct}->{name}
2084 .= chr ($self->{nc}); # DOCTYPE
2085 ## Stay in the state
2086 !!!next-input-character;
2087 redo A;
2088 }
2089 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2090 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2091 ## state", but implemented differently.
2092
2093 if ($is_space->{$self->{nc}}) {
2094 !!!cp (165);
2095 ## Stay in the state
2096 !!!next-input-character;
2097 redo A;
2098 } elsif ($self->{nc} == 0x003E) { # >
2099 !!!cp (166);
2100 $self->{state} = DATA_STATE;
2101 $self->{s_kwd} = '';
2102 !!!next-input-character;
2103
2104 !!!emit ($self->{ct}); # DOCTYPE
2105
2106 redo A;
2107 } elsif ($self->{nc} == -1) {
2108 !!!cp (167);
2109 !!!parse-error (type => 'unclosed DOCTYPE');
2110 $self->{state} = DATA_STATE;
2111 $self->{s_kwd} = '';
2112 ## reconsume
2113
2114 $self->{ct}->{quirks} = 1;
2115 !!!emit ($self->{ct}); # DOCTYPE
2116
2117 redo A;
2118 } elsif ($self->{nc} == 0x0050 or # P
2119 $self->{nc} == 0x0070) { # p
2120 !!!cp (167.1);
2121 $self->{state} = PUBLIC_STATE;
2122 $self->{kwd} = chr $self->{nc};
2123 !!!next-input-character;
2124 redo A;
2125 } elsif ($self->{nc} == 0x0053 or # S
2126 $self->{nc} == 0x0073) { # s
2127 !!!cp (167.2);
2128 $self->{state} = SYSTEM_STATE;
2129 $self->{kwd} = chr $self->{nc};
2130 !!!next-input-character;
2131 redo A;
2132 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2133 !!!cp (167.3);
2134 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2135 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2136 $self->{in_subset} = 1;
2137 !!!next-input-character;
2138 !!!emit ($self->{ct}); # DOCTYPE
2139 redo A;
2140 } else {
2141 !!!cp (180);
2142 !!!parse-error (type => 'string after DOCTYPE name');
2143 $self->{ct}->{quirks} = 1;
2144
2145 $self->{state} = BOGUS_DOCTYPE_STATE;
2146 !!!next-input-character;
2147 redo A;
2148 }
2149 } elsif ($self->{state} == PUBLIC_STATE) {
2150 ## ASCII case-insensitive
2151 if ($self->{nc} == [
2152 undef,
2153 0x0055, # U
2154 0x0042, # B
2155 0x004C, # L
2156 0x0049, # I
2157 ]->[length $self->{kwd}] or
2158 $self->{nc} == [
2159 undef,
2160 0x0075, # u
2161 0x0062, # b
2162 0x006C, # l
2163 0x0069, # i
2164 ]->[length $self->{kwd}]) {
2165 !!!cp (175);
2166 ## Stay in the state.
2167 $self->{kwd} .= chr $self->{nc};
2168 !!!next-input-character;
2169 redo A;
2170 } elsif ((length $self->{kwd}) == 5 and
2171 ($self->{nc} == 0x0043 or # C
2172 $self->{nc} == 0x0063)) { # c
2173 if ($self->{is_xml} and
2174 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2175 !!!cp (168.1);
2176 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2177 text => 'PUBLIC',
2178 line => $self->{line_prev},
2179 column => $self->{column_prev} - 4);
2180 } else {
2181 !!!cp (168);
2182 }
2183 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2184 !!!next-input-character;
2185 redo A;
2186 } else {
2187 !!!cp (169);
2188 !!!parse-error (type => 'string after DOCTYPE name',
2189 line => $self->{line_prev},
2190 column => $self->{column_prev} + 1 - length $self->{kwd});
2191 $self->{ct}->{quirks} = 1;
2192
2193 $self->{state} = BOGUS_DOCTYPE_STATE;
2194 ## Reconsume.
2195 redo A;
2196 }
2197 } elsif ($self->{state} == SYSTEM_STATE) {
2198 ## ASCII case-insensitive
2199 if ($self->{nc} == [
2200 undef,
2201 0x0059, # Y
2202 0x0053, # S
2203 0x0054, # T
2204 0x0045, # E
2205 ]->[length $self->{kwd}] or
2206 $self->{nc} == [
2207 undef,
2208 0x0079, # y
2209 0x0073, # s
2210 0x0074, # t
2211 0x0065, # e
2212 ]->[length $self->{kwd}]) {
2213 !!!cp (170);
2214 ## Stay in the state.
2215 $self->{kwd} .= chr $self->{nc};
2216 !!!next-input-character;
2217 redo A;
2218 } elsif ((length $self->{kwd}) == 5 and
2219 ($self->{nc} == 0x004D or # M
2220 $self->{nc} == 0x006D)) { # m
2221 if ($self->{is_xml} and
2222 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2223 !!!cp (171.1);
2224 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2225 text => 'SYSTEM',
2226 line => $self->{line_prev},
2227 column => $self->{column_prev} - 4);
2228 } else {
2229 !!!cp (171);
2230 }
2231 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2232 !!!next-input-character;
2233 redo A;
2234 } else {
2235 !!!cp (172);
2236 !!!parse-error (type => 'string after DOCTYPE name',
2237 line => $self->{line_prev},
2238 column => $self->{column_prev} + 1 - length $self->{kwd});
2239 $self->{ct}->{quirks} = 1;
2240
2241 $self->{state} = BOGUS_DOCTYPE_STATE;
2242 ## Reconsume.
2243 redo A;
2244 }
2245 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2246 if ($is_space->{$self->{nc}}) {
2247 !!!cp (181);
2248 ## Stay in the state
2249 !!!next-input-character;
2250 redo A;
2251 } elsif ($self->{nc} eq 0x0022) { # "
2252 !!!cp (182);
2253 $self->{ct}->{pubid} = ''; # DOCTYPE
2254 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2255 !!!next-input-character;
2256 redo A;
2257 } elsif ($self->{nc} eq 0x0027) { # '
2258 !!!cp (183);
2259 $self->{ct}->{pubid} = ''; # DOCTYPE
2260 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2261 !!!next-input-character;
2262 redo A;
2263 } elsif ($self->{nc} eq 0x003E) { # >
2264 !!!cp (184);
2265 !!!parse-error (type => 'no PUBLIC literal');
2266
2267 $self->{state} = DATA_STATE;
2268 $self->{s_kwd} = '';
2269 !!!next-input-character;
2270
2271 $self->{ct}->{quirks} = 1;
2272 !!!emit ($self->{ct}); # DOCTYPE
2273
2274 redo A;
2275 } elsif ($self->{nc} == -1) {
2276 !!!cp (185);
2277 !!!parse-error (type => 'unclosed DOCTYPE');
2278
2279 $self->{state} = DATA_STATE;
2280 $self->{s_kwd} = '';
2281 ## reconsume
2282
2283 $self->{ct}->{quirks} = 1;
2284 !!!emit ($self->{ct}); # DOCTYPE
2285
2286 redo A;
2287 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2288 !!!cp (186.1);
2289 !!!parse-error (type => 'no PUBLIC literal');
2290 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2291 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2292 $self->{in_subset} = 1;
2293 !!!next-input-character;
2294 !!!emit ($self->{ct}); # DOCTYPE
2295 redo A;
2296 } else {
2297 !!!cp (186);
2298 !!!parse-error (type => 'string after PUBLIC');
2299 $self->{ct}->{quirks} = 1;
2300
2301 $self->{state} = BOGUS_DOCTYPE_STATE;
2302 !!!next-input-character;
2303 redo A;
2304 }
2305 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2306 if ($self->{nc} == 0x0022) { # "
2307 !!!cp (187);
2308 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2309 !!!next-input-character;
2310 redo A;
2311 } elsif ($self->{nc} == 0x003E) { # >
2312 !!!cp (188);
2313 !!!parse-error (type => 'unclosed PUBLIC literal');
2314
2315 $self->{state} = DATA_STATE;
2316 $self->{s_kwd} = '';
2317 !!!next-input-character;
2318
2319 $self->{ct}->{quirks} = 1;
2320 !!!emit ($self->{ct}); # DOCTYPE
2321
2322 redo A;
2323 } elsif ($self->{nc} == -1) {
2324 !!!cp (189);
2325 !!!parse-error (type => 'unclosed PUBLIC literal');
2326
2327 $self->{state} = DATA_STATE;
2328 $self->{s_kwd} = '';
2329 ## reconsume
2330
2331 $self->{ct}->{quirks} = 1;
2332 !!!emit ($self->{ct}); # DOCTYPE
2333
2334 redo A;
2335 } else {
2336 !!!cp (190);
2337 $self->{ct}->{pubid} # DOCTYPE
2338 .= chr $self->{nc};
2339 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2340 length $self->{ct}->{pubid});
2341
2342 ## Stay in the state
2343 !!!next-input-character;
2344 redo A;
2345 }
2346 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2347 if ($self->{nc} == 0x0027) { # '
2348 !!!cp (191);
2349 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2350 !!!next-input-character;
2351 redo A;
2352 } elsif ($self->{nc} == 0x003E) { # >
2353 !!!cp (192);
2354 !!!parse-error (type => 'unclosed PUBLIC literal');
2355
2356 $self->{state} = DATA_STATE;
2357 $self->{s_kwd} = '';
2358 !!!next-input-character;
2359
2360 $self->{ct}->{quirks} = 1;
2361 !!!emit ($self->{ct}); # DOCTYPE
2362
2363 redo A;
2364 } elsif ($self->{nc} == -1) {
2365 !!!cp (193);
2366 !!!parse-error (type => 'unclosed PUBLIC literal');
2367
2368 $self->{state} = DATA_STATE;
2369 $self->{s_kwd} = '';
2370 ## reconsume
2371
2372 $self->{ct}->{quirks} = 1;
2373 !!!emit ($self->{ct}); # DOCTYPE
2374
2375 redo A;
2376 } else {
2377 !!!cp (194);
2378 $self->{ct}->{pubid} # DOCTYPE
2379 .= chr $self->{nc};
2380 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2381 length $self->{ct}->{pubid});
2382
2383 ## Stay in the state
2384 !!!next-input-character;
2385 redo A;
2386 }
2387 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2388 if ($is_space->{$self->{nc}}) {
2389 !!!cp (195);
2390 ## Stay in the state
2391 !!!next-input-character;
2392 redo A;
2393 } elsif ($self->{nc} == 0x0022) { # "
2394 !!!cp (196);
2395 $self->{ct}->{sysid} = ''; # DOCTYPE
2396 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2397 !!!next-input-character;
2398 redo A;
2399 } elsif ($self->{nc} == 0x0027) { # '
2400 !!!cp (197);
2401 $self->{ct}->{sysid} = ''; # DOCTYPE
2402 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2403 !!!next-input-character;
2404 redo A;
2405 } elsif ($self->{nc} == 0x003E) { # >
2406 if ($self->{is_xml}) {
2407 !!!cp (198.1);
2408 !!!parse-error (type => 'no SYSTEM literal');
2409 } else {
2410 !!!cp (198);
2411 }
2412 $self->{state} = DATA_STATE;
2413 $self->{s_kwd} = '';
2414 !!!next-input-character;
2415
2416 !!!emit ($self->{ct}); # DOCTYPE
2417
2418 redo A;
2419 } elsif ($self->{nc} == -1) {
2420 !!!cp (199);
2421 !!!parse-error (type => 'unclosed DOCTYPE');
2422
2423 $self->{state} = DATA_STATE;
2424 $self->{s_kwd} = '';
2425 ## reconsume
2426
2427 $self->{ct}->{quirks} = 1;
2428 !!!emit ($self->{ct}); # DOCTYPE
2429
2430 redo A;
2431 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2432 !!!cp (200.1);
2433 !!!parse-error (type => 'no SYSTEM literal');
2434 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2435 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2436 $self->{in_subset} = 1;
2437 !!!next-input-character;
2438 !!!emit ($self->{ct}); # DOCTYPE
2439 redo A;
2440 } else {
2441 !!!cp (200);
2442 !!!parse-error (type => 'string after PUBLIC literal');
2443 $self->{ct}->{quirks} = 1;
2444
2445 $self->{state} = BOGUS_DOCTYPE_STATE;
2446 !!!next-input-character;
2447 redo A;
2448 }
2449 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2450 if ($is_space->{$self->{nc}}) {
2451 !!!cp (201);
2452 ## Stay in the state
2453 !!!next-input-character;
2454 redo A;
2455 } elsif ($self->{nc} == 0x0022) { # "
2456 !!!cp (202);
2457 $self->{ct}->{sysid} = ''; # DOCTYPE
2458 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2459 !!!next-input-character;
2460 redo A;
2461 } elsif ($self->{nc} == 0x0027) { # '
2462 !!!cp (203);
2463 $self->{ct}->{sysid} = ''; # DOCTYPE
2464 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2465 !!!next-input-character;
2466 redo A;
2467 } elsif ($self->{nc} == 0x003E) { # >
2468 !!!cp (204);
2469 !!!parse-error (type => 'no SYSTEM literal');
2470 $self->{state} = DATA_STATE;
2471 $self->{s_kwd} = '';
2472 !!!next-input-character;
2473
2474 $self->{ct}->{quirks} = 1;
2475 !!!emit ($self->{ct}); # DOCTYPE
2476
2477 redo A;
2478 } elsif ($self->{nc} == -1) {
2479 !!!cp (205);
2480 !!!parse-error (type => 'unclosed DOCTYPE');
2481
2482 $self->{state} = DATA_STATE;
2483 $self->{s_kwd} = '';
2484 ## reconsume
2485
2486 $self->{ct}->{quirks} = 1;
2487 !!!emit ($self->{ct}); # DOCTYPE
2488
2489 redo A;
2490 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2491 !!!cp (206.1);
2492 !!!parse-error (type => 'no SYSTEM literal');
2493
2494 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2495 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2496 $self->{in_subset} = 1;
2497 !!!next-input-character;
2498 !!!emit ($self->{ct}); # DOCTYPE
2499 redo A;
2500 } else {
2501 !!!cp (206);
2502 !!!parse-error (type => 'string after SYSTEM');
2503 $self->{ct}->{quirks} = 1;
2504
2505 $self->{state} = BOGUS_DOCTYPE_STATE;
2506 !!!next-input-character;
2507 redo A;
2508 }
2509 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2510 if ($self->{nc} == 0x0022) { # "
2511 !!!cp (207);
2512 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2513 !!!next-input-character;
2514 redo A;
2515 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2516 !!!cp (208);
2517 !!!parse-error (type => 'unclosed SYSTEM literal');
2518
2519 $self->{state} = DATA_STATE;
2520 $self->{s_kwd} = '';
2521 !!!next-input-character;
2522
2523 $self->{ct}->{quirks} = 1;
2524 !!!emit ($self->{ct}); # DOCTYPE
2525
2526 redo A;
2527 } elsif ($self->{nc} == -1) {
2528 !!!cp (209);
2529 !!!parse-error (type => 'unclosed SYSTEM literal');
2530
2531 $self->{state} = DATA_STATE;
2532 $self->{s_kwd} = '';
2533 ## reconsume
2534
2535 $self->{ct}->{quirks} = 1;
2536 !!!emit ($self->{ct}); # DOCTYPE
2537
2538 redo A;
2539 } else {
2540 !!!cp (210);
2541 $self->{ct}->{sysid} # DOCTYPE
2542 .= chr $self->{nc};
2543 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2544 length $self->{ct}->{sysid});
2545
2546 ## Stay in the state
2547 !!!next-input-character;
2548 redo A;
2549 }
2550 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2551 if ($self->{nc} == 0x0027) { # '
2552 !!!cp (211);
2553 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2554 !!!next-input-character;
2555 redo A;
2556 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2557 !!!cp (212);
2558 !!!parse-error (type => 'unclosed SYSTEM literal');
2559
2560 $self->{state} = DATA_STATE;
2561 $self->{s_kwd} = '';
2562 !!!next-input-character;
2563
2564 $self->{ct}->{quirks} = 1;
2565 !!!emit ($self->{ct}); # DOCTYPE
2566
2567 redo A;
2568 } elsif ($self->{nc} == -1) {
2569 !!!cp (213);
2570 !!!parse-error (type => 'unclosed SYSTEM literal');
2571
2572 $self->{state} = DATA_STATE;
2573 $self->{s_kwd} = '';
2574 ## reconsume
2575
2576 $self->{ct}->{quirks} = 1;
2577 !!!emit ($self->{ct}); # DOCTYPE
2578
2579 redo A;
2580 } else {
2581 !!!cp (214);
2582 $self->{ct}->{sysid} # DOCTYPE
2583 .= chr $self->{nc};
2584 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2585 length $self->{ct}->{sysid});
2586
2587 ## Stay in the state
2588 !!!next-input-character;
2589 redo A;
2590 }
2591 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2592 if ($is_space->{$self->{nc}}) {
2593 !!!cp (215);
2594 ## Stay in the state
2595 !!!next-input-character;
2596 redo A;
2597 } elsif ($self->{nc} == 0x003E) { # >
2598 !!!cp (216);
2599 $self->{state} = DATA_STATE;
2600 $self->{s_kwd} = '';
2601 !!!next-input-character;
2602
2603 !!!emit ($self->{ct}); # DOCTYPE
2604
2605 redo A;
2606 } elsif ($self->{nc} == -1) {
2607 !!!cp (217);
2608 !!!parse-error (type => 'unclosed DOCTYPE');
2609 $self->{state} = DATA_STATE;
2610 $self->{s_kwd} = '';
2611 ## reconsume
2612
2613 $self->{ct}->{quirks} = 1;
2614 !!!emit ($self->{ct}); # DOCTYPE
2615
2616 redo A;
2617 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2618 !!!cp (218.1);
2619 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2620 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2621 $self->{in_subset} = 1;
2622 !!!next-input-character;
2623 !!!emit ($self->{ct}); # DOCTYPE
2624 redo A;
2625 } else {
2626 !!!cp (218);
2627 !!!parse-error (type => 'string after SYSTEM literal');
2628 #$self->{ct}->{quirks} = 1;
2629
2630 $self->{state} = BOGUS_DOCTYPE_STATE;
2631 !!!next-input-character;
2632 redo A;
2633 }
2634 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2635 if ($self->{nc} == 0x003E) { # >
2636 !!!cp (219);
2637 $self->{state} = DATA_STATE;
2638 $self->{s_kwd} = '';
2639 !!!next-input-character;
2640
2641 !!!emit ($self->{ct}); # DOCTYPE
2642
2643 redo A;
2644 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2645 !!!cp (220.1);
2646 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2647 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2648 $self->{in_subset} = 1;
2649 !!!next-input-character;
2650 !!!emit ($self->{ct}); # DOCTYPE
2651 redo A;
2652 } elsif ($self->{nc} == -1) {
2653 !!!cp (220);
2654 $self->{state} = DATA_STATE;
2655 $self->{s_kwd} = '';
2656 ## reconsume
2657
2658 !!!emit ($self->{ct}); # DOCTYPE
2659
2660 redo A;
2661 } else {
2662 !!!cp (221);
2663 my $s = '';
2664 $self->{read_until}->($s, q{>[}, 0);
2665
2666 ## Stay in the state
2667 !!!next-input-character;
2668 redo A;
2669 }
2670 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2671 ## NOTE: "CDATA section state" in the state is jointly implemented
2672 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2673 ## and |CDATA_SECTION_MSE2_STATE|.
2674
2675 ## XML5: "CDATA state".
2676
2677 if ($self->{nc} == 0x005D) { # ]
2678 !!!cp (221.1);
2679 $self->{state} = CDATA_SECTION_MSE1_STATE;
2680 !!!next-input-character;
2681 redo A;
2682 } elsif ($self->{nc} == -1) {
2683 if ($self->{is_xml}) {
2684 !!!cp (221.11);
2685 !!!parse-error (type => 'no mse'); ## TODO: type
2686 } else {
2687 !!!cp (221.12);
2688 }
2689
2690 $self->{state} = DATA_STATE;
2691 $self->{s_kwd} = '';
2692 ## Reconsume.
2693 if (length $self->{ct}->{data}) { # character
2694 !!!cp (221.2);
2695 !!!emit ($self->{ct}); # character
2696 } else {
2697 !!!cp (221.3);
2698 ## No token to emit. $self->{ct} is discarded.
2699 }
2700 redo A;
2701 } else {
2702 !!!cp (221.4);
2703 $self->{ct}->{data} .= chr $self->{nc};
2704 $self->{read_until}->($self->{ct}->{data},
2705 q<]>,
2706 length $self->{ct}->{data});
2707
2708 ## Stay in the state.
2709 !!!next-input-character;
2710 redo A;
2711 }
2712
2713 ## ISSUE: "text tokens" in spec.
2714 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2715 ## XML5: "CDATA bracket state".
2716
2717 if ($self->{nc} == 0x005D) { # ]
2718 !!!cp (221.5);
2719 $self->{state} = CDATA_SECTION_MSE2_STATE;
2720 !!!next-input-character;
2721 redo A;
2722 } else {
2723 !!!cp (221.6);
2724 ## XML5: If EOF, "]" is not appended and changed to the data state.
2725 $self->{ct}->{data} .= ']';
2726 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2727 ## Reconsume.
2728 redo A;
2729 }
2730 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2731 ## XML5: "CDATA end state".
2732
2733 if ($self->{nc} == 0x003E) { # >
2734 $self->{state} = DATA_STATE;
2735 $self->{s_kwd} = '';
2736 !!!next-input-character;
2737 if (length $self->{ct}->{data}) { # character
2738 !!!cp (221.7);
2739 !!!emit ($self->{ct}); # character
2740 } else {
2741 !!!cp (221.8);
2742 ## No token to emit. $self->{ct} is discarded.
2743 }
2744 redo A;
2745 } elsif ($self->{nc} == 0x005D) { # ]
2746 !!!cp (221.9); # character
2747 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2748 ## Stay in the state.
2749 !!!next-input-character;
2750 redo A;
2751 } else {
2752 !!!cp (221.11);
2753 $self->{ct}->{data} .= ']]'; # character
2754 $self->{state} = CDATA_SECTION_STATE;
2755 ## Reconsume. ## XML5: Emit.
2756 redo A;
2757 }
2758 } elsif ($self->{state} == ENTITY_STATE) {
2759 if ($is_space->{$self->{nc}} or
2760 {
2761 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2762 $self->{entity_add} => 1,
2763 }->{$self->{nc}}) {
2764 !!!cp (1001);
2765 ## Don't consume
2766 ## No error
2767 ## Return nothing.
2768 #
2769 } elsif ($self->{nc} == 0x0023) { # #
2770 !!!cp (999);
2771 $self->{state} = ENTITY_HASH_STATE;
2772 $self->{kwd} = '#';
2773 !!!next-input-character;
2774 redo A;
2775 } elsif ((0x0041 <= $self->{nc} and
2776 $self->{nc} <= 0x005A) or # A..Z
2777 (0x0061 <= $self->{nc} and
2778 $self->{nc} <= 0x007A)) { # a..z
2779 !!!cp (998);
2780 require Whatpm::_NamedEntityList;
2781 $self->{state} = ENTITY_NAME_STATE;
2782 $self->{kwd} = chr $self->{nc};
2783 $self->{entity__value} = $self->{kwd};
2784 $self->{entity__match} = 0;
2785 !!!next-input-character;
2786 redo A;
2787 } else {
2788 !!!cp (1027);
2789 !!!parse-error (type => 'bare ero');
2790 ## Return nothing.
2791 #
2792 }
2793
2794 ## NOTE: No character is consumed by the "consume a character
2795 ## reference" algorithm. In other word, there is an "&" character
2796 ## that does not introduce a character reference, which would be
2797 ## appended to the parent element or the attribute value in later
2798 ## process of the tokenizer.
2799
2800 if ($self->{prev_state} == DATA_STATE) {
2801 !!!cp (997);
2802 $self->{state} = $self->{prev_state};
2803 $self->{s_kwd} = '';
2804 ## Reconsume.
2805 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2806 line => $self->{line_prev},
2807 column => $self->{column_prev},
2808 });
2809 redo A;
2810 } else {
2811 !!!cp (996);
2812 $self->{ca}->{value} .= '&';
2813 $self->{state} = $self->{prev_state};
2814 $self->{s_kwd} = '';
2815 ## Reconsume.
2816 redo A;
2817 }
2818 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2819 if ($self->{nc} == 0x0078 or # x
2820 $self->{nc} == 0x0058) { # X
2821 !!!cp (995);
2822 $self->{state} = HEXREF_X_STATE;
2823 $self->{kwd} .= chr $self->{nc};
2824 !!!next-input-character;
2825 redo A;
2826 } elsif (0x0030 <= $self->{nc} and
2827 $self->{nc} <= 0x0039) { # 0..9
2828 !!!cp (994);
2829 $self->{state} = NCR_NUM_STATE;
2830 $self->{kwd} = $self->{nc} - 0x0030;
2831 !!!next-input-character;
2832 redo A;
2833 } else {
2834 !!!parse-error (type => 'bare nero',
2835 line => $self->{line_prev},
2836 column => $self->{column_prev} - 1);
2837
2838 ## NOTE: According to the spec algorithm, nothing is returned,
2839 ## and then "&#" is appended to the parent element or the attribute
2840 ## value in the later processing.
2841
2842 if ($self->{prev_state} == DATA_STATE) {
2843 !!!cp (1019);
2844 $self->{state} = $self->{prev_state};
2845 $self->{s_kwd} = '';
2846 ## Reconsume.
2847 !!!emit ({type => CHARACTER_TOKEN,
2848 data => '&#',
2849 line => $self->{line_prev},
2850 column => $self->{column_prev} - 1,
2851 });
2852 redo A;
2853 } else {
2854 !!!cp (993);
2855 $self->{ca}->{value} .= '&#';
2856 $self->{state} = $self->{prev_state};
2857 $self->{s_kwd} = '';
2858 ## Reconsume.
2859 redo A;
2860 }
2861 }
2862 } elsif ($self->{state} == NCR_NUM_STATE) {
2863 if (0x0030 <= $self->{nc} and
2864 $self->{nc} <= 0x0039) { # 0..9
2865 !!!cp (1012);
2866 $self->{kwd} *= 10;
2867 $self->{kwd} += $self->{nc} - 0x0030;
2868
2869 ## Stay in the state.
2870 !!!next-input-character;
2871 redo A;
2872 } elsif ($self->{nc} == 0x003B) { # ;
2873 !!!cp (1013);
2874 !!!next-input-character;
2875 #
2876 } else {
2877 !!!cp (1014);
2878 !!!parse-error (type => 'no refc');
2879 ## Reconsume.
2880 #
2881 }
2882
2883 my $code = $self->{kwd};
2884 my $l = $self->{line_prev};
2885 my $c = $self->{column_prev};
2886 if ($charref_map->{$code}) {
2887 !!!cp (1015);
2888 !!!parse-error (type => 'invalid character reference',
2889 text => (sprintf 'U+%04X', $code),
2890 line => $l, column => $c);
2891 $code = $charref_map->{$code};
2892 } elsif ($code > 0x10FFFF) {
2893 !!!cp (1016);
2894 !!!parse-error (type => 'invalid character reference',
2895 text => (sprintf 'U-%08X', $code),
2896 line => $l, column => $c);
2897 $code = 0xFFFD;
2898 }
2899
2900 if ($self->{prev_state} == DATA_STATE) {
2901 !!!cp (992);
2902 $self->{state} = $self->{prev_state};
2903 $self->{s_kwd} = '';
2904 ## Reconsume.
2905 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2906 has_reference => 1,
2907 line => $l, column => $c,
2908 });
2909 redo A;
2910 } else {
2911 !!!cp (991);
2912 $self->{ca}->{value} .= chr $code;
2913 $self->{ca}->{has_reference} = 1;
2914 $self->{state} = $self->{prev_state};
2915 $self->{s_kwd} = '';
2916 ## Reconsume.
2917 redo A;
2918 }
2919 } elsif ($self->{state} == HEXREF_X_STATE) {
2920 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2921 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2922 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2923 # 0..9, A..F, a..f
2924 !!!cp (990);
2925 $self->{state} = HEXREF_HEX_STATE;
2926 $self->{kwd} = 0;
2927 ## Reconsume.
2928 redo A;
2929 } else {
2930 !!!parse-error (type => 'bare hcro',
2931 line => $self->{line_prev},
2932 column => $self->{column_prev} - 2);
2933
2934 ## NOTE: According to the spec algorithm, nothing is returned,
2935 ## and then "&#" followed by "X" or "x" is appended to the parent
2936 ## element or the attribute value in the later processing.
2937
2938 if ($self->{prev_state} == DATA_STATE) {
2939 !!!cp (1005);
2940 $self->{state} = $self->{prev_state};
2941 $self->{s_kwd} = '';
2942 ## Reconsume.
2943 !!!emit ({type => CHARACTER_TOKEN,
2944 data => '&' . $self->{kwd},
2945 line => $self->{line_prev},
2946 column => $self->{column_prev} - length $self->{kwd},
2947 });
2948 redo A;
2949 } else {
2950 !!!cp (989);
2951 $self->{ca}->{value} .= '&' . $self->{kwd};
2952 $self->{state} = $self->{prev_state};
2953 $self->{s_kwd} = '';
2954 ## Reconsume.
2955 redo A;
2956 }
2957 }
2958 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2959 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2960 # 0..9
2961 !!!cp (1002);
2962 $self->{kwd} *= 0x10;
2963 $self->{kwd} += $self->{nc} - 0x0030;
2964 ## Stay in the state.
2965 !!!next-input-character;
2966 redo A;
2967 } elsif (0x0061 <= $self->{nc} and
2968 $self->{nc} <= 0x0066) { # a..f
2969 !!!cp (1003);
2970 $self->{kwd} *= 0x10;
2971 $self->{kwd} += $self->{nc} - 0x0060 + 9;
2972 ## Stay in the state.
2973 !!!next-input-character;
2974 redo A;
2975 } elsif (0x0041 <= $self->{nc} and
2976 $self->{nc} <= 0x0046) { # A..F
2977 !!!cp (1004);
2978 $self->{kwd} *= 0x10;
2979 $self->{kwd} += $self->{nc} - 0x0040 + 9;
2980 ## Stay in the state.
2981 !!!next-input-character;
2982 redo A;
2983 } elsif ($self->{nc} == 0x003B) { # ;
2984 !!!cp (1006);
2985 !!!next-input-character;
2986 #
2987 } else {
2988 !!!cp (1007);
2989 !!!parse-error (type => 'no refc',
2990 line => $self->{line},
2991 column => $self->{column});
2992 ## Reconsume.
2993 #
2994 }
2995
2996 my $code = $self->{kwd};
2997 my $l = $self->{line_prev};
2998 my $c = $self->{column_prev};
2999 if ($charref_map->{$code}) {
3000 !!!cp (1008);
3001 !!!parse-error (type => 'invalid character reference',
3002 text => (sprintf 'U+%04X', $code),
3003 line => $l, column => $c);
3004 $code = $charref_map->{$code};
3005 } elsif ($code > 0x10FFFF) {
3006 !!!cp (1009);
3007 !!!parse-error (type => 'invalid character reference',
3008 text => (sprintf 'U-%08X', $code),
3009 line => $l, column => $c);
3010 $code = 0xFFFD;
3011 }
3012
3013 if ($self->{prev_state} == DATA_STATE) {
3014 !!!cp (988);
3015 $self->{state} = $self->{prev_state};
3016 $self->{s_kwd} = '';
3017 ## Reconsume.
3018 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3019 has_reference => 1,
3020 line => $l, column => $c,
3021 });
3022 redo A;
3023 } else {
3024 !!!cp (987);
3025 $self->{ca}->{value} .= chr $code;
3026 $self->{ca}->{has_reference} = 1;
3027 $self->{state} = $self->{prev_state};
3028 $self->{s_kwd} = '';
3029 ## Reconsume.
3030 redo A;
3031 }
3032 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3033 if (length $self->{kwd} < 30 and
3034 ## NOTE: Some number greater than the maximum length of entity name
3035 ((0x0041 <= $self->{nc} and # a
3036 $self->{nc} <= 0x005A) or # x
3037 (0x0061 <= $self->{nc} and # a
3038 $self->{nc} <= 0x007A) or # z
3039 (0x0030 <= $self->{nc} and # 0
3040 $self->{nc} <= 0x0039) or # 9
3041 $self->{nc} == 0x003B)) { # ;
3042 our $EntityChar;
3043 $self->{kwd} .= chr $self->{nc};
3044 if (defined $EntityChar->{$self->{kwd}}) {
3045 if ($self->{nc} == 0x003B) { # ;
3046 !!!cp (1020);
3047 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3048 $self->{entity__match} = 1;
3049 !!!next-input-character;
3050 #
3051 } else {
3052 !!!cp (1021);
3053 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3054 $self->{entity__match} = -1;
3055 ## Stay in the state.
3056 !!!next-input-character;
3057 redo A;
3058 }
3059 } else {
3060 !!!cp (1022);
3061 $self->{entity__value} .= chr $self->{nc};
3062 $self->{entity__match} *= 2;
3063 ## Stay in the state.
3064 !!!next-input-character;
3065 redo A;
3066 }
3067 }
3068
3069 my $data;
3070 my $has_ref;
3071 if ($self->{entity__match} > 0) {
3072 !!!cp (1023);
3073 $data = $self->{entity__value};
3074 $has_ref = 1;
3075 #
3076 } elsif ($self->{entity__match} < 0) {
3077 !!!parse-error (type => 'no refc');
3078 if ($self->{prev_state} != DATA_STATE and # in attribute
3079 $self->{entity__match} < -1) {
3080 !!!cp (1024);
3081 $data = '&' . $self->{kwd};
3082 #
3083 } else {
3084 !!!cp (1025);
3085 $data = $self->{entity__value};
3086 $has_ref = 1;
3087 #
3088 }
3089 } else {
3090 !!!cp (1026);
3091 !!!parse-error (type => 'bare ero',
3092 line => $self->{line_prev},
3093 column => $self->{column_prev} - length $self->{kwd});
3094 $data = '&' . $self->{kwd};
3095 #
3096 }
3097
3098 ## NOTE: In these cases, when a character reference is found,
3099 ## it is consumed and a character token is returned, or, otherwise,
3100 ## nothing is consumed and returned, according to the spec algorithm.
3101 ## In this implementation, anything that has been examined by the
3102 ## tokenizer is appended to the parent element or the attribute value
3103 ## as string, either literal string when no character reference or
3104 ## entity-replaced string otherwise, in this stage, since any characters
3105 ## that would not be consumed are appended in the data state or in an
3106 ## appropriate attribute value state anyway.
3107
3108 if ($self->{prev_state} == DATA_STATE) {
3109 !!!cp (986);
3110 $self->{state} = $self->{prev_state};
3111 $self->{s_kwd} = '';
3112 ## Reconsume.
3113 !!!emit ({type => CHARACTER_TOKEN,
3114 data => $data,
3115 has_reference => $has_ref,
3116 line => $self->{line_prev},
3117 column => $self->{column_prev} + 1 - length $self->{kwd},
3118 });
3119 redo A;
3120 } else {
3121 !!!cp (985);
3122 $self->{ca}->{value} .= $data;
3123 $self->{ca}->{has_reference} = 1 if $has_ref;
3124 $self->{state} = $self->{prev_state};
3125 $self->{s_kwd} = '';
3126 ## Reconsume.
3127 redo A;
3128 }
3129
3130 ## XML-only states
3131
3132 } elsif ($self->{state} == PI_STATE) {
3133 if ($is_space->{$self->{nc}} or
3134 $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
3135 $self->{nc} == -1) {
3136 !!!parse-error (type => 'bare pio', ## TODO: type
3137 line => $self->{line_prev},
3138 column => $self->{column_prev}
3139 - 1 * ($self->{nc} != -1));
3140 $self->{state} = BOGUS_COMMENT_STATE;
3141 ## Reconsume.
3142 $self->{ct} = {type => COMMENT_TOKEN,
3143 data => '?',
3144 line => $self->{line_prev},
3145 column => $self->{column_prev}
3146 - 1 * ($self->{nc} != -1),
3147 };
3148 redo A;
3149 } else {
3150 $self->{ct} = {type => PI_TOKEN,
3151 target => chr $self->{nc},
3152 data => '',
3153 line => $self->{line_prev},
3154 column => $self->{column_prev} - 1,
3155 };
3156 $self->{state} = PI_TARGET_STATE;
3157 !!!next-input-character;
3158 redo A;
3159 }
3160 } elsif ($self->{state} == PI_TARGET_STATE) {
3161 if ($is_space->{$self->{nc}}) {
3162 $self->{state} = PI_TARGET_AFTER_STATE;
3163 !!!next-input-character;
3164 redo A;
3165 } elsif ($self->{nc} == -1) {
3166 !!!parse-error (type => 'no pic'); ## TODO: type
3167 if ($self->{in_subset}) {
3168 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3169 } else {
3170 $self->{state} = DATA_STATE;
3171 $self->{s_kwd} = '';
3172 }
3173 ## Reconsume.
3174 !!!emit ($self->{ct}); # pi
3175 redo A;
3176 } elsif ($self->{nc} == 0x003F) { # ?
3177 $self->{state} = PI_AFTER_STATE;
3178 !!!next-input-character;
3179 redo A;
3180 } else {
3181 ## XML5: typo ("tag name" -> "target")
3182 $self->{ct}->{target} .= chr $self->{nc}; # pi
3183 !!!next-input-character;
3184 redo A;
3185 }
3186 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3187 if ($is_space->{$self->{nc}}) {
3188 ## Stay in the state.
3189 !!!next-input-character;
3190 redo A;
3191 } else {
3192 $self->{state} = PI_DATA_STATE;
3193 ## Reprocess.
3194 redo A;
3195 }
3196 } elsif ($self->{state} == PI_DATA_STATE) {
3197 if ($self->{nc} == 0x003F) { # ?
3198 $self->{state} = PI_DATA_AFTER_STATE;
3199 !!!next-input-character;
3200 redo A;
3201 } elsif ($self->{nc} == -1) {
3202 !!!parse-error (type => 'no pic'); ## TODO: type
3203 if ($self->{in_subset}) {
3204 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3205 } else {
3206 $self->{state} = DATA_STATE;
3207 $self->{s_kwd} = '';
3208 }
3209 ## Reprocess.
3210 !!!emit ($self->{ct}); # pi
3211 redo A;
3212 } else {
3213 $self->{ct}->{data} .= chr $self->{nc}; # pi
3214 $self->{read_until}->($self->{ct}->{data}, q[?],
3215 length $self->{ct}->{data});
3216 ## Stay in the state.
3217 !!!next-input-character;
3218 ## Reprocess.
3219 redo A;
3220 }
3221 } elsif ($self->{state} == PI_AFTER_STATE) {
3222 if ($self->{nc} == 0x003E) { # >
3223 if ($self->{in_subset}) {
3224 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3225 } else {
3226 $self->{state} = DATA_STATE;
3227 $self->{s_kwd} = '';
3228 }
3229 !!!next-input-character;
3230 !!!emit ($self->{ct}); # pi
3231 redo A;
3232 } elsif ($self->{nc} == 0x003F) { # ?
3233 !!!parse-error (type => 'no s after target', ## TODO: type
3234 line => $self->{line_prev},
3235 column => $self->{column_prev}); ## XML5: no error
3236 $self->{ct}->{data} .= '?';
3237 $self->{state} = PI_DATA_AFTER_STATE;
3238 !!!next-input-character;
3239 redo A;
3240 } else {
3241 !!!parse-error (type => 'no s after target', ## TODO: type
3242 line => $self->{line_prev},
3243 column => $self->{column_prev}
3244 + 1 * ($self->{nc} == -1)); ## XML5: no error
3245 $self->{ct}->{data} .= '?'; ## XML5: not appended
3246 $self->{state} = PI_DATA_STATE;
3247 ## Reprocess.
3248 redo A;
3249 }
3250 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3251 ## XML5: Same as "pi after state" in XML5
3252 if ($self->{nc} == 0x003E) { # >
3253 if ($self->{in_subset}) {
3254 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3255 } else {
3256 $self->{state} = DATA_STATE;
3257 $self->{s_kwd} = '';
3258 }
3259 !!!next-input-character;
3260 !!!emit ($self->{ct}); # pi
3261 redo A;
3262 } elsif ($self->{nc} == 0x003F) { # ?
3263 $self->{ct}->{data} .= '?';
3264 ## Stay in the state.
3265 !!!next-input-character;
3266 redo A;
3267 } else {
3268 $self->{ct}->{data} .= '?'; ## XML5: not appended
3269 $self->{state} = PI_DATA_STATE;
3270 ## Reprocess.
3271 redo A;
3272 }
3273
3274 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3275 if ($self->{nc} == 0x003C) { # <
3276 $self->{state} = DOCTYPE_TAG_STATE;
3277 !!!next-input-character;
3278 redo A;
3279 } elsif ($self->{nc} == 0x0025) { # %
3280 ## XML5: Not defined yet.
3281
3282 ## TODO:
3283 !!!next-input-character;
3284 redo A;
3285 } elsif ($self->{nc} == 0x005D) { # ]
3286 delete $self->{in_subset};
3287 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3288 !!!next-input-character;
3289 redo A;
3290 } elsif ($is_space->{$self->{nc}}) {
3291 ## Stay in the state.
3292 !!!next-input-character;
3293 redo A;
3294 } elsif ($self->{nc} == -1) {
3295 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3296 delete $self->{in_subset};
3297 $self->{state} = DATA_STATE;
3298 $self->{s_kwd} = '';
3299 ## Reconsume.
3300 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3301 redo A;
3302 } else {
3303 unless ($self->{internal_subset_tainted}) {
3304 ## XML5: No parse error.
3305 !!!parse-error (type => 'string in internal subset');
3306 $self->{internal_subset_tainted} = 1;
3307 }
3308 ## Stay in the state.
3309 !!!next-input-character;
3310 redo A;
3311 }
3312 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3313 if ($self->{nc} == 0x003E) { # >
3314 $self->{state} = DATA_STATE;
3315 $self->{s_kwd} = '';
3316 !!!next-input-character;
3317 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3318 redo A;
3319 } elsif ($self->{nc} == -1) {
3320 !!!parse-error (type => 'unclosed DOCTYPE');
3321 $self->{state} = DATA_STATE;
3322 $self->{s_kwd} = '';
3323 ## Reconsume.
3324 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3325 redo A;
3326 } else {
3327 ## XML5: No parse error and stay in the state.
3328 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3329
3330 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3331 !!!next-input-character;
3332 redo A;
3333 }
3334 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3335 if ($self->{nc} == 0x003E) { # >
3336 $self->{state} = DATA_STATE;
3337 $self->{s_kwd} = '';
3338 !!!next-input-character;
3339 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3340 redo A;
3341 } elsif ($self->{nc} == -1) {
3342 $self->{state} = DATA_STATE;
3343 $self->{s_kwd} = '';
3344 ## Reconsume.
3345 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3346 redo A;
3347 } else {
3348 ## Stay in the state.
3349 !!!next-input-character;
3350 redo A;
3351 }
3352 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3353 if ($self->{nc} == 0x0021) { # !
3354 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
3355 !!!next-input-character;
3356 redo A;
3357 } elsif ($self->{nc} == 0x003F) { # ?
3358 $self->{state} = PI_STATE;
3359 !!!next-input-character;
3360 redo A;
3361 } elsif ($self->{nc} == -1) {
3362 !!!parse-error (type => 'bare stago');
3363 $self->{state} = DATA_STATE;
3364 $self->{s_kwd} = '';
3365 ## Reconsume.
3366 redo A;
3367 } else {
3368 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3369 line => $self->{line_prev},
3370 column => $self->{column_prev});
3371 $self->{state} = BOGUS_COMMENT_STATE;
3372 $self->{ct} = {type => COMMENT_TOKEN,
3373 data => '',
3374 }; ## NOTE: Will be discarded.
3375 !!!next-input-character;
3376 redo A;
3377 }
3378
3379 } else {
3380 die "$0: $self->{state}: Unknown state";
3381 }
3382 } # A
3383
3384 die "$0: _get_next_token: unexpected case";
3385 } # _get_next_token
3386
3387 1;
3388 ## $Date: 2008/10/15 12:49:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24