/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (show annotations) (download) (as text)
Wed Oct 15 12:49:49 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.11: +249 -82 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	15 Oct 2008 12:49:07 -0000
	* XML-Parser.t: "xml/doctypes-2.dat" added.

	* tokenizer-test-1.test: Keyword case-sensitivility tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 12:49:41 -0000
	* doctypes-1.dat: A keyword case-sensitivility test added.

	* doctypes-2.dat: New test data file.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 12:46:53 -0000
	* Tokenizer.pm.src: $self->{s_kwd} for non-DATA_STATE states are
	renamed as $self->{kwd} to avoid confliction.  Don't raise
	case-sensitivity error for the keyword "DOCTYPE" in HTML mode.
	Support for internal subsets (internal subset itself only; no
	declaration in them is supported yet).  Raise a parse error for
	non-uppercase keywords "PUBLIC" and "SYSTEM" in XML mode.  Raise a
	parse error if no system identifier is specified for a DOCTYPE
	declaration with a public identifier.  Don't close the DOCTYPE
	declaration by a ">" character in the system declaration in XML
	mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 12:48:30 -0000
	* Parser.pm.src: Typo fixed.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.11 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
35
36 ## Token types
37
38 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
39 sub COMMENT_TOKEN () { 2 }
40 sub START_TAG_TOKEN () { 3 }
41 sub END_TAG_TOKEN () { 4 }
42 sub END_OF_FILE_TOKEN () { 5 }
43 sub CHARACTER_TOKEN () { 6 }
44 sub PI_TOKEN () { 7 } ## NOTE: XML only.
45 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
46
47 ## XML5: XML5 has "empty tag token". In this implementation, it is
48 ## represented as a start tag token with $self->{self_closing} flag
49 ## set to true.
50
51 ## XML5: XML5 has "short end tag token". In this implementation, it
52 ## is represented as an end tag token with $token->{tag_name} flag set
53 ## to an empty string.
54
55 package Whatpm::HTML;
56
57 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
58
59 ## Content model flags
60
61 sub CM_ENTITY () { 0b001 } # & markup in data
62 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
63 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
64
65 sub PLAINTEXT_CONTENT_MODEL () { 0 }
66 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
67 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
68 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
69
70 ## Tokenizer states
71
72 sub DATA_STATE () { 0 }
73 #sub ENTITY_DATA_STATE () { 1 }
74 sub TAG_OPEN_STATE () { 2 }
75 sub CLOSE_TAG_OPEN_STATE () { 3 }
76 sub TAG_NAME_STATE () { 4 }
77 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
78 sub ATTRIBUTE_NAME_STATE () { 6 }
79 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
80 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
81 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
82 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
83 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
84 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
85 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
86 sub COMMENT_START_STATE () { 14 }
87 sub COMMENT_START_DASH_STATE () { 15 }
88 sub COMMENT_STATE () { 16 }
89 sub COMMENT_END_STATE () { 17 }
90 sub COMMENT_END_DASH_STATE () { 18 }
91 sub BOGUS_COMMENT_STATE () { 19 }
92 sub DOCTYPE_STATE () { 20 }
93 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
94 sub DOCTYPE_NAME_STATE () { 22 }
95 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
96 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
97 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
98 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
99 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
100 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
101 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
102 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
103 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
104 sub BOGUS_DOCTYPE_STATE () { 32 }
105 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
106 sub SELF_CLOSING_START_TAG_STATE () { 34 }
107 sub CDATA_SECTION_STATE () { 35 }
108 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
109 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
110 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
111 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
112 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
113 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
114 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
115 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
116 ## NOTE: "Entity data state", "entity in attribute value state", and
117 ## "consume a character reference" algorithm are jointly implemented
118 ## using the following six states:
119 sub ENTITY_STATE () { 44 }
120 sub ENTITY_HASH_STATE () { 45 }
121 sub NCR_NUM_STATE () { 46 }
122 sub HEXREF_X_STATE () { 47 }
123 sub HEXREF_HEX_STATE () { 48 }
124 sub ENTITY_NAME_STATE () { 49 }
125 sub PCDATA_STATE () { 50 } # "data state" in the spec
126
127 ## XML-only states
128 sub PI_STATE () { 51 }
129 sub PI_TARGET_STATE () { 52 }
130 sub PI_TARGET_AFTER_STATE () { 53 }
131 sub PI_DATA_STATE () { 54 }
132 sub PI_AFTER_STATE () { 55 }
133 sub PI_DATA_AFTER_STATE () { 56 }
134 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
135 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
136
137 ## Tree constructor state constants (see Whatpm::HTML for the full
138 ## list and descriptions)
139
140 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
141 sub FOREIGN_EL () { 0b1_00000000000 }
142
143 ## Character reference mappings
144
145 my $charref_map = {
146 0x0D => 0x000A,
147 0x80 => 0x20AC,
148 0x81 => 0xFFFD,
149 0x82 => 0x201A,
150 0x83 => 0x0192,
151 0x84 => 0x201E,
152 0x85 => 0x2026,
153 0x86 => 0x2020,
154 0x87 => 0x2021,
155 0x88 => 0x02C6,
156 0x89 => 0x2030,
157 0x8A => 0x0160,
158 0x8B => 0x2039,
159 0x8C => 0x0152,
160 0x8D => 0xFFFD,
161 0x8E => 0x017D,
162 0x8F => 0xFFFD,
163 0x90 => 0xFFFD,
164 0x91 => 0x2018,
165 0x92 => 0x2019,
166 0x93 => 0x201C,
167 0x94 => 0x201D,
168 0x95 => 0x2022,
169 0x96 => 0x2013,
170 0x97 => 0x2014,
171 0x98 => 0x02DC,
172 0x99 => 0x2122,
173 0x9A => 0x0161,
174 0x9B => 0x203A,
175 0x9C => 0x0153,
176 0x9D => 0xFFFD,
177 0x9E => 0x017E,
178 0x9F => 0x0178,
179 }; # $charref_map
180 $charref_map->{$_} = 0xFFFD
181 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
182 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
183 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
184 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
185 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
186 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
187 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
188
189 ## Implementations MUST act as if state machine in the spec
190
191 sub _initialize_tokenizer ($) {
192 my $self = shift;
193
194 ## NOTE: Fields set by |new| constructor:
195 #$self->{level}
196 #$self->{set_nc}
197 #$self->{parse_error}
198 #$self->{is_xml} (if XML)
199
200 $self->{state} = DATA_STATE; # MUST
201 $self->{s_kwd} = ''; # Data state keyword
202 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
203 #$self->{entity__value}; # initialized when used
204 #$self->{entity__match}; # initialized when used
205 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
206 undef $self->{ct}; # current token
207 undef $self->{ca}; # current attribute
208 undef $self->{last_stag_name}; # last emitted start tag name
209 #$self->{prev_state}; # initialized when used
210 delete $self->{self_closing};
211 $self->{char_buffer} = '';
212 $self->{char_buffer_pos} = 0;
213 $self->{nc} = -1; # next input character
214 #$self->{next_nc}
215 !!!next-input-character;
216 $self->{token} = [];
217 # $self->{escape}
218 } # _initialize_tokenizer
219
220 ## A token has:
221 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
222 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
223 ## ->{name} (DOCTYPE_TOKEN)
224 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
225 ## ->{target} (PI_TOKEN)
226 ## ->{pubid} (DOCTYPE_TOKEN)
227 ## ->{sysid} (DOCTYPE_TOKEN)
228 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
229 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
230 ## ->{name}
231 ## ->{value}
232 ## ->{has_reference} == 1 or 0
233 ## ->{index}: Index of the attribute in a tag.
234 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
235 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
236 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
237 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
238
239 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
240 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
241 ## while the token is pushed back to the stack.
242
243 ## Emitted token MUST immediately be handled by the tree construction state.
244
245 ## Before each step, UA MAY check to see if either one of the scripts in
246 ## "list of scripts that will execute as soon as possible" or the first
247 ## script in the "list of scripts that will execute asynchronously",
248 ## has completed loading. If one has, then it MUST be executed
249 ## and removed from the list.
250
251 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
252 ## (This requirement was dropped from HTML5 spec, unfortunately.)
253
254 my $is_space = {
255 0x0009 => 1, # CHARACTER TABULATION (HT)
256 0x000A => 1, # LINE FEED (LF)
257 #0x000B => 0, # LINE TABULATION (VT)
258 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
259 #0x000D => 1, # CARRIAGE RETURN (CR)
260 0x0020 => 1, # SPACE (SP)
261 };
262
263 sub _get_next_token ($) {
264 my $self = shift;
265
266 if ($self->{self_closing}) {
267 !!!parse-error (type => 'nestc', token => $self->{ct});
268 ## NOTE: The |self_closing| flag is only set by start tag token.
269 ## In addition, when a start tag token is emitted, it is always set to
270 ## |ct|.
271 delete $self->{self_closing};
272 }
273
274 if (@{$self->{token}}) {
275 $self->{self_closing} = $self->{token}->[0]->{self_closing};
276 return shift @{$self->{token}};
277 }
278
279 A: {
280 if ($self->{state} == PCDATA_STATE) {
281 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
282
283 if ($self->{nc} == 0x0026) { # &
284 !!!cp (0.1);
285 ## NOTE: In the spec, the tokenizer is switched to the
286 ## "entity data state". In this implementation, the tokenizer
287 ## is switched to the |ENTITY_STATE|, which is an implementation
288 ## of the "consume a character reference" algorithm.
289 $self->{entity_add} = -1;
290 $self->{prev_state} = DATA_STATE;
291 $self->{state} = ENTITY_STATE;
292 !!!next-input-character;
293 redo A;
294 } elsif ($self->{nc} == 0x003C) { # <
295 !!!cp (0.2);
296 $self->{state} = TAG_OPEN_STATE;
297 !!!next-input-character;
298 redo A;
299 } elsif ($self->{nc} == -1) {
300 !!!cp (0.3);
301 !!!emit ({type => END_OF_FILE_TOKEN,
302 line => $self->{line}, column => $self->{column}});
303 last A; ## TODO: ok?
304 } else {
305 !!!cp (0.4);
306 #
307 }
308
309 # Anything else
310 my $token = {type => CHARACTER_TOKEN,
311 data => chr $self->{nc},
312 line => $self->{line}, column => $self->{column},
313 };
314 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
315
316 ## Stay in the state.
317 !!!next-input-character;
318 !!!emit ($token);
319 redo A;
320 } elsif ($self->{state} == DATA_STATE) {
321 $self->{s_kwd} = '' unless defined $self->{s_kwd};
322 if ($self->{nc} == 0x0026) { # &
323 $self->{s_kwd} = '';
324 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
325 not $self->{escape}) {
326 !!!cp (1);
327 ## NOTE: In the spec, the tokenizer is switched to the
328 ## "entity data state". In this implementation, the tokenizer
329 ## is switched to the |ENTITY_STATE|, which is an implementation
330 ## of the "consume a character reference" algorithm.
331 $self->{entity_add} = -1;
332 $self->{prev_state} = DATA_STATE;
333 $self->{state} = ENTITY_STATE;
334 !!!next-input-character;
335 redo A;
336 } else {
337 !!!cp (2);
338 #
339 }
340 } elsif ($self->{nc} == 0x002D) { # -
341 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
342 if ($self->{s_kwd} eq '<!-') {
343 !!!cp (3);
344 $self->{escape} = 1; # unless $self->{escape};
345 $self->{s_kwd} = '--';
346 #
347 } elsif ($self->{s_kwd} eq '-') {
348 !!!cp (4);
349 $self->{s_kwd} = '--';
350 #
351 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
352 !!!cp (4.1);
353 $self->{s_kwd} .= '-';
354 #
355 } else {
356 !!!cp (5);
357 $self->{s_kwd} = '-';
358 #
359 }
360 }
361
362 #
363 } elsif ($self->{nc} == 0x0021) { # !
364 if (length $self->{s_kwd}) {
365 !!!cp (5.1);
366 $self->{s_kwd} .= '!';
367 #
368 } else {
369 !!!cp (5.2);
370 #$self->{s_kwd} = '';
371 #
372 }
373 #
374 } elsif ($self->{nc} == 0x003C) { # <
375 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
376 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
377 not $self->{escape})) {
378 !!!cp (6);
379 $self->{state} = TAG_OPEN_STATE;
380 !!!next-input-character;
381 redo A;
382 } else {
383 !!!cp (7);
384 $self->{s_kwd} = '';
385 #
386 }
387 } elsif ($self->{nc} == 0x003E) { # >
388 if ($self->{escape} and
389 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
390 if ($self->{s_kwd} eq '--') {
391 !!!cp (8);
392 delete $self->{escape};
393 #
394 } else {
395 !!!cp (9);
396 #
397 }
398 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
399 !!!cp (9.1);
400 !!!parse-error (type => 'unmatched mse', ## TODO: type
401 line => $self->{line_prev},
402 column => $self->{column_prev} - 1);
403 #
404 } else {
405 !!!cp (10);
406 #
407 }
408
409 $self->{s_kwd} = '';
410 #
411 } elsif ($self->{nc} == 0x005D) { # ]
412 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
413 !!!cp (10.1);
414 $self->{s_kwd} .= ']';
415 } elsif ($self->{s_kwd} eq ']]') {
416 !!!cp (10.2);
417 #
418 } else {
419 !!!cp (10.3);
420 $self->{s_kwd} = '';
421 }
422 #
423 } elsif ($self->{nc} == -1) {
424 !!!cp (11);
425 $self->{s_kwd} = '';
426 !!!emit ({type => END_OF_FILE_TOKEN,
427 line => $self->{line}, column => $self->{column}});
428 last A; ## TODO: ok?
429 } else {
430 !!!cp (12);
431 $self->{s_kwd} = '';
432 #
433 }
434
435 # Anything else
436 my $token = {type => CHARACTER_TOKEN,
437 data => chr $self->{nc},
438 line => $self->{line}, column => $self->{column},
439 };
440 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
441 length $token->{data})) {
442 $self->{s_kwd} = '';
443 }
444
445 ## Stay in the data state.
446 if (not $self->{is_xml} and
447 $self->{content_model} == PCDATA_CONTENT_MODEL) {
448 !!!cp (13);
449 $self->{state} = PCDATA_STATE;
450 } else {
451 !!!cp (14);
452 ## Stay in the state.
453 }
454 !!!next-input-character;
455 !!!emit ($token);
456 redo A;
457 } elsif ($self->{state} == TAG_OPEN_STATE) {
458 ## XML5: "tag state".
459
460 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
461 if ($self->{nc} == 0x002F) { # /
462 !!!cp (15);
463 !!!next-input-character;
464 $self->{state} = CLOSE_TAG_OPEN_STATE;
465 redo A;
466 } elsif ($self->{nc} == 0x0021) { # !
467 !!!cp (15.1);
468 $self->{s_kwd} = $self->{escaped} ? '' : '<';
469 #
470 } else {
471 !!!cp (16);
472 $self->{s_kwd} = '';
473 #
474 }
475
476 ## reconsume
477 $self->{state} = DATA_STATE;
478 !!!emit ({type => CHARACTER_TOKEN, data => '<',
479 line => $self->{line_prev},
480 column => $self->{column_prev},
481 });
482 redo A;
483 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
484 if ($self->{nc} == 0x0021) { # !
485 !!!cp (17);
486 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
487 !!!next-input-character;
488 redo A;
489 } elsif ($self->{nc} == 0x002F) { # /
490 !!!cp (18);
491 $self->{state} = CLOSE_TAG_OPEN_STATE;
492 !!!next-input-character;
493 redo A;
494 } elsif (0x0041 <= $self->{nc} and
495 $self->{nc} <= 0x005A) { # A..Z
496 !!!cp (19);
497 $self->{ct}
498 = {type => START_TAG_TOKEN,
499 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
500 line => $self->{line_prev},
501 column => $self->{column_prev}};
502 $self->{state} = TAG_NAME_STATE;
503 !!!next-input-character;
504 redo A;
505 } elsif (0x0061 <= $self->{nc} and
506 $self->{nc} <= 0x007A) { # a..z
507 !!!cp (20);
508 $self->{ct} = {type => START_TAG_TOKEN,
509 tag_name => chr ($self->{nc}),
510 line => $self->{line_prev},
511 column => $self->{column_prev}};
512 $self->{state} = TAG_NAME_STATE;
513 !!!next-input-character;
514 redo A;
515 } elsif ($self->{nc} == 0x003E) { # >
516 !!!cp (21);
517 !!!parse-error (type => 'empty start tag',
518 line => $self->{line_prev},
519 column => $self->{column_prev});
520 $self->{state} = DATA_STATE;
521 $self->{s_kwd} = '';
522 !!!next-input-character;
523
524 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
525 line => $self->{line_prev},
526 column => $self->{column_prev},
527 });
528
529 redo A;
530 } elsif ($self->{nc} == 0x003F) { # ?
531 if ($self->{is_xml}) {
532 !!!cp (22.1);
533 $self->{state} = PI_STATE;
534 !!!next-input-character;
535 redo A;
536 } else {
537 !!!cp (22);
538 !!!parse-error (type => 'pio',
539 line => $self->{line_prev},
540 column => $self->{column_prev});
541 $self->{state} = BOGUS_COMMENT_STATE;
542 $self->{ct} = {type => COMMENT_TOKEN, data => '',
543 line => $self->{line_prev},
544 column => $self->{column_prev},
545 };
546 ## $self->{nc} is intentionally left as is
547 redo A;
548 }
549 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
550 !!!cp (23);
551 !!!parse-error (type => 'bare stago',
552 line => $self->{line_prev},
553 column => $self->{column_prev});
554 $self->{state} = DATA_STATE;
555 $self->{s_kwd} = '';
556 ## reconsume
557
558 !!!emit ({type => CHARACTER_TOKEN, data => '<',
559 line => $self->{line_prev},
560 column => $self->{column_prev},
561 });
562
563 redo A;
564 } else {
565 ## XML5: "<:" is a parse error.
566 !!!cp (23.1);
567 $self->{ct} = {type => START_TAG_TOKEN,
568 tag_name => chr ($self->{nc}),
569 line => $self->{line_prev},
570 column => $self->{column_prev}};
571 $self->{state} = TAG_NAME_STATE;
572 !!!next-input-character;
573 redo A;
574 }
575 } else {
576 die "$0: $self->{content_model} in tag open";
577 }
578 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
579 ## NOTE: The "close tag open state" in the spec is implemented as
580 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
581
582 ## XML5: "end tag state".
583
584 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
585 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
586 if (defined $self->{last_stag_name}) {
587 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
588 $self->{kwd} = '';
589 ## Reconsume.
590 redo A;
591 } else {
592 ## No start tag token has ever been emitted
593 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
594 !!!cp (28);
595 $self->{state} = DATA_STATE;
596 $self->{s_kwd} = '';
597 ## Reconsume.
598 !!!emit ({type => CHARACTER_TOKEN, data => '</',
599 line => $l, column => $c,
600 });
601 redo A;
602 }
603 }
604
605 if (0x0041 <= $self->{nc} and
606 $self->{nc} <= 0x005A) { # A..Z
607 !!!cp (29);
608 $self->{ct}
609 = {type => END_TAG_TOKEN,
610 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
611 line => $l, column => $c};
612 $self->{state} = TAG_NAME_STATE;
613 !!!next-input-character;
614 redo A;
615 } elsif (0x0061 <= $self->{nc} and
616 $self->{nc} <= 0x007A) { # a..z
617 !!!cp (30);
618 $self->{ct} = {type => END_TAG_TOKEN,
619 tag_name => chr ($self->{nc}),
620 line => $l, column => $c};
621 $self->{state} = TAG_NAME_STATE;
622 !!!next-input-character;
623 redo A;
624 } elsif ($self->{nc} == 0x003E) { # >
625 !!!parse-error (type => 'empty end tag',
626 line => $self->{line_prev}, ## "<" in "</>"
627 column => $self->{column_prev} - 1);
628 $self->{state} = DATA_STATE;
629 $self->{s_kwd} = '';
630 if ($self->{is_xml}) {
631 !!!cp (31);
632 ## XML5: No parse error.
633
634 ## NOTE: This parser raises a parse error, since it supports
635 ## XML1, not XML5.
636
637 ## NOTE: A short end tag token.
638 my $ct = {type => END_TAG_TOKEN,
639 tag_name => '',
640 line => $self->{line_prev},
641 column => $self->{column_prev} - 1,
642 };
643 !!!next-input-character;
644 !!!emit ($ct);
645 } else {
646 !!!cp (31.1);
647 !!!next-input-character;
648 }
649 redo A;
650 } elsif ($self->{nc} == -1) {
651 !!!cp (32);
652 !!!parse-error (type => 'bare etago');
653 $self->{s_kwd} = '';
654 $self->{state} = DATA_STATE;
655 # reconsume
656
657 !!!emit ({type => CHARACTER_TOKEN, data => '</',
658 line => $l, column => $c,
659 });
660
661 redo A;
662 } elsif (not $self->{is_xml} or
663 $is_space->{$self->{nc}}) {
664 !!!cp (33);
665 !!!parse-error (type => 'bogus end tag',
666 line => $self->{line_prev}, # "<" of "</"
667 column => $self->{column_prev} - 1);
668 $self->{state} = BOGUS_COMMENT_STATE;
669 $self->{ct} = {type => COMMENT_TOKEN, data => '',
670 line => $self->{line_prev}, # "<" of "</"
671 column => $self->{column_prev} - 1,
672 };
673 ## NOTE: $self->{nc} is intentionally left as is.
674 ## Although the "anything else" case of the spec not explicitly
675 ## states that the next input character is to be reconsumed,
676 ## it will be included to the |data| of the comment token
677 ## generated from the bogus end tag, as defined in the
678 ## "bogus comment state" entry.
679 redo A;
680 } else {
681 ## XML5: "</:" is a parse error.
682 !!!cp (30.1);
683 $self->{ct} = {type => END_TAG_TOKEN,
684 tag_name => chr ($self->{nc}),
685 line => $l, column => $c};
686 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
687 !!!next-input-character;
688 redo A;
689 }
690 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
691 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
692 if (length $ch) {
693 my $CH = $ch;
694 $ch =~ tr/a-z/A-Z/;
695 my $nch = chr $self->{nc};
696 if ($nch eq $ch or $nch eq $CH) {
697 !!!cp (24);
698 ## Stay in the state.
699 $self->{kwd} .= $nch;
700 !!!next-input-character;
701 redo A;
702 } else {
703 !!!cp (25);
704 $self->{state} = DATA_STATE;
705 $self->{s_kwd} = '';
706 ## Reconsume.
707 !!!emit ({type => CHARACTER_TOKEN,
708 data => '</' . $self->{kwd},
709 line => $self->{line_prev},
710 column => $self->{column_prev} - 1 - length $self->{kwd},
711 });
712 redo A;
713 }
714 } else { # after "<{tag-name}"
715 unless ($is_space->{$self->{nc}} or
716 {
717 0x003E => 1, # >
718 0x002F => 1, # /
719 -1 => 1, # EOF
720 }->{$self->{nc}}) {
721 !!!cp (26);
722 ## Reconsume.
723 $self->{state} = DATA_STATE;
724 $self->{s_kwd} = '';
725 !!!emit ({type => CHARACTER_TOKEN,
726 data => '</' . $self->{kwd},
727 line => $self->{line_prev},
728 column => $self->{column_prev} - 1 - length $self->{kwd},
729 });
730 redo A;
731 } else {
732 !!!cp (27);
733 $self->{ct}
734 = {type => END_TAG_TOKEN,
735 tag_name => $self->{last_stag_name},
736 line => $self->{line_prev},
737 column => $self->{column_prev} - 1 - length $self->{kwd}};
738 $self->{state} = TAG_NAME_STATE;
739 ## Reconsume.
740 redo A;
741 }
742 }
743 } elsif ($self->{state} == TAG_NAME_STATE) {
744 if ($is_space->{$self->{nc}}) {
745 !!!cp (34);
746 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
747 !!!next-input-character;
748 redo A;
749 } elsif ($self->{nc} == 0x003E) { # >
750 if ($self->{ct}->{type} == START_TAG_TOKEN) {
751 !!!cp (35);
752 $self->{last_stag_name} = $self->{ct}->{tag_name};
753 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
754 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
755 #if ($self->{ct}->{attributes}) {
756 # ## NOTE: This should never be reached.
757 # !!! cp (36);
758 # !!! parse-error (type => 'end tag attribute');
759 #} else {
760 !!!cp (37);
761 #}
762 } else {
763 die "$0: $self->{ct}->{type}: Unknown token type";
764 }
765 $self->{state} = DATA_STATE;
766 $self->{s_kwd} = '';
767 !!!next-input-character;
768
769 !!!emit ($self->{ct}); # start tag or end tag
770
771 redo A;
772 } elsif (0x0041 <= $self->{nc} and
773 $self->{nc} <= 0x005A) { # A..Z
774 !!!cp (38);
775 $self->{ct}->{tag_name}
776 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
777 # start tag or end tag
778 ## Stay in this state
779 !!!next-input-character;
780 redo A;
781 } elsif ($self->{nc} == -1) {
782 !!!parse-error (type => 'unclosed tag');
783 if ($self->{ct}->{type} == START_TAG_TOKEN) {
784 !!!cp (39);
785 $self->{last_stag_name} = $self->{ct}->{tag_name};
786 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
787 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
788 #if ($self->{ct}->{attributes}) {
789 # ## NOTE: This state should never be reached.
790 # !!! cp (40);
791 # !!! parse-error (type => 'end tag attribute');
792 #} else {
793 !!!cp (41);
794 #}
795 } else {
796 die "$0: $self->{ct}->{type}: Unknown token type";
797 }
798 $self->{state} = DATA_STATE;
799 $self->{s_kwd} = '';
800 # reconsume
801
802 !!!emit ($self->{ct}); # start tag or end tag
803
804 redo A;
805 } elsif ($self->{nc} == 0x002F) { # /
806 !!!cp (42);
807 $self->{state} = SELF_CLOSING_START_TAG_STATE;
808 !!!next-input-character;
809 redo A;
810 } else {
811 !!!cp (44);
812 $self->{ct}->{tag_name} .= chr $self->{nc};
813 # start tag or end tag
814 ## Stay in the state
815 !!!next-input-character;
816 redo A;
817 }
818 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
819 ## XML5: "Tag attribute name before state".
820
821 if ($is_space->{$self->{nc}}) {
822 !!!cp (45);
823 ## Stay in the state
824 !!!next-input-character;
825 redo A;
826 } elsif ($self->{nc} == 0x003E) { # >
827 if ($self->{ct}->{type} == START_TAG_TOKEN) {
828 !!!cp (46);
829 $self->{last_stag_name} = $self->{ct}->{tag_name};
830 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
831 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
832 if ($self->{ct}->{attributes}) {
833 !!!cp (47);
834 !!!parse-error (type => 'end tag attribute');
835 } else {
836 !!!cp (48);
837 }
838 } else {
839 die "$0: $self->{ct}->{type}: Unknown token type";
840 }
841 $self->{state} = DATA_STATE;
842 $self->{s_kwd} = '';
843 !!!next-input-character;
844
845 !!!emit ($self->{ct}); # start tag or end tag
846
847 redo A;
848 } elsif (0x0041 <= $self->{nc} and
849 $self->{nc} <= 0x005A) { # A..Z
850 !!!cp (49);
851 $self->{ca}
852 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
853 value => '',
854 line => $self->{line}, column => $self->{column}};
855 $self->{state} = ATTRIBUTE_NAME_STATE;
856 !!!next-input-character;
857 redo A;
858 } elsif ($self->{nc} == 0x002F) { # /
859 !!!cp (50);
860 $self->{state} = SELF_CLOSING_START_TAG_STATE;
861 !!!next-input-character;
862 redo A;
863 } elsif ($self->{nc} == -1) {
864 !!!parse-error (type => 'unclosed tag');
865 if ($self->{ct}->{type} == START_TAG_TOKEN) {
866 !!!cp (52);
867 $self->{last_stag_name} = $self->{ct}->{tag_name};
868 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
869 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
870 if ($self->{ct}->{attributes}) {
871 !!!cp (53);
872 !!!parse-error (type => 'end tag attribute');
873 } else {
874 !!!cp (54);
875 }
876 } else {
877 die "$0: $self->{ct}->{type}: Unknown token type";
878 }
879 $self->{state} = DATA_STATE;
880 $self->{s_kwd} = '';
881 # reconsume
882
883 !!!emit ($self->{ct}); # start tag or end tag
884
885 redo A;
886 } else {
887 if ({
888 0x0022 => 1, # "
889 0x0027 => 1, # '
890 0x003D => 1, # =
891 }->{$self->{nc}}) {
892 !!!cp (55);
893 ## XML5: Not a parse error.
894 !!!parse-error (type => 'bad attribute name');
895 } else {
896 !!!cp (56);
897 ## XML5: ":" raises a parse error and is ignored.
898 }
899 $self->{ca}
900 = {name => chr ($self->{nc}),
901 value => '',
902 line => $self->{line}, column => $self->{column}};
903 $self->{state} = ATTRIBUTE_NAME_STATE;
904 !!!next-input-character;
905 redo A;
906 }
907 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
908 ## XML5: "Tag attribute name state".
909
910 my $before_leave = sub {
911 if (exists $self->{ct}->{attributes} # start tag or end tag
912 ->{$self->{ca}->{name}}) { # MUST
913 !!!cp (57);
914 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
915 ## Discard $self->{ca} # MUST
916 } else {
917 !!!cp (58);
918 $self->{ct}->{attributes}->{$self->{ca}->{name}}
919 = $self->{ca};
920 $self->{ca}->{index} = ++$self->{ct}->{last_index};
921 }
922 }; # $before_leave
923
924 if ($is_space->{$self->{nc}}) {
925 !!!cp (59);
926 $before_leave->();
927 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
928 !!!next-input-character;
929 redo A;
930 } elsif ($self->{nc} == 0x003D) { # =
931 !!!cp (60);
932 $before_leave->();
933 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
934 !!!next-input-character;
935 redo A;
936 } elsif ($self->{nc} == 0x003E) { # >
937 if ($self->{is_xml}) {
938 !!!cp (60.1);
939 ## XML5: Not a parse error.
940 !!!parse-error (type => 'no attr value'); ## TODO: type
941 } else {
942 !!!cp (60.2);
943 }
944
945 $before_leave->();
946 if ($self->{ct}->{type} == START_TAG_TOKEN) {
947 !!!cp (61);
948 $self->{last_stag_name} = $self->{ct}->{tag_name};
949 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
950 !!!cp (62);
951 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
952 if ($self->{ct}->{attributes}) {
953 !!!parse-error (type => 'end tag attribute');
954 }
955 } else {
956 die "$0: $self->{ct}->{type}: Unknown token type";
957 }
958 $self->{state} = DATA_STATE;
959 $self->{s_kwd} = '';
960 !!!next-input-character;
961
962 !!!emit ($self->{ct}); # start tag or end tag
963
964 redo A;
965 } elsif (0x0041 <= $self->{nc} and
966 $self->{nc} <= 0x005A) { # A..Z
967 !!!cp (63);
968 $self->{ca}->{name}
969 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
970 ## Stay in the state
971 !!!next-input-character;
972 redo A;
973 } elsif ($self->{nc} == 0x002F) { # /
974 if ($self->{is_xml}) {
975 !!!cp (64);
976 ## XML5: Not a parse error.
977 !!!parse-error (type => 'no attr value'); ## TODO: type
978 } else {
979 !!!cp (64.1);
980 }
981
982 $before_leave->();
983 $self->{state} = SELF_CLOSING_START_TAG_STATE;
984 !!!next-input-character;
985 redo A;
986 } elsif ($self->{nc} == -1) {
987 !!!parse-error (type => 'unclosed tag');
988 $before_leave->();
989 if ($self->{ct}->{type} == START_TAG_TOKEN) {
990 !!!cp (66);
991 $self->{last_stag_name} = $self->{ct}->{tag_name};
992 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
993 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
994 if ($self->{ct}->{attributes}) {
995 !!!cp (67);
996 !!!parse-error (type => 'end tag attribute');
997 } else {
998 ## NOTE: This state should never be reached.
999 !!!cp (68);
1000 }
1001 } else {
1002 die "$0: $self->{ct}->{type}: Unknown token type";
1003 }
1004 $self->{state} = DATA_STATE;
1005 $self->{s_kwd} = '';
1006 # reconsume
1007
1008 !!!emit ($self->{ct}); # start tag or end tag
1009
1010 redo A;
1011 } else {
1012 if ($self->{nc} == 0x0022 or # "
1013 $self->{nc} == 0x0027) { # '
1014 !!!cp (69);
1015 ## XML5: Not a parse error.
1016 !!!parse-error (type => 'bad attribute name');
1017 } else {
1018 !!!cp (70);
1019 }
1020 $self->{ca}->{name} .= chr ($self->{nc});
1021 ## Stay in the state
1022 !!!next-input-character;
1023 redo A;
1024 }
1025 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1026 ## XML5: "Tag attribute name after state".
1027
1028 if ($is_space->{$self->{nc}}) {
1029 !!!cp (71);
1030 ## Stay in the state
1031 !!!next-input-character;
1032 redo A;
1033 } elsif ($self->{nc} == 0x003D) { # =
1034 !!!cp (72);
1035 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1036 !!!next-input-character;
1037 redo A;
1038 } elsif ($self->{nc} == 0x003E) { # >
1039 if ($self->{is_xml}) {
1040 !!!cp (72.1);
1041 ## XML5: Not a parse error.
1042 !!!parse-error (type => 'no attr value'); ## TODO: type
1043 } else {
1044 !!!cp (72.2);
1045 }
1046
1047 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1048 !!!cp (73);
1049 $self->{last_stag_name} = $self->{ct}->{tag_name};
1050 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1051 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1052 if ($self->{ct}->{attributes}) {
1053 !!!cp (74);
1054 !!!parse-error (type => 'end tag attribute');
1055 } else {
1056 ## NOTE: This state should never be reached.
1057 !!!cp (75);
1058 }
1059 } else {
1060 die "$0: $self->{ct}->{type}: Unknown token type";
1061 }
1062 $self->{state} = DATA_STATE;
1063 $self->{s_kwd} = '';
1064 !!!next-input-character;
1065
1066 !!!emit ($self->{ct}); # start tag or end tag
1067
1068 redo A;
1069 } elsif (0x0041 <= $self->{nc} and
1070 $self->{nc} <= 0x005A) { # A..Z
1071 !!!cp (76);
1072 $self->{ca}
1073 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1074 value => '',
1075 line => $self->{line}, column => $self->{column}};
1076 $self->{state} = ATTRIBUTE_NAME_STATE;
1077 !!!next-input-character;
1078 redo A;
1079 } elsif ($self->{nc} == 0x002F) { # /
1080 if ($self->{is_xml}) {
1081 !!!cp (77);
1082 ## XML5: Not a parse error.
1083 !!!parse-error (type => 'no attr value'); ## TODO: type
1084 } else {
1085 !!!cp (77.1);
1086 }
1087
1088 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1089 !!!next-input-character;
1090 redo A;
1091 } elsif ($self->{nc} == -1) {
1092 !!!parse-error (type => 'unclosed tag');
1093 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1094 !!!cp (79);
1095 $self->{last_stag_name} = $self->{ct}->{tag_name};
1096 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1097 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1098 if ($self->{ct}->{attributes}) {
1099 !!!cp (80);
1100 !!!parse-error (type => 'end tag attribute');
1101 } else {
1102 ## NOTE: This state should never be reached.
1103 !!!cp (81);
1104 }
1105 } else {
1106 die "$0: $self->{ct}->{type}: Unknown token type";
1107 }
1108 $self->{s_kwd} = '';
1109 $self->{state} = DATA_STATE;
1110 # reconsume
1111
1112 !!!emit ($self->{ct}); # start tag or end tag
1113
1114 redo A;
1115 } else {
1116 if ($self->{is_xml}) {
1117 !!!cp (78.1);
1118 ## XML5: Not a parse error.
1119 !!!parse-error (type => 'no attr value'); ## TODO: type
1120 } else {
1121 !!!cp (78.2);
1122 }
1123
1124 if ($self->{nc} == 0x0022 or # "
1125 $self->{nc} == 0x0027) { # '
1126 !!!cp (78);
1127 ## XML5: Not a parse error.
1128 !!!parse-error (type => 'bad attribute name');
1129 } else {
1130 !!!cp (82);
1131 }
1132 $self->{ca}
1133 = {name => chr ($self->{nc}),
1134 value => '',
1135 line => $self->{line}, column => $self->{column}};
1136 $self->{state} = ATTRIBUTE_NAME_STATE;
1137 !!!next-input-character;
1138 redo A;
1139 }
1140 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1141 ## XML5: "Tag attribute value before state".
1142
1143 if ($is_space->{$self->{nc}}) {
1144 !!!cp (83);
1145 ## Stay in the state
1146 !!!next-input-character;
1147 redo A;
1148 } elsif ($self->{nc} == 0x0022) { # "
1149 !!!cp (84);
1150 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1151 !!!next-input-character;
1152 redo A;
1153 } elsif ($self->{nc} == 0x0026) { # &
1154 !!!cp (85);
1155 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1156 ## reconsume
1157 redo A;
1158 } elsif ($self->{nc} == 0x0027) { # '
1159 !!!cp (86);
1160 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1161 !!!next-input-character;
1162 redo A;
1163 } elsif ($self->{nc} == 0x003E) { # >
1164 !!!parse-error (type => 'empty unquoted attribute value');
1165 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1166 !!!cp (87);
1167 $self->{last_stag_name} = $self->{ct}->{tag_name};
1168 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1169 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1170 if ($self->{ct}->{attributes}) {
1171 !!!cp (88);
1172 !!!parse-error (type => 'end tag attribute');
1173 } else {
1174 ## NOTE: This state should never be reached.
1175 !!!cp (89);
1176 }
1177 } else {
1178 die "$0: $self->{ct}->{type}: Unknown token type";
1179 }
1180 $self->{state} = DATA_STATE;
1181 $self->{s_kwd} = '';
1182 !!!next-input-character;
1183
1184 !!!emit ($self->{ct}); # start tag or end tag
1185
1186 redo A;
1187 } elsif ($self->{nc} == -1) {
1188 !!!parse-error (type => 'unclosed tag');
1189 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1190 !!!cp (90);
1191 $self->{last_stag_name} = $self->{ct}->{tag_name};
1192 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1193 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1194 if ($self->{ct}->{attributes}) {
1195 !!!cp (91);
1196 !!!parse-error (type => 'end tag attribute');
1197 } else {
1198 ## NOTE: This state should never be reached.
1199 !!!cp (92);
1200 }
1201 } else {
1202 die "$0: $self->{ct}->{type}: Unknown token type";
1203 }
1204 $self->{state} = DATA_STATE;
1205 $self->{s_kwd} = '';
1206 ## reconsume
1207
1208 !!!emit ($self->{ct}); # start tag or end tag
1209
1210 redo A;
1211 } else {
1212 if ($self->{nc} == 0x003D) { # =
1213 !!!cp (93);
1214 ## XML5: Not a parse error.
1215 !!!parse-error (type => 'bad attribute value');
1216 } elsif ($self->{is_xml}) {
1217 !!!cp (93.1);
1218 ## XML5: No parse error.
1219 !!!parse-error (type => 'unquoted attr value'); ## TODO
1220 } else {
1221 !!!cp (94);
1222 }
1223 $self->{ca}->{value} .= chr ($self->{nc});
1224 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1225 !!!next-input-character;
1226 redo A;
1227 }
1228 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1229 ## XML5: "Tag attribute value double quoted state".
1230
1231 if ($self->{nc} == 0x0022) { # "
1232 !!!cp (95);
1233 ## XML5: "Tag attribute name before state".
1234 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1235 !!!next-input-character;
1236 redo A;
1237 } elsif ($self->{nc} == 0x0026) { # &
1238 !!!cp (96);
1239 ## XML5: Not defined yet.
1240
1241 ## NOTE: In the spec, the tokenizer is switched to the
1242 ## "entity in attribute value state". In this implementation, the
1243 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1244 ## implementation of the "consume a character reference" algorithm.
1245 $self->{prev_state} = $self->{state};
1246 $self->{entity_add} = 0x0022; # "
1247 $self->{state} = ENTITY_STATE;
1248 !!!next-input-character;
1249 redo A;
1250 } elsif ($self->{nc} == -1) {
1251 !!!parse-error (type => 'unclosed attribute value');
1252 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1253 !!!cp (97);
1254 $self->{last_stag_name} = $self->{ct}->{tag_name};
1255 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1256 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1257 if ($self->{ct}->{attributes}) {
1258 !!!cp (98);
1259 !!!parse-error (type => 'end tag attribute');
1260 } else {
1261 ## NOTE: This state should never be reached.
1262 !!!cp (99);
1263 }
1264 } else {
1265 die "$0: $self->{ct}->{type}: Unknown token type";
1266 }
1267 $self->{state} = DATA_STATE;
1268 $self->{s_kwd} = '';
1269 ## reconsume
1270
1271 !!!emit ($self->{ct}); # start tag or end tag
1272
1273 redo A;
1274 } else {
1275 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1276 !!!cp (100);
1277 ## XML5: Not a parse error.
1278 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1279 } else {
1280 !!!cp (100.1);
1281 }
1282 $self->{ca}->{value} .= chr ($self->{nc});
1283 $self->{read_until}->($self->{ca}->{value},
1284 q["&<],
1285 length $self->{ca}->{value});
1286
1287 ## Stay in the state
1288 !!!next-input-character;
1289 redo A;
1290 }
1291 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1292 ## XML5: "Tag attribute value single quoted state".
1293
1294 if ($self->{nc} == 0x0027) { # '
1295 !!!cp (101);
1296 ## XML5: "Before attribute name state" (sic).
1297 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1298 !!!next-input-character;
1299 redo A;
1300 } elsif ($self->{nc} == 0x0026) { # &
1301 !!!cp (102);
1302 ## XML5: Not defined yet.
1303
1304 ## NOTE: In the spec, the tokenizer is switched to the
1305 ## "entity in attribute value state". In this implementation, the
1306 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1307 ## implementation of the "consume a character reference" algorithm.
1308 $self->{entity_add} = 0x0027; # '
1309 $self->{prev_state} = $self->{state};
1310 $self->{state} = ENTITY_STATE;
1311 !!!next-input-character;
1312 redo A;
1313 } elsif ($self->{nc} == -1) {
1314 !!!parse-error (type => 'unclosed attribute value');
1315 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1316 !!!cp (103);
1317 $self->{last_stag_name} = $self->{ct}->{tag_name};
1318 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1319 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1320 if ($self->{ct}->{attributes}) {
1321 !!!cp (104);
1322 !!!parse-error (type => 'end tag attribute');
1323 } else {
1324 ## NOTE: This state should never be reached.
1325 !!!cp (105);
1326 }
1327 } else {
1328 die "$0: $self->{ct}->{type}: Unknown token type";
1329 }
1330 $self->{state} = DATA_STATE;
1331 $self->{s_kwd} = '';
1332 ## reconsume
1333
1334 !!!emit ($self->{ct}); # start tag or end tag
1335
1336 redo A;
1337 } else {
1338 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1339 !!!cp (106);
1340 ## XML5: Not a parse error.
1341 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1342 } else {
1343 !!!cp (106.1);
1344 }
1345 $self->{ca}->{value} .= chr ($self->{nc});
1346 $self->{read_until}->($self->{ca}->{value},
1347 q['&<],
1348 length $self->{ca}->{value});
1349
1350 ## Stay in the state
1351 !!!next-input-character;
1352 redo A;
1353 }
1354 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1355 ## XML5: "Tag attribute value unquoted state".
1356
1357 if ($is_space->{$self->{nc}}) {
1358 !!!cp (107);
1359 ## XML5: "Tag attribute name before state".
1360 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1361 !!!next-input-character;
1362 redo A;
1363 } elsif ($self->{nc} == 0x0026) { # &
1364 !!!cp (108);
1365
1366 ## XML5: Not defined yet.
1367
1368 ## NOTE: In the spec, the tokenizer is switched to the
1369 ## "entity in attribute value state". In this implementation, the
1370 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1371 ## implementation of the "consume a character reference" algorithm.
1372 $self->{entity_add} = -1;
1373 $self->{prev_state} = $self->{state};
1374 $self->{state} = ENTITY_STATE;
1375 !!!next-input-character;
1376 redo A;
1377 } elsif ($self->{nc} == 0x003E) { # >
1378 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1379 !!!cp (109);
1380 $self->{last_stag_name} = $self->{ct}->{tag_name};
1381 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1382 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1383 if ($self->{ct}->{attributes}) {
1384 !!!cp (110);
1385 !!!parse-error (type => 'end tag attribute');
1386 } else {
1387 ## NOTE: This state should never be reached.
1388 !!!cp (111);
1389 }
1390 } else {
1391 die "$0: $self->{ct}->{type}: Unknown token type";
1392 }
1393 $self->{state} = DATA_STATE;
1394 $self->{s_kwd} = '';
1395 !!!next-input-character;
1396
1397 !!!emit ($self->{ct}); # start tag or end tag
1398
1399 redo A;
1400 } elsif ($self->{nc} == -1) {
1401 !!!parse-error (type => 'unclosed tag');
1402 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1403 !!!cp (112);
1404 $self->{last_stag_name} = $self->{ct}->{tag_name};
1405 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1406 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1407 if ($self->{ct}->{attributes}) {
1408 !!!cp (113);
1409 !!!parse-error (type => 'end tag attribute');
1410 } else {
1411 ## NOTE: This state should never be reached.
1412 !!!cp (114);
1413 }
1414 } else {
1415 die "$0: $self->{ct}->{type}: Unknown token type";
1416 }
1417 $self->{state} = DATA_STATE;
1418 $self->{s_kwd} = '';
1419 ## reconsume
1420
1421 !!!emit ($self->{ct}); # start tag or end tag
1422
1423 redo A;
1424 } else {
1425 if ({
1426 0x0022 => 1, # "
1427 0x0027 => 1, # '
1428 0x003D => 1, # =
1429 }->{$self->{nc}}) {
1430 !!!cp (115);
1431 ## XML5: Not a parse error.
1432 !!!parse-error (type => 'bad attribute value');
1433 } else {
1434 !!!cp (116);
1435 }
1436 $self->{ca}->{value} .= chr ($self->{nc});
1437 $self->{read_until}->($self->{ca}->{value},
1438 q["'=& >],
1439 length $self->{ca}->{value});
1440
1441 ## Stay in the state
1442 !!!next-input-character;
1443 redo A;
1444 }
1445 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1446 if ($is_space->{$self->{nc}}) {
1447 !!!cp (118);
1448 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1449 !!!next-input-character;
1450 redo A;
1451 } elsif ($self->{nc} == 0x003E) { # >
1452 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1453 !!!cp (119);
1454 $self->{last_stag_name} = $self->{ct}->{tag_name};
1455 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1456 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1457 if ($self->{ct}->{attributes}) {
1458 !!!cp (120);
1459 !!!parse-error (type => 'end tag attribute');
1460 } else {
1461 ## NOTE: This state should never be reached.
1462 !!!cp (121);
1463 }
1464 } else {
1465 die "$0: $self->{ct}->{type}: Unknown token type";
1466 }
1467 $self->{state} = DATA_STATE;
1468 $self->{s_kwd} = '';
1469 !!!next-input-character;
1470
1471 !!!emit ($self->{ct}); # start tag or end tag
1472
1473 redo A;
1474 } elsif ($self->{nc} == 0x002F) { # /
1475 !!!cp (122);
1476 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1477 !!!next-input-character;
1478 redo A;
1479 } elsif ($self->{nc} == -1) {
1480 !!!parse-error (type => 'unclosed tag');
1481 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1482 !!!cp (122.3);
1483 $self->{last_stag_name} = $self->{ct}->{tag_name};
1484 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1485 if ($self->{ct}->{attributes}) {
1486 !!!cp (122.1);
1487 !!!parse-error (type => 'end tag attribute');
1488 } else {
1489 ## NOTE: This state should never be reached.
1490 !!!cp (122.2);
1491 }
1492 } else {
1493 die "$0: $self->{ct}->{type}: Unknown token type";
1494 }
1495 $self->{state} = DATA_STATE;
1496 $self->{s_kwd} = '';
1497 ## Reconsume.
1498 !!!emit ($self->{ct}); # start tag or end tag
1499 redo A;
1500 } else {
1501 !!!cp ('124.1');
1502 !!!parse-error (type => 'no space between attributes');
1503 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1504 ## reconsume
1505 redo A;
1506 }
1507 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1508 ## XML5: "Empty tag state".
1509
1510 if ($self->{nc} == 0x003E) { # >
1511 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1512 !!!cp ('124.2');
1513 !!!parse-error (type => 'nestc', token => $self->{ct});
1514 ## TODO: Different type than slash in start tag
1515 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1516 if ($self->{ct}->{attributes}) {
1517 !!!cp ('124.4');
1518 !!!parse-error (type => 'end tag attribute');
1519 } else {
1520 !!!cp ('124.5');
1521 }
1522 ## TODO: Test |<title></title/>|
1523 } else {
1524 !!!cp ('124.3');
1525 $self->{self_closing} = 1;
1526 }
1527
1528 $self->{state} = DATA_STATE;
1529 $self->{s_kwd} = '';
1530 !!!next-input-character;
1531
1532 !!!emit ($self->{ct}); # start tag or end tag
1533
1534 redo A;
1535 } elsif ($self->{nc} == -1) {
1536 !!!parse-error (type => 'unclosed tag');
1537 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1538 !!!cp (124.7);
1539 $self->{last_stag_name} = $self->{ct}->{tag_name};
1540 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1541 if ($self->{ct}->{attributes}) {
1542 !!!cp (124.5);
1543 !!!parse-error (type => 'end tag attribute');
1544 } else {
1545 ## NOTE: This state should never be reached.
1546 !!!cp (124.6);
1547 }
1548 } else {
1549 die "$0: $self->{ct}->{type}: Unknown token type";
1550 }
1551 ## XML5: "Tag attribute name before state".
1552 $self->{state} = DATA_STATE;
1553 $self->{s_kwd} = '';
1554 ## Reconsume.
1555 !!!emit ($self->{ct}); # start tag or end tag
1556 redo A;
1557 } else {
1558 !!!cp ('124.4');
1559 !!!parse-error (type => 'nestc');
1560 ## TODO: This error type is wrong.
1561 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1562 ## Reconsume.
1563 redo A;
1564 }
1565 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1566 ## (only happen if PCDATA state)
1567
1568 ## NOTE: Unlike spec's "bogus comment state", this implementation
1569 ## consumes characters one-by-one basis.
1570
1571 if ($self->{nc} == 0x003E) { # >
1572 !!!cp (124);
1573 $self->{state} = DATA_STATE;
1574 $self->{s_kwd} = '';
1575 !!!next-input-character;
1576
1577 !!!emit ($self->{ct}); # comment
1578 redo A;
1579 } elsif ($self->{nc} == -1) {
1580 !!!cp (125);
1581 $self->{state} = DATA_STATE;
1582 $self->{s_kwd} = '';
1583 ## reconsume
1584
1585 !!!emit ($self->{ct}); # comment
1586 redo A;
1587 } else {
1588 !!!cp (126);
1589 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1590 $self->{read_until}->($self->{ct}->{data},
1591 q[>],
1592 length $self->{ct}->{data});
1593
1594 ## Stay in the state.
1595 !!!next-input-character;
1596 redo A;
1597 }
1598 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1599 ## (only happen if PCDATA state)
1600
1601 if ($self->{nc} == 0x002D) { # -
1602 !!!cp (133);
1603 $self->{state} = MD_HYPHEN_STATE;
1604 !!!next-input-character;
1605 redo A;
1606 } elsif ($self->{nc} == 0x0044 or # D
1607 $self->{nc} == 0x0064) { # d
1608 ## ASCII case-insensitive.
1609 !!!cp (130);
1610 $self->{state} = MD_DOCTYPE_STATE;
1611 $self->{kwd} = chr $self->{nc};
1612 !!!next-input-character;
1613 redo A;
1614 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1615 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1616 $self->{is_xml}) and
1617 $self->{nc} == 0x005B) { # [
1618 !!!cp (135.4);
1619 $self->{state} = MD_CDATA_STATE;
1620 $self->{kwd} = '[';
1621 !!!next-input-character;
1622 redo A;
1623 } else {
1624 !!!cp (136);
1625 }
1626
1627 !!!parse-error (type => 'bogus comment',
1628 line => $self->{line_prev},
1629 column => $self->{column_prev} - 1);
1630 ## Reconsume.
1631 $self->{state} = BOGUS_COMMENT_STATE;
1632 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1633 line => $self->{line_prev},
1634 column => $self->{column_prev} - 1,
1635 };
1636 redo A;
1637 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1638 if ($self->{nc} == 0x002D) { # -
1639 !!!cp (127);
1640 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1641 line => $self->{line_prev},
1642 column => $self->{column_prev} - 2,
1643 };
1644 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1645 !!!next-input-character;
1646 redo A;
1647 } else {
1648 !!!cp (128);
1649 !!!parse-error (type => 'bogus comment',
1650 line => $self->{line_prev},
1651 column => $self->{column_prev} - 2);
1652 $self->{state} = BOGUS_COMMENT_STATE;
1653 ## Reconsume.
1654 $self->{ct} = {type => COMMENT_TOKEN,
1655 data => '-',
1656 line => $self->{line_prev},
1657 column => $self->{column_prev} - 2,
1658 };
1659 redo A;
1660 }
1661 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1662 ## ASCII case-insensitive.
1663 if ($self->{nc} == [
1664 undef,
1665 0x004F, # O
1666 0x0043, # C
1667 0x0054, # T
1668 0x0059, # Y
1669 0x0050, # P
1670 ]->[length $self->{kwd}] or
1671 $self->{nc} == [
1672 undef,
1673 0x006F, # o
1674 0x0063, # c
1675 0x0074, # t
1676 0x0079, # y
1677 0x0070, # p
1678 ]->[length $self->{kwd}]) {
1679 !!!cp (131);
1680 ## Stay in the state.
1681 $self->{kwd} .= chr $self->{nc};
1682 !!!next-input-character;
1683 redo A;
1684 } elsif ((length $self->{kwd}) == 6 and
1685 ($self->{nc} == 0x0045 or # E
1686 $self->{nc} == 0x0065)) { # e
1687 if ($self->{is_xml} and
1688 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1689 !!!cp (129);
1690 ## XML5: case-sensitive.
1691 !!!parse-error (type => 'lowercase keyword', ## TODO
1692 text => 'DOCTYPE',
1693 line => $self->{line_prev},
1694 column => $self->{column_prev} - 5);
1695 } else {
1696 !!!cp (129.1);
1697 }
1698 $self->{state} = DOCTYPE_STATE;
1699 $self->{ct} = {type => DOCTYPE_TOKEN,
1700 quirks => 1,
1701 line => $self->{line_prev},
1702 column => $self->{column_prev} - 7,
1703 };
1704 !!!next-input-character;
1705 redo A;
1706 } else {
1707 !!!cp (132);
1708 !!!parse-error (type => 'bogus comment',
1709 line => $self->{line_prev},
1710 column => $self->{column_prev} - 1 - length $self->{kwd});
1711 $self->{state} = BOGUS_COMMENT_STATE;
1712 ## Reconsume.
1713 $self->{ct} = {type => COMMENT_TOKEN,
1714 data => $self->{kwd},
1715 line => $self->{line_prev},
1716 column => $self->{column_prev} - 1 - length $self->{kwd},
1717 };
1718 redo A;
1719 }
1720 } elsif ($self->{state} == MD_CDATA_STATE) {
1721 if ($self->{nc} == {
1722 '[' => 0x0043, # C
1723 '[C' => 0x0044, # D
1724 '[CD' => 0x0041, # A
1725 '[CDA' => 0x0054, # T
1726 '[CDAT' => 0x0041, # A
1727 }->{$self->{kwd}}) {
1728 !!!cp (135.1);
1729 ## Stay in the state.
1730 $self->{kwd} .= chr $self->{nc};
1731 !!!next-input-character;
1732 redo A;
1733 } elsif ($self->{kwd} eq '[CDATA' and
1734 $self->{nc} == 0x005B) { # [
1735 if ($self->{is_xml} and
1736 not $self->{tainted} and
1737 @{$self->{open_elements} or []} == 0) {
1738 !!!cp (135.2);
1739 !!!parse-error (type => 'cdata outside of root element',
1740 line => $self->{line_prev},
1741 column => $self->{column_prev} - 7);
1742 $self->{tainted} = 1;
1743 } else {
1744 !!!cp (135.21);
1745 }
1746
1747 $self->{ct} = {type => CHARACTER_TOKEN,
1748 data => '',
1749 line => $self->{line_prev},
1750 column => $self->{column_prev} - 7};
1751 $self->{state} = CDATA_SECTION_STATE;
1752 !!!next-input-character;
1753 redo A;
1754 } else {
1755 !!!cp (135.3);
1756 !!!parse-error (type => 'bogus comment',
1757 line => $self->{line_prev},
1758 column => $self->{column_prev} - 1 - length $self->{kwd});
1759 $self->{state} = BOGUS_COMMENT_STATE;
1760 ## Reconsume.
1761 $self->{ct} = {type => COMMENT_TOKEN,
1762 data => $self->{kwd},
1763 line => $self->{line_prev},
1764 column => $self->{column_prev} - 1 - length $self->{kwd},
1765 };
1766 redo A;
1767 }
1768 } elsif ($self->{state} == COMMENT_START_STATE) {
1769 if ($self->{nc} == 0x002D) { # -
1770 !!!cp (137);
1771 $self->{state} = COMMENT_START_DASH_STATE;
1772 !!!next-input-character;
1773 redo A;
1774 } elsif ($self->{nc} == 0x003E) { # >
1775 !!!cp (138);
1776 !!!parse-error (type => 'bogus comment');
1777 $self->{state} = DATA_STATE;
1778 $self->{s_kwd} = '';
1779 !!!next-input-character;
1780
1781 !!!emit ($self->{ct}); # comment
1782
1783 redo A;
1784 } elsif ($self->{nc} == -1) {
1785 !!!cp (139);
1786 !!!parse-error (type => 'unclosed comment');
1787 $self->{state} = DATA_STATE;
1788 $self->{s_kwd} = '';
1789 ## reconsume
1790
1791 !!!emit ($self->{ct}); # comment
1792
1793 redo A;
1794 } else {
1795 !!!cp (140);
1796 $self->{ct}->{data} # comment
1797 .= chr ($self->{nc});
1798 $self->{state} = COMMENT_STATE;
1799 !!!next-input-character;
1800 redo A;
1801 }
1802 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1803 if ($self->{nc} == 0x002D) { # -
1804 !!!cp (141);
1805 $self->{state} = COMMENT_END_STATE;
1806 !!!next-input-character;
1807 redo A;
1808 } elsif ($self->{nc} == 0x003E) { # >
1809 !!!cp (142);
1810 !!!parse-error (type => 'bogus comment');
1811 $self->{state} = DATA_STATE;
1812 $self->{s_kwd} = '';
1813 !!!next-input-character;
1814
1815 !!!emit ($self->{ct}); # comment
1816
1817 redo A;
1818 } elsif ($self->{nc} == -1) {
1819 !!!cp (143);
1820 !!!parse-error (type => 'unclosed comment');
1821 $self->{state} = DATA_STATE;
1822 $self->{s_kwd} = '';
1823 ## reconsume
1824
1825 !!!emit ($self->{ct}); # comment
1826
1827 redo A;
1828 } else {
1829 !!!cp (144);
1830 $self->{ct}->{data} # comment
1831 .= '-' . chr ($self->{nc});
1832 $self->{state} = COMMENT_STATE;
1833 !!!next-input-character;
1834 redo A;
1835 }
1836 } elsif ($self->{state} == COMMENT_STATE) {
1837 if ($self->{nc} == 0x002D) { # -
1838 !!!cp (145);
1839 $self->{state} = COMMENT_END_DASH_STATE;
1840 !!!next-input-character;
1841 redo A;
1842 } elsif ($self->{nc} == -1) {
1843 !!!cp (146);
1844 !!!parse-error (type => 'unclosed comment');
1845 $self->{state} = DATA_STATE;
1846 $self->{s_kwd} = '';
1847 ## reconsume
1848
1849 !!!emit ($self->{ct}); # comment
1850
1851 redo A;
1852 } else {
1853 !!!cp (147);
1854 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1855 $self->{read_until}->($self->{ct}->{data},
1856 q[-],
1857 length $self->{ct}->{data});
1858
1859 ## Stay in the state
1860 !!!next-input-character;
1861 redo A;
1862 }
1863 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1864 ## XML5: "comment dash state".
1865
1866 if ($self->{nc} == 0x002D) { # -
1867 !!!cp (148);
1868 $self->{state} = COMMENT_END_STATE;
1869 !!!next-input-character;
1870 redo A;
1871 } elsif ($self->{nc} == -1) {
1872 !!!cp (149);
1873 !!!parse-error (type => 'unclosed comment');
1874 $self->{state} = DATA_STATE;
1875 $self->{s_kwd} = '';
1876 ## reconsume
1877
1878 !!!emit ($self->{ct}); # comment
1879
1880 redo A;
1881 } else {
1882 !!!cp (150);
1883 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1884 $self->{state} = COMMENT_STATE;
1885 !!!next-input-character;
1886 redo A;
1887 }
1888 } elsif ($self->{state} == COMMENT_END_STATE) {
1889 if ($self->{nc} == 0x003E) { # >
1890 !!!cp (151);
1891 $self->{state} = DATA_STATE;
1892 $self->{s_kwd} = '';
1893 !!!next-input-character;
1894
1895 !!!emit ($self->{ct}); # comment
1896
1897 redo A;
1898 } elsif ($self->{nc} == 0x002D) { # -
1899 !!!cp (152);
1900 ## XML5: Not a parse error.
1901 !!!parse-error (type => 'dash in comment',
1902 line => $self->{line_prev},
1903 column => $self->{column_prev});
1904 $self->{ct}->{data} .= '-'; # comment
1905 ## Stay in the state
1906 !!!next-input-character;
1907 redo A;
1908 } elsif ($self->{nc} == -1) {
1909 !!!cp (153);
1910 !!!parse-error (type => 'unclosed comment');
1911 $self->{state} = DATA_STATE;
1912 $self->{s_kwd} = '';
1913 ## reconsume
1914
1915 !!!emit ($self->{ct}); # comment
1916
1917 redo A;
1918 } else {
1919 !!!cp (154);
1920 ## XML5: Not a parse error.
1921 !!!parse-error (type => 'dash in comment',
1922 line => $self->{line_prev},
1923 column => $self->{column_prev});
1924 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1925 $self->{state} = COMMENT_STATE;
1926 !!!next-input-character;
1927 redo A;
1928 }
1929 } elsif ($self->{state} == DOCTYPE_STATE) {
1930 if ($is_space->{$self->{nc}}) {
1931 !!!cp (155);
1932 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1933 !!!next-input-character;
1934 redo A;
1935 } else {
1936 !!!cp (156);
1937 ## XML5: Unless EOF, swith to the bogus comment state.
1938 !!!parse-error (type => 'no space before DOCTYPE name');
1939 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1940 ## reconsume
1941 redo A;
1942 }
1943 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1944 ## XML5: "DOCTYPE root name before state".
1945
1946 if ($is_space->{$self->{nc}}) {
1947 !!!cp (157);
1948 ## Stay in the state
1949 !!!next-input-character;
1950 redo A;
1951 } elsif ($self->{nc} == 0x003E) { # >
1952 !!!cp (158);
1953 ## XML5: No parse error.
1954 !!!parse-error (type => 'no DOCTYPE name');
1955 $self->{state} = DATA_STATE;
1956 $self->{s_kwd} = '';
1957 !!!next-input-character;
1958
1959 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1960
1961 redo A;
1962 } elsif ($self->{nc} == -1) {
1963 !!!cp (159);
1964 !!!parse-error (type => 'no DOCTYPE name');
1965 $self->{state} = DATA_STATE;
1966 $self->{s_kwd} = '';
1967 ## reconsume
1968
1969 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1970
1971 redo A;
1972 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
1973 !!!cp (159.1);
1974 !!!parse-error (type => 'no DOCTYPE name');
1975 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1976 !!!next-input-character;
1977 redo A;
1978 } else {
1979 !!!cp (160);
1980 $self->{ct}->{name} = chr $self->{nc};
1981 delete $self->{ct}->{quirks};
1982 $self->{state} = DOCTYPE_NAME_STATE;
1983 !!!next-input-character;
1984 redo A;
1985 }
1986 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1987 ## XML5: "DOCTYPE root name state".
1988
1989 ## ISSUE: Redundant "First," in the spec.
1990
1991 if ($is_space->{$self->{nc}}) {
1992 !!!cp (161);
1993 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1994 !!!next-input-character;
1995 redo A;
1996 } elsif ($self->{nc} == 0x003E) { # >
1997 !!!cp (162);
1998 $self->{state} = DATA_STATE;
1999 $self->{s_kwd} = '';
2000 !!!next-input-character;
2001
2002 !!!emit ($self->{ct}); # DOCTYPE
2003
2004 redo A;
2005 } elsif ($self->{nc} == -1) {
2006 !!!cp (163);
2007 !!!parse-error (type => 'unclosed DOCTYPE');
2008 $self->{state} = DATA_STATE;
2009 $self->{s_kwd} = '';
2010 ## reconsume
2011
2012 $self->{ct}->{quirks} = 1;
2013 !!!emit ($self->{ct}); # DOCTYPE
2014
2015 redo A;
2016 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2017 !!!cp (163.1);
2018 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2019 !!!next-input-character;
2020 redo A;
2021 } else {
2022 !!!cp (164);
2023 $self->{ct}->{name}
2024 .= chr ($self->{nc}); # DOCTYPE
2025 ## Stay in the state
2026 !!!next-input-character;
2027 redo A;
2028 }
2029 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2030 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2031 ## state", but implemented differently.
2032
2033 if ($is_space->{$self->{nc}}) {
2034 !!!cp (165);
2035 ## Stay in the state
2036 !!!next-input-character;
2037 redo A;
2038 } elsif ($self->{nc} == 0x003E) { # >
2039 !!!cp (166);
2040 $self->{state} = DATA_STATE;
2041 $self->{s_kwd} = '';
2042 !!!next-input-character;
2043
2044 !!!emit ($self->{ct}); # DOCTYPE
2045
2046 redo A;
2047 } elsif ($self->{nc} == -1) {
2048 !!!cp (167);
2049 !!!parse-error (type => 'unclosed DOCTYPE');
2050 $self->{state} = DATA_STATE;
2051 $self->{s_kwd} = '';
2052 ## reconsume
2053
2054 $self->{ct}->{quirks} = 1;
2055 !!!emit ($self->{ct}); # DOCTYPE
2056
2057 redo A;
2058 } elsif ($self->{nc} == 0x0050 or # P
2059 $self->{nc} == 0x0070) { # p
2060 !!!cp (167.1);
2061 $self->{state} = PUBLIC_STATE;
2062 $self->{kwd} = chr $self->{nc};
2063 !!!next-input-character;
2064 redo A;
2065 } elsif ($self->{nc} == 0x0053 or # S
2066 $self->{nc} == 0x0073) { # s
2067 !!!cp (167.2);
2068 $self->{state} = SYSTEM_STATE;
2069 $self->{kwd} = chr $self->{nc};
2070 !!!next-input-character;
2071 redo A;
2072 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2073 !!!cp (167.3);
2074 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2075 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2076 !!!next-input-character;
2077 redo A;
2078 } else {
2079 !!!cp (180);
2080 !!!parse-error (type => 'string after DOCTYPE name');
2081 $self->{ct}->{quirks} = 1;
2082
2083 $self->{state} = BOGUS_DOCTYPE_STATE;
2084 !!!next-input-character;
2085 redo A;
2086 }
2087 } elsif ($self->{state} == PUBLIC_STATE) {
2088 ## ASCII case-insensitive
2089 if ($self->{nc} == [
2090 undef,
2091 0x0055, # U
2092 0x0042, # B
2093 0x004C, # L
2094 0x0049, # I
2095 ]->[length $self->{kwd}] or
2096 $self->{nc} == [
2097 undef,
2098 0x0075, # u
2099 0x0062, # b
2100 0x006C, # l
2101 0x0069, # i
2102 ]->[length $self->{kwd}]) {
2103 !!!cp (175);
2104 ## Stay in the state.
2105 $self->{kwd} .= chr $self->{nc};
2106 !!!next-input-character;
2107 redo A;
2108 } elsif ((length $self->{kwd}) == 5 and
2109 ($self->{nc} == 0x0043 or # C
2110 $self->{nc} == 0x0063)) { # c
2111 if ($self->{is_xml} and
2112 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2113 !!!cp (168.1);
2114 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2115 text => 'PUBLIC',
2116 line => $self->{line_prev},
2117 column => $self->{column_prev} - 4);
2118 } else {
2119 !!!cp (168);
2120 }
2121 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2122 !!!next-input-character;
2123 redo A;
2124 } else {
2125 !!!cp (169);
2126 !!!parse-error (type => 'string after DOCTYPE name',
2127 line => $self->{line_prev},
2128 column => $self->{column_prev} + 1 - length $self->{kwd});
2129 $self->{ct}->{quirks} = 1;
2130
2131 $self->{state} = BOGUS_DOCTYPE_STATE;
2132 ## Reconsume.
2133 redo A;
2134 }
2135 } elsif ($self->{state} == SYSTEM_STATE) {
2136 ## ASCII case-insensitive
2137 if ($self->{nc} == [
2138 undef,
2139 0x0059, # Y
2140 0x0053, # S
2141 0x0054, # T
2142 0x0045, # E
2143 ]->[length $self->{kwd}] or
2144 $self->{nc} == [
2145 undef,
2146 0x0079, # y
2147 0x0073, # s
2148 0x0074, # t
2149 0x0065, # e
2150 ]->[length $self->{kwd}]) {
2151 !!!cp (170);
2152 ## Stay in the state.
2153 $self->{kwd} .= chr $self->{nc};
2154 !!!next-input-character;
2155 redo A;
2156 } elsif ((length $self->{kwd}) == 5 and
2157 ($self->{nc} == 0x004D or # M
2158 $self->{nc} == 0x006D)) { # m
2159 if ($self->{is_xml} and
2160 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2161 !!!cp (171.1);
2162 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2163 text => 'SYSTEM',
2164 line => $self->{line_prev},
2165 column => $self->{column_prev} - 4);
2166 } else {
2167 !!!cp (171);
2168 }
2169 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2170 !!!next-input-character;
2171 redo A;
2172 } else {
2173 !!!cp (172);
2174 !!!parse-error (type => 'string after DOCTYPE name',
2175 line => $self->{line_prev},
2176 column => $self->{column_prev} + 1 - length $self->{kwd});
2177 $self->{ct}->{quirks} = 1;
2178
2179 $self->{state} = BOGUS_DOCTYPE_STATE;
2180 ## Reconsume.
2181 redo A;
2182 }
2183 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2184 if ($is_space->{$self->{nc}}) {
2185 !!!cp (181);
2186 ## Stay in the state
2187 !!!next-input-character;
2188 redo A;
2189 } elsif ($self->{nc} eq 0x0022) { # "
2190 !!!cp (182);
2191 $self->{ct}->{pubid} = ''; # DOCTYPE
2192 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2193 !!!next-input-character;
2194 redo A;
2195 } elsif ($self->{nc} eq 0x0027) { # '
2196 !!!cp (183);
2197 $self->{ct}->{pubid} = ''; # DOCTYPE
2198 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2199 !!!next-input-character;
2200 redo A;
2201 } elsif ($self->{nc} eq 0x003E) { # >
2202 !!!cp (184);
2203 !!!parse-error (type => 'no PUBLIC literal');
2204
2205 $self->{state} = DATA_STATE;
2206 $self->{s_kwd} = '';
2207 !!!next-input-character;
2208
2209 $self->{ct}->{quirks} = 1;
2210 !!!emit ($self->{ct}); # DOCTYPE
2211
2212 redo A;
2213 } elsif ($self->{nc} == -1) {
2214 !!!cp (185);
2215 !!!parse-error (type => 'unclosed DOCTYPE');
2216
2217 $self->{state} = DATA_STATE;
2218 $self->{s_kwd} = '';
2219 ## reconsume
2220
2221 $self->{ct}->{quirks} = 1;
2222 !!!emit ($self->{ct}); # DOCTYPE
2223
2224 redo A;
2225 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2226 !!!cp (186.1);
2227 !!!parse-error (type => 'no PUBLIC literal');
2228 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2229 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2230 !!!next-input-character;
2231 redo A;
2232 } else {
2233 !!!cp (186);
2234 !!!parse-error (type => 'string after PUBLIC');
2235 $self->{ct}->{quirks} = 1;
2236
2237 $self->{state} = BOGUS_DOCTYPE_STATE;
2238 !!!next-input-character;
2239 redo A;
2240 }
2241 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2242 if ($self->{nc} == 0x0022) { # "
2243 !!!cp (187);
2244 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2245 !!!next-input-character;
2246 redo A;
2247 } elsif ($self->{nc} == 0x003E) { # >
2248 !!!cp (188);
2249 !!!parse-error (type => 'unclosed PUBLIC literal');
2250
2251 $self->{state} = DATA_STATE;
2252 $self->{s_kwd} = '';
2253 !!!next-input-character;
2254
2255 $self->{ct}->{quirks} = 1;
2256 !!!emit ($self->{ct}); # DOCTYPE
2257
2258 redo A;
2259 } elsif ($self->{nc} == -1) {
2260 !!!cp (189);
2261 !!!parse-error (type => 'unclosed PUBLIC literal');
2262
2263 $self->{state} = DATA_STATE;
2264 $self->{s_kwd} = '';
2265 ## reconsume
2266
2267 $self->{ct}->{quirks} = 1;
2268 !!!emit ($self->{ct}); # DOCTYPE
2269
2270 redo A;
2271 } else {
2272 !!!cp (190);
2273 $self->{ct}->{pubid} # DOCTYPE
2274 .= chr $self->{nc};
2275 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2276 length $self->{ct}->{pubid});
2277
2278 ## Stay in the state
2279 !!!next-input-character;
2280 redo A;
2281 }
2282 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2283 if ($self->{nc} == 0x0027) { # '
2284 !!!cp (191);
2285 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2286 !!!next-input-character;
2287 redo A;
2288 } elsif ($self->{nc} == 0x003E) { # >
2289 !!!cp (192);
2290 !!!parse-error (type => 'unclosed PUBLIC literal');
2291
2292 $self->{state} = DATA_STATE;
2293 $self->{s_kwd} = '';
2294 !!!next-input-character;
2295
2296 $self->{ct}->{quirks} = 1;
2297 !!!emit ($self->{ct}); # DOCTYPE
2298
2299 redo A;
2300 } elsif ($self->{nc} == -1) {
2301 !!!cp (193);
2302 !!!parse-error (type => 'unclosed PUBLIC literal');
2303
2304 $self->{state} = DATA_STATE;
2305 $self->{s_kwd} = '';
2306 ## reconsume
2307
2308 $self->{ct}->{quirks} = 1;
2309 !!!emit ($self->{ct}); # DOCTYPE
2310
2311 redo A;
2312 } else {
2313 !!!cp (194);
2314 $self->{ct}->{pubid} # DOCTYPE
2315 .= chr $self->{nc};
2316 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2317 length $self->{ct}->{pubid});
2318
2319 ## Stay in the state
2320 !!!next-input-character;
2321 redo A;
2322 }
2323 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2324 if ($is_space->{$self->{nc}}) {
2325 !!!cp (195);
2326 ## Stay in the state
2327 !!!next-input-character;
2328 redo A;
2329 } elsif ($self->{nc} == 0x0022) { # "
2330 !!!cp (196);
2331 $self->{ct}->{sysid} = ''; # DOCTYPE
2332 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2333 !!!next-input-character;
2334 redo A;
2335 } elsif ($self->{nc} == 0x0027) { # '
2336 !!!cp (197);
2337 $self->{ct}->{sysid} = ''; # DOCTYPE
2338 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2339 !!!next-input-character;
2340 redo A;
2341 } elsif ($self->{nc} == 0x003E) { # >
2342 if ($self->{is_xml}) {
2343 !!!cp (198.1);
2344 !!!parse-error (type => 'no SYSTEM literal');
2345 } else {
2346 !!!cp (198);
2347 }
2348 $self->{state} = DATA_STATE;
2349 $self->{s_kwd} = '';
2350 !!!next-input-character;
2351
2352 !!!emit ($self->{ct}); # DOCTYPE
2353
2354 redo A;
2355 } elsif ($self->{nc} == -1) {
2356 !!!cp (199);
2357 !!!parse-error (type => 'unclosed DOCTYPE');
2358
2359 $self->{state} = DATA_STATE;
2360 $self->{s_kwd} = '';
2361 ## reconsume
2362
2363 $self->{ct}->{quirks} = 1;
2364 !!!emit ($self->{ct}); # DOCTYPE
2365
2366 redo A;
2367 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2368 !!!cp (200.1);
2369 !!!parse-error (type => 'no SYSTEM literal');
2370 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2372 !!!next-input-character;
2373 redo A;
2374 } else {
2375 !!!cp (200);
2376 !!!parse-error (type => 'string after PUBLIC literal');
2377 $self->{ct}->{quirks} = 1;
2378
2379 $self->{state} = BOGUS_DOCTYPE_STATE;
2380 !!!next-input-character;
2381 redo A;
2382 }
2383 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2384 if ($is_space->{$self->{nc}}) {
2385 !!!cp (201);
2386 ## Stay in the state
2387 !!!next-input-character;
2388 redo A;
2389 } elsif ($self->{nc} == 0x0022) { # "
2390 !!!cp (202);
2391 $self->{ct}->{sysid} = ''; # DOCTYPE
2392 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2393 !!!next-input-character;
2394 redo A;
2395 } elsif ($self->{nc} == 0x0027) { # '
2396 !!!cp (203);
2397 $self->{ct}->{sysid} = ''; # DOCTYPE
2398 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2399 !!!next-input-character;
2400 redo A;
2401 } elsif ($self->{nc} == 0x003E) { # >
2402 !!!cp (204);
2403 !!!parse-error (type => 'no SYSTEM literal');
2404 $self->{state} = DATA_STATE;
2405 $self->{s_kwd} = '';
2406 !!!next-input-character;
2407
2408 $self->{ct}->{quirks} = 1;
2409 !!!emit ($self->{ct}); # DOCTYPE
2410
2411 redo A;
2412 } elsif ($self->{nc} == -1) {
2413 !!!cp (205);
2414 !!!parse-error (type => 'unclosed DOCTYPE');
2415
2416 $self->{state} = DATA_STATE;
2417 $self->{s_kwd} = '';
2418 ## reconsume
2419
2420 $self->{ct}->{quirks} = 1;
2421 !!!emit ($self->{ct}); # DOCTYPE
2422
2423 redo A;
2424 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2425 !!!cp (206.1);
2426 !!!parse-error (type => 'no SYSTEM literal');
2427
2428 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2429 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2430 !!!next-input-character;
2431 redo A;
2432 } else {
2433 !!!cp (206);
2434 !!!parse-error (type => 'string after SYSTEM');
2435 $self->{ct}->{quirks} = 1;
2436
2437 $self->{state} = BOGUS_DOCTYPE_STATE;
2438 !!!next-input-character;
2439 redo A;
2440 }
2441 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2442 if ($self->{nc} == 0x0022) { # "
2443 !!!cp (207);
2444 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2445 !!!next-input-character;
2446 redo A;
2447 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2448 !!!cp (208);
2449 !!!parse-error (type => 'unclosed SYSTEM literal');
2450
2451 $self->{state} = DATA_STATE;
2452 $self->{s_kwd} = '';
2453 !!!next-input-character;
2454
2455 $self->{ct}->{quirks} = 1;
2456 !!!emit ($self->{ct}); # DOCTYPE
2457
2458 redo A;
2459 } elsif ($self->{nc} == -1) {
2460 !!!cp (209);
2461 !!!parse-error (type => 'unclosed SYSTEM literal');
2462
2463 $self->{state} = DATA_STATE;
2464 $self->{s_kwd} = '';
2465 ## reconsume
2466
2467 $self->{ct}->{quirks} = 1;
2468 !!!emit ($self->{ct}); # DOCTYPE
2469
2470 redo A;
2471 } else {
2472 !!!cp (210);
2473 $self->{ct}->{sysid} # DOCTYPE
2474 .= chr $self->{nc};
2475 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2476 length $self->{ct}->{sysid});
2477
2478 ## Stay in the state
2479 !!!next-input-character;
2480 redo A;
2481 }
2482 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2483 if ($self->{nc} == 0x0027) { # '
2484 !!!cp (211);
2485 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2486 !!!next-input-character;
2487 redo A;
2488 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2489 !!!cp (212);
2490 !!!parse-error (type => 'unclosed SYSTEM literal');
2491
2492 $self->{state} = DATA_STATE;
2493 $self->{s_kwd} = '';
2494 !!!next-input-character;
2495
2496 $self->{ct}->{quirks} = 1;
2497 !!!emit ($self->{ct}); # DOCTYPE
2498
2499 redo A;
2500 } elsif ($self->{nc} == -1) {
2501 !!!cp (213);
2502 !!!parse-error (type => 'unclosed SYSTEM literal');
2503
2504 $self->{state} = DATA_STATE;
2505 $self->{s_kwd} = '';
2506 ## reconsume
2507
2508 $self->{ct}->{quirks} = 1;
2509 !!!emit ($self->{ct}); # DOCTYPE
2510
2511 redo A;
2512 } else {
2513 !!!cp (214);
2514 $self->{ct}->{sysid} # DOCTYPE
2515 .= chr $self->{nc};
2516 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2517 length $self->{ct}->{sysid});
2518
2519 ## Stay in the state
2520 !!!next-input-character;
2521 redo A;
2522 }
2523 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2524 if ($is_space->{$self->{nc}}) {
2525 !!!cp (215);
2526 ## Stay in the state
2527 !!!next-input-character;
2528 redo A;
2529 } elsif ($self->{nc} == 0x003E) { # >
2530 !!!cp (216);
2531 $self->{state} = DATA_STATE;
2532 $self->{s_kwd} = '';
2533 !!!next-input-character;
2534
2535 !!!emit ($self->{ct}); # DOCTYPE
2536
2537 redo A;
2538 } elsif ($self->{nc} == -1) {
2539 !!!cp (217);
2540 !!!parse-error (type => 'unclosed DOCTYPE');
2541 $self->{state} = DATA_STATE;
2542 $self->{s_kwd} = '';
2543 ## reconsume
2544
2545 $self->{ct}->{quirks} = 1;
2546 !!!emit ($self->{ct}); # DOCTYPE
2547
2548 redo A;
2549 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2550 !!!cp (218.1);
2551 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2552 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2553 !!!next-input-character;
2554 redo A;
2555 } else {
2556 !!!cp (218);
2557 !!!parse-error (type => 'string after SYSTEM literal');
2558 #$self->{ct}->{quirks} = 1;
2559
2560 $self->{state} = BOGUS_DOCTYPE_STATE;
2561 !!!next-input-character;
2562 redo A;
2563 }
2564 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2565 if ($self->{nc} == 0x003E) { # >
2566 !!!cp (219);
2567 $self->{state} = DATA_STATE;
2568 $self->{s_kwd} = '';
2569 !!!next-input-character;
2570
2571 !!!emit ($self->{ct}); # DOCTYPE
2572
2573 redo A;
2574 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2575 if ($self->{ct}->{has_internal_subset}) { # DOCTYPE
2576 !!!cp (220.2);
2577 ## Stay in the state.
2578 !!!next-input-character;
2579 redo A;
2580 } else {
2581 !!!cp (220.1);
2582 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2583 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2584 !!!next-input-character;
2585 redo A;
2586 }
2587 } elsif ($self->{nc} == -1) {
2588 !!!cp (220);
2589 $self->{state} = DATA_STATE;
2590 $self->{s_kwd} = '';
2591 ## reconsume
2592
2593 !!!emit ($self->{ct}); # DOCTYPE
2594
2595 redo A;
2596 } else {
2597 !!!cp (221);
2598 my $s = '';
2599 $self->{read_until}->($s, q{>[}, 0);
2600
2601 ## Stay in the state
2602 !!!next-input-character;
2603 redo A;
2604 }
2605 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2606 ## NOTE: "CDATA section state" in the state is jointly implemented
2607 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2608 ## and |CDATA_SECTION_MSE2_STATE|.
2609
2610 ## XML5: "CDATA state".
2611
2612 if ($self->{nc} == 0x005D) { # ]
2613 !!!cp (221.1);
2614 $self->{state} = CDATA_SECTION_MSE1_STATE;
2615 !!!next-input-character;
2616 redo A;
2617 } elsif ($self->{nc} == -1) {
2618 if ($self->{is_xml}) {
2619 !!!cp (221.11);
2620 !!!parse-error (type => 'no mse'); ## TODO: type
2621 } else {
2622 !!!cp (221.12);
2623 }
2624
2625 $self->{state} = DATA_STATE;
2626 $self->{s_kwd} = '';
2627 ## Reconsume.
2628 if (length $self->{ct}->{data}) { # character
2629 !!!cp (221.2);
2630 !!!emit ($self->{ct}); # character
2631 } else {
2632 !!!cp (221.3);
2633 ## No token to emit. $self->{ct} is discarded.
2634 }
2635 redo A;
2636 } else {
2637 !!!cp (221.4);
2638 $self->{ct}->{data} .= chr $self->{nc};
2639 $self->{read_until}->($self->{ct}->{data},
2640 q<]>,
2641 length $self->{ct}->{data});
2642
2643 ## Stay in the state.
2644 !!!next-input-character;
2645 redo A;
2646 }
2647
2648 ## ISSUE: "text tokens" in spec.
2649 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2650 ## XML5: "CDATA bracket state".
2651
2652 if ($self->{nc} == 0x005D) { # ]
2653 !!!cp (221.5);
2654 $self->{state} = CDATA_SECTION_MSE2_STATE;
2655 !!!next-input-character;
2656 redo A;
2657 } else {
2658 !!!cp (221.6);
2659 ## XML5: If EOF, "]" is not appended and changed to the data state.
2660 $self->{ct}->{data} .= ']';
2661 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2662 ## Reconsume.
2663 redo A;
2664 }
2665 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2666 ## XML5: "CDATA end state".
2667
2668 if ($self->{nc} == 0x003E) { # >
2669 $self->{state} = DATA_STATE;
2670 $self->{s_kwd} = '';
2671 !!!next-input-character;
2672 if (length $self->{ct}->{data}) { # character
2673 !!!cp (221.7);
2674 !!!emit ($self->{ct}); # character
2675 } else {
2676 !!!cp (221.8);
2677 ## No token to emit. $self->{ct} is discarded.
2678 }
2679 redo A;
2680 } elsif ($self->{nc} == 0x005D) { # ]
2681 !!!cp (221.9); # character
2682 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2683 ## Stay in the state.
2684 !!!next-input-character;
2685 redo A;
2686 } else {
2687 !!!cp (221.11);
2688 $self->{ct}->{data} .= ']]'; # character
2689 $self->{state} = CDATA_SECTION_STATE;
2690 ## Reconsume. ## XML5: Emit.
2691 redo A;
2692 }
2693 } elsif ($self->{state} == ENTITY_STATE) {
2694 if ($is_space->{$self->{nc}} or
2695 {
2696 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2697 $self->{entity_add} => 1,
2698 }->{$self->{nc}}) {
2699 !!!cp (1001);
2700 ## Don't consume
2701 ## No error
2702 ## Return nothing.
2703 #
2704 } elsif ($self->{nc} == 0x0023) { # #
2705 !!!cp (999);
2706 $self->{state} = ENTITY_HASH_STATE;
2707 $self->{kwd} = '#';
2708 !!!next-input-character;
2709 redo A;
2710 } elsif ((0x0041 <= $self->{nc} and
2711 $self->{nc} <= 0x005A) or # A..Z
2712 (0x0061 <= $self->{nc} and
2713 $self->{nc} <= 0x007A)) { # a..z
2714 !!!cp (998);
2715 require Whatpm::_NamedEntityList;
2716 $self->{state} = ENTITY_NAME_STATE;
2717 $self->{kwd} = chr $self->{nc};
2718 $self->{entity__value} = $self->{kwd};
2719 $self->{entity__match} = 0;
2720 !!!next-input-character;
2721 redo A;
2722 } else {
2723 !!!cp (1027);
2724 !!!parse-error (type => 'bare ero');
2725 ## Return nothing.
2726 #
2727 }
2728
2729 ## NOTE: No character is consumed by the "consume a character
2730 ## reference" algorithm. In other word, there is an "&" character
2731 ## that does not introduce a character reference, which would be
2732 ## appended to the parent element or the attribute value in later
2733 ## process of the tokenizer.
2734
2735 if ($self->{prev_state} == DATA_STATE) {
2736 !!!cp (997);
2737 $self->{state} = $self->{prev_state};
2738 $self->{s_kwd} = '';
2739 ## Reconsume.
2740 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2741 line => $self->{line_prev},
2742 column => $self->{column_prev},
2743 });
2744 redo A;
2745 } else {
2746 !!!cp (996);
2747 $self->{ca}->{value} .= '&';
2748 $self->{state} = $self->{prev_state};
2749 $self->{s_kwd} = '';
2750 ## Reconsume.
2751 redo A;
2752 }
2753 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2754 if ($self->{nc} == 0x0078 or # x
2755 $self->{nc} == 0x0058) { # X
2756 !!!cp (995);
2757 $self->{state} = HEXREF_X_STATE;
2758 $self->{kwd} .= chr $self->{nc};
2759 !!!next-input-character;
2760 redo A;
2761 } elsif (0x0030 <= $self->{nc} and
2762 $self->{nc} <= 0x0039) { # 0..9
2763 !!!cp (994);
2764 $self->{state} = NCR_NUM_STATE;
2765 $self->{kwd} = $self->{nc} - 0x0030;
2766 !!!next-input-character;
2767 redo A;
2768 } else {
2769 !!!parse-error (type => 'bare nero',
2770 line => $self->{line_prev},
2771 column => $self->{column_prev} - 1);
2772
2773 ## NOTE: According to the spec algorithm, nothing is returned,
2774 ## and then "&#" is appended to the parent element or the attribute
2775 ## value in the later processing.
2776
2777 if ($self->{prev_state} == DATA_STATE) {
2778 !!!cp (1019);
2779 $self->{state} = $self->{prev_state};
2780 $self->{s_kwd} = '';
2781 ## Reconsume.
2782 !!!emit ({type => CHARACTER_TOKEN,
2783 data => '&#',
2784 line => $self->{line_prev},
2785 column => $self->{column_prev} - 1,
2786 });
2787 redo A;
2788 } else {
2789 !!!cp (993);
2790 $self->{ca}->{value} .= '&#';
2791 $self->{state} = $self->{prev_state};
2792 $self->{s_kwd} = '';
2793 ## Reconsume.
2794 redo A;
2795 }
2796 }
2797 } elsif ($self->{state} == NCR_NUM_STATE) {
2798 if (0x0030 <= $self->{nc} and
2799 $self->{nc} <= 0x0039) { # 0..9
2800 !!!cp (1012);
2801 $self->{kwd} *= 10;
2802 $self->{kwd} += $self->{nc} - 0x0030;
2803
2804 ## Stay in the state.
2805 !!!next-input-character;
2806 redo A;
2807 } elsif ($self->{nc} == 0x003B) { # ;
2808 !!!cp (1013);
2809 !!!next-input-character;
2810 #
2811 } else {
2812 !!!cp (1014);
2813 !!!parse-error (type => 'no refc');
2814 ## Reconsume.
2815 #
2816 }
2817
2818 my $code = $self->{kwd};
2819 my $l = $self->{line_prev};
2820 my $c = $self->{column_prev};
2821 if ($charref_map->{$code}) {
2822 !!!cp (1015);
2823 !!!parse-error (type => 'invalid character reference',
2824 text => (sprintf 'U+%04X', $code),
2825 line => $l, column => $c);
2826 $code = $charref_map->{$code};
2827 } elsif ($code > 0x10FFFF) {
2828 !!!cp (1016);
2829 !!!parse-error (type => 'invalid character reference',
2830 text => (sprintf 'U-%08X', $code),
2831 line => $l, column => $c);
2832 $code = 0xFFFD;
2833 }
2834
2835 if ($self->{prev_state} == DATA_STATE) {
2836 !!!cp (992);
2837 $self->{state} = $self->{prev_state};
2838 $self->{s_kwd} = '';
2839 ## Reconsume.
2840 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2841 has_reference => 1,
2842 line => $l, column => $c,
2843 });
2844 redo A;
2845 } else {
2846 !!!cp (991);
2847 $self->{ca}->{value} .= chr $code;
2848 $self->{ca}->{has_reference} = 1;
2849 $self->{state} = $self->{prev_state};
2850 $self->{s_kwd} = '';
2851 ## Reconsume.
2852 redo A;
2853 }
2854 } elsif ($self->{state} == HEXREF_X_STATE) {
2855 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2856 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2857 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2858 # 0..9, A..F, a..f
2859 !!!cp (990);
2860 $self->{state} = HEXREF_HEX_STATE;
2861 $self->{kwd} = 0;
2862 ## Reconsume.
2863 redo A;
2864 } else {
2865 !!!parse-error (type => 'bare hcro',
2866 line => $self->{line_prev},
2867 column => $self->{column_prev} - 2);
2868
2869 ## NOTE: According to the spec algorithm, nothing is returned,
2870 ## and then "&#" followed by "X" or "x" is appended to the parent
2871 ## element or the attribute value in the later processing.
2872
2873 if ($self->{prev_state} == DATA_STATE) {
2874 !!!cp (1005);
2875 $self->{state} = $self->{prev_state};
2876 $self->{s_kwd} = '';
2877 ## Reconsume.
2878 !!!emit ({type => CHARACTER_TOKEN,
2879 data => '&' . $self->{kwd},
2880 line => $self->{line_prev},
2881 column => $self->{column_prev} - length $self->{kwd},
2882 });
2883 redo A;
2884 } else {
2885 !!!cp (989);
2886 $self->{ca}->{value} .= '&' . $self->{kwd};
2887 $self->{state} = $self->{prev_state};
2888 $self->{s_kwd} = '';
2889 ## Reconsume.
2890 redo A;
2891 }
2892 }
2893 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2894 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2895 # 0..9
2896 !!!cp (1002);
2897 $self->{kwd} *= 0x10;
2898 $self->{kwd} += $self->{nc} - 0x0030;
2899 ## Stay in the state.
2900 !!!next-input-character;
2901 redo A;
2902 } elsif (0x0061 <= $self->{nc} and
2903 $self->{nc} <= 0x0066) { # a..f
2904 !!!cp (1003);
2905 $self->{kwd} *= 0x10;
2906 $self->{kwd} += $self->{nc} - 0x0060 + 9;
2907 ## Stay in the state.
2908 !!!next-input-character;
2909 redo A;
2910 } elsif (0x0041 <= $self->{nc} and
2911 $self->{nc} <= 0x0046) { # A..F
2912 !!!cp (1004);
2913 $self->{kwd} *= 0x10;
2914 $self->{kwd} += $self->{nc} - 0x0040 + 9;
2915 ## Stay in the state.
2916 !!!next-input-character;
2917 redo A;
2918 } elsif ($self->{nc} == 0x003B) { # ;
2919 !!!cp (1006);
2920 !!!next-input-character;
2921 #
2922 } else {
2923 !!!cp (1007);
2924 !!!parse-error (type => 'no refc',
2925 line => $self->{line},
2926 column => $self->{column});
2927 ## Reconsume.
2928 #
2929 }
2930
2931 my $code = $self->{kwd};
2932 my $l = $self->{line_prev};
2933 my $c = $self->{column_prev};
2934 if ($charref_map->{$code}) {
2935 !!!cp (1008);
2936 !!!parse-error (type => 'invalid character reference',
2937 text => (sprintf 'U+%04X', $code),
2938 line => $l, column => $c);
2939 $code = $charref_map->{$code};
2940 } elsif ($code > 0x10FFFF) {
2941 !!!cp (1009);
2942 !!!parse-error (type => 'invalid character reference',
2943 text => (sprintf 'U-%08X', $code),
2944 line => $l, column => $c);
2945 $code = 0xFFFD;
2946 }
2947
2948 if ($self->{prev_state} == DATA_STATE) {
2949 !!!cp (988);
2950 $self->{state} = $self->{prev_state};
2951 $self->{s_kwd} = '';
2952 ## Reconsume.
2953 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2954 has_reference => 1,
2955 line => $l, column => $c,
2956 });
2957 redo A;
2958 } else {
2959 !!!cp (987);
2960 $self->{ca}->{value} .= chr $code;
2961 $self->{ca}->{has_reference} = 1;
2962 $self->{state} = $self->{prev_state};
2963 $self->{s_kwd} = '';
2964 ## Reconsume.
2965 redo A;
2966 }
2967 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2968 if (length $self->{kwd} < 30 and
2969 ## NOTE: Some number greater than the maximum length of entity name
2970 ((0x0041 <= $self->{nc} and # a
2971 $self->{nc} <= 0x005A) or # x
2972 (0x0061 <= $self->{nc} and # a
2973 $self->{nc} <= 0x007A) or # z
2974 (0x0030 <= $self->{nc} and # 0
2975 $self->{nc} <= 0x0039) or # 9
2976 $self->{nc} == 0x003B)) { # ;
2977 our $EntityChar;
2978 $self->{kwd} .= chr $self->{nc};
2979 if (defined $EntityChar->{$self->{kwd}}) {
2980 if ($self->{nc} == 0x003B) { # ;
2981 !!!cp (1020);
2982 $self->{entity__value} = $EntityChar->{$self->{kwd}};
2983 $self->{entity__match} = 1;
2984 !!!next-input-character;
2985 #
2986 } else {
2987 !!!cp (1021);
2988 $self->{entity__value} = $EntityChar->{$self->{kwd}};
2989 $self->{entity__match} = -1;
2990 ## Stay in the state.
2991 !!!next-input-character;
2992 redo A;
2993 }
2994 } else {
2995 !!!cp (1022);
2996 $self->{entity__value} .= chr $self->{nc};
2997 $self->{entity__match} *= 2;
2998 ## Stay in the state.
2999 !!!next-input-character;
3000 redo A;
3001 }
3002 }
3003
3004 my $data;
3005 my $has_ref;
3006 if ($self->{entity__match} > 0) {
3007 !!!cp (1023);
3008 $data = $self->{entity__value};
3009 $has_ref = 1;
3010 #
3011 } elsif ($self->{entity__match} < 0) {
3012 !!!parse-error (type => 'no refc');
3013 if ($self->{prev_state} != DATA_STATE and # in attribute
3014 $self->{entity__match} < -1) {
3015 !!!cp (1024);
3016 $data = '&' . $self->{kwd};
3017 #
3018 } else {
3019 !!!cp (1025);
3020 $data = $self->{entity__value};
3021 $has_ref = 1;
3022 #
3023 }
3024 } else {
3025 !!!cp (1026);
3026 !!!parse-error (type => 'bare ero',
3027 line => $self->{line_prev},
3028 column => $self->{column_prev} - length $self->{kwd});
3029 $data = '&' . $self->{kwd};
3030 #
3031 }
3032
3033 ## NOTE: In these cases, when a character reference is found,
3034 ## it is consumed and a character token is returned, or, otherwise,
3035 ## nothing is consumed and returned, according to the spec algorithm.
3036 ## In this implementation, anything that has been examined by the
3037 ## tokenizer is appended to the parent element or the attribute value
3038 ## as string, either literal string when no character reference or
3039 ## entity-replaced string otherwise, in this stage, since any characters
3040 ## that would not be consumed are appended in the data state or in an
3041 ## appropriate attribute value state anyway.
3042
3043 if ($self->{prev_state} == DATA_STATE) {
3044 !!!cp (986);
3045 $self->{state} = $self->{prev_state};
3046 $self->{s_kwd} = '';
3047 ## Reconsume.
3048 !!!emit ({type => CHARACTER_TOKEN,
3049 data => $data,
3050 has_reference => $has_ref,
3051 line => $self->{line_prev},
3052 column => $self->{column_prev} + 1 - length $self->{kwd},
3053 });
3054 redo A;
3055 } else {
3056 !!!cp (985);
3057 $self->{ca}->{value} .= $data;
3058 $self->{ca}->{has_reference} = 1 if $has_ref;
3059 $self->{state} = $self->{prev_state};
3060 $self->{s_kwd} = '';
3061 ## Reconsume.
3062 redo A;
3063 }
3064
3065 ## XML-only states
3066
3067 } elsif ($self->{state} == PI_STATE) {
3068 if ($is_space->{$self->{nc}} or
3069 $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
3070 $self->{nc} == -1) {
3071 !!!parse-error (type => 'bare pio', ## TODO: type
3072 line => $self->{line_prev},
3073 column => $self->{column_prev}
3074 - 1 * ($self->{nc} != -1));
3075 $self->{state} = BOGUS_COMMENT_STATE;
3076 ## Reconsume.
3077 $self->{ct} = {type => COMMENT_TOKEN,
3078 data => '?',
3079 line => $self->{line_prev},
3080 column => $self->{column_prev}
3081 - 1 * ($self->{nc} != -1),
3082 };
3083 redo A;
3084 } else {
3085 $self->{ct} = {type => PI_TOKEN,
3086 target => chr $self->{nc},
3087 data => '',
3088 line => $self->{line_prev},
3089 column => $self->{column_prev} - 1,
3090 };
3091 $self->{state} = PI_TARGET_STATE;
3092 !!!next-input-character;
3093 redo A;
3094 }
3095 } elsif ($self->{state} == PI_TARGET_STATE) {
3096 if ($is_space->{$self->{nc}}) {
3097 $self->{state} = PI_TARGET_AFTER_STATE;
3098 !!!next-input-character;
3099 redo A;
3100 } elsif ($self->{nc} == -1) {
3101 !!!parse-error (type => 'no pic'); ## TODO: type
3102 $self->{state} = DATA_STATE;
3103 $self->{s_kwd} = '';
3104 ## Reconsume.
3105 !!!emit ($self->{ct}); # pi
3106 redo A;
3107 } elsif ($self->{nc} == 0x003F) { # ?
3108 $self->{state} = PI_AFTER_STATE;
3109 !!!next-input-character;
3110 redo A;
3111 } else {
3112 ## XML5: typo ("tag name" -> "target")
3113 $self->{ct}->{target} .= chr $self->{nc}; # pi
3114 !!!next-input-character;
3115 redo A;
3116 }
3117 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3118 if ($is_space->{$self->{nc}}) {
3119 ## Stay in the state.
3120 !!!next-input-character;
3121 redo A;
3122 } else {
3123 $self->{state} = PI_DATA_STATE;
3124 ## Reprocess.
3125 redo A;
3126 }
3127 } elsif ($self->{state} == PI_DATA_STATE) {
3128 if ($self->{nc} == 0x003F) { # ?
3129 $self->{state} = PI_DATA_AFTER_STATE;
3130 !!!next-input-character;
3131 redo A;
3132 } elsif ($self->{nc} == -1) {
3133 !!!parse-error (type => 'no pic'); ## TODO: type
3134 $self->{state} = DATA_STATE;
3135 $self->{s_kwd} = '';
3136 ## Reprocess.
3137 !!!emit ($self->{ct}); # pi
3138 redo A;
3139 } else {
3140 $self->{ct}->{data} .= chr $self->{nc}; # pi
3141 $self->{read_until}->($self->{ct}->{data}, q[?],
3142 length $self->{ct}->{data});
3143 ## Stay in the state.
3144 !!!next-input-character;
3145 ## Reprocess.
3146 redo A;
3147 }
3148 } elsif ($self->{state} == PI_AFTER_STATE) {
3149 if ($self->{nc} == 0x003E) { # >
3150 $self->{state} = DATA_STATE;
3151 $self->{s_kwd} = '';
3152 !!!next-input-character;
3153 !!!emit ($self->{ct}); # pi
3154 redo A;
3155 } elsif ($self->{nc} == 0x003F) { # ?
3156 !!!parse-error (type => 'no s after target', ## TODO: type
3157 line => $self->{line_prev},
3158 column => $self->{column_prev}); ## XML5: no error
3159 $self->{ct}->{data} .= '?';
3160 $self->{state} = PI_DATA_AFTER_STATE;
3161 !!!next-input-character;
3162 redo A;
3163 } else {
3164 !!!parse-error (type => 'no s after target', ## TODO: type
3165 line => $self->{line_prev},
3166 column => $self->{column_prev}
3167 + 1 * ($self->{nc} == -1)); ## XML5: no error
3168 $self->{ct}->{data} .= '?'; ## XML5: not appended
3169 $self->{state} = PI_DATA_STATE;
3170 ## Reprocess.
3171 redo A;
3172 }
3173 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3174 ## XML5: Same as "pi after state" in XML5
3175 if ($self->{nc} == 0x003E) { # >
3176 $self->{state} = DATA_STATE;
3177 $self->{s_kwd} = '';
3178 !!!next-input-character;
3179 !!!emit ($self->{ct}); # pi
3180 redo A;
3181 } elsif ($self->{nc} == 0x003F) { # ?
3182 $self->{ct}->{data} .= '?';
3183 ## Stay in the state.
3184 !!!next-input-character;
3185 redo A;
3186 } else {
3187 $self->{ct}->{data} .= '?'; ## XML5: not appended
3188 $self->{state} = PI_DATA_STATE;
3189 ## Reprocess.
3190 redo A;
3191 }
3192
3193 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3194 if ($self->{nc} == 0x003C) { # <
3195 ## TODO:
3196 !!!next-input-character;
3197 redo A;
3198 } elsif ($self->{nc} == 0x0025) { # %
3199 ## XML5: Not defined yet.
3200
3201 ## TODO:
3202 !!!next-input-character;
3203 redo A;
3204 } elsif ($self->{nc} == 0x005D) { # ]
3205 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3206 !!!next-input-character;
3207 redo A;
3208 } elsif ($is_space->{$self->{nc}}) {
3209 ## Stay in the state.
3210 !!!next-input-character;
3211 redo A;
3212 } elsif ($self->{nc} == -1) {
3213 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3214 $self->{state} = DATA_STATE;
3215 $self->{s_kwd} = '';
3216 ## Reconsume.
3217 !!!emit ($self->{ct}); # DOCTYPE
3218 redo A;
3219 } else {
3220 unless ($self->{internal_subset_tainted}) {
3221 ## XML5: No parse error.
3222 !!!parse-error (type => 'string in internal subset');
3223 $self->{internal_subset_tainted} = 1;
3224 }
3225 ## Stay in the state.
3226 !!!next-input-character;
3227 redo A;
3228 }
3229 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3230 if ($self->{nc} == 0x003E) { # >
3231 $self->{state} = DATA_STATE;
3232 $self->{s_kwd} = '';
3233 !!!next-input-character;
3234 !!!emit ($self->{ct}); # DOCTYPE
3235 redo A;
3236 } elsif ($self->{nc} == -1) {
3237 !!!parse-error (type => 'unclosed DOCTYPE');
3238 $self->{state} = DATA_STATE;
3239 $self->{s_kwd} = '';
3240 ## Reconsume.
3241 !!!emit ($self->{ct}); # DOCTYPE
3242 redo A;
3243 } else {
3244 ## XML5: No parse error and stay in the state.
3245 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3246
3247 $self->{state} = BOGUS_DOCTYPE_STATE;
3248 !!!next-input-character;
3249 redo A;
3250 }
3251
3252 } else {
3253 die "$0: $self->{state}: Unknown state";
3254 }
3255 } # A
3256
3257 die "$0: _get_next_token: unexpected case";
3258 } # _get_next_token
3259
3260 1;
3261 ## $Date: 2008/10/15 10:50:38 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24