/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (show annotations) (download) (as text)
Wed Oct 15 04:38:22 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.7: +163 -15 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	15 Oct 2008 04:37:36 -0000
	* XML-Parser.t: "xml/pis-1.dat" and "xml/xmldecls-1.dat" added.
	Test directifes "#xml-version", "#xml-encoding", and
	"#xml-standalone" are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 04:37:54 -0000
	* pis-1.dat, xmldecls-1.dat: New test data files.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	15 Oct 2008 04:33:34 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (create_processing_instruction): New method.
	(xml_version, xml_encoding, xml_standalone): New attributes.
	(ProcessingInstruction): New class.

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 04:34:03 -0000
	* Tokenizer.pm.src: Support for XML processing instructions.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 04:34:57 -0000
	* Parser.pm.src: Support for XML declarations.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.7 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## XML states
118 sub PI_STATE () { 51 }
119 sub PI_TARGET_STATE () { 52 }
120 sub PI_TARGET_AFTER_STATE () { 53 }
121 sub PI_DATA_STATE () { 54 }
122 sub PI_AFTER_STATE () { 55 }
123 sub PI_DATA_AFTER_STATE () { 56 }
124
125 ## Tree constructor state constants (see Whatpm::HTML for the full
126 ## list and descriptions)
127
128 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129 sub FOREIGN_EL () { 0b1_00000000000 }
130
131 ## Character reference mappings
132
133 my $charref_map = {
134 0x0D => 0x000A,
135 0x80 => 0x20AC,
136 0x81 => 0xFFFD,
137 0x82 => 0x201A,
138 0x83 => 0x0192,
139 0x84 => 0x201E,
140 0x85 => 0x2026,
141 0x86 => 0x2020,
142 0x87 => 0x2021,
143 0x88 => 0x02C6,
144 0x89 => 0x2030,
145 0x8A => 0x0160,
146 0x8B => 0x2039,
147 0x8C => 0x0152,
148 0x8D => 0xFFFD,
149 0x8E => 0x017D,
150 0x8F => 0xFFFD,
151 0x90 => 0xFFFD,
152 0x91 => 0x2018,
153 0x92 => 0x2019,
154 0x93 => 0x201C,
155 0x94 => 0x201D,
156 0x95 => 0x2022,
157 0x96 => 0x2013,
158 0x97 => 0x2014,
159 0x98 => 0x02DC,
160 0x99 => 0x2122,
161 0x9A => 0x0161,
162 0x9B => 0x203A,
163 0x9C => 0x0153,
164 0x9D => 0xFFFD,
165 0x9E => 0x017E,
166 0x9F => 0x0178,
167 }; # $charref_map
168 $charref_map->{$_} = 0xFFFD
169 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176
177 ## Implementations MUST act as if state machine in the spec
178
179 sub _initialize_tokenizer ($) {
180 my $self = shift;
181
182 ## NOTE: Fields set by |new| constructor:
183 #$self->{level}
184 #$self->{set_nc}
185 #$self->{parse_error}
186 #$self->{is_xml} (if XML)
187
188 $self->{state} = DATA_STATE; # MUST
189 $self->{s_kwd} = ''; # state keyword
190 #$self->{entity__value}; # initialized when used
191 #$self->{entity__match}; # initialized when used
192 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193 undef $self->{ct}; # current token
194 undef $self->{ca}; # current attribute
195 undef $self->{last_stag_name}; # last emitted start tag name
196 #$self->{prev_state}; # initialized when used
197 delete $self->{self_closing};
198 $self->{char_buffer} = '';
199 $self->{char_buffer_pos} = 0;
200 $self->{nc} = -1; # next input character
201 #$self->{next_nc}
202 !!!next-input-character;
203 $self->{token} = [];
204 # $self->{escape}
205 } # _initialize_tokenizer
206
207 ## A token has:
208 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
210 ## ->{name} (DOCTYPE_TOKEN)
211 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212 ## ->{pubid} (DOCTYPE_TOKEN)
213 ## ->{sysid} (DOCTYPE_TOKEN)
214 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
215 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
216 ## ->{name}
217 ## ->{value}
218 ## ->{has_reference} == 1 or 0
219 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
220 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
221 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
223 ## while the token is pushed back to the stack.
224
225 ## Emitted token MUST immediately be handled by the tree construction state.
226
227 ## Before each step, UA MAY check to see if either one of the scripts in
228 ## "list of scripts that will execute as soon as possible" or the first
229 ## script in the "list of scripts that will execute asynchronously",
230 ## has completed loading. If one has, then it MUST be executed
231 ## and removed from the list.
232
233 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
234 ## (This requirement was dropped from HTML5 spec, unfortunately.)
235
236 my $is_space = {
237 0x0009 => 1, # CHARACTER TABULATION (HT)
238 0x000A => 1, # LINE FEED (LF)
239 #0x000B => 0, # LINE TABULATION (VT)
240 0x000C => 1, # FORM FEED (FF)
241 #0x000D => 1, # CARRIAGE RETURN (CR)
242 0x0020 => 1, # SPACE (SP)
243 };
244
245 sub _get_next_token ($) {
246 my $self = shift;
247
248 if ($self->{self_closing}) {
249 !!!parse-error (type => 'nestc', token => $self->{ct});
250 ## NOTE: The |self_closing| flag is only set by start tag token.
251 ## In addition, when a start tag token is emitted, it is always set to
252 ## |ct|.
253 delete $self->{self_closing};
254 }
255
256 if (@{$self->{token}}) {
257 $self->{self_closing} = $self->{token}->[0]->{self_closing};
258 return shift @{$self->{token}};
259 }
260
261 A: {
262 if ($self->{state} == PCDATA_STATE) {
263 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
264
265 if ($self->{nc} == 0x0026) { # &
266 !!!cp (0.1);
267 ## NOTE: In the spec, the tokenizer is switched to the
268 ## "entity data state". In this implementation, the tokenizer
269 ## is switched to the |ENTITY_STATE|, which is an implementation
270 ## of the "consume a character reference" algorithm.
271 $self->{entity_add} = -1;
272 $self->{prev_state} = DATA_STATE;
273 $self->{state} = ENTITY_STATE;
274 !!!next-input-character;
275 redo A;
276 } elsif ($self->{nc} == 0x003C) { # <
277 !!!cp (0.2);
278 $self->{state} = TAG_OPEN_STATE;
279 !!!next-input-character;
280 redo A;
281 } elsif ($self->{nc} == -1) {
282 !!!cp (0.3);
283 !!!emit ({type => END_OF_FILE_TOKEN,
284 line => $self->{line}, column => $self->{column}});
285 last A; ## TODO: ok?
286 } else {
287 !!!cp (0.4);
288 #
289 }
290
291 # Anything else
292 my $token = {type => CHARACTER_TOKEN,
293 data => chr $self->{nc},
294 line => $self->{line}, column => $self->{column},
295 };
296 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
297
298 ## Stay in the state.
299 !!!next-input-character;
300 !!!emit ($token);
301 redo A;
302 } elsif ($self->{state} == DATA_STATE) {
303 $self->{s_kwd} = '' unless defined $self->{s_kwd};
304 if ($self->{nc} == 0x0026) { # &
305 $self->{s_kwd} = '';
306 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
307 not $self->{escape}) {
308 !!!cp (1);
309 ## NOTE: In the spec, the tokenizer is switched to the
310 ## "entity data state". In this implementation, the tokenizer
311 ## is switched to the |ENTITY_STATE|, which is an implementation
312 ## of the "consume a character reference" algorithm.
313 $self->{entity_add} = -1;
314 $self->{prev_state} = DATA_STATE;
315 $self->{state} = ENTITY_STATE;
316 !!!next-input-character;
317 redo A;
318 } else {
319 !!!cp (2);
320 #
321 }
322 } elsif ($self->{nc} == 0x002D) { # -
323 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
324 if ($self->{s_kwd} eq '<!-') {
325 !!!cp (3);
326 $self->{escape} = 1; # unless $self->{escape};
327 $self->{s_kwd} = '--';
328 #
329 } elsif ($self->{s_kwd} eq '-') {
330 !!!cp (4);
331 $self->{s_kwd} = '--';
332 #
333 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
334 !!!cp (4.1);
335 $self->{s_kwd} .= '-';
336 #
337 } else {
338 !!!cp (5);
339 $self->{s_kwd} = '-';
340 #
341 }
342 }
343
344 #
345 } elsif ($self->{nc} == 0x0021) { # !
346 if (length $self->{s_kwd}) {
347 !!!cp (5.1);
348 $self->{s_kwd} .= '!';
349 #
350 } else {
351 !!!cp (5.2);
352 #$self->{s_kwd} = '';
353 #
354 }
355 #
356 } elsif ($self->{nc} == 0x003C) { # <
357 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
358 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
359 not $self->{escape})) {
360 !!!cp (6);
361 $self->{state} = TAG_OPEN_STATE;
362 !!!next-input-character;
363 redo A;
364 } else {
365 !!!cp (7);
366 $self->{s_kwd} = '';
367 #
368 }
369 } elsif ($self->{nc} == 0x003E) { # >
370 if ($self->{escape} and
371 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
372 if ($self->{s_kwd} eq '--') {
373 !!!cp (8);
374 delete $self->{escape};
375 #
376 } else {
377 !!!cp (9);
378 #
379 }
380 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
381 !!!cp (9.1);
382 !!!parse-error (type => 'unmatched mse', ## TODO: type
383 line => $self->{line_prev},
384 column => $self->{column_prev} - 1);
385 #
386 } else {
387 !!!cp (10);
388 #
389 }
390
391 $self->{s_kwd} = '';
392 #
393 } elsif ($self->{nc} == 0x005D) { # ]
394 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
395 !!!cp (10.1);
396 $self->{s_kwd} .= ']';
397 } elsif ($self->{s_kwd} eq ']]') {
398 !!!cp (10.2);
399 #
400 } else {
401 !!!cp (10.3);
402 $self->{s_kwd} = '';
403 }
404 #
405 } elsif ($self->{nc} == -1) {
406 !!!cp (11);
407 $self->{s_kwd} = '';
408 !!!emit ({type => END_OF_FILE_TOKEN,
409 line => $self->{line}, column => $self->{column}});
410 last A; ## TODO: ok?
411 } else {
412 !!!cp (12);
413 $self->{s_kwd} = '';
414 #
415 }
416
417 # Anything else
418 my $token = {type => CHARACTER_TOKEN,
419 data => chr $self->{nc},
420 line => $self->{line}, column => $self->{column},
421 };
422 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
423 length $token->{data})) {
424 $self->{s_kwd} = '';
425 }
426
427 ## Stay in the data state.
428 if (not $self->{is_xml} and
429 $self->{content_model} == PCDATA_CONTENT_MODEL) {
430 !!!cp (13);
431 $self->{state} = PCDATA_STATE;
432 } else {
433 !!!cp (14);
434 ## Stay in the state.
435 }
436 !!!next-input-character;
437 !!!emit ($token);
438 redo A;
439 } elsif ($self->{state} == TAG_OPEN_STATE) {
440 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
441 if ($self->{nc} == 0x002F) { # /
442 !!!cp (15);
443 !!!next-input-character;
444 $self->{state} = CLOSE_TAG_OPEN_STATE;
445 redo A;
446 } elsif ($self->{nc} == 0x0021) { # !
447 !!!cp (15.1);
448 $self->{s_kwd} = '<' unless $self->{escape};
449 #
450 } else {
451 !!!cp (16);
452 #
453 }
454
455 ## reconsume
456 $self->{state} = DATA_STATE;
457 $self->{s_kwd} = '';
458 !!!emit ({type => CHARACTER_TOKEN, data => '<',
459 line => $self->{line_prev},
460 column => $self->{column_prev},
461 });
462 redo A;
463 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
464 if ($self->{nc} == 0x0021) { # !
465 !!!cp (17);
466 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
467 !!!next-input-character;
468 redo A;
469 } elsif ($self->{nc} == 0x002F) { # /
470 !!!cp (18);
471 $self->{state} = CLOSE_TAG_OPEN_STATE;
472 !!!next-input-character;
473 redo A;
474 } elsif (0x0041 <= $self->{nc} and
475 $self->{nc} <= 0x005A) { # A..Z
476 !!!cp (19);
477 $self->{ct}
478 = {type => START_TAG_TOKEN,
479 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
480 line => $self->{line_prev},
481 column => $self->{column_prev}};
482 $self->{state} = TAG_NAME_STATE;
483 !!!next-input-character;
484 redo A;
485 } elsif (0x0061 <= $self->{nc} and
486 $self->{nc} <= 0x007A) { # a..z
487 !!!cp (20);
488 $self->{ct} = {type => START_TAG_TOKEN,
489 tag_name => chr ($self->{nc}),
490 line => $self->{line_prev},
491 column => $self->{column_prev}};
492 $self->{state} = TAG_NAME_STATE;
493 !!!next-input-character;
494 redo A;
495 } elsif ($self->{nc} == 0x003E) { # >
496 !!!cp (21);
497 !!!parse-error (type => 'empty start tag',
498 line => $self->{line_prev},
499 column => $self->{column_prev});
500 $self->{state} = DATA_STATE;
501 $self->{s_kwd} = '';
502 !!!next-input-character;
503
504 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
505 line => $self->{line_prev},
506 column => $self->{column_prev},
507 });
508
509 redo A;
510 } elsif ($self->{nc} == 0x003F) { # ?
511 if ($self->{is_xml}) {
512 !!!cp (22.1);
513 $self->{state} = PI_STATE;
514 !!!next-input-character;
515 redo A;
516 } else {
517 !!!cp (22);
518 !!!parse-error (type => 'pio',
519 line => $self->{line_prev},
520 column => $self->{column_prev});
521 $self->{state} = BOGUS_COMMENT_STATE;
522 $self->{ct} = {type => COMMENT_TOKEN, data => '',
523 line => $self->{line_prev},
524 column => $self->{column_prev},
525 };
526 ## $self->{nc} is intentionally left as is
527 redo A;
528 }
529 } else {
530 !!!cp (23);
531 !!!parse-error (type => 'bare stago',
532 line => $self->{line_prev},
533 column => $self->{column_prev});
534 $self->{state} = DATA_STATE;
535 $self->{s_kwd} = '';
536 ## reconsume
537
538 !!!emit ({type => CHARACTER_TOKEN, data => '<',
539 line => $self->{line_prev},
540 column => $self->{column_prev},
541 });
542
543 redo A;
544 }
545 } else {
546 die "$0: $self->{content_model} in tag open";
547 }
548 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
549 ## NOTE: The "close tag open state" in the spec is implemented as
550 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
551
552 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
553 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
554 if (defined $self->{last_stag_name}) {
555 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
556 $self->{s_kwd} = '';
557 ## Reconsume.
558 redo A;
559 } else {
560 ## No start tag token has ever been emitted
561 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
562 !!!cp (28);
563 $self->{state} = DATA_STATE;
564 $self->{s_kwd} = '';
565 ## Reconsume.
566 !!!emit ({type => CHARACTER_TOKEN, data => '</',
567 line => $l, column => $c,
568 });
569 redo A;
570 }
571 }
572
573 if (0x0041 <= $self->{nc} and
574 $self->{nc} <= 0x005A) { # A..Z
575 !!!cp (29);
576 $self->{ct}
577 = {type => END_TAG_TOKEN,
578 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
579 line => $l, column => $c};
580 $self->{state} = TAG_NAME_STATE;
581 !!!next-input-character;
582 redo A;
583 } elsif (0x0061 <= $self->{nc} and
584 $self->{nc} <= 0x007A) { # a..z
585 !!!cp (30);
586 $self->{ct} = {type => END_TAG_TOKEN,
587 tag_name => chr ($self->{nc}),
588 line => $l, column => $c};
589 $self->{state} = TAG_NAME_STATE;
590 !!!next-input-character;
591 redo A;
592 } elsif ($self->{nc} == 0x003E) { # >
593 !!!cp (31);
594 !!!parse-error (type => 'empty end tag',
595 line => $self->{line_prev}, ## "<" in "</>"
596 column => $self->{column_prev} - 1);
597 $self->{state} = DATA_STATE;
598 $self->{s_kwd} = '';
599 !!!next-input-character;
600 redo A;
601 } elsif ($self->{nc} == -1) {
602 !!!cp (32);
603 !!!parse-error (type => 'bare etago');
604 $self->{s_kwd} = '';
605 $self->{state} = DATA_STATE;
606 # reconsume
607
608 !!!emit ({type => CHARACTER_TOKEN, data => '</',
609 line => $l, column => $c,
610 });
611
612 redo A;
613 } else {
614 !!!cp (33);
615 !!!parse-error (type => 'bogus end tag');
616 $self->{state} = BOGUS_COMMENT_STATE;
617 $self->{ct} = {type => COMMENT_TOKEN, data => '',
618 line => $self->{line_prev}, # "<" of "</"
619 column => $self->{column_prev} - 1,
620 };
621 ## NOTE: $self->{nc} is intentionally left as is.
622 ## Although the "anything else" case of the spec not explicitly
623 ## states that the next input character is to be reconsumed,
624 ## it will be included to the |data| of the comment token
625 ## generated from the bogus end tag, as defined in the
626 ## "bogus comment state" entry.
627 redo A;
628 }
629 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
630 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
631 if (length $ch) {
632 my $CH = $ch;
633 $ch =~ tr/a-z/A-Z/;
634 my $nch = chr $self->{nc};
635 if ($nch eq $ch or $nch eq $CH) {
636 !!!cp (24);
637 ## Stay in the state.
638 $self->{s_kwd} .= $nch;
639 !!!next-input-character;
640 redo A;
641 } else {
642 !!!cp (25);
643 $self->{state} = DATA_STATE;
644 $self->{s_kwd} = '';
645 ## Reconsume.
646 !!!emit ({type => CHARACTER_TOKEN,
647 data => '</' . $self->{s_kwd},
648 line => $self->{line_prev},
649 column => $self->{column_prev} - 1 - length $self->{s_kwd},
650 });
651 redo A;
652 }
653 } else { # after "<{tag-name}"
654 unless ($is_space->{$self->{nc}} or
655 {
656 0x003E => 1, # >
657 0x002F => 1, # /
658 -1 => 1, # EOF
659 }->{$self->{nc}}) {
660 !!!cp (26);
661 ## Reconsume.
662 $self->{state} = DATA_STATE;
663 $self->{s_kwd} = '';
664 !!!emit ({type => CHARACTER_TOKEN,
665 data => '</' . $self->{s_kwd},
666 line => $self->{line_prev},
667 column => $self->{column_prev} - 1 - length $self->{s_kwd},
668 });
669 redo A;
670 } else {
671 !!!cp (27);
672 $self->{ct}
673 = {type => END_TAG_TOKEN,
674 tag_name => $self->{last_stag_name},
675 line => $self->{line_prev},
676 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
677 $self->{state} = TAG_NAME_STATE;
678 ## Reconsume.
679 redo A;
680 }
681 }
682 } elsif ($self->{state} == TAG_NAME_STATE) {
683 if ($is_space->{$self->{nc}}) {
684 !!!cp (34);
685 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
686 !!!next-input-character;
687 redo A;
688 } elsif ($self->{nc} == 0x003E) { # >
689 if ($self->{ct}->{type} == START_TAG_TOKEN) {
690 !!!cp (35);
691 $self->{last_stag_name} = $self->{ct}->{tag_name};
692 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
693 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
694 #if ($self->{ct}->{attributes}) {
695 # ## NOTE: This should never be reached.
696 # !!! cp (36);
697 # !!! parse-error (type => 'end tag attribute');
698 #} else {
699 !!!cp (37);
700 #}
701 } else {
702 die "$0: $self->{ct}->{type}: Unknown token type";
703 }
704 $self->{state} = DATA_STATE;
705 $self->{s_kwd} = '';
706 !!!next-input-character;
707
708 !!!emit ($self->{ct}); # start tag or end tag
709
710 redo A;
711 } elsif (0x0041 <= $self->{nc} and
712 $self->{nc} <= 0x005A) { # A..Z
713 !!!cp (38);
714 $self->{ct}->{tag_name}
715 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
716 # start tag or end tag
717 ## Stay in this state
718 !!!next-input-character;
719 redo A;
720 } elsif ($self->{nc} == -1) {
721 !!!parse-error (type => 'unclosed tag');
722 if ($self->{ct}->{type} == START_TAG_TOKEN) {
723 !!!cp (39);
724 $self->{last_stag_name} = $self->{ct}->{tag_name};
725 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
726 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
727 #if ($self->{ct}->{attributes}) {
728 # ## NOTE: This state should never be reached.
729 # !!! cp (40);
730 # !!! parse-error (type => 'end tag attribute');
731 #} else {
732 !!!cp (41);
733 #}
734 } else {
735 die "$0: $self->{ct}->{type}: Unknown token type";
736 }
737 $self->{state} = DATA_STATE;
738 $self->{s_kwd} = '';
739 # reconsume
740
741 !!!emit ($self->{ct}); # start tag or end tag
742
743 redo A;
744 } elsif ($self->{nc} == 0x002F) { # /
745 !!!cp (42);
746 $self->{state} = SELF_CLOSING_START_TAG_STATE;
747 !!!next-input-character;
748 redo A;
749 } else {
750 !!!cp (44);
751 $self->{ct}->{tag_name} .= chr $self->{nc};
752 # start tag or end tag
753 ## Stay in the state
754 !!!next-input-character;
755 redo A;
756 }
757 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
758 if ($is_space->{$self->{nc}}) {
759 !!!cp (45);
760 ## Stay in the state
761 !!!next-input-character;
762 redo A;
763 } elsif ($self->{nc} == 0x003E) { # >
764 if ($self->{ct}->{type} == START_TAG_TOKEN) {
765 !!!cp (46);
766 $self->{last_stag_name} = $self->{ct}->{tag_name};
767 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
768 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
769 if ($self->{ct}->{attributes}) {
770 !!!cp (47);
771 !!!parse-error (type => 'end tag attribute');
772 } else {
773 !!!cp (48);
774 }
775 } else {
776 die "$0: $self->{ct}->{type}: Unknown token type";
777 }
778 $self->{state} = DATA_STATE;
779 $self->{s_kwd} = '';
780 !!!next-input-character;
781
782 !!!emit ($self->{ct}); # start tag or end tag
783
784 redo A;
785 } elsif (0x0041 <= $self->{nc} and
786 $self->{nc} <= 0x005A) { # A..Z
787 !!!cp (49);
788 $self->{ca}
789 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
790 value => '',
791 line => $self->{line}, column => $self->{column}};
792 $self->{state} = ATTRIBUTE_NAME_STATE;
793 !!!next-input-character;
794 redo A;
795 } elsif ($self->{nc} == 0x002F) { # /
796 !!!cp (50);
797 $self->{state} = SELF_CLOSING_START_TAG_STATE;
798 !!!next-input-character;
799 redo A;
800 } elsif ($self->{nc} == -1) {
801 !!!parse-error (type => 'unclosed tag');
802 if ($self->{ct}->{type} == START_TAG_TOKEN) {
803 !!!cp (52);
804 $self->{last_stag_name} = $self->{ct}->{tag_name};
805 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
806 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
807 if ($self->{ct}->{attributes}) {
808 !!!cp (53);
809 !!!parse-error (type => 'end tag attribute');
810 } else {
811 !!!cp (54);
812 }
813 } else {
814 die "$0: $self->{ct}->{type}: Unknown token type";
815 }
816 $self->{state} = DATA_STATE;
817 $self->{s_kwd} = '';
818 # reconsume
819
820 !!!emit ($self->{ct}); # start tag or end tag
821
822 redo A;
823 } else {
824 if ({
825 0x0022 => 1, # "
826 0x0027 => 1, # '
827 0x003D => 1, # =
828 }->{$self->{nc}}) {
829 !!!cp (55);
830 !!!parse-error (type => 'bad attribute name');
831 } else {
832 !!!cp (56);
833 }
834 $self->{ca}
835 = {name => chr ($self->{nc}),
836 value => '',
837 line => $self->{line}, column => $self->{column}};
838 $self->{state} = ATTRIBUTE_NAME_STATE;
839 !!!next-input-character;
840 redo A;
841 }
842 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
843 my $before_leave = sub {
844 if (exists $self->{ct}->{attributes} # start tag or end tag
845 ->{$self->{ca}->{name}}) { # MUST
846 !!!cp (57);
847 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
848 ## Discard $self->{ca} # MUST
849 } else {
850 !!!cp (58);
851 $self->{ct}->{attributes}->{$self->{ca}->{name}}
852 = $self->{ca};
853 }
854 }; # $before_leave
855
856 if ($is_space->{$self->{nc}}) {
857 !!!cp (59);
858 $before_leave->();
859 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
860 !!!next-input-character;
861 redo A;
862 } elsif ($self->{nc} == 0x003D) { # =
863 !!!cp (60);
864 $before_leave->();
865 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
866 !!!next-input-character;
867 redo A;
868 } elsif ($self->{nc} == 0x003E) { # >
869 $before_leave->();
870 if ($self->{ct}->{type} == START_TAG_TOKEN) {
871 !!!cp (61);
872 $self->{last_stag_name} = $self->{ct}->{tag_name};
873 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
874 !!!cp (62);
875 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
876 if ($self->{ct}->{attributes}) {
877 !!!parse-error (type => 'end tag attribute');
878 }
879 } else {
880 die "$0: $self->{ct}->{type}: Unknown token type";
881 }
882 $self->{state} = DATA_STATE;
883 $self->{s_kwd} = '';
884 !!!next-input-character;
885
886 !!!emit ($self->{ct}); # start tag or end tag
887
888 redo A;
889 } elsif (0x0041 <= $self->{nc} and
890 $self->{nc} <= 0x005A) { # A..Z
891 !!!cp (63);
892 $self->{ca}->{name}
893 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
894 ## Stay in the state
895 !!!next-input-character;
896 redo A;
897 } elsif ($self->{nc} == 0x002F) { # /
898 !!!cp (64);
899 $before_leave->();
900 $self->{state} = SELF_CLOSING_START_TAG_STATE;
901 !!!next-input-character;
902 redo A;
903 } elsif ($self->{nc} == -1) {
904 !!!parse-error (type => 'unclosed tag');
905 $before_leave->();
906 if ($self->{ct}->{type} == START_TAG_TOKEN) {
907 !!!cp (66);
908 $self->{last_stag_name} = $self->{ct}->{tag_name};
909 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
910 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
911 if ($self->{ct}->{attributes}) {
912 !!!cp (67);
913 !!!parse-error (type => 'end tag attribute');
914 } else {
915 ## NOTE: This state should never be reached.
916 !!!cp (68);
917 }
918 } else {
919 die "$0: $self->{ct}->{type}: Unknown token type";
920 }
921 $self->{state} = DATA_STATE;
922 $self->{s_kwd} = '';
923 # reconsume
924
925 !!!emit ($self->{ct}); # start tag or end tag
926
927 redo A;
928 } else {
929 if ($self->{nc} == 0x0022 or # "
930 $self->{nc} == 0x0027) { # '
931 !!!cp (69);
932 !!!parse-error (type => 'bad attribute name');
933 } else {
934 !!!cp (70);
935 }
936 $self->{ca}->{name} .= chr ($self->{nc});
937 ## Stay in the state
938 !!!next-input-character;
939 redo A;
940 }
941 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
942 if ($is_space->{$self->{nc}}) {
943 !!!cp (71);
944 ## Stay in the state
945 !!!next-input-character;
946 redo A;
947 } elsif ($self->{nc} == 0x003D) { # =
948 !!!cp (72);
949 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
950 !!!next-input-character;
951 redo A;
952 } elsif ($self->{nc} == 0x003E) { # >
953 if ($self->{ct}->{type} == START_TAG_TOKEN) {
954 !!!cp (73);
955 $self->{last_stag_name} = $self->{ct}->{tag_name};
956 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
957 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
958 if ($self->{ct}->{attributes}) {
959 !!!cp (74);
960 !!!parse-error (type => 'end tag attribute');
961 } else {
962 ## NOTE: This state should never be reached.
963 !!!cp (75);
964 }
965 } else {
966 die "$0: $self->{ct}->{type}: Unknown token type";
967 }
968 $self->{state} = DATA_STATE;
969 $self->{s_kwd} = '';
970 !!!next-input-character;
971
972 !!!emit ($self->{ct}); # start tag or end tag
973
974 redo A;
975 } elsif (0x0041 <= $self->{nc} and
976 $self->{nc} <= 0x005A) { # A..Z
977 !!!cp (76);
978 $self->{ca}
979 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
980 value => '',
981 line => $self->{line}, column => $self->{column}};
982 $self->{state} = ATTRIBUTE_NAME_STATE;
983 !!!next-input-character;
984 redo A;
985 } elsif ($self->{nc} == 0x002F) { # /
986 !!!cp (77);
987 $self->{state} = SELF_CLOSING_START_TAG_STATE;
988 !!!next-input-character;
989 redo A;
990 } elsif ($self->{nc} == -1) {
991 !!!parse-error (type => 'unclosed tag');
992 if ($self->{ct}->{type} == START_TAG_TOKEN) {
993 !!!cp (79);
994 $self->{last_stag_name} = $self->{ct}->{tag_name};
995 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
996 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
997 if ($self->{ct}->{attributes}) {
998 !!!cp (80);
999 !!!parse-error (type => 'end tag attribute');
1000 } else {
1001 ## NOTE: This state should never be reached.
1002 !!!cp (81);
1003 }
1004 } else {
1005 die "$0: $self->{ct}->{type}: Unknown token type";
1006 }
1007 $self->{s_kwd} = '';
1008 $self->{state} = DATA_STATE;
1009 # reconsume
1010
1011 !!!emit ($self->{ct}); # start tag or end tag
1012
1013 redo A;
1014 } else {
1015 if ($self->{nc} == 0x0022 or # "
1016 $self->{nc} == 0x0027) { # '
1017 !!!cp (78);
1018 !!!parse-error (type => 'bad attribute name');
1019 } else {
1020 !!!cp (82);
1021 }
1022 $self->{ca}
1023 = {name => chr ($self->{nc}),
1024 value => '',
1025 line => $self->{line}, column => $self->{column}};
1026 $self->{state} = ATTRIBUTE_NAME_STATE;
1027 !!!next-input-character;
1028 redo A;
1029 }
1030 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1031 if ($is_space->{$self->{nc}}) {
1032 !!!cp (83);
1033 ## Stay in the state
1034 !!!next-input-character;
1035 redo A;
1036 } elsif ($self->{nc} == 0x0022) { # "
1037 !!!cp (84);
1038 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1039 !!!next-input-character;
1040 redo A;
1041 } elsif ($self->{nc} == 0x0026) { # &
1042 !!!cp (85);
1043 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1044 ## reconsume
1045 redo A;
1046 } elsif ($self->{nc} == 0x0027) { # '
1047 !!!cp (86);
1048 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1049 !!!next-input-character;
1050 redo A;
1051 } elsif ($self->{nc} == 0x003E) { # >
1052 !!!parse-error (type => 'empty unquoted attribute value');
1053 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1054 !!!cp (87);
1055 $self->{last_stag_name} = $self->{ct}->{tag_name};
1056 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1057 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1058 if ($self->{ct}->{attributes}) {
1059 !!!cp (88);
1060 !!!parse-error (type => 'end tag attribute');
1061 } else {
1062 ## NOTE: This state should never be reached.
1063 !!!cp (89);
1064 }
1065 } else {
1066 die "$0: $self->{ct}->{type}: Unknown token type";
1067 }
1068 $self->{state} = DATA_STATE;
1069 $self->{s_kwd} = '';
1070 !!!next-input-character;
1071
1072 !!!emit ($self->{ct}); # start tag or end tag
1073
1074 redo A;
1075 } elsif ($self->{nc} == -1) {
1076 !!!parse-error (type => 'unclosed tag');
1077 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1078 !!!cp (90);
1079 $self->{last_stag_name} = $self->{ct}->{tag_name};
1080 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1081 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1082 if ($self->{ct}->{attributes}) {
1083 !!!cp (91);
1084 !!!parse-error (type => 'end tag attribute');
1085 } else {
1086 ## NOTE: This state should never be reached.
1087 !!!cp (92);
1088 }
1089 } else {
1090 die "$0: $self->{ct}->{type}: Unknown token type";
1091 }
1092 $self->{state} = DATA_STATE;
1093 $self->{s_kwd} = '';
1094 ## reconsume
1095
1096 !!!emit ($self->{ct}); # start tag or end tag
1097
1098 redo A;
1099 } else {
1100 if ($self->{nc} == 0x003D) { # =
1101 !!!cp (93);
1102 !!!parse-error (type => 'bad attribute value');
1103 } else {
1104 !!!cp (94);
1105 }
1106 $self->{ca}->{value} .= chr ($self->{nc});
1107 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1108 !!!next-input-character;
1109 redo A;
1110 }
1111 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1112 if ($self->{nc} == 0x0022) { # "
1113 !!!cp (95);
1114 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1115 !!!next-input-character;
1116 redo A;
1117 } elsif ($self->{nc} == 0x0026) { # &
1118 !!!cp (96);
1119 ## NOTE: In the spec, the tokenizer is switched to the
1120 ## "entity in attribute value state". In this implementation, the
1121 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1122 ## implementation of the "consume a character reference" algorithm.
1123 $self->{prev_state} = $self->{state};
1124 $self->{entity_add} = 0x0022; # "
1125 $self->{state} = ENTITY_STATE;
1126 !!!next-input-character;
1127 redo A;
1128 } elsif ($self->{nc} == -1) {
1129 !!!parse-error (type => 'unclosed attribute value');
1130 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1131 !!!cp (97);
1132 $self->{last_stag_name} = $self->{ct}->{tag_name};
1133 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1134 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1135 if ($self->{ct}->{attributes}) {
1136 !!!cp (98);
1137 !!!parse-error (type => 'end tag attribute');
1138 } else {
1139 ## NOTE: This state should never be reached.
1140 !!!cp (99);
1141 }
1142 } else {
1143 die "$0: $self->{ct}->{type}: Unknown token type";
1144 }
1145 $self->{state} = DATA_STATE;
1146 $self->{s_kwd} = '';
1147 ## reconsume
1148
1149 !!!emit ($self->{ct}); # start tag or end tag
1150
1151 redo A;
1152 } else {
1153 !!!cp (100);
1154 $self->{ca}->{value} .= chr ($self->{nc});
1155 $self->{read_until}->($self->{ca}->{value},
1156 q["&],
1157 length $self->{ca}->{value});
1158
1159 ## Stay in the state
1160 !!!next-input-character;
1161 redo A;
1162 }
1163 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1164 if ($self->{nc} == 0x0027) { # '
1165 !!!cp (101);
1166 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1167 !!!next-input-character;
1168 redo A;
1169 } elsif ($self->{nc} == 0x0026) { # &
1170 !!!cp (102);
1171 ## NOTE: In the spec, the tokenizer is switched to the
1172 ## "entity in attribute value state". In this implementation, the
1173 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1174 ## implementation of the "consume a character reference" algorithm.
1175 $self->{entity_add} = 0x0027; # '
1176 $self->{prev_state} = $self->{state};
1177 $self->{state} = ENTITY_STATE;
1178 !!!next-input-character;
1179 redo A;
1180 } elsif ($self->{nc} == -1) {
1181 !!!parse-error (type => 'unclosed attribute value');
1182 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1183 !!!cp (103);
1184 $self->{last_stag_name} = $self->{ct}->{tag_name};
1185 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1186 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1187 if ($self->{ct}->{attributes}) {
1188 !!!cp (104);
1189 !!!parse-error (type => 'end tag attribute');
1190 } else {
1191 ## NOTE: This state should never be reached.
1192 !!!cp (105);
1193 }
1194 } else {
1195 die "$0: $self->{ct}->{type}: Unknown token type";
1196 }
1197 $self->{state} = DATA_STATE;
1198 $self->{s_kwd} = '';
1199 ## reconsume
1200
1201 !!!emit ($self->{ct}); # start tag or end tag
1202
1203 redo A;
1204 } else {
1205 !!!cp (106);
1206 $self->{ca}->{value} .= chr ($self->{nc});
1207 $self->{read_until}->($self->{ca}->{value},
1208 q['&],
1209 length $self->{ca}->{value});
1210
1211 ## Stay in the state
1212 !!!next-input-character;
1213 redo A;
1214 }
1215 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1216 if ($is_space->{$self->{nc}}) {
1217 !!!cp (107);
1218 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1219 !!!next-input-character;
1220 redo A;
1221 } elsif ($self->{nc} == 0x0026) { # &
1222 !!!cp (108);
1223 ## NOTE: In the spec, the tokenizer is switched to the
1224 ## "entity in attribute value state". In this implementation, the
1225 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1226 ## implementation of the "consume a character reference" algorithm.
1227 $self->{entity_add} = -1;
1228 $self->{prev_state} = $self->{state};
1229 $self->{state} = ENTITY_STATE;
1230 !!!next-input-character;
1231 redo A;
1232 } elsif ($self->{nc} == 0x003E) { # >
1233 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1234 !!!cp (109);
1235 $self->{last_stag_name} = $self->{ct}->{tag_name};
1236 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1237 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1238 if ($self->{ct}->{attributes}) {
1239 !!!cp (110);
1240 !!!parse-error (type => 'end tag attribute');
1241 } else {
1242 ## NOTE: This state should never be reached.
1243 !!!cp (111);
1244 }
1245 } else {
1246 die "$0: $self->{ct}->{type}: Unknown token type";
1247 }
1248 $self->{state} = DATA_STATE;
1249 $self->{s_kwd} = '';
1250 !!!next-input-character;
1251
1252 !!!emit ($self->{ct}); # start tag or end tag
1253
1254 redo A;
1255 } elsif ($self->{nc} == -1) {
1256 !!!parse-error (type => 'unclosed tag');
1257 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1258 !!!cp (112);
1259 $self->{last_stag_name} = $self->{ct}->{tag_name};
1260 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1261 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1262 if ($self->{ct}->{attributes}) {
1263 !!!cp (113);
1264 !!!parse-error (type => 'end tag attribute');
1265 } else {
1266 ## NOTE: This state should never be reached.
1267 !!!cp (114);
1268 }
1269 } else {
1270 die "$0: $self->{ct}->{type}: Unknown token type";
1271 }
1272 $self->{state} = DATA_STATE;
1273 $self->{s_kwd} = '';
1274 ## reconsume
1275
1276 !!!emit ($self->{ct}); # start tag or end tag
1277
1278 redo A;
1279 } else {
1280 if ({
1281 0x0022 => 1, # "
1282 0x0027 => 1, # '
1283 0x003D => 1, # =
1284 }->{$self->{nc}}) {
1285 !!!cp (115);
1286 !!!parse-error (type => 'bad attribute value');
1287 } else {
1288 !!!cp (116);
1289 }
1290 $self->{ca}->{value} .= chr ($self->{nc});
1291 $self->{read_until}->($self->{ca}->{value},
1292 q["'=& >],
1293 length $self->{ca}->{value});
1294
1295 ## Stay in the state
1296 !!!next-input-character;
1297 redo A;
1298 }
1299 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1300 if ($is_space->{$self->{nc}}) {
1301 !!!cp (118);
1302 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1303 !!!next-input-character;
1304 redo A;
1305 } elsif ($self->{nc} == 0x003E) { # >
1306 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1307 !!!cp (119);
1308 $self->{last_stag_name} = $self->{ct}->{tag_name};
1309 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1310 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1311 if ($self->{ct}->{attributes}) {
1312 !!!cp (120);
1313 !!!parse-error (type => 'end tag attribute');
1314 } else {
1315 ## NOTE: This state should never be reached.
1316 !!!cp (121);
1317 }
1318 } else {
1319 die "$0: $self->{ct}->{type}: Unknown token type";
1320 }
1321 $self->{state} = DATA_STATE;
1322 $self->{s_kwd} = '';
1323 !!!next-input-character;
1324
1325 !!!emit ($self->{ct}); # start tag or end tag
1326
1327 redo A;
1328 } elsif ($self->{nc} == 0x002F) { # /
1329 !!!cp (122);
1330 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1331 !!!next-input-character;
1332 redo A;
1333 } elsif ($self->{nc} == -1) {
1334 !!!parse-error (type => 'unclosed tag');
1335 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1336 !!!cp (122.3);
1337 $self->{last_stag_name} = $self->{ct}->{tag_name};
1338 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1339 if ($self->{ct}->{attributes}) {
1340 !!!cp (122.1);
1341 !!!parse-error (type => 'end tag attribute');
1342 } else {
1343 ## NOTE: This state should never be reached.
1344 !!!cp (122.2);
1345 }
1346 } else {
1347 die "$0: $self->{ct}->{type}: Unknown token type";
1348 }
1349 $self->{state} = DATA_STATE;
1350 $self->{s_kwd} = '';
1351 ## Reconsume.
1352 !!!emit ($self->{ct}); # start tag or end tag
1353 redo A;
1354 } else {
1355 !!!cp ('124.1');
1356 !!!parse-error (type => 'no space between attributes');
1357 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1358 ## reconsume
1359 redo A;
1360 }
1361 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1362 if ($self->{nc} == 0x003E) { # >
1363 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1364 !!!cp ('124.2');
1365 !!!parse-error (type => 'nestc', token => $self->{ct});
1366 ## TODO: Different type than slash in start tag
1367 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1368 if ($self->{ct}->{attributes}) {
1369 !!!cp ('124.4');
1370 !!!parse-error (type => 'end tag attribute');
1371 } else {
1372 !!!cp ('124.5');
1373 }
1374 ## TODO: Test |<title></title/>|
1375 } else {
1376 !!!cp ('124.3');
1377 $self->{self_closing} = 1;
1378 }
1379
1380 $self->{state} = DATA_STATE;
1381 $self->{s_kwd} = '';
1382 !!!next-input-character;
1383
1384 !!!emit ($self->{ct}); # start tag or end tag
1385
1386 redo A;
1387 } elsif ($self->{nc} == -1) {
1388 !!!parse-error (type => 'unclosed tag');
1389 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1390 !!!cp (124.7);
1391 $self->{last_stag_name} = $self->{ct}->{tag_name};
1392 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1393 if ($self->{ct}->{attributes}) {
1394 !!!cp (124.5);
1395 !!!parse-error (type => 'end tag attribute');
1396 } else {
1397 ## NOTE: This state should never be reached.
1398 !!!cp (124.6);
1399 }
1400 } else {
1401 die "$0: $self->{ct}->{type}: Unknown token type";
1402 }
1403 $self->{state} = DATA_STATE;
1404 $self->{s_kwd} = '';
1405 ## Reconsume.
1406 !!!emit ($self->{ct}); # start tag or end tag
1407 redo A;
1408 } else {
1409 !!!cp ('124.4');
1410 !!!parse-error (type => 'nestc');
1411 ## TODO: This error type is wrong.
1412 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1413 ## Reconsume.
1414 redo A;
1415 }
1416 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1417 ## (only happen if PCDATA state)
1418
1419 ## NOTE: Unlike spec's "bogus comment state", this implementation
1420 ## consumes characters one-by-one basis.
1421
1422 if ($self->{nc} == 0x003E) { # >
1423 !!!cp (124);
1424 $self->{state} = DATA_STATE;
1425 $self->{s_kwd} = '';
1426 !!!next-input-character;
1427
1428 !!!emit ($self->{ct}); # comment
1429 redo A;
1430 } elsif ($self->{nc} == -1) {
1431 !!!cp (125);
1432 $self->{state} = DATA_STATE;
1433 $self->{s_kwd} = '';
1434 ## reconsume
1435
1436 !!!emit ($self->{ct}); # comment
1437 redo A;
1438 } else {
1439 !!!cp (126);
1440 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1441 $self->{read_until}->($self->{ct}->{data},
1442 q[>],
1443 length $self->{ct}->{data});
1444
1445 ## Stay in the state.
1446 !!!next-input-character;
1447 redo A;
1448 }
1449 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1450 ## (only happen if PCDATA state)
1451
1452 if ($self->{nc} == 0x002D) { # -
1453 !!!cp (133);
1454 $self->{state} = MD_HYPHEN_STATE;
1455 !!!next-input-character;
1456 redo A;
1457 } elsif ($self->{nc} == 0x0044 or # D
1458 $self->{nc} == 0x0064) { # d
1459 ## ASCII case-insensitive.
1460 !!!cp (130);
1461 $self->{state} = MD_DOCTYPE_STATE;
1462 $self->{s_kwd} = chr $self->{nc};
1463 !!!next-input-character;
1464 redo A;
1465 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1466 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1467 $self->{is_xml}) and
1468 $self->{nc} == 0x005B) { # [
1469 !!!cp (135.4);
1470 $self->{state} = MD_CDATA_STATE;
1471 $self->{s_kwd} = '[';
1472 !!!next-input-character;
1473 redo A;
1474 } else {
1475 !!!cp (136);
1476 }
1477
1478 !!!parse-error (type => 'bogus comment',
1479 line => $self->{line_prev},
1480 column => $self->{column_prev} - 1);
1481 ## Reconsume.
1482 $self->{state} = BOGUS_COMMENT_STATE;
1483 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1484 line => $self->{line_prev},
1485 column => $self->{column_prev} - 1,
1486 };
1487 redo A;
1488 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1489 if ($self->{nc} == 0x002D) { # -
1490 !!!cp (127);
1491 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1492 line => $self->{line_prev},
1493 column => $self->{column_prev} - 2,
1494 };
1495 $self->{state} = COMMENT_START_STATE;
1496 !!!next-input-character;
1497 redo A;
1498 } else {
1499 !!!cp (128);
1500 !!!parse-error (type => 'bogus comment',
1501 line => $self->{line_prev},
1502 column => $self->{column_prev} - 2);
1503 $self->{state} = BOGUS_COMMENT_STATE;
1504 ## Reconsume.
1505 $self->{ct} = {type => COMMENT_TOKEN,
1506 data => '-',
1507 line => $self->{line_prev},
1508 column => $self->{column_prev} - 2,
1509 };
1510 redo A;
1511 }
1512 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1513 ## ASCII case-insensitive.
1514 if ($self->{nc} == [
1515 undef,
1516 0x004F, # O
1517 0x0043, # C
1518 0x0054, # T
1519 0x0059, # Y
1520 0x0050, # P
1521 ]->[length $self->{s_kwd}] or
1522 $self->{nc} == [
1523 undef,
1524 0x006F, # o
1525 0x0063, # c
1526 0x0074, # t
1527 0x0079, # y
1528 0x0070, # p
1529 ]->[length $self->{s_kwd}]) {
1530 !!!cp (131);
1531 ## Stay in the state.
1532 $self->{s_kwd} .= chr $self->{nc};
1533 !!!next-input-character;
1534 redo A;
1535 } elsif ((length $self->{s_kwd}) == 6 and
1536 ($self->{nc} == 0x0045 or # E
1537 $self->{nc} == 0x0065)) { # e
1538 !!!cp (129);
1539 $self->{state} = DOCTYPE_STATE;
1540 $self->{ct} = {type => DOCTYPE_TOKEN,
1541 quirks => 1,
1542 line => $self->{line_prev},
1543 column => $self->{column_prev} - 7,
1544 };
1545 !!!next-input-character;
1546 redo A;
1547 } else {
1548 !!!cp (132);
1549 !!!parse-error (type => 'bogus comment',
1550 line => $self->{line_prev},
1551 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1552 $self->{state} = BOGUS_COMMENT_STATE;
1553 ## Reconsume.
1554 $self->{ct} = {type => COMMENT_TOKEN,
1555 data => $self->{s_kwd},
1556 line => $self->{line_prev},
1557 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1558 };
1559 redo A;
1560 }
1561 } elsif ($self->{state} == MD_CDATA_STATE) {
1562 if ($self->{nc} == {
1563 '[' => 0x0043, # C
1564 '[C' => 0x0044, # D
1565 '[CD' => 0x0041, # A
1566 '[CDA' => 0x0054, # T
1567 '[CDAT' => 0x0041, # A
1568 }->{$self->{s_kwd}}) {
1569 !!!cp (135.1);
1570 ## Stay in the state.
1571 $self->{s_kwd} .= chr $self->{nc};
1572 !!!next-input-character;
1573 redo A;
1574 } elsif ($self->{s_kwd} eq '[CDATA' and
1575 $self->{nc} == 0x005B) { # [
1576 if ($self->{is_xml} and
1577 not $self->{tainted} and
1578 @{$self->{open_elements} or []} == 0) {
1579 !!!cp (135.2);
1580 !!!parse-error (type => 'cdata outside of root element',
1581 line => $self->{line_prev},
1582 column => $self->{column_prev} - 7);
1583 $self->{tainted} = 1;
1584 } else {
1585 !!!cp (135.21);
1586 }
1587
1588 $self->{ct} = {type => CHARACTER_TOKEN,
1589 data => '',
1590 line => $self->{line_prev},
1591 column => $self->{column_prev} - 7};
1592 $self->{state} = CDATA_SECTION_STATE;
1593 !!!next-input-character;
1594 redo A;
1595 } else {
1596 !!!cp (135.3);
1597 !!!parse-error (type => 'bogus comment',
1598 line => $self->{line_prev},
1599 column => $self->{column_prev} - 1 - length $self->{s_kwd});
1600 $self->{state} = BOGUS_COMMENT_STATE;
1601 ## Reconsume.
1602 $self->{ct} = {type => COMMENT_TOKEN,
1603 data => $self->{s_kwd},
1604 line => $self->{line_prev},
1605 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1606 };
1607 redo A;
1608 }
1609 } elsif ($self->{state} == COMMENT_START_STATE) {
1610 if ($self->{nc} == 0x002D) { # -
1611 !!!cp (137);
1612 $self->{state} = COMMENT_START_DASH_STATE;
1613 !!!next-input-character;
1614 redo A;
1615 } elsif ($self->{nc} == 0x003E) { # >
1616 !!!cp (138);
1617 !!!parse-error (type => 'bogus comment');
1618 $self->{state} = DATA_STATE;
1619 $self->{s_kwd} = '';
1620 !!!next-input-character;
1621
1622 !!!emit ($self->{ct}); # comment
1623
1624 redo A;
1625 } elsif ($self->{nc} == -1) {
1626 !!!cp (139);
1627 !!!parse-error (type => 'unclosed comment');
1628 $self->{state} = DATA_STATE;
1629 $self->{s_kwd} = '';
1630 ## reconsume
1631
1632 !!!emit ($self->{ct}); # comment
1633
1634 redo A;
1635 } else {
1636 !!!cp (140);
1637 $self->{ct}->{data} # comment
1638 .= chr ($self->{nc});
1639 $self->{state} = COMMENT_STATE;
1640 !!!next-input-character;
1641 redo A;
1642 }
1643 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1644 if ($self->{nc} == 0x002D) { # -
1645 !!!cp (141);
1646 $self->{state} = COMMENT_END_STATE;
1647 !!!next-input-character;
1648 redo A;
1649 } elsif ($self->{nc} == 0x003E) { # >
1650 !!!cp (142);
1651 !!!parse-error (type => 'bogus comment');
1652 $self->{state} = DATA_STATE;
1653 $self->{s_kwd} = '';
1654 !!!next-input-character;
1655
1656 !!!emit ($self->{ct}); # comment
1657
1658 redo A;
1659 } elsif ($self->{nc} == -1) {
1660 !!!cp (143);
1661 !!!parse-error (type => 'unclosed comment');
1662 $self->{state} = DATA_STATE;
1663 $self->{s_kwd} = '';
1664 ## reconsume
1665
1666 !!!emit ($self->{ct}); # comment
1667
1668 redo A;
1669 } else {
1670 !!!cp (144);
1671 $self->{ct}->{data} # comment
1672 .= '-' . chr ($self->{nc});
1673 $self->{state} = COMMENT_STATE;
1674 !!!next-input-character;
1675 redo A;
1676 }
1677 } elsif ($self->{state} == COMMENT_STATE) {
1678 if ($self->{nc} == 0x002D) { # -
1679 !!!cp (145);
1680 $self->{state} = COMMENT_END_DASH_STATE;
1681 !!!next-input-character;
1682 redo A;
1683 } elsif ($self->{nc} == -1) {
1684 !!!cp (146);
1685 !!!parse-error (type => 'unclosed comment');
1686 $self->{state} = DATA_STATE;
1687 $self->{s_kwd} = '';
1688 ## reconsume
1689
1690 !!!emit ($self->{ct}); # comment
1691
1692 redo A;
1693 } else {
1694 !!!cp (147);
1695 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1696 $self->{read_until}->($self->{ct}->{data},
1697 q[-],
1698 length $self->{ct}->{data});
1699
1700 ## Stay in the state
1701 !!!next-input-character;
1702 redo A;
1703 }
1704 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1705 if ($self->{nc} == 0x002D) { # -
1706 !!!cp (148);
1707 $self->{state} = COMMENT_END_STATE;
1708 !!!next-input-character;
1709 redo A;
1710 } elsif ($self->{nc} == -1) {
1711 !!!cp (149);
1712 !!!parse-error (type => 'unclosed comment');
1713 $self->{s_kwd} = '';
1714 $self->{state} = DATA_STATE;
1715 $self->{s_kwd} = '';
1716 ## reconsume
1717
1718 !!!emit ($self->{ct}); # comment
1719
1720 redo A;
1721 } else {
1722 !!!cp (150);
1723 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1724 $self->{state} = COMMENT_STATE;
1725 !!!next-input-character;
1726 redo A;
1727 }
1728 } elsif ($self->{state} == COMMENT_END_STATE) {
1729 if ($self->{nc} == 0x003E) { # >
1730 !!!cp (151);
1731 $self->{state} = DATA_STATE;
1732 $self->{s_kwd} = '';
1733 !!!next-input-character;
1734
1735 !!!emit ($self->{ct}); # comment
1736
1737 redo A;
1738 } elsif ($self->{nc} == 0x002D) { # -
1739 !!!cp (152);
1740 !!!parse-error (type => 'dash in comment',
1741 line => $self->{line_prev},
1742 column => $self->{column_prev});
1743 $self->{ct}->{data} .= '-'; # comment
1744 ## Stay in the state
1745 !!!next-input-character;
1746 redo A;
1747 } elsif ($self->{nc} == -1) {
1748 !!!cp (153);
1749 !!!parse-error (type => 'unclosed comment');
1750 $self->{state} = DATA_STATE;
1751 $self->{s_kwd} = '';
1752 ## reconsume
1753
1754 !!!emit ($self->{ct}); # comment
1755
1756 redo A;
1757 } else {
1758 !!!cp (154);
1759 !!!parse-error (type => 'dash in comment',
1760 line => $self->{line_prev},
1761 column => $self->{column_prev});
1762 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1763 $self->{state} = COMMENT_STATE;
1764 !!!next-input-character;
1765 redo A;
1766 }
1767 } elsif ($self->{state} == DOCTYPE_STATE) {
1768 if ($is_space->{$self->{nc}}) {
1769 !!!cp (155);
1770 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1771 !!!next-input-character;
1772 redo A;
1773 } else {
1774 !!!cp (156);
1775 !!!parse-error (type => 'no space before DOCTYPE name');
1776 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1777 ## reconsume
1778 redo A;
1779 }
1780 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1781 if ($is_space->{$self->{nc}}) {
1782 !!!cp (157);
1783 ## Stay in the state
1784 !!!next-input-character;
1785 redo A;
1786 } elsif ($self->{nc} == 0x003E) { # >
1787 !!!cp (158);
1788 !!!parse-error (type => 'no DOCTYPE name');
1789 $self->{state} = DATA_STATE;
1790 $self->{s_kwd} = '';
1791 !!!next-input-character;
1792
1793 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1794
1795 redo A;
1796 } elsif ($self->{nc} == -1) {
1797 !!!cp (159);
1798 !!!parse-error (type => 'no DOCTYPE name');
1799 $self->{state} = DATA_STATE;
1800 $self->{s_kwd} = '';
1801 ## reconsume
1802
1803 !!!emit ($self->{ct}); # DOCTYPE (quirks)
1804
1805 redo A;
1806 } else {
1807 !!!cp (160);
1808 $self->{ct}->{name} = chr $self->{nc};
1809 delete $self->{ct}->{quirks};
1810 $self->{state} = DOCTYPE_NAME_STATE;
1811 !!!next-input-character;
1812 redo A;
1813 }
1814 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1815 ## ISSUE: Redundant "First," in the spec.
1816 if ($is_space->{$self->{nc}}) {
1817 !!!cp (161);
1818 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1819 !!!next-input-character;
1820 redo A;
1821 } elsif ($self->{nc} == 0x003E) { # >
1822 !!!cp (162);
1823 $self->{state} = DATA_STATE;
1824 $self->{s_kwd} = '';
1825 !!!next-input-character;
1826
1827 !!!emit ($self->{ct}); # DOCTYPE
1828
1829 redo A;
1830 } elsif ($self->{nc} == -1) {
1831 !!!cp (163);
1832 !!!parse-error (type => 'unclosed DOCTYPE');
1833 $self->{state} = DATA_STATE;
1834 $self->{s_kwd} = '';
1835 ## reconsume
1836
1837 $self->{ct}->{quirks} = 1;
1838 !!!emit ($self->{ct}); # DOCTYPE
1839
1840 redo A;
1841 } else {
1842 !!!cp (164);
1843 $self->{ct}->{name}
1844 .= chr ($self->{nc}); # DOCTYPE
1845 ## Stay in the state
1846 !!!next-input-character;
1847 redo A;
1848 }
1849 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1850 if ($is_space->{$self->{nc}}) {
1851 !!!cp (165);
1852 ## Stay in the state
1853 !!!next-input-character;
1854 redo A;
1855 } elsif ($self->{nc} == 0x003E) { # >
1856 !!!cp (166);
1857 $self->{state} = DATA_STATE;
1858 $self->{s_kwd} = '';
1859 !!!next-input-character;
1860
1861 !!!emit ($self->{ct}); # DOCTYPE
1862
1863 redo A;
1864 } elsif ($self->{nc} == -1) {
1865 !!!cp (167);
1866 !!!parse-error (type => 'unclosed DOCTYPE');
1867 $self->{state} = DATA_STATE;
1868 $self->{s_kwd} = '';
1869 ## reconsume
1870
1871 $self->{ct}->{quirks} = 1;
1872 !!!emit ($self->{ct}); # DOCTYPE
1873
1874 redo A;
1875 } elsif ($self->{nc} == 0x0050 or # P
1876 $self->{nc} == 0x0070) { # p
1877 $self->{state} = PUBLIC_STATE;
1878 $self->{s_kwd} = chr $self->{nc};
1879 !!!next-input-character;
1880 redo A;
1881 } elsif ($self->{nc} == 0x0053 or # S
1882 $self->{nc} == 0x0073) { # s
1883 $self->{state} = SYSTEM_STATE;
1884 $self->{s_kwd} = chr $self->{nc};
1885 !!!next-input-character;
1886 redo A;
1887 } else {
1888 !!!cp (180);
1889 !!!parse-error (type => 'string after DOCTYPE name');
1890 $self->{ct}->{quirks} = 1;
1891
1892 $self->{state} = BOGUS_DOCTYPE_STATE;
1893 !!!next-input-character;
1894 redo A;
1895 }
1896 } elsif ($self->{state} == PUBLIC_STATE) {
1897 ## ASCII case-insensitive
1898 if ($self->{nc} == [
1899 undef,
1900 0x0055, # U
1901 0x0042, # B
1902 0x004C, # L
1903 0x0049, # I
1904 ]->[length $self->{s_kwd}] or
1905 $self->{nc} == [
1906 undef,
1907 0x0075, # u
1908 0x0062, # b
1909 0x006C, # l
1910 0x0069, # i
1911 ]->[length $self->{s_kwd}]) {
1912 !!!cp (175);
1913 ## Stay in the state.
1914 $self->{s_kwd} .= chr $self->{nc};
1915 !!!next-input-character;
1916 redo A;
1917 } elsif ((length $self->{s_kwd}) == 5 and
1918 ($self->{nc} == 0x0043 or # C
1919 $self->{nc} == 0x0063)) { # c
1920 !!!cp (168);
1921 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1922 !!!next-input-character;
1923 redo A;
1924 } else {
1925 !!!cp (169);
1926 !!!parse-error (type => 'string after DOCTYPE name',
1927 line => $self->{line_prev},
1928 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1929 $self->{ct}->{quirks} = 1;
1930
1931 $self->{state} = BOGUS_DOCTYPE_STATE;
1932 ## Reconsume.
1933 redo A;
1934 }
1935 } elsif ($self->{state} == SYSTEM_STATE) {
1936 ## ASCII case-insensitive
1937 if ($self->{nc} == [
1938 undef,
1939 0x0059, # Y
1940 0x0053, # S
1941 0x0054, # T
1942 0x0045, # E
1943 ]->[length $self->{s_kwd}] or
1944 $self->{nc} == [
1945 undef,
1946 0x0079, # y
1947 0x0073, # s
1948 0x0074, # t
1949 0x0065, # e
1950 ]->[length $self->{s_kwd}]) {
1951 !!!cp (170);
1952 ## Stay in the state.
1953 $self->{s_kwd} .= chr $self->{nc};
1954 !!!next-input-character;
1955 redo A;
1956 } elsif ((length $self->{s_kwd}) == 5 and
1957 ($self->{nc} == 0x004D or # M
1958 $self->{nc} == 0x006D)) { # m
1959 !!!cp (171);
1960 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1961 !!!next-input-character;
1962 redo A;
1963 } else {
1964 !!!cp (172);
1965 !!!parse-error (type => 'string after DOCTYPE name',
1966 line => $self->{line_prev},
1967 column => $self->{column_prev} + 1 - length $self->{s_kwd});
1968 $self->{ct}->{quirks} = 1;
1969
1970 $self->{state} = BOGUS_DOCTYPE_STATE;
1971 ## Reconsume.
1972 redo A;
1973 }
1974 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1975 if ($is_space->{$self->{nc}}) {
1976 !!!cp (181);
1977 ## Stay in the state
1978 !!!next-input-character;
1979 redo A;
1980 } elsif ($self->{nc} eq 0x0022) { # "
1981 !!!cp (182);
1982 $self->{ct}->{pubid} = ''; # DOCTYPE
1983 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1984 !!!next-input-character;
1985 redo A;
1986 } elsif ($self->{nc} eq 0x0027) { # '
1987 !!!cp (183);
1988 $self->{ct}->{pubid} = ''; # DOCTYPE
1989 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1990 !!!next-input-character;
1991 redo A;
1992 } elsif ($self->{nc} eq 0x003E) { # >
1993 !!!cp (184);
1994 !!!parse-error (type => 'no PUBLIC literal');
1995
1996 $self->{state} = DATA_STATE;
1997 $self->{s_kwd} = '';
1998 !!!next-input-character;
1999
2000 $self->{ct}->{quirks} = 1;
2001 !!!emit ($self->{ct}); # DOCTYPE
2002
2003 redo A;
2004 } elsif ($self->{nc} == -1) {
2005 !!!cp (185);
2006 !!!parse-error (type => 'unclosed DOCTYPE');
2007
2008 $self->{state} = DATA_STATE;
2009 $self->{s_kwd} = '';
2010 ## reconsume
2011
2012 $self->{ct}->{quirks} = 1;
2013 !!!emit ($self->{ct}); # DOCTYPE
2014
2015 redo A;
2016 } else {
2017 !!!cp (186);
2018 !!!parse-error (type => 'string after PUBLIC');
2019 $self->{ct}->{quirks} = 1;
2020
2021 $self->{state} = BOGUS_DOCTYPE_STATE;
2022 !!!next-input-character;
2023 redo A;
2024 }
2025 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2026 if ($self->{nc} == 0x0022) { # "
2027 !!!cp (187);
2028 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2029 !!!next-input-character;
2030 redo A;
2031 } elsif ($self->{nc} == 0x003E) { # >
2032 !!!cp (188);
2033 !!!parse-error (type => 'unclosed PUBLIC literal');
2034
2035 $self->{state} = DATA_STATE;
2036 $self->{s_kwd} = '';
2037 !!!next-input-character;
2038
2039 $self->{ct}->{quirks} = 1;
2040 !!!emit ($self->{ct}); # DOCTYPE
2041
2042 redo A;
2043 } elsif ($self->{nc} == -1) {
2044 !!!cp (189);
2045 !!!parse-error (type => 'unclosed PUBLIC literal');
2046
2047 $self->{state} = DATA_STATE;
2048 $self->{s_kwd} = '';
2049 ## reconsume
2050
2051 $self->{ct}->{quirks} = 1;
2052 !!!emit ($self->{ct}); # DOCTYPE
2053
2054 redo A;
2055 } else {
2056 !!!cp (190);
2057 $self->{ct}->{pubid} # DOCTYPE
2058 .= chr $self->{nc};
2059 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2060 length $self->{ct}->{pubid});
2061
2062 ## Stay in the state
2063 !!!next-input-character;
2064 redo A;
2065 }
2066 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2067 if ($self->{nc} == 0x0027) { # '
2068 !!!cp (191);
2069 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2070 !!!next-input-character;
2071 redo A;
2072 } elsif ($self->{nc} == 0x003E) { # >
2073 !!!cp (192);
2074 !!!parse-error (type => 'unclosed PUBLIC literal');
2075
2076 $self->{state} = DATA_STATE;
2077 $self->{s_kwd} = '';
2078 !!!next-input-character;
2079
2080 $self->{ct}->{quirks} = 1;
2081 !!!emit ($self->{ct}); # DOCTYPE
2082
2083 redo A;
2084 } elsif ($self->{nc} == -1) {
2085 !!!cp (193);
2086 !!!parse-error (type => 'unclosed PUBLIC literal');
2087
2088 $self->{state} = DATA_STATE;
2089 $self->{s_kwd} = '';
2090 ## reconsume
2091
2092 $self->{ct}->{quirks} = 1;
2093 !!!emit ($self->{ct}); # DOCTYPE
2094
2095 redo A;
2096 } else {
2097 !!!cp (194);
2098 $self->{ct}->{pubid} # DOCTYPE
2099 .= chr $self->{nc};
2100 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2101 length $self->{ct}->{pubid});
2102
2103 ## Stay in the state
2104 !!!next-input-character;
2105 redo A;
2106 }
2107 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2108 if ($is_space->{$self->{nc}}) {
2109 !!!cp (195);
2110 ## Stay in the state
2111 !!!next-input-character;
2112 redo A;
2113 } elsif ($self->{nc} == 0x0022) { # "
2114 !!!cp (196);
2115 $self->{ct}->{sysid} = ''; # DOCTYPE
2116 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2117 !!!next-input-character;
2118 redo A;
2119 } elsif ($self->{nc} == 0x0027) { # '
2120 !!!cp (197);
2121 $self->{ct}->{sysid} = ''; # DOCTYPE
2122 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2123 !!!next-input-character;
2124 redo A;
2125 } elsif ($self->{nc} == 0x003E) { # >
2126 !!!cp (198);
2127 $self->{state} = DATA_STATE;
2128 $self->{s_kwd} = '';
2129 !!!next-input-character;
2130
2131 !!!emit ($self->{ct}); # DOCTYPE
2132
2133 redo A;
2134 } elsif ($self->{nc} == -1) {
2135 !!!cp (199);
2136 !!!parse-error (type => 'unclosed DOCTYPE');
2137
2138 $self->{state} = DATA_STATE;
2139 $self->{s_kwd} = '';
2140 ## reconsume
2141
2142 $self->{ct}->{quirks} = 1;
2143 !!!emit ($self->{ct}); # DOCTYPE
2144
2145 redo A;
2146 } else {
2147 !!!cp (200);
2148 !!!parse-error (type => 'string after PUBLIC literal');
2149 $self->{ct}->{quirks} = 1;
2150
2151 $self->{state} = BOGUS_DOCTYPE_STATE;
2152 !!!next-input-character;
2153 redo A;
2154 }
2155 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2156 if ($is_space->{$self->{nc}}) {
2157 !!!cp (201);
2158 ## Stay in the state
2159 !!!next-input-character;
2160 redo A;
2161 } elsif ($self->{nc} == 0x0022) { # "
2162 !!!cp (202);
2163 $self->{ct}->{sysid} = ''; # DOCTYPE
2164 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2165 !!!next-input-character;
2166 redo A;
2167 } elsif ($self->{nc} == 0x0027) { # '
2168 !!!cp (203);
2169 $self->{ct}->{sysid} = ''; # DOCTYPE
2170 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2171 !!!next-input-character;
2172 redo A;
2173 } elsif ($self->{nc} == 0x003E) { # >
2174 !!!cp (204);
2175 !!!parse-error (type => 'no SYSTEM literal');
2176 $self->{state} = DATA_STATE;
2177 $self->{s_kwd} = '';
2178 !!!next-input-character;
2179
2180 $self->{ct}->{quirks} = 1;
2181 !!!emit ($self->{ct}); # DOCTYPE
2182
2183 redo A;
2184 } elsif ($self->{nc} == -1) {
2185 !!!cp (205);
2186 !!!parse-error (type => 'unclosed DOCTYPE');
2187
2188 $self->{state} = DATA_STATE;
2189 $self->{s_kwd} = '';
2190 ## reconsume
2191
2192 $self->{ct}->{quirks} = 1;
2193 !!!emit ($self->{ct}); # DOCTYPE
2194
2195 redo A;
2196 } else {
2197 !!!cp (206);
2198 !!!parse-error (type => 'string after SYSTEM');
2199 $self->{ct}->{quirks} = 1;
2200
2201 $self->{state} = BOGUS_DOCTYPE_STATE;
2202 !!!next-input-character;
2203 redo A;
2204 }
2205 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2206 if ($self->{nc} == 0x0022) { # "
2207 !!!cp (207);
2208 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2209 !!!next-input-character;
2210 redo A;
2211 } elsif ($self->{nc} == 0x003E) { # >
2212 !!!cp (208);
2213 !!!parse-error (type => 'unclosed SYSTEM literal');
2214
2215 $self->{state} = DATA_STATE;
2216 $self->{s_kwd} = '';
2217 !!!next-input-character;
2218
2219 $self->{ct}->{quirks} = 1;
2220 !!!emit ($self->{ct}); # DOCTYPE
2221
2222 redo A;
2223 } elsif ($self->{nc} == -1) {
2224 !!!cp (209);
2225 !!!parse-error (type => 'unclosed SYSTEM literal');
2226
2227 $self->{state} = DATA_STATE;
2228 $self->{s_kwd} = '';
2229 ## reconsume
2230
2231 $self->{ct}->{quirks} = 1;
2232 !!!emit ($self->{ct}); # DOCTYPE
2233
2234 redo A;
2235 } else {
2236 !!!cp (210);
2237 $self->{ct}->{sysid} # DOCTYPE
2238 .= chr $self->{nc};
2239 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2240 length $self->{ct}->{sysid});
2241
2242 ## Stay in the state
2243 !!!next-input-character;
2244 redo A;
2245 }
2246 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2247 if ($self->{nc} == 0x0027) { # '
2248 !!!cp (211);
2249 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2250 !!!next-input-character;
2251 redo A;
2252 } elsif ($self->{nc} == 0x003E) { # >
2253 !!!cp (212);
2254 !!!parse-error (type => 'unclosed SYSTEM literal');
2255
2256 $self->{state} = DATA_STATE;
2257 $self->{s_kwd} = '';
2258 !!!next-input-character;
2259
2260 $self->{ct}->{quirks} = 1;
2261 !!!emit ($self->{ct}); # DOCTYPE
2262
2263 redo A;
2264 } elsif ($self->{nc} == -1) {
2265 !!!cp (213);
2266 !!!parse-error (type => 'unclosed SYSTEM literal');
2267
2268 $self->{state} = DATA_STATE;
2269 $self->{s_kwd} = '';
2270 ## reconsume
2271
2272 $self->{ct}->{quirks} = 1;
2273 !!!emit ($self->{ct}); # DOCTYPE
2274
2275 redo A;
2276 } else {
2277 !!!cp (214);
2278 $self->{ct}->{sysid} # DOCTYPE
2279 .= chr $self->{nc};
2280 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2281 length $self->{ct}->{sysid});
2282
2283 ## Stay in the state
2284 !!!next-input-character;
2285 redo A;
2286 }
2287 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2288 if ($is_space->{$self->{nc}}) {
2289 !!!cp (215);
2290 ## Stay in the state
2291 !!!next-input-character;
2292 redo A;
2293 } elsif ($self->{nc} == 0x003E) { # >
2294 !!!cp (216);
2295 $self->{state} = DATA_STATE;
2296 $self->{s_kwd} = '';
2297 !!!next-input-character;
2298
2299 !!!emit ($self->{ct}); # DOCTYPE
2300
2301 redo A;
2302 } elsif ($self->{nc} == -1) {
2303 !!!cp (217);
2304 !!!parse-error (type => 'unclosed DOCTYPE');
2305 $self->{state} = DATA_STATE;
2306 $self->{s_kwd} = '';
2307 ## reconsume
2308
2309 $self->{ct}->{quirks} = 1;
2310 !!!emit ($self->{ct}); # DOCTYPE
2311
2312 redo A;
2313 } else {
2314 !!!cp (218);
2315 !!!parse-error (type => 'string after SYSTEM literal');
2316 #$self->{ct}->{quirks} = 1;
2317
2318 $self->{state} = BOGUS_DOCTYPE_STATE;
2319 !!!next-input-character;
2320 redo A;
2321 }
2322 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2323 if ($self->{nc} == 0x003E) { # >
2324 !!!cp (219);
2325 $self->{state} = DATA_STATE;
2326 $self->{s_kwd} = '';
2327 !!!next-input-character;
2328
2329 !!!emit ($self->{ct}); # DOCTYPE
2330
2331 redo A;
2332 } elsif ($self->{nc} == -1) {
2333 !!!cp (220);
2334 $self->{state} = DATA_STATE;
2335 $self->{s_kwd} = '';
2336 ## reconsume
2337
2338 !!!emit ($self->{ct}); # DOCTYPE
2339
2340 redo A;
2341 } else {
2342 !!!cp (221);
2343 my $s = '';
2344 $self->{read_until}->($s, q[>], 0);
2345
2346 ## Stay in the state
2347 !!!next-input-character;
2348 redo A;
2349 }
2350 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2351 ## NOTE: "CDATA section state" in the state is jointly implemented
2352 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2353 ## and |CDATA_SECTION_MSE2_STATE|.
2354
2355 if ($self->{nc} == 0x005D) { # ]
2356 !!!cp (221.1);
2357 $self->{state} = CDATA_SECTION_MSE1_STATE;
2358 !!!next-input-character;
2359 redo A;
2360 } elsif ($self->{nc} == -1) {
2361 if ($self->{is_xml}) {
2362 !!!cp (221.11);
2363 !!!parse-error (type => 'no mse'); ## TODO: type
2364 } else {
2365 !!!cp (221.12);
2366 }
2367
2368 $self->{state} = DATA_STATE;
2369 $self->{s_kwd} = '';
2370 !!!next-input-character;
2371 if (length $self->{ct}->{data}) { # character
2372 !!!cp (221.2);
2373 !!!emit ($self->{ct}); # character
2374 } else {
2375 !!!cp (221.3);
2376 ## No token to emit. $self->{ct} is discarded.
2377 }
2378 redo A;
2379 } else {
2380 !!!cp (221.4);
2381 $self->{ct}->{data} .= chr $self->{nc};
2382 $self->{read_until}->($self->{ct}->{data},
2383 q<]>,
2384 length $self->{ct}->{data});
2385
2386 ## Stay in the state.
2387 !!!next-input-character;
2388 redo A;
2389 }
2390
2391 ## ISSUE: "text tokens" in spec.
2392 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2393 if ($self->{nc} == 0x005D) { # ]
2394 !!!cp (221.5);
2395 $self->{state} = CDATA_SECTION_MSE2_STATE;
2396 !!!next-input-character;
2397 redo A;
2398 } else {
2399 !!!cp (221.6);
2400 $self->{ct}->{data} .= ']';
2401 $self->{state} = CDATA_SECTION_STATE;
2402 ## Reconsume.
2403 redo A;
2404 }
2405 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2406 if ($self->{nc} == 0x003E) { # >
2407 $self->{state} = DATA_STATE;
2408 $self->{s_kwd} = '';
2409 !!!next-input-character;
2410 if (length $self->{ct}->{data}) { # character
2411 !!!cp (221.7);
2412 !!!emit ($self->{ct}); # character
2413 } else {
2414 !!!cp (221.8);
2415 ## No token to emit. $self->{ct} is discarded.
2416 }
2417 redo A;
2418 } elsif ($self->{nc} == 0x005D) { # ]
2419 !!!cp (221.9); # character
2420 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2421 ## Stay in the state.
2422 !!!next-input-character;
2423 redo A;
2424 } else {
2425 !!!cp (221.11);
2426 $self->{ct}->{data} .= ']]'; # character
2427 $self->{state} = CDATA_SECTION_STATE;
2428 ## Reconsume.
2429 redo A;
2430 }
2431 } elsif ($self->{state} == ENTITY_STATE) {
2432 if ($is_space->{$self->{nc}} or
2433 {
2434 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2435 $self->{entity_add} => 1,
2436 }->{$self->{nc}}) {
2437 !!!cp (1001);
2438 ## Don't consume
2439 ## No error
2440 ## Return nothing.
2441 #
2442 } elsif ($self->{nc} == 0x0023) { # #
2443 !!!cp (999);
2444 $self->{state} = ENTITY_HASH_STATE;
2445 $self->{s_kwd} = '#';
2446 !!!next-input-character;
2447 redo A;
2448 } elsif ((0x0041 <= $self->{nc} and
2449 $self->{nc} <= 0x005A) or # A..Z
2450 (0x0061 <= $self->{nc} and
2451 $self->{nc} <= 0x007A)) { # a..z
2452 !!!cp (998);
2453 require Whatpm::_NamedEntityList;
2454 $self->{state} = ENTITY_NAME_STATE;
2455 $self->{s_kwd} = chr $self->{nc};
2456 $self->{entity__value} = $self->{s_kwd};
2457 $self->{entity__match} = 0;
2458 !!!next-input-character;
2459 redo A;
2460 } else {
2461 !!!cp (1027);
2462 !!!parse-error (type => 'bare ero');
2463 ## Return nothing.
2464 #
2465 }
2466
2467 ## NOTE: No character is consumed by the "consume a character
2468 ## reference" algorithm. In other word, there is an "&" character
2469 ## that does not introduce a character reference, which would be
2470 ## appended to the parent element or the attribute value in later
2471 ## process of the tokenizer.
2472
2473 if ($self->{prev_state} == DATA_STATE) {
2474 !!!cp (997);
2475 $self->{state} = $self->{prev_state};
2476 $self->{s_kwd} = '';
2477 ## Reconsume.
2478 !!!emit ({type => CHARACTER_TOKEN, data => '&',
2479 line => $self->{line_prev},
2480 column => $self->{column_prev},
2481 });
2482 redo A;
2483 } else {
2484 !!!cp (996);
2485 $self->{ca}->{value} .= '&';
2486 $self->{state} = $self->{prev_state};
2487 $self->{s_kwd} = '';
2488 ## Reconsume.
2489 redo A;
2490 }
2491 } elsif ($self->{state} == ENTITY_HASH_STATE) {
2492 if ($self->{nc} == 0x0078 or # x
2493 $self->{nc} == 0x0058) { # X
2494 !!!cp (995);
2495 $self->{state} = HEXREF_X_STATE;
2496 $self->{s_kwd} .= chr $self->{nc};
2497 !!!next-input-character;
2498 redo A;
2499 } elsif (0x0030 <= $self->{nc} and
2500 $self->{nc} <= 0x0039) { # 0..9
2501 !!!cp (994);
2502 $self->{state} = NCR_NUM_STATE;
2503 $self->{s_kwd} = $self->{nc} - 0x0030;
2504 !!!next-input-character;
2505 redo A;
2506 } else {
2507 !!!parse-error (type => 'bare nero',
2508 line => $self->{line_prev},
2509 column => $self->{column_prev} - 1);
2510
2511 ## NOTE: According to the spec algorithm, nothing is returned,
2512 ## and then "&#" is appended to the parent element or the attribute
2513 ## value in the later processing.
2514
2515 if ($self->{prev_state} == DATA_STATE) {
2516 !!!cp (1019);
2517 $self->{state} = $self->{prev_state};
2518 $self->{s_kwd} = '';
2519 ## Reconsume.
2520 !!!emit ({type => CHARACTER_TOKEN,
2521 data => '&#',
2522 line => $self->{line_prev},
2523 column => $self->{column_prev} - 1,
2524 });
2525 redo A;
2526 } else {
2527 !!!cp (993);
2528 $self->{ca}->{value} .= '&#';
2529 $self->{state} = $self->{prev_state};
2530 $self->{s_kwd} = '';
2531 ## Reconsume.
2532 redo A;
2533 }
2534 }
2535 } elsif ($self->{state} == NCR_NUM_STATE) {
2536 if (0x0030 <= $self->{nc} and
2537 $self->{nc} <= 0x0039) { # 0..9
2538 !!!cp (1012);
2539 $self->{s_kwd} *= 10;
2540 $self->{s_kwd} += $self->{nc} - 0x0030;
2541
2542 ## Stay in the state.
2543 !!!next-input-character;
2544 redo A;
2545 } elsif ($self->{nc} == 0x003B) { # ;
2546 !!!cp (1013);
2547 !!!next-input-character;
2548 #
2549 } else {
2550 !!!cp (1014);
2551 !!!parse-error (type => 'no refc');
2552 ## Reconsume.
2553 #
2554 }
2555
2556 my $code = $self->{s_kwd};
2557 my $l = $self->{line_prev};
2558 my $c = $self->{column_prev};
2559 if ($charref_map->{$code}) {
2560 !!!cp (1015);
2561 !!!parse-error (type => 'invalid character reference',
2562 text => (sprintf 'U+%04X', $code),
2563 line => $l, column => $c);
2564 $code = $charref_map->{$code};
2565 } elsif ($code > 0x10FFFF) {
2566 !!!cp (1016);
2567 !!!parse-error (type => 'invalid character reference',
2568 text => (sprintf 'U-%08X', $code),
2569 line => $l, column => $c);
2570 $code = 0xFFFD;
2571 }
2572
2573 if ($self->{prev_state} == DATA_STATE) {
2574 !!!cp (992);
2575 $self->{state} = $self->{prev_state};
2576 $self->{s_kwd} = '';
2577 ## Reconsume.
2578 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2579 has_reference => 1,
2580 line => $l, column => $c,
2581 });
2582 redo A;
2583 } else {
2584 !!!cp (991);
2585 $self->{ca}->{value} .= chr $code;
2586 $self->{ca}->{has_reference} = 1;
2587 $self->{state} = $self->{prev_state};
2588 $self->{s_kwd} = '';
2589 ## Reconsume.
2590 redo A;
2591 }
2592 } elsif ($self->{state} == HEXREF_X_STATE) {
2593 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2594 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2595 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2596 # 0..9, A..F, a..f
2597 !!!cp (990);
2598 $self->{state} = HEXREF_HEX_STATE;
2599 $self->{s_kwd} = 0;
2600 ## Reconsume.
2601 redo A;
2602 } else {
2603 !!!parse-error (type => 'bare hcro',
2604 line => $self->{line_prev},
2605 column => $self->{column_prev} - 2);
2606
2607 ## NOTE: According to the spec algorithm, nothing is returned,
2608 ## and then "&#" followed by "X" or "x" is appended to the parent
2609 ## element or the attribute value in the later processing.
2610
2611 if ($self->{prev_state} == DATA_STATE) {
2612 !!!cp (1005);
2613 $self->{state} = $self->{prev_state};
2614 $self->{s_kwd} = '';
2615 ## Reconsume.
2616 !!!emit ({type => CHARACTER_TOKEN,
2617 data => '&' . $self->{s_kwd},
2618 line => $self->{line_prev},
2619 column => $self->{column_prev} - length $self->{s_kwd},
2620 });
2621 redo A;
2622 } else {
2623 !!!cp (989);
2624 $self->{ca}->{value} .= '&' . $self->{s_kwd};
2625 $self->{state} = $self->{prev_state};
2626 $self->{s_kwd} = '';
2627 ## Reconsume.
2628 redo A;
2629 }
2630 }
2631 } elsif ($self->{state} == HEXREF_HEX_STATE) {
2632 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2633 # 0..9
2634 !!!cp (1002);
2635 $self->{s_kwd} *= 0x10;
2636 $self->{s_kwd} += $self->{nc} - 0x0030;
2637 ## Stay in the state.
2638 !!!next-input-character;
2639 redo A;
2640 } elsif (0x0061 <= $self->{nc} and
2641 $self->{nc} <= 0x0066) { # a..f
2642 !!!cp (1003);
2643 $self->{s_kwd} *= 0x10;
2644 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2645 ## Stay in the state.
2646 !!!next-input-character;
2647 redo A;
2648 } elsif (0x0041 <= $self->{nc} and
2649 $self->{nc} <= 0x0046) { # A..F
2650 !!!cp (1004);
2651 $self->{s_kwd} *= 0x10;
2652 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2653 ## Stay in the state.
2654 !!!next-input-character;
2655 redo A;
2656 } elsif ($self->{nc} == 0x003B) { # ;
2657 !!!cp (1006);
2658 !!!next-input-character;
2659 #
2660 } else {
2661 !!!cp (1007);
2662 !!!parse-error (type => 'no refc',
2663 line => $self->{line},
2664 column => $self->{column});
2665 ## Reconsume.
2666 #
2667 }
2668
2669 my $code = $self->{s_kwd};
2670 my $l = $self->{line_prev};
2671 my $c = $self->{column_prev};
2672 if ($charref_map->{$code}) {
2673 !!!cp (1008);
2674 !!!parse-error (type => 'invalid character reference',
2675 text => (sprintf 'U+%04X', $code),
2676 line => $l, column => $c);
2677 $code = $charref_map->{$code};
2678 } elsif ($code > 0x10FFFF) {
2679 !!!cp (1009);
2680 !!!parse-error (type => 'invalid character reference',
2681 text => (sprintf 'U-%08X', $code),
2682 line => $l, column => $c);
2683 $code = 0xFFFD;
2684 }
2685
2686 if ($self->{prev_state} == DATA_STATE) {
2687 !!!cp (988);
2688 $self->{state} = $self->{prev_state};
2689 $self->{s_kwd} = '';
2690 ## Reconsume.
2691 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2692 has_reference => 1,
2693 line => $l, column => $c,
2694 });
2695 redo A;
2696 } else {
2697 !!!cp (987);
2698 $self->{ca}->{value} .= chr $code;
2699 $self->{ca}->{has_reference} = 1;
2700 $self->{state} = $self->{prev_state};
2701 $self->{s_kwd} = '';
2702 ## Reconsume.
2703 redo A;
2704 }
2705 } elsif ($self->{state} == ENTITY_NAME_STATE) {
2706 if (length $self->{s_kwd} < 30 and
2707 ## NOTE: Some number greater than the maximum length of entity name
2708 ((0x0041 <= $self->{nc} and # a
2709 $self->{nc} <= 0x005A) or # x
2710 (0x0061 <= $self->{nc} and # a
2711 $self->{nc} <= 0x007A) or # z
2712 (0x0030 <= $self->{nc} and # 0
2713 $self->{nc} <= 0x0039) or # 9
2714 $self->{nc} == 0x003B)) { # ;
2715 our $EntityChar;
2716 $self->{s_kwd} .= chr $self->{nc};
2717 if (defined $EntityChar->{$self->{s_kwd}}) {
2718 if ($self->{nc} == 0x003B) { # ;
2719 !!!cp (1020);
2720 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2721 $self->{entity__match} = 1;
2722 !!!next-input-character;
2723 #
2724 } else {
2725 !!!cp (1021);
2726 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2727 $self->{entity__match} = -1;
2728 ## Stay in the state.
2729 !!!next-input-character;
2730 redo A;
2731 }
2732 } else {
2733 !!!cp (1022);
2734 $self->{entity__value} .= chr $self->{nc};
2735 $self->{entity__match} *= 2;
2736 ## Stay in the state.
2737 !!!next-input-character;
2738 redo A;
2739 }
2740 }
2741
2742 my $data;
2743 my $has_ref;
2744 if ($self->{entity__match} > 0) {
2745 !!!cp (1023);
2746 $data = $self->{entity__value};
2747 $has_ref = 1;
2748 #
2749 } elsif ($self->{entity__match} < 0) {
2750 !!!parse-error (type => 'no refc');
2751 if ($self->{prev_state} != DATA_STATE and # in attribute
2752 $self->{entity__match} < -1) {
2753 !!!cp (1024);
2754 $data = '&' . $self->{s_kwd};
2755 #
2756 } else {
2757 !!!cp (1025);
2758 $data = $self->{entity__value};
2759 $has_ref = 1;
2760 #
2761 }
2762 } else {
2763 !!!cp (1026);
2764 !!!parse-error (type => 'bare ero',
2765 line => $self->{line_prev},
2766 column => $self->{column_prev} - length $self->{s_kwd});
2767 $data = '&' . $self->{s_kwd};
2768 #
2769 }
2770
2771 ## NOTE: In these cases, when a character reference is found,
2772 ## it is consumed and a character token is returned, or, otherwise,
2773 ## nothing is consumed and returned, according to the spec algorithm.
2774 ## In this implementation, anything that has been examined by the
2775 ## tokenizer is appended to the parent element or the attribute value
2776 ## as string, either literal string when no character reference or
2777 ## entity-replaced string otherwise, in this stage, since any characters
2778 ## that would not be consumed are appended in the data state or in an
2779 ## appropriate attribute value state anyway.
2780
2781 if ($self->{prev_state} == DATA_STATE) {
2782 !!!cp (986);
2783 $self->{state} = $self->{prev_state};
2784 $self->{s_kwd} = '';
2785 ## Reconsume.
2786 !!!emit ({type => CHARACTER_TOKEN,
2787 data => $data,
2788 has_reference => $has_ref,
2789 line => $self->{line_prev},
2790 column => $self->{column_prev} + 1 - length $self->{s_kwd},
2791 });
2792 redo A;
2793 } else {
2794 !!!cp (985);
2795 $self->{ca}->{value} .= $data;
2796 $self->{ca}->{has_reference} = 1 if $has_ref;
2797 $self->{state} = $self->{prev_state};
2798 $self->{s_kwd} = '';
2799 ## Reconsume.
2800 redo A;
2801 }
2802
2803 ## XML-only states
2804
2805 } elsif ($self->{state} == PI_STATE) {
2806 if ($is_space->{$self->{nc}} or
2807 $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2808 $self->{nc} == -1) {
2809 !!!parse-error (type => 'bare pio', ## TODO: type
2810 line => $self->{line_prev},
2811 column => $self->{column_prev}
2812 - 1 * ($self->{nc} != -1));
2813 $self->{state} = BOGUS_COMMENT_STATE;
2814 ## Reconsume.
2815 $self->{ct} = {type => COMMENT_TOKEN,
2816 data => '?',
2817 line => $self->{line_prev},
2818 column => $self->{column_prev}
2819 - 1 * ($self->{nc} != -1),
2820 };
2821 redo A;
2822 } else {
2823 $self->{ct} = {type => PI_TOKEN,
2824 target => chr $self->{nc},
2825 data => '',
2826 line => $self->{line_prev},
2827 column => $self->{column_prev} - 1,
2828 };
2829 $self->{state} = PI_TARGET_STATE;
2830 !!!next-input-character;
2831 redo A;
2832 }
2833 } elsif ($self->{state} == PI_TARGET_STATE) {
2834 if ($is_space->{$self->{nc}}) {
2835 $self->{state} = PI_TARGET_AFTER_STATE;
2836 !!!next-input-character;
2837 redo A;
2838 } elsif ($self->{nc} == -1) {
2839 !!!parse-error (type => 'no pic'); ## TODO: type
2840 $self->{state} = DATA_STATE;
2841 $self->{s_kwd} = '';
2842 ## Reconsume.
2843 !!!emit ($self->{ct}); # pi
2844 redo A;
2845 } elsif ($self->{nc} == 0x003F) { # ?
2846 $self->{state} = PI_AFTER_STATE;
2847 !!!next-input-character;
2848 redo A;
2849 } else {
2850 ## XML5: typo ("tag name" -> "target")
2851 $self->{ct}->{target} .= chr $self->{nc}; # pi
2852 !!!next-input-character;
2853 redo A;
2854 }
2855 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
2856 if ($is_space->{$self->{nc}}) {
2857 ## Stay in the state.
2858 !!!next-input-character;
2859 redo A;
2860 } else {
2861 $self->{state} = PI_DATA_STATE;
2862 ## Reprocess.
2863 redo A;
2864 }
2865 } elsif ($self->{state} == PI_DATA_STATE) {
2866 if ($self->{nc} == 0x003F) { # ?
2867 $self->{state} = PI_DATA_AFTER_STATE;
2868 !!!next-input-character;
2869 redo A;
2870 } elsif ($self->{nc} == -1) {
2871 !!!parse-error (type => 'no pic'); ## TODO: type
2872 $self->{state} = DATA_STATE;
2873 $self->{s_kwd} = '';
2874 ## Reprocess.
2875 !!!emit ($self->{ct}); # pi
2876 redo A;
2877 } else {
2878 $self->{ct}->{data} .= chr $self->{nc}; # pi
2879 $self->{read_until}->($self->{ct}->{data}, q[?],
2880 length $self->{ct}->{data});
2881 ## Stay in the state.
2882 !!!next-input-character;
2883 ## Reprocess.
2884 redo A;
2885 }
2886 } elsif ($self->{state} == PI_AFTER_STATE) {
2887 if ($self->{nc} == 0x003E) { # >
2888 $self->{state} = DATA_STATE;
2889 $self->{s_kwd} = '';
2890 !!!next-input-character;
2891 !!!emit ($self->{ct}); # pi
2892 redo A;
2893 } elsif ($self->{nc} == 0x003F) { # ?
2894 !!!parse-error (type => 'no s after target', ## TODO: type
2895 line => $self->{line_prev},
2896 column => $self->{column_prev}); ## XML5: no error
2897 $self->{ct}->{data} .= '?';
2898 $self->{state} = PI_DATA_AFTER_STATE;
2899 !!!next-input-character;
2900 redo A;
2901 } else {
2902 !!!parse-error (type => 'no s after target', ## TODO: type
2903 line => $self->{line_prev},
2904 column => $self->{column_prev}
2905 + 1 * ($self->{nc} == -1)); ## XML5: no error
2906 $self->{ct}->{data} .= '?'; ## XML5: not appended
2907 $self->{state} = PI_DATA_STATE;
2908 ## Reprocess.
2909 redo A;
2910 }
2911 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
2912 ## XML5: Same as "pi after state" in XML5
2913 if ($self->{nc} == 0x003E) { # >
2914 $self->{state} = DATA_STATE;
2915 $self->{s_kwd} = '';
2916 !!!next-input-character;
2917 !!!emit ($self->{ct}); # pi
2918 redo A;
2919 } elsif ($self->{nc} == 0x003F) { # ?
2920 $self->{ct}->{data} .= '?';
2921 ## Stay in the state.
2922 !!!next-input-character;
2923 redo A;
2924 } else {
2925 $self->{ct}->{data} .= '?'; ## XML5: not appended
2926 $self->{state} = PI_DATA_STATE;
2927 ## Reprocess.
2928 redo A;
2929 }
2930
2931 } else {
2932 die "$0: $self->{state}: Unknown state";
2933 }
2934 } # A
2935
2936 die "$0: _get_next_token: unexpected case";
2937 } # _get_next_token
2938
2939 1;
2940 ## $Date: 2008/10/14 15:25:50 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24