/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (show annotations) (download) (as text)
Sun Oct 19 04:39:25 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.16: +113 -42 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	19 Oct 2008 04:38:53 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* notations-1.dat, notations-1.dat: Tests on lowercase markup
	declaration keywords are added.

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 04:37:30 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Make keywords 'ENTITY',
	'ELEMENT', 'ATTLIST', and 'NOTATION' ASCII case-insensitive.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.16 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BOGUS_MD_STATE () { 85 }
181
182 ## Tree constructor state constants (see Whatpm::HTML for the full
183 ## list and descriptions)
184
185 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
186 sub FOREIGN_EL () { 0b1_00000000000 }
187
188 ## Character reference mappings
189
190 my $charref_map = {
191 0x0D => 0x000A,
192 0x80 => 0x20AC,
193 0x81 => 0xFFFD,
194 0x82 => 0x201A,
195 0x83 => 0x0192,
196 0x84 => 0x201E,
197 0x85 => 0x2026,
198 0x86 => 0x2020,
199 0x87 => 0x2021,
200 0x88 => 0x02C6,
201 0x89 => 0x2030,
202 0x8A => 0x0160,
203 0x8B => 0x2039,
204 0x8C => 0x0152,
205 0x8D => 0xFFFD,
206 0x8E => 0x017D,
207 0x8F => 0xFFFD,
208 0x90 => 0xFFFD,
209 0x91 => 0x2018,
210 0x92 => 0x2019,
211 0x93 => 0x201C,
212 0x94 => 0x201D,
213 0x95 => 0x2022,
214 0x96 => 0x2013,
215 0x97 => 0x2014,
216 0x98 => 0x02DC,
217 0x99 => 0x2122,
218 0x9A => 0x0161,
219 0x9B => 0x203A,
220 0x9C => 0x0153,
221 0x9D => 0xFFFD,
222 0x9E => 0x017E,
223 0x9F => 0x0178,
224 }; # $charref_map
225 $charref_map->{$_} = 0xFFFD
226 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
227 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
228 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
229 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
230 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
231 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
232 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
233
234 ## Implementations MUST act as if state machine in the spec
235
236 sub _initialize_tokenizer ($) {
237 my $self = shift;
238
239 ## NOTE: Fields set by |new| constructor:
240 #$self->{level}
241 #$self->{set_nc}
242 #$self->{parse_error}
243 #$self->{is_xml} (if XML)
244
245 $self->{state} = DATA_STATE; # MUST
246 $self->{s_kwd} = ''; # Data state keyword
247 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
248 #$self->{entity__value}; # initialized when used
249 #$self->{entity__match}; # initialized when used
250 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
251 undef $self->{ct}; # current token
252 undef $self->{ca}; # current attribute
253 undef $self->{last_stag_name}; # last emitted start tag name
254 #$self->{prev_state}; # initialized when used
255 delete $self->{self_closing};
256 $self->{char_buffer} = '';
257 $self->{char_buffer_pos} = 0;
258 $self->{nc} = -1; # next input character
259 #$self->{next_nc}
260 !!!next-input-character;
261 $self->{token} = [];
262 # $self->{escape}
263 } # _initialize_tokenizer
264
265 ## A token has:
266 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
267 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
268 ## ->{name} (DOCTYPE_TOKEN)
269 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
270 ## ->{target} (PI_TOKEN)
271 ## ->{pubid} (DOCTYPE_TOKEN)
272 ## ->{sysid} (DOCTYPE_TOKEN)
273 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
274 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
275 ## ->{name}
276 ## ->{value}
277 ## ->{has_reference} == 1 or 0
278 ## ->{index}: Index of the attribute in a tag.
279 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
280 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
281 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
282 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
283
284 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
285 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
286 ## while the token is pushed back to the stack.
287
288 ## Emitted token MUST immediately be handled by the tree construction state.
289
290 ## Before each step, UA MAY check to see if either one of the scripts in
291 ## "list of scripts that will execute as soon as possible" or the first
292 ## script in the "list of scripts that will execute asynchronously",
293 ## has completed loading. If one has, then it MUST be executed
294 ## and removed from the list.
295
296 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
297 ## (This requirement was dropped from HTML5 spec, unfortunately.)
298
299 my $is_space = {
300 0x0009 => 1, # CHARACTER TABULATION (HT)
301 0x000A => 1, # LINE FEED (LF)
302 #0x000B => 0, # LINE TABULATION (VT)
303 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
304 #0x000D => 1, # CARRIAGE RETURN (CR)
305 0x0020 => 1, # SPACE (SP)
306 };
307
308 sub _get_next_token ($) {
309 my $self = shift;
310
311 if ($self->{self_closing}) {
312 !!!parse-error (type => 'nestc', token => $self->{ct});
313 ## NOTE: The |self_closing| flag is only set by start tag token.
314 ## In addition, when a start tag token is emitted, it is always set to
315 ## |ct|.
316 delete $self->{self_closing};
317 }
318
319 if (@{$self->{token}}) {
320 $self->{self_closing} = $self->{token}->[0]->{self_closing};
321 return shift @{$self->{token}};
322 }
323
324 A: {
325 if ($self->{state} == PCDATA_STATE) {
326 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
327
328 if ($self->{nc} == 0x0026) { # &
329 !!!cp (0.1);
330 ## NOTE: In the spec, the tokenizer is switched to the
331 ## "entity data state". In this implementation, the tokenizer
332 ## is switched to the |ENTITY_STATE|, which is an implementation
333 ## of the "consume a character reference" algorithm.
334 $self->{entity_add} = -1;
335 $self->{prev_state} = DATA_STATE;
336 $self->{state} = ENTITY_STATE;
337 !!!next-input-character;
338 redo A;
339 } elsif ($self->{nc} == 0x003C) { # <
340 !!!cp (0.2);
341 $self->{state} = TAG_OPEN_STATE;
342 !!!next-input-character;
343 redo A;
344 } elsif ($self->{nc} == -1) {
345 !!!cp (0.3);
346 !!!emit ({type => END_OF_FILE_TOKEN,
347 line => $self->{line}, column => $self->{column}});
348 last A; ## TODO: ok?
349 } else {
350 !!!cp (0.4);
351 #
352 }
353
354 # Anything else
355 my $token = {type => CHARACTER_TOKEN,
356 data => chr $self->{nc},
357 line => $self->{line}, column => $self->{column},
358 };
359 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
360
361 ## Stay in the state.
362 !!!next-input-character;
363 !!!emit ($token);
364 redo A;
365 } elsif ($self->{state} == DATA_STATE) {
366 $self->{s_kwd} = '' unless defined $self->{s_kwd};
367 if ($self->{nc} == 0x0026) { # &
368 $self->{s_kwd} = '';
369 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
370 not $self->{escape}) {
371 !!!cp (1);
372 ## NOTE: In the spec, the tokenizer is switched to the
373 ## "entity data state". In this implementation, the tokenizer
374 ## is switched to the |ENTITY_STATE|, which is an implementation
375 ## of the "consume a character reference" algorithm.
376 $self->{entity_add} = -1;
377 $self->{prev_state} = DATA_STATE;
378 $self->{state} = ENTITY_STATE;
379 !!!next-input-character;
380 redo A;
381 } else {
382 !!!cp (2);
383 #
384 }
385 } elsif ($self->{nc} == 0x002D) { # -
386 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
387 if ($self->{s_kwd} eq '<!-') {
388 !!!cp (3);
389 $self->{escape} = 1; # unless $self->{escape};
390 $self->{s_kwd} = '--';
391 #
392 } elsif ($self->{s_kwd} eq '-') {
393 !!!cp (4);
394 $self->{s_kwd} = '--';
395 #
396 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
397 !!!cp (4.1);
398 $self->{s_kwd} .= '-';
399 #
400 } else {
401 !!!cp (5);
402 $self->{s_kwd} = '-';
403 #
404 }
405 }
406
407 #
408 } elsif ($self->{nc} == 0x0021) { # !
409 if (length $self->{s_kwd}) {
410 !!!cp (5.1);
411 $self->{s_kwd} .= '!';
412 #
413 } else {
414 !!!cp (5.2);
415 #$self->{s_kwd} = '';
416 #
417 }
418 #
419 } elsif ($self->{nc} == 0x003C) { # <
420 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
421 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
422 not $self->{escape})) {
423 !!!cp (6);
424 $self->{state} = TAG_OPEN_STATE;
425 !!!next-input-character;
426 redo A;
427 } else {
428 !!!cp (7);
429 $self->{s_kwd} = '';
430 #
431 }
432 } elsif ($self->{nc} == 0x003E) { # >
433 if ($self->{escape} and
434 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
435 if ($self->{s_kwd} eq '--') {
436 !!!cp (8);
437 delete $self->{escape};
438 #
439 } else {
440 !!!cp (9);
441 #
442 }
443 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
444 !!!cp (9.1);
445 !!!parse-error (type => 'unmatched mse', ## TODO: type
446 line => $self->{line_prev},
447 column => $self->{column_prev} - 1);
448 #
449 } else {
450 !!!cp (10);
451 #
452 }
453
454 $self->{s_kwd} = '';
455 #
456 } elsif ($self->{nc} == 0x005D) { # ]
457 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
458 !!!cp (10.1);
459 $self->{s_kwd} .= ']';
460 } elsif ($self->{s_kwd} eq ']]') {
461 !!!cp (10.2);
462 #
463 } else {
464 !!!cp (10.3);
465 $self->{s_kwd} = '';
466 }
467 #
468 } elsif ($self->{nc} == -1) {
469 !!!cp (11);
470 $self->{s_kwd} = '';
471 !!!emit ({type => END_OF_FILE_TOKEN,
472 line => $self->{line}, column => $self->{column}});
473 last A; ## TODO: ok?
474 } else {
475 !!!cp (12);
476 $self->{s_kwd} = '';
477 #
478 }
479
480 # Anything else
481 my $token = {type => CHARACTER_TOKEN,
482 data => chr $self->{nc},
483 line => $self->{line}, column => $self->{column},
484 };
485 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
486 length $token->{data})) {
487 $self->{s_kwd} = '';
488 }
489
490 ## Stay in the data state.
491 if (not $self->{is_xml} and
492 $self->{content_model} == PCDATA_CONTENT_MODEL) {
493 !!!cp (13);
494 $self->{state} = PCDATA_STATE;
495 } else {
496 !!!cp (14);
497 ## Stay in the state.
498 }
499 !!!next-input-character;
500 !!!emit ($token);
501 redo A;
502 } elsif ($self->{state} == TAG_OPEN_STATE) {
503 ## XML5: "tag state".
504
505 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
506 if ($self->{nc} == 0x002F) { # /
507 !!!cp (15);
508 !!!next-input-character;
509 $self->{state} = CLOSE_TAG_OPEN_STATE;
510 redo A;
511 } elsif ($self->{nc} == 0x0021) { # !
512 !!!cp (15.1);
513 $self->{s_kwd} = $self->{escaped} ? '' : '<';
514 #
515 } else {
516 !!!cp (16);
517 $self->{s_kwd} = '';
518 #
519 }
520
521 ## reconsume
522 $self->{state} = DATA_STATE;
523 !!!emit ({type => CHARACTER_TOKEN, data => '<',
524 line => $self->{line_prev},
525 column => $self->{column_prev},
526 });
527 redo A;
528 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
529 if ($self->{nc} == 0x0021) { # !
530 !!!cp (17);
531 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
532 !!!next-input-character;
533 redo A;
534 } elsif ($self->{nc} == 0x002F) { # /
535 !!!cp (18);
536 $self->{state} = CLOSE_TAG_OPEN_STATE;
537 !!!next-input-character;
538 redo A;
539 } elsif (0x0041 <= $self->{nc} and
540 $self->{nc} <= 0x005A) { # A..Z
541 !!!cp (19);
542 $self->{ct}
543 = {type => START_TAG_TOKEN,
544 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
545 line => $self->{line_prev},
546 column => $self->{column_prev}};
547 $self->{state} = TAG_NAME_STATE;
548 !!!next-input-character;
549 redo A;
550 } elsif (0x0061 <= $self->{nc} and
551 $self->{nc} <= 0x007A) { # a..z
552 !!!cp (20);
553 $self->{ct} = {type => START_TAG_TOKEN,
554 tag_name => chr ($self->{nc}),
555 line => $self->{line_prev},
556 column => $self->{column_prev}};
557 $self->{state} = TAG_NAME_STATE;
558 !!!next-input-character;
559 redo A;
560 } elsif ($self->{nc} == 0x003E) { # >
561 !!!cp (21);
562 !!!parse-error (type => 'empty start tag',
563 line => $self->{line_prev},
564 column => $self->{column_prev});
565 $self->{state} = DATA_STATE;
566 $self->{s_kwd} = '';
567 !!!next-input-character;
568
569 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
570 line => $self->{line_prev},
571 column => $self->{column_prev},
572 });
573
574 redo A;
575 } elsif ($self->{nc} == 0x003F) { # ?
576 if ($self->{is_xml}) {
577 !!!cp (22.1);
578 $self->{state} = PI_STATE;
579 !!!next-input-character;
580 redo A;
581 } else {
582 !!!cp (22);
583 !!!parse-error (type => 'pio',
584 line => $self->{line_prev},
585 column => $self->{column_prev});
586 $self->{state} = BOGUS_COMMENT_STATE;
587 $self->{ct} = {type => COMMENT_TOKEN, data => '',
588 line => $self->{line_prev},
589 column => $self->{column_prev},
590 };
591 ## $self->{nc} is intentionally left as is
592 redo A;
593 }
594 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
595 !!!cp (23);
596 !!!parse-error (type => 'bare stago',
597 line => $self->{line_prev},
598 column => $self->{column_prev});
599 $self->{state} = DATA_STATE;
600 $self->{s_kwd} = '';
601 ## reconsume
602
603 !!!emit ({type => CHARACTER_TOKEN, data => '<',
604 line => $self->{line_prev},
605 column => $self->{column_prev},
606 });
607
608 redo A;
609 } else {
610 ## XML5: "<:" is a parse error.
611 !!!cp (23.1);
612 $self->{ct} = {type => START_TAG_TOKEN,
613 tag_name => chr ($self->{nc}),
614 line => $self->{line_prev},
615 column => $self->{column_prev}};
616 $self->{state} = TAG_NAME_STATE;
617 !!!next-input-character;
618 redo A;
619 }
620 } else {
621 die "$0: $self->{content_model} in tag open";
622 }
623 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
624 ## NOTE: The "close tag open state" in the spec is implemented as
625 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
626
627 ## XML5: "end tag state".
628
629 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
630 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
631 if (defined $self->{last_stag_name}) {
632 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
633 $self->{kwd} = '';
634 ## Reconsume.
635 redo A;
636 } else {
637 ## No start tag token has ever been emitted
638 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
639 !!!cp (28);
640 $self->{state} = DATA_STATE;
641 $self->{s_kwd} = '';
642 ## Reconsume.
643 !!!emit ({type => CHARACTER_TOKEN, data => '</',
644 line => $l, column => $c,
645 });
646 redo A;
647 }
648 }
649
650 if (0x0041 <= $self->{nc} and
651 $self->{nc} <= 0x005A) { # A..Z
652 !!!cp (29);
653 $self->{ct}
654 = {type => END_TAG_TOKEN,
655 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
656 line => $l, column => $c};
657 $self->{state} = TAG_NAME_STATE;
658 !!!next-input-character;
659 redo A;
660 } elsif (0x0061 <= $self->{nc} and
661 $self->{nc} <= 0x007A) { # a..z
662 !!!cp (30);
663 $self->{ct} = {type => END_TAG_TOKEN,
664 tag_name => chr ($self->{nc}),
665 line => $l, column => $c};
666 $self->{state} = TAG_NAME_STATE;
667 !!!next-input-character;
668 redo A;
669 } elsif ($self->{nc} == 0x003E) { # >
670 !!!parse-error (type => 'empty end tag',
671 line => $self->{line_prev}, ## "<" in "</>"
672 column => $self->{column_prev} - 1);
673 $self->{state} = DATA_STATE;
674 $self->{s_kwd} = '';
675 if ($self->{is_xml}) {
676 !!!cp (31);
677 ## XML5: No parse error.
678
679 ## NOTE: This parser raises a parse error, since it supports
680 ## XML1, not XML5.
681
682 ## NOTE: A short end tag token.
683 my $ct = {type => END_TAG_TOKEN,
684 tag_name => '',
685 line => $self->{line_prev},
686 column => $self->{column_prev} - 1,
687 };
688 !!!next-input-character;
689 !!!emit ($ct);
690 } else {
691 !!!cp (31.1);
692 !!!next-input-character;
693 }
694 redo A;
695 } elsif ($self->{nc} == -1) {
696 !!!cp (32);
697 !!!parse-error (type => 'bare etago');
698 $self->{s_kwd} = '';
699 $self->{state} = DATA_STATE;
700 # reconsume
701
702 !!!emit ({type => CHARACTER_TOKEN, data => '</',
703 line => $l, column => $c,
704 });
705
706 redo A;
707 } elsif (not $self->{is_xml} or
708 $is_space->{$self->{nc}}) {
709 !!!cp (33);
710 !!!parse-error (type => 'bogus end tag',
711 line => $self->{line_prev}, # "<" of "</"
712 column => $self->{column_prev} - 1);
713 $self->{state} = BOGUS_COMMENT_STATE;
714 $self->{ct} = {type => COMMENT_TOKEN, data => '',
715 line => $self->{line_prev}, # "<" of "</"
716 column => $self->{column_prev} - 1,
717 };
718 ## NOTE: $self->{nc} is intentionally left as is.
719 ## Although the "anything else" case of the spec not explicitly
720 ## states that the next input character is to be reconsumed,
721 ## it will be included to the |data| of the comment token
722 ## generated from the bogus end tag, as defined in the
723 ## "bogus comment state" entry.
724 redo A;
725 } else {
726 ## XML5: "</:" is a parse error.
727 !!!cp (30.1);
728 $self->{ct} = {type => END_TAG_TOKEN,
729 tag_name => chr ($self->{nc}),
730 line => $l, column => $c};
731 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
732 !!!next-input-character;
733 redo A;
734 }
735 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
736 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
737 if (length $ch) {
738 my $CH = $ch;
739 $ch =~ tr/a-z/A-Z/;
740 my $nch = chr $self->{nc};
741 if ($nch eq $ch or $nch eq $CH) {
742 !!!cp (24);
743 ## Stay in the state.
744 $self->{kwd} .= $nch;
745 !!!next-input-character;
746 redo A;
747 } else {
748 !!!cp (25);
749 $self->{state} = DATA_STATE;
750 $self->{s_kwd} = '';
751 ## Reconsume.
752 !!!emit ({type => CHARACTER_TOKEN,
753 data => '</' . $self->{kwd},
754 line => $self->{line_prev},
755 column => $self->{column_prev} - 1 - length $self->{kwd},
756 });
757 redo A;
758 }
759 } else { # after "<{tag-name}"
760 unless ($is_space->{$self->{nc}} or
761 {
762 0x003E => 1, # >
763 0x002F => 1, # /
764 -1 => 1, # EOF
765 }->{$self->{nc}}) {
766 !!!cp (26);
767 ## Reconsume.
768 $self->{state} = DATA_STATE;
769 $self->{s_kwd} = '';
770 !!!emit ({type => CHARACTER_TOKEN,
771 data => '</' . $self->{kwd},
772 line => $self->{line_prev},
773 column => $self->{column_prev} - 1 - length $self->{kwd},
774 });
775 redo A;
776 } else {
777 !!!cp (27);
778 $self->{ct}
779 = {type => END_TAG_TOKEN,
780 tag_name => $self->{last_stag_name},
781 line => $self->{line_prev},
782 column => $self->{column_prev} - 1 - length $self->{kwd}};
783 $self->{state} = TAG_NAME_STATE;
784 ## Reconsume.
785 redo A;
786 }
787 }
788 } elsif ($self->{state} == TAG_NAME_STATE) {
789 if ($is_space->{$self->{nc}}) {
790 !!!cp (34);
791 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
792 !!!next-input-character;
793 redo A;
794 } elsif ($self->{nc} == 0x003E) { # >
795 if ($self->{ct}->{type} == START_TAG_TOKEN) {
796 !!!cp (35);
797 $self->{last_stag_name} = $self->{ct}->{tag_name};
798 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
799 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
800 #if ($self->{ct}->{attributes}) {
801 # ## NOTE: This should never be reached.
802 # !!! cp (36);
803 # !!! parse-error (type => 'end tag attribute');
804 #} else {
805 !!!cp (37);
806 #}
807 } else {
808 die "$0: $self->{ct}->{type}: Unknown token type";
809 }
810 $self->{state} = DATA_STATE;
811 $self->{s_kwd} = '';
812 !!!next-input-character;
813
814 !!!emit ($self->{ct}); # start tag or end tag
815
816 redo A;
817 } elsif (0x0041 <= $self->{nc} and
818 $self->{nc} <= 0x005A) { # A..Z
819 !!!cp (38);
820 $self->{ct}->{tag_name}
821 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
822 # start tag or end tag
823 ## Stay in this state
824 !!!next-input-character;
825 redo A;
826 } elsif ($self->{nc} == -1) {
827 !!!parse-error (type => 'unclosed tag');
828 if ($self->{ct}->{type} == START_TAG_TOKEN) {
829 !!!cp (39);
830 $self->{last_stag_name} = $self->{ct}->{tag_name};
831 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
832 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
833 #if ($self->{ct}->{attributes}) {
834 # ## NOTE: This state should never be reached.
835 # !!! cp (40);
836 # !!! parse-error (type => 'end tag attribute');
837 #} else {
838 !!!cp (41);
839 #}
840 } else {
841 die "$0: $self->{ct}->{type}: Unknown token type";
842 }
843 $self->{state} = DATA_STATE;
844 $self->{s_kwd} = '';
845 # reconsume
846
847 !!!emit ($self->{ct}); # start tag or end tag
848
849 redo A;
850 } elsif ($self->{nc} == 0x002F) { # /
851 !!!cp (42);
852 $self->{state} = SELF_CLOSING_START_TAG_STATE;
853 !!!next-input-character;
854 redo A;
855 } else {
856 !!!cp (44);
857 $self->{ct}->{tag_name} .= chr $self->{nc};
858 # start tag or end tag
859 ## Stay in the state
860 !!!next-input-character;
861 redo A;
862 }
863 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
864 ## XML5: "Tag attribute name before state".
865
866 if ($is_space->{$self->{nc}}) {
867 !!!cp (45);
868 ## Stay in the state
869 !!!next-input-character;
870 redo A;
871 } elsif ($self->{nc} == 0x003E) { # >
872 if ($self->{ct}->{type} == START_TAG_TOKEN) {
873 !!!cp (46);
874 $self->{last_stag_name} = $self->{ct}->{tag_name};
875 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
876 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
877 if ($self->{ct}->{attributes}) {
878 !!!cp (47);
879 !!!parse-error (type => 'end tag attribute');
880 } else {
881 !!!cp (48);
882 }
883 } else {
884 die "$0: $self->{ct}->{type}: Unknown token type";
885 }
886 $self->{state} = DATA_STATE;
887 $self->{s_kwd} = '';
888 !!!next-input-character;
889
890 !!!emit ($self->{ct}); # start tag or end tag
891
892 redo A;
893 } elsif (0x0041 <= $self->{nc} and
894 $self->{nc} <= 0x005A) { # A..Z
895 !!!cp (49);
896 $self->{ca}
897 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
898 value => '',
899 line => $self->{line}, column => $self->{column}};
900 $self->{state} = ATTRIBUTE_NAME_STATE;
901 !!!next-input-character;
902 redo A;
903 } elsif ($self->{nc} == 0x002F) { # /
904 !!!cp (50);
905 $self->{state} = SELF_CLOSING_START_TAG_STATE;
906 !!!next-input-character;
907 redo A;
908 } elsif ($self->{nc} == -1) {
909 !!!parse-error (type => 'unclosed tag');
910 if ($self->{ct}->{type} == START_TAG_TOKEN) {
911 !!!cp (52);
912 $self->{last_stag_name} = $self->{ct}->{tag_name};
913 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
914 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
915 if ($self->{ct}->{attributes}) {
916 !!!cp (53);
917 !!!parse-error (type => 'end tag attribute');
918 } else {
919 !!!cp (54);
920 }
921 } else {
922 die "$0: $self->{ct}->{type}: Unknown token type";
923 }
924 $self->{state} = DATA_STATE;
925 $self->{s_kwd} = '';
926 # reconsume
927
928 !!!emit ($self->{ct}); # start tag or end tag
929
930 redo A;
931 } else {
932 if ({
933 0x0022 => 1, # "
934 0x0027 => 1, # '
935 0x003D => 1, # =
936 }->{$self->{nc}}) {
937 !!!cp (55);
938 ## XML5: Not a parse error.
939 !!!parse-error (type => 'bad attribute name');
940 } else {
941 !!!cp (56);
942 ## XML5: ":" raises a parse error and is ignored.
943 }
944 $self->{ca}
945 = {name => chr ($self->{nc}),
946 value => '',
947 line => $self->{line}, column => $self->{column}};
948 $self->{state} = ATTRIBUTE_NAME_STATE;
949 !!!next-input-character;
950 redo A;
951 }
952 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
953 ## XML5: "Tag attribute name state".
954
955 my $before_leave = sub {
956 if (exists $self->{ct}->{attributes} # start tag or end tag
957 ->{$self->{ca}->{name}}) { # MUST
958 !!!cp (57);
959 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
960 ## Discard $self->{ca} # MUST
961 } else {
962 !!!cp (58);
963 $self->{ct}->{attributes}->{$self->{ca}->{name}}
964 = $self->{ca};
965 $self->{ca}->{index} = ++$self->{ct}->{last_index};
966 }
967 }; # $before_leave
968
969 if ($is_space->{$self->{nc}}) {
970 !!!cp (59);
971 $before_leave->();
972 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
973 !!!next-input-character;
974 redo A;
975 } elsif ($self->{nc} == 0x003D) { # =
976 !!!cp (60);
977 $before_leave->();
978 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
979 !!!next-input-character;
980 redo A;
981 } elsif ($self->{nc} == 0x003E) { # >
982 if ($self->{is_xml}) {
983 !!!cp (60.1);
984 ## XML5: Not a parse error.
985 !!!parse-error (type => 'no attr value'); ## TODO: type
986 } else {
987 !!!cp (60.2);
988 }
989
990 $before_leave->();
991 if ($self->{ct}->{type} == START_TAG_TOKEN) {
992 !!!cp (61);
993 $self->{last_stag_name} = $self->{ct}->{tag_name};
994 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
995 !!!cp (62);
996 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
997 if ($self->{ct}->{attributes}) {
998 !!!parse-error (type => 'end tag attribute');
999 }
1000 } else {
1001 die "$0: $self->{ct}->{type}: Unknown token type";
1002 }
1003 $self->{state} = DATA_STATE;
1004 $self->{s_kwd} = '';
1005 !!!next-input-character;
1006
1007 !!!emit ($self->{ct}); # start tag or end tag
1008
1009 redo A;
1010 } elsif (0x0041 <= $self->{nc} and
1011 $self->{nc} <= 0x005A) { # A..Z
1012 !!!cp (63);
1013 $self->{ca}->{name}
1014 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1015 ## Stay in the state
1016 !!!next-input-character;
1017 redo A;
1018 } elsif ($self->{nc} == 0x002F) { # /
1019 if ($self->{is_xml}) {
1020 !!!cp (64);
1021 ## XML5: Not a parse error.
1022 !!!parse-error (type => 'no attr value'); ## TODO: type
1023 } else {
1024 !!!cp (64.1);
1025 }
1026
1027 $before_leave->();
1028 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1029 !!!next-input-character;
1030 redo A;
1031 } elsif ($self->{nc} == -1) {
1032 !!!parse-error (type => 'unclosed tag');
1033 $before_leave->();
1034 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1035 !!!cp (66);
1036 $self->{last_stag_name} = $self->{ct}->{tag_name};
1037 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1038 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1039 if ($self->{ct}->{attributes}) {
1040 !!!cp (67);
1041 !!!parse-error (type => 'end tag attribute');
1042 } else {
1043 ## NOTE: This state should never be reached.
1044 !!!cp (68);
1045 }
1046 } else {
1047 die "$0: $self->{ct}->{type}: Unknown token type";
1048 }
1049 $self->{state} = DATA_STATE;
1050 $self->{s_kwd} = '';
1051 # reconsume
1052
1053 !!!emit ($self->{ct}); # start tag or end tag
1054
1055 redo A;
1056 } else {
1057 if ($self->{nc} == 0x0022 or # "
1058 $self->{nc} == 0x0027) { # '
1059 !!!cp (69);
1060 ## XML5: Not a parse error.
1061 !!!parse-error (type => 'bad attribute name');
1062 } else {
1063 !!!cp (70);
1064 }
1065 $self->{ca}->{name} .= chr ($self->{nc});
1066 ## Stay in the state
1067 !!!next-input-character;
1068 redo A;
1069 }
1070 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1071 ## XML5: "Tag attribute name after state".
1072
1073 if ($is_space->{$self->{nc}}) {
1074 !!!cp (71);
1075 ## Stay in the state
1076 !!!next-input-character;
1077 redo A;
1078 } elsif ($self->{nc} == 0x003D) { # =
1079 !!!cp (72);
1080 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1081 !!!next-input-character;
1082 redo A;
1083 } elsif ($self->{nc} == 0x003E) { # >
1084 if ($self->{is_xml}) {
1085 !!!cp (72.1);
1086 ## XML5: Not a parse error.
1087 !!!parse-error (type => 'no attr value'); ## TODO: type
1088 } else {
1089 !!!cp (72.2);
1090 }
1091
1092 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1093 !!!cp (73);
1094 $self->{last_stag_name} = $self->{ct}->{tag_name};
1095 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1096 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1097 if ($self->{ct}->{attributes}) {
1098 !!!cp (74);
1099 !!!parse-error (type => 'end tag attribute');
1100 } else {
1101 ## NOTE: This state should never be reached.
1102 !!!cp (75);
1103 }
1104 } else {
1105 die "$0: $self->{ct}->{type}: Unknown token type";
1106 }
1107 $self->{state} = DATA_STATE;
1108 $self->{s_kwd} = '';
1109 !!!next-input-character;
1110
1111 !!!emit ($self->{ct}); # start tag or end tag
1112
1113 redo A;
1114 } elsif (0x0041 <= $self->{nc} and
1115 $self->{nc} <= 0x005A) { # A..Z
1116 !!!cp (76);
1117 $self->{ca}
1118 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1119 value => '',
1120 line => $self->{line}, column => $self->{column}};
1121 $self->{state} = ATTRIBUTE_NAME_STATE;
1122 !!!next-input-character;
1123 redo A;
1124 } elsif ($self->{nc} == 0x002F) { # /
1125 if ($self->{is_xml}) {
1126 !!!cp (77);
1127 ## XML5: Not a parse error.
1128 !!!parse-error (type => 'no attr value'); ## TODO: type
1129 } else {
1130 !!!cp (77.1);
1131 }
1132
1133 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1134 !!!next-input-character;
1135 redo A;
1136 } elsif ($self->{nc} == -1) {
1137 !!!parse-error (type => 'unclosed tag');
1138 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1139 !!!cp (79);
1140 $self->{last_stag_name} = $self->{ct}->{tag_name};
1141 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1142 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1143 if ($self->{ct}->{attributes}) {
1144 !!!cp (80);
1145 !!!parse-error (type => 'end tag attribute');
1146 } else {
1147 ## NOTE: This state should never be reached.
1148 !!!cp (81);
1149 }
1150 } else {
1151 die "$0: $self->{ct}->{type}: Unknown token type";
1152 }
1153 $self->{s_kwd} = '';
1154 $self->{state} = DATA_STATE;
1155 # reconsume
1156
1157 !!!emit ($self->{ct}); # start tag or end tag
1158
1159 redo A;
1160 } else {
1161 if ($self->{is_xml}) {
1162 !!!cp (78.1);
1163 ## XML5: Not a parse error.
1164 !!!parse-error (type => 'no attr value'); ## TODO: type
1165 } else {
1166 !!!cp (78.2);
1167 }
1168
1169 if ($self->{nc} == 0x0022 or # "
1170 $self->{nc} == 0x0027) { # '
1171 !!!cp (78);
1172 ## XML5: Not a parse error.
1173 !!!parse-error (type => 'bad attribute name');
1174 } else {
1175 !!!cp (82);
1176 }
1177 $self->{ca}
1178 = {name => chr ($self->{nc}),
1179 value => '',
1180 line => $self->{line}, column => $self->{column}};
1181 $self->{state} = ATTRIBUTE_NAME_STATE;
1182 !!!next-input-character;
1183 redo A;
1184 }
1185 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1186 ## XML5: "Tag attribute value before state".
1187
1188 if ($is_space->{$self->{nc}}) {
1189 !!!cp (83);
1190 ## Stay in the state
1191 !!!next-input-character;
1192 redo A;
1193 } elsif ($self->{nc} == 0x0022) { # "
1194 !!!cp (84);
1195 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1196 !!!next-input-character;
1197 redo A;
1198 } elsif ($self->{nc} == 0x0026) { # &
1199 !!!cp (85);
1200 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1201 ## reconsume
1202 redo A;
1203 } elsif ($self->{nc} == 0x0027) { # '
1204 !!!cp (86);
1205 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1206 !!!next-input-character;
1207 redo A;
1208 } elsif ($self->{nc} == 0x003E) { # >
1209 !!!parse-error (type => 'empty unquoted attribute value');
1210 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211 !!!cp (87);
1212 $self->{last_stag_name} = $self->{ct}->{tag_name};
1213 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215 if ($self->{ct}->{attributes}) {
1216 !!!cp (88);
1217 !!!parse-error (type => 'end tag attribute');
1218 } else {
1219 ## NOTE: This state should never be reached.
1220 !!!cp (89);
1221 }
1222 } else {
1223 die "$0: $self->{ct}->{type}: Unknown token type";
1224 }
1225 $self->{state} = DATA_STATE;
1226 $self->{s_kwd} = '';
1227 !!!next-input-character;
1228
1229 !!!emit ($self->{ct}); # start tag or end tag
1230
1231 redo A;
1232 } elsif ($self->{nc} == -1) {
1233 !!!parse-error (type => 'unclosed tag');
1234 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1235 !!!cp (90);
1236 $self->{last_stag_name} = $self->{ct}->{tag_name};
1237 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1238 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1239 if ($self->{ct}->{attributes}) {
1240 !!!cp (91);
1241 !!!parse-error (type => 'end tag attribute');
1242 } else {
1243 ## NOTE: This state should never be reached.
1244 !!!cp (92);
1245 }
1246 } else {
1247 die "$0: $self->{ct}->{type}: Unknown token type";
1248 }
1249 $self->{state} = DATA_STATE;
1250 $self->{s_kwd} = '';
1251 ## reconsume
1252
1253 !!!emit ($self->{ct}); # start tag or end tag
1254
1255 redo A;
1256 } else {
1257 if ($self->{nc} == 0x003D) { # =
1258 !!!cp (93);
1259 ## XML5: Not a parse error.
1260 !!!parse-error (type => 'bad attribute value');
1261 } elsif ($self->{is_xml}) {
1262 !!!cp (93.1);
1263 ## XML5: No parse error.
1264 !!!parse-error (type => 'unquoted attr value'); ## TODO
1265 } else {
1266 !!!cp (94);
1267 }
1268 $self->{ca}->{value} .= chr ($self->{nc});
1269 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1270 !!!next-input-character;
1271 redo A;
1272 }
1273 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1274 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1275 ## ATTLIST attribute value double quoted state".
1276
1277 if ($self->{nc} == 0x0022) { # "
1278 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1279 !!!cp (95.1);
1280 ## XML5: "DOCTYPE ATTLIST name after state".
1281 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1282 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1283 } else {
1284 !!!cp (95);
1285 ## XML5: "Tag attribute name before state".
1286 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1287 }
1288 !!!next-input-character;
1289 redo A;
1290 } elsif ($self->{nc} == 0x0026) { # &
1291 !!!cp (96);
1292 ## XML5: Not defined yet.
1293
1294 ## NOTE: In the spec, the tokenizer is switched to the
1295 ## "entity in attribute value state". In this implementation, the
1296 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1297 ## implementation of the "consume a character reference" algorithm.
1298 $self->{prev_state} = $self->{state};
1299 $self->{entity_add} = 0x0022; # "
1300 $self->{state} = ENTITY_STATE;
1301 !!!next-input-character;
1302 redo A;
1303 } elsif ($self->{nc} == -1) {
1304 !!!parse-error (type => 'unclosed attribute value');
1305 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1306 !!!cp (97);
1307 $self->{last_stag_name} = $self->{ct}->{tag_name};
1308
1309 $self->{state} = DATA_STATE;
1310 $self->{s_kwd} = '';
1311 ## reconsume
1312 !!!emit ($self->{ct}); # start tag
1313 redo A;
1314 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1315 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1316 if ($self->{ct}->{attributes}) {
1317 !!!cp (98);
1318 !!!parse-error (type => 'end tag attribute');
1319 } else {
1320 ## NOTE: This state should never be reached.
1321 !!!cp (99);
1322 }
1323
1324 $self->{state} = DATA_STATE;
1325 $self->{s_kwd} = '';
1326 ## reconsume
1327 !!!emit ($self->{ct}); # end tag
1328 redo A;
1329 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1330 ## XML5: No parse error above; not defined yet.
1331 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1332 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1333 ## Reconsume.
1334 !!!emit ($self->{ct}); # ATTLIST
1335 redo A;
1336 } else {
1337 die "$0: $self->{ct}->{type}: Unknown token type";
1338 }
1339 } else {
1340 ## XML5 [ATTLIST]: Not defined yet.
1341 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1342 !!!cp (100);
1343 ## XML5: Not a parse error.
1344 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1345 } else {
1346 !!!cp (100.1);
1347 }
1348 $self->{ca}->{value} .= chr ($self->{nc});
1349 $self->{read_until}->($self->{ca}->{value},
1350 q["&<],
1351 length $self->{ca}->{value});
1352
1353 ## Stay in the state
1354 !!!next-input-character;
1355 redo A;
1356 }
1357 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1358 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1359 ## ATTLIST attribute value single quoted state".
1360
1361 if ($self->{nc} == 0x0027) { # '
1362 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1363 !!!cp (101.1);
1364 ## XML5: "DOCTYPE ATTLIST name after state".
1365 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1366 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1367 } else {
1368 !!!cp (101);
1369 ## XML5: "Before attribute name state" (sic).
1370 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1371 }
1372 !!!next-input-character;
1373 redo A;
1374 } elsif ($self->{nc} == 0x0026) { # &
1375 !!!cp (102);
1376 ## XML5: Not defined yet.
1377
1378 ## NOTE: In the spec, the tokenizer is switched to the
1379 ## "entity in attribute value state". In this implementation, the
1380 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1381 ## implementation of the "consume a character reference" algorithm.
1382 $self->{entity_add} = 0x0027; # '
1383 $self->{prev_state} = $self->{state};
1384 $self->{state} = ENTITY_STATE;
1385 !!!next-input-character;
1386 redo A;
1387 } elsif ($self->{nc} == -1) {
1388 !!!parse-error (type => 'unclosed attribute value');
1389 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1390 !!!cp (103);
1391 $self->{last_stag_name} = $self->{ct}->{tag_name};
1392
1393 $self->{state} = DATA_STATE;
1394 $self->{s_kwd} = '';
1395 ## reconsume
1396 !!!emit ($self->{ct}); # start tag
1397 redo A;
1398 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1399 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1400 if ($self->{ct}->{attributes}) {
1401 !!!cp (104);
1402 !!!parse-error (type => 'end tag attribute');
1403 } else {
1404 ## NOTE: This state should never be reached.
1405 !!!cp (105);
1406 }
1407
1408 $self->{state} = DATA_STATE;
1409 $self->{s_kwd} = '';
1410 ## reconsume
1411 !!!emit ($self->{ct}); # end tag
1412 redo A;
1413 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1414 ## XML5: No parse error above; not defined yet.
1415 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1416 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1417 ## Reconsume.
1418 !!!emit ($self->{ct}); # ATTLIST
1419 redo A;
1420 } else {
1421 die "$0: $self->{ct}->{type}: Unknown token type";
1422 }
1423 } else {
1424 ## XML5 [ATTLIST]: Not defined yet.
1425 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1426 !!!cp (106);
1427 ## XML5: Not a parse error.
1428 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1429 } else {
1430 !!!cp (106.1);
1431 }
1432 $self->{ca}->{value} .= chr ($self->{nc});
1433 $self->{read_until}->($self->{ca}->{value},
1434 q['&<],
1435 length $self->{ca}->{value});
1436
1437 ## Stay in the state
1438 !!!next-input-character;
1439 redo A;
1440 }
1441 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1442 ## XML5: "Tag attribute value unquoted state".
1443
1444 if ($is_space->{$self->{nc}}) {
1445 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1446 !!!cp (107.1);
1447 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1448 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1449 } else {
1450 !!!cp (107);
1451 ## XML5: "Tag attribute name before state".
1452 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1453 }
1454 !!!next-input-character;
1455 redo A;
1456 } elsif ($self->{nc} == 0x0026) { # &
1457 !!!cp (108);
1458
1459 ## XML5: Not defined yet.
1460
1461 ## NOTE: In the spec, the tokenizer is switched to the
1462 ## "entity in attribute value state". In this implementation, the
1463 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1464 ## implementation of the "consume a character reference" algorithm.
1465 $self->{entity_add} = -1;
1466 $self->{prev_state} = $self->{state};
1467 $self->{state} = ENTITY_STATE;
1468 !!!next-input-character;
1469 redo A;
1470 } elsif ($self->{nc} == 0x003E) { # >
1471 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1472 !!!cp (109);
1473 $self->{last_stag_name} = $self->{ct}->{tag_name};
1474
1475 $self->{state} = DATA_STATE;
1476 $self->{s_kwd} = '';
1477 !!!next-input-character;
1478 !!!emit ($self->{ct}); # start tag
1479 redo A;
1480 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1481 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1482 if ($self->{ct}->{attributes}) {
1483 !!!cp (110);
1484 !!!parse-error (type => 'end tag attribute');
1485 } else {
1486 ## NOTE: This state should never be reached.
1487 !!!cp (111);
1488 }
1489
1490 $self->{state} = DATA_STATE;
1491 $self->{s_kwd} = '';
1492 !!!next-input-character;
1493 !!!emit ($self->{ct}); # end tag
1494 redo A;
1495 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1496 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1497 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1498 !!!next-input-character;
1499 !!!emit ($self->{ct}); # ATTLIST
1500 redo A;
1501 } else {
1502 die "$0: $self->{ct}->{type}: Unknown token type";
1503 }
1504 } elsif ($self->{nc} == -1) {
1505 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1506 !!!cp (112);
1507 !!!parse-error (type => 'unclosed tag');
1508 $self->{last_stag_name} = $self->{ct}->{tag_name};
1509
1510 $self->{state} = DATA_STATE;
1511 $self->{s_kwd} = '';
1512 ## reconsume
1513 !!!emit ($self->{ct}); # start tag
1514 redo A;
1515 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1516 !!!parse-error (type => 'unclosed tag');
1517 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1518 if ($self->{ct}->{attributes}) {
1519 !!!cp (113);
1520 !!!parse-error (type => 'end tag attribute');
1521 } else {
1522 ## NOTE: This state should never be reached.
1523 !!!cp (114);
1524 }
1525
1526 $self->{state} = DATA_STATE;
1527 $self->{s_kwd} = '';
1528 ## reconsume
1529 !!!emit ($self->{ct}); # end tag
1530 redo A;
1531 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1532 !!!parse-error (type => 'unclosed md'); ## TODO: type
1533 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1534 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1535 ## Reconsume.
1536 !!!emit ($self->{ct}); # ATTLIST
1537 redo A;
1538 } else {
1539 die "$0: $self->{ct}->{type}: Unknown token type";
1540 }
1541 } else {
1542 if ({
1543 0x0022 => 1, # "
1544 0x0027 => 1, # '
1545 0x003D => 1, # =
1546 }->{$self->{nc}}) {
1547 !!!cp (115);
1548 ## XML5: Not a parse error.
1549 !!!parse-error (type => 'bad attribute value');
1550 } else {
1551 !!!cp (116);
1552 }
1553 $self->{ca}->{value} .= chr ($self->{nc});
1554 $self->{read_until}->($self->{ca}->{value},
1555 q["'=& >],
1556 length $self->{ca}->{value});
1557
1558 ## Stay in the state
1559 !!!next-input-character;
1560 redo A;
1561 }
1562 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1563 if ($is_space->{$self->{nc}}) {
1564 !!!cp (118);
1565 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1566 !!!next-input-character;
1567 redo A;
1568 } elsif ($self->{nc} == 0x003E) { # >
1569 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1570 !!!cp (119);
1571 $self->{last_stag_name} = $self->{ct}->{tag_name};
1572 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1573 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1574 if ($self->{ct}->{attributes}) {
1575 !!!cp (120);
1576 !!!parse-error (type => 'end tag attribute');
1577 } else {
1578 ## NOTE: This state should never be reached.
1579 !!!cp (121);
1580 }
1581 } else {
1582 die "$0: $self->{ct}->{type}: Unknown token type";
1583 }
1584 $self->{state} = DATA_STATE;
1585 $self->{s_kwd} = '';
1586 !!!next-input-character;
1587
1588 !!!emit ($self->{ct}); # start tag or end tag
1589
1590 redo A;
1591 } elsif ($self->{nc} == 0x002F) { # /
1592 !!!cp (122);
1593 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1594 !!!next-input-character;
1595 redo A;
1596 } elsif ($self->{nc} == -1) {
1597 !!!parse-error (type => 'unclosed tag');
1598 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1599 !!!cp (122.3);
1600 $self->{last_stag_name} = $self->{ct}->{tag_name};
1601 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1602 if ($self->{ct}->{attributes}) {
1603 !!!cp (122.1);
1604 !!!parse-error (type => 'end tag attribute');
1605 } else {
1606 ## NOTE: This state should never be reached.
1607 !!!cp (122.2);
1608 }
1609 } else {
1610 die "$0: $self->{ct}->{type}: Unknown token type";
1611 }
1612 $self->{state} = DATA_STATE;
1613 $self->{s_kwd} = '';
1614 ## Reconsume.
1615 !!!emit ($self->{ct}); # start tag or end tag
1616 redo A;
1617 } else {
1618 !!!cp ('124.1');
1619 !!!parse-error (type => 'no space between attributes');
1620 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1621 ## reconsume
1622 redo A;
1623 }
1624 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1625 ## XML5: "Empty tag state".
1626
1627 if ($self->{nc} == 0x003E) { # >
1628 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1629 !!!cp ('124.2');
1630 !!!parse-error (type => 'nestc', token => $self->{ct});
1631 ## TODO: Different type than slash in start tag
1632 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1633 if ($self->{ct}->{attributes}) {
1634 !!!cp ('124.4');
1635 !!!parse-error (type => 'end tag attribute');
1636 } else {
1637 !!!cp ('124.5');
1638 }
1639 ## TODO: Test |<title></title/>|
1640 } else {
1641 !!!cp ('124.3');
1642 $self->{self_closing} = 1;
1643 }
1644
1645 $self->{state} = DATA_STATE;
1646 $self->{s_kwd} = '';
1647 !!!next-input-character;
1648
1649 !!!emit ($self->{ct}); # start tag or end tag
1650
1651 redo A;
1652 } elsif ($self->{nc} == -1) {
1653 !!!parse-error (type => 'unclosed tag');
1654 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1655 !!!cp (124.7);
1656 $self->{last_stag_name} = $self->{ct}->{tag_name};
1657 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1658 if ($self->{ct}->{attributes}) {
1659 !!!cp (124.5);
1660 !!!parse-error (type => 'end tag attribute');
1661 } else {
1662 ## NOTE: This state should never be reached.
1663 !!!cp (124.6);
1664 }
1665 } else {
1666 die "$0: $self->{ct}->{type}: Unknown token type";
1667 }
1668 ## XML5: "Tag attribute name before state".
1669 $self->{state} = DATA_STATE;
1670 $self->{s_kwd} = '';
1671 ## Reconsume.
1672 !!!emit ($self->{ct}); # start tag or end tag
1673 redo A;
1674 } else {
1675 !!!cp ('124.4');
1676 !!!parse-error (type => 'nestc');
1677 ## TODO: This error type is wrong.
1678 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1679 ## Reconsume.
1680 redo A;
1681 }
1682 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1683 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1684
1685 ## NOTE: Unlike spec's "bogus comment state", this implementation
1686 ## consumes characters one-by-one basis.
1687
1688 if ($self->{nc} == 0x003E) { # >
1689 if ($self->{in_subset}) {
1690 !!!cp (123);
1691 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1692 } else {
1693 !!!cp (124);
1694 $self->{state} = DATA_STATE;
1695 $self->{s_kwd} = '';
1696 }
1697 !!!next-input-character;
1698
1699 !!!emit ($self->{ct}); # comment
1700 redo A;
1701 } elsif ($self->{nc} == -1) {
1702 if ($self->{in_subset}) {
1703 !!!cp (125.1);
1704 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1705 } else {
1706 !!!cp (125);
1707 $self->{state} = DATA_STATE;
1708 $self->{s_kwd} = '';
1709 }
1710 ## reconsume
1711
1712 !!!emit ($self->{ct}); # comment
1713 redo A;
1714 } else {
1715 !!!cp (126);
1716 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1717 $self->{read_until}->($self->{ct}->{data},
1718 q[>],
1719 length $self->{ct}->{data});
1720
1721 ## Stay in the state.
1722 !!!next-input-character;
1723 redo A;
1724 }
1725 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1726 ## XML5: "Markup declaration state".
1727
1728 if ($self->{nc} == 0x002D) { # -
1729 !!!cp (133);
1730 $self->{state} = MD_HYPHEN_STATE;
1731 !!!next-input-character;
1732 redo A;
1733 } elsif ($self->{nc} == 0x0044 or # D
1734 $self->{nc} == 0x0064) { # d
1735 ## ASCII case-insensitive.
1736 !!!cp (130);
1737 $self->{state} = MD_DOCTYPE_STATE;
1738 $self->{kwd} = chr $self->{nc};
1739 !!!next-input-character;
1740 redo A;
1741 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1742 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1743 $self->{is_xml}) and
1744 $self->{nc} == 0x005B) { # [
1745 !!!cp (135.4);
1746 $self->{state} = MD_CDATA_STATE;
1747 $self->{kwd} = '[';
1748 !!!next-input-character;
1749 redo A;
1750 } else {
1751 !!!cp (136);
1752 }
1753
1754 !!!parse-error (type => 'bogus comment',
1755 line => $self->{line_prev},
1756 column => $self->{column_prev} - 1);
1757 ## Reconsume.
1758 $self->{state} = BOGUS_COMMENT_STATE;
1759 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1760 line => $self->{line_prev},
1761 column => $self->{column_prev} - 1,
1762 };
1763 redo A;
1764 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1765 if ($self->{nc} == 0x002D) { # -
1766 !!!cp (127);
1767 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1768 line => $self->{line_prev},
1769 column => $self->{column_prev} - 2,
1770 };
1771 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1772 !!!next-input-character;
1773 redo A;
1774 } else {
1775 !!!cp (128);
1776 !!!parse-error (type => 'bogus comment',
1777 line => $self->{line_prev},
1778 column => $self->{column_prev} - 2);
1779 $self->{state} = BOGUS_COMMENT_STATE;
1780 ## Reconsume.
1781 $self->{ct} = {type => COMMENT_TOKEN,
1782 data => '-',
1783 line => $self->{line_prev},
1784 column => $self->{column_prev} - 2,
1785 };
1786 redo A;
1787 }
1788 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1789 ## ASCII case-insensitive.
1790 if ($self->{nc} == [
1791 undef,
1792 0x004F, # O
1793 0x0043, # C
1794 0x0054, # T
1795 0x0059, # Y
1796 0x0050, # P
1797 ]->[length $self->{kwd}] or
1798 $self->{nc} == [
1799 undef,
1800 0x006F, # o
1801 0x0063, # c
1802 0x0074, # t
1803 0x0079, # y
1804 0x0070, # p
1805 ]->[length $self->{kwd}]) {
1806 !!!cp (131);
1807 ## Stay in the state.
1808 $self->{kwd} .= chr $self->{nc};
1809 !!!next-input-character;
1810 redo A;
1811 } elsif ((length $self->{kwd}) == 6 and
1812 ($self->{nc} == 0x0045 or # E
1813 $self->{nc} == 0x0065)) { # e
1814 if ($self->{is_xml} and
1815 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1816 !!!cp (129);
1817 ## XML5: case-sensitive.
1818 !!!parse-error (type => 'lowercase keyword', ## TODO
1819 text => 'DOCTYPE',
1820 line => $self->{line_prev},
1821 column => $self->{column_prev} - 5);
1822 } else {
1823 !!!cp (129.1);
1824 }
1825 $self->{state} = DOCTYPE_STATE;
1826 $self->{ct} = {type => DOCTYPE_TOKEN,
1827 quirks => 1,
1828 line => $self->{line_prev},
1829 column => $self->{column_prev} - 7,
1830 };
1831 !!!next-input-character;
1832 redo A;
1833 } else {
1834 !!!cp (132);
1835 !!!parse-error (type => 'bogus comment',
1836 line => $self->{line_prev},
1837 column => $self->{column_prev} - 1 - length $self->{kwd});
1838 $self->{state} = BOGUS_COMMENT_STATE;
1839 ## Reconsume.
1840 $self->{ct} = {type => COMMENT_TOKEN,
1841 data => $self->{kwd},
1842 line => $self->{line_prev},
1843 column => $self->{column_prev} - 1 - length $self->{kwd},
1844 };
1845 redo A;
1846 }
1847 } elsif ($self->{state} == MD_CDATA_STATE) {
1848 if ($self->{nc} == {
1849 '[' => 0x0043, # C
1850 '[C' => 0x0044, # D
1851 '[CD' => 0x0041, # A
1852 '[CDA' => 0x0054, # T
1853 '[CDAT' => 0x0041, # A
1854 }->{$self->{kwd}}) {
1855 !!!cp (135.1);
1856 ## Stay in the state.
1857 $self->{kwd} .= chr $self->{nc};
1858 !!!next-input-character;
1859 redo A;
1860 } elsif ($self->{kwd} eq '[CDATA' and
1861 $self->{nc} == 0x005B) { # [
1862 if ($self->{is_xml} and
1863 not $self->{tainted} and
1864 @{$self->{open_elements} or []} == 0) {
1865 !!!cp (135.2);
1866 !!!parse-error (type => 'cdata outside of root element',
1867 line => $self->{line_prev},
1868 column => $self->{column_prev} - 7);
1869 $self->{tainted} = 1;
1870 } else {
1871 !!!cp (135.21);
1872 }
1873
1874 $self->{ct} = {type => CHARACTER_TOKEN,
1875 data => '',
1876 line => $self->{line_prev},
1877 column => $self->{column_prev} - 7};
1878 $self->{state} = CDATA_SECTION_STATE;
1879 !!!next-input-character;
1880 redo A;
1881 } else {
1882 !!!cp (135.3);
1883 !!!parse-error (type => 'bogus comment',
1884 line => $self->{line_prev},
1885 column => $self->{column_prev} - 1 - length $self->{kwd});
1886 $self->{state} = BOGUS_COMMENT_STATE;
1887 ## Reconsume.
1888 $self->{ct} = {type => COMMENT_TOKEN,
1889 data => $self->{kwd},
1890 line => $self->{line_prev},
1891 column => $self->{column_prev} - 1 - length $self->{kwd},
1892 };
1893 redo A;
1894 }
1895 } elsif ($self->{state} == COMMENT_START_STATE) {
1896 if ($self->{nc} == 0x002D) { # -
1897 !!!cp (137);
1898 $self->{state} = COMMENT_START_DASH_STATE;
1899 !!!next-input-character;
1900 redo A;
1901 } elsif ($self->{nc} == 0x003E) { # >
1902 !!!parse-error (type => 'bogus comment');
1903 if ($self->{in_subset}) {
1904 !!!cp (138.1);
1905 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1906 } else {
1907 !!!cp (138);
1908 $self->{state} = DATA_STATE;
1909 $self->{s_kwd} = '';
1910 }
1911 !!!next-input-character;
1912
1913 !!!emit ($self->{ct}); # comment
1914
1915 redo A;
1916 } elsif ($self->{nc} == -1) {
1917 !!!parse-error (type => 'unclosed comment');
1918 if ($self->{in_subset}) {
1919 !!!cp (139.1);
1920 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1921 } else {
1922 !!!cp (139);
1923 $self->{state} = DATA_STATE;
1924 $self->{s_kwd} = '';
1925 }
1926 ## reconsume
1927
1928 !!!emit ($self->{ct}); # comment
1929
1930 redo A;
1931 } else {
1932 !!!cp (140);
1933 $self->{ct}->{data} # comment
1934 .= chr ($self->{nc});
1935 $self->{state} = COMMENT_STATE;
1936 !!!next-input-character;
1937 redo A;
1938 }
1939 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1940 if ($self->{nc} == 0x002D) { # -
1941 !!!cp (141);
1942 $self->{state} = COMMENT_END_STATE;
1943 !!!next-input-character;
1944 redo A;
1945 } elsif ($self->{nc} == 0x003E) { # >
1946 !!!parse-error (type => 'bogus comment');
1947 if ($self->{in_subset}) {
1948 !!!cp (142.1);
1949 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1950 } else {
1951 !!!cp (142);
1952 $self->{state} = DATA_STATE;
1953 $self->{s_kwd} = '';
1954 }
1955 !!!next-input-character;
1956
1957 !!!emit ($self->{ct}); # comment
1958
1959 redo A;
1960 } elsif ($self->{nc} == -1) {
1961 !!!parse-error (type => 'unclosed comment');
1962 if ($self->{in_subset}) {
1963 !!!cp (143.1);
1964 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1965 } else {
1966 !!!cp (143);
1967 $self->{state} = DATA_STATE;
1968 $self->{s_kwd} = '';
1969 }
1970 ## reconsume
1971
1972 !!!emit ($self->{ct}); # comment
1973
1974 redo A;
1975 } else {
1976 !!!cp (144);
1977 $self->{ct}->{data} # comment
1978 .= '-' . chr ($self->{nc});
1979 $self->{state} = COMMENT_STATE;
1980 !!!next-input-character;
1981 redo A;
1982 }
1983 } elsif ($self->{state} == COMMENT_STATE) {
1984 ## XML5: "Comment state" and "DOCTYPE comment state".
1985
1986 if ($self->{nc} == 0x002D) { # -
1987 !!!cp (145);
1988 $self->{state} = COMMENT_END_DASH_STATE;
1989 !!!next-input-character;
1990 redo A;
1991 } elsif ($self->{nc} == -1) {
1992 !!!parse-error (type => 'unclosed comment');
1993 if ($self->{in_subset}) {
1994 !!!cp (146.1);
1995 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996 } else {
1997 !!!cp (146);
1998 $self->{state} = DATA_STATE;
1999 $self->{s_kwd} = '';
2000 }
2001 ## reconsume
2002
2003 !!!emit ($self->{ct}); # comment
2004
2005 redo A;
2006 } else {
2007 !!!cp (147);
2008 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2009 $self->{read_until}->($self->{ct}->{data},
2010 q[-],
2011 length $self->{ct}->{data});
2012
2013 ## Stay in the state
2014 !!!next-input-character;
2015 redo A;
2016 }
2017 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2018 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2019
2020 if ($self->{nc} == 0x002D) { # -
2021 !!!cp (148);
2022 $self->{state} = COMMENT_END_STATE;
2023 !!!next-input-character;
2024 redo A;
2025 } elsif ($self->{nc} == -1) {
2026 !!!parse-error (type => 'unclosed comment');
2027 if ($self->{in_subset}) {
2028 !!!cp (149.1);
2029 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2030 } else {
2031 !!!cp (149);
2032 $self->{state} = DATA_STATE;
2033 $self->{s_kwd} = '';
2034 }
2035 ## reconsume
2036
2037 !!!emit ($self->{ct}); # comment
2038
2039 redo A;
2040 } else {
2041 !!!cp (150);
2042 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2043 $self->{state} = COMMENT_STATE;
2044 !!!next-input-character;
2045 redo A;
2046 }
2047 } elsif ($self->{state} == COMMENT_END_STATE) {
2048 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2049
2050 if ($self->{nc} == 0x003E) { # >
2051 if ($self->{in_subset}) {
2052 !!!cp (151.1);
2053 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2054 } else {
2055 !!!cp (151);
2056 $self->{state} = DATA_STATE;
2057 $self->{s_kwd} = '';
2058 }
2059 !!!next-input-character;
2060
2061 !!!emit ($self->{ct}); # comment
2062
2063 redo A;
2064 } elsif ($self->{nc} == 0x002D) { # -
2065 !!!cp (152);
2066 ## XML5: Not a parse error.
2067 !!!parse-error (type => 'dash in comment',
2068 line => $self->{line_prev},
2069 column => $self->{column_prev});
2070 $self->{ct}->{data} .= '-'; # comment
2071 ## Stay in the state
2072 !!!next-input-character;
2073 redo A;
2074 } elsif ($self->{nc} == -1) {
2075 !!!parse-error (type => 'unclosed comment');
2076 if ($self->{in_subset}) {
2077 !!!cp (153.1);
2078 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2079 } else {
2080 !!!cp (153);
2081 $self->{state} = DATA_STATE;
2082 $self->{s_kwd} = '';
2083 }
2084 ## reconsume
2085
2086 !!!emit ($self->{ct}); # comment
2087
2088 redo A;
2089 } else {
2090 !!!cp (154);
2091 ## XML5: Not a parse error.
2092 !!!parse-error (type => 'dash in comment',
2093 line => $self->{line_prev},
2094 column => $self->{column_prev});
2095 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2096 $self->{state} = COMMENT_STATE;
2097 !!!next-input-character;
2098 redo A;
2099 }
2100 } elsif ($self->{state} == DOCTYPE_STATE) {
2101 if ($is_space->{$self->{nc}}) {
2102 !!!cp (155);
2103 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2104 !!!next-input-character;
2105 redo A;
2106 } else {
2107 !!!cp (156);
2108 ## XML5: Unless EOF, swith to the bogus comment state.
2109 !!!parse-error (type => 'no space before DOCTYPE name');
2110 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2111 ## reconsume
2112 redo A;
2113 }
2114 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2115 ## XML5: "DOCTYPE root name before state".
2116
2117 if ($is_space->{$self->{nc}}) {
2118 !!!cp (157);
2119 ## Stay in the state
2120 !!!next-input-character;
2121 redo A;
2122 } elsif ($self->{nc} == 0x003E) { # >
2123 !!!cp (158);
2124 ## XML5: No parse error.
2125 !!!parse-error (type => 'no DOCTYPE name');
2126 $self->{state} = DATA_STATE;
2127 $self->{s_kwd} = '';
2128 !!!next-input-character;
2129
2130 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2131
2132 redo A;
2133 } elsif ($self->{nc} == -1) {
2134 !!!cp (159);
2135 !!!parse-error (type => 'no DOCTYPE name');
2136 $self->{state} = DATA_STATE;
2137 $self->{s_kwd} = '';
2138 ## reconsume
2139
2140 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2141
2142 redo A;
2143 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2144 !!!cp (159.1);
2145 !!!parse-error (type => 'no DOCTYPE name');
2146 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2147 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2148 $self->{in_subset} = 1;
2149 !!!next-input-character;
2150 !!!emit ($self->{ct}); # DOCTYPE
2151 redo A;
2152 } else {
2153 !!!cp (160);
2154 $self->{ct}->{name} = chr $self->{nc};
2155 delete $self->{ct}->{quirks};
2156 $self->{state} = DOCTYPE_NAME_STATE;
2157 !!!next-input-character;
2158 redo A;
2159 }
2160 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2161 ## XML5: "DOCTYPE root name state".
2162
2163 ## ISSUE: Redundant "First," in the spec.
2164
2165 if ($is_space->{$self->{nc}}) {
2166 !!!cp (161);
2167 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2168 !!!next-input-character;
2169 redo A;
2170 } elsif ($self->{nc} == 0x003E) { # >
2171 !!!cp (162);
2172 $self->{state} = DATA_STATE;
2173 $self->{s_kwd} = '';
2174 !!!next-input-character;
2175
2176 !!!emit ($self->{ct}); # DOCTYPE
2177
2178 redo A;
2179 } elsif ($self->{nc} == -1) {
2180 !!!cp (163);
2181 !!!parse-error (type => 'unclosed DOCTYPE');
2182 $self->{state} = DATA_STATE;
2183 $self->{s_kwd} = '';
2184 ## reconsume
2185
2186 $self->{ct}->{quirks} = 1;
2187 !!!emit ($self->{ct}); # DOCTYPE
2188
2189 redo A;
2190 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2191 !!!cp (163.1);
2192 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2193 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2194 $self->{in_subset} = 1;
2195 !!!next-input-character;
2196 !!!emit ($self->{ct}); # DOCTYPE
2197 redo A;
2198 } else {
2199 !!!cp (164);
2200 $self->{ct}->{name}
2201 .= chr ($self->{nc}); # DOCTYPE
2202 ## Stay in the state
2203 !!!next-input-character;
2204 redo A;
2205 }
2206 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2207 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2208 ## state", but implemented differently.
2209
2210 if ($is_space->{$self->{nc}}) {
2211 !!!cp (165);
2212 ## Stay in the state
2213 !!!next-input-character;
2214 redo A;
2215 } elsif ($self->{nc} == 0x003E) { # >
2216 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2217 !!!cp (166);
2218 $self->{state} = DATA_STATE;
2219 $self->{s_kwd} = '';
2220 } else {
2221 !!!cp (166.1);
2222 !!!parse-error (type => 'no md def'); ## TODO: type
2223 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2224 }
2225
2226 !!!next-input-character;
2227 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2228 redo A;
2229 } elsif ($self->{nc} == -1) {
2230 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2231 !!!cp (167);
2232 !!!parse-error (type => 'unclosed DOCTYPE');
2233 $self->{state} = DATA_STATE;
2234 $self->{s_kwd} = '';
2235 $self->{ct}->{quirks} = 1;
2236 } else {
2237 !!!cp (167.12);
2238 !!!parse-error (type => 'unclosed md'); ## TODO: type
2239 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2240 }
2241
2242 ## Reconsume.
2243 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2244 redo A;
2245 } elsif ($self->{nc} == 0x0050 or # P
2246 $self->{nc} == 0x0070) { # p
2247 !!!cp (167.1);
2248 $self->{state} = PUBLIC_STATE;
2249 $self->{kwd} = chr $self->{nc};
2250 !!!next-input-character;
2251 redo A;
2252 } elsif ($self->{nc} == 0x0053 or # S
2253 $self->{nc} == 0x0073) { # s
2254 !!!cp (167.2);
2255 $self->{state} = SYSTEM_STATE;
2256 $self->{kwd} = chr $self->{nc};
2257 !!!next-input-character;
2258 redo A;
2259 ## TODO: " and ' for ENTITY
2260 } elsif ($self->{is_xml} and
2261 $self->{ct}->{type} == DOCTYPE_TOKEN and
2262 $self->{nc} == 0x005B) { # [
2263 !!!cp (167.3);
2264 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2265 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2266 $self->{in_subset} = 1;
2267 !!!next-input-character;
2268 !!!emit ($self->{ct}); # DOCTYPE
2269 redo A;
2270 } else {
2271 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2272
2273 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2274 !!!cp (180);
2275 $self->{ct}->{quirks} = 1;
2276 $self->{state} = BOGUS_DOCTYPE_STATE;
2277 } else {
2278 !!!cp (180.1);
2279 $self->{state} = BOGUS_MD_STATE;
2280 }
2281
2282 !!!next-input-character;
2283 redo A;
2284 }
2285 } elsif ($self->{state} == PUBLIC_STATE) {
2286 ## ASCII case-insensitive
2287 if ($self->{nc} == [
2288 undef,
2289 0x0055, # U
2290 0x0042, # B
2291 0x004C, # L
2292 0x0049, # I
2293 ]->[length $self->{kwd}] or
2294 $self->{nc} == [
2295 undef,
2296 0x0075, # u
2297 0x0062, # b
2298 0x006C, # l
2299 0x0069, # i
2300 ]->[length $self->{kwd}]) {
2301 !!!cp (175);
2302 ## Stay in the state.
2303 $self->{kwd} .= chr $self->{nc};
2304 !!!next-input-character;
2305 redo A;
2306 } elsif ((length $self->{kwd}) == 5 and
2307 ($self->{nc} == 0x0043 or # C
2308 $self->{nc} == 0x0063)) { # c
2309 if ($self->{is_xml} and
2310 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2311 !!!cp (168.1);
2312 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2313 text => 'PUBLIC',
2314 line => $self->{line_prev},
2315 column => $self->{column_prev} - 4);
2316 } else {
2317 !!!cp (168);
2318 }
2319 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2320 !!!next-input-character;
2321 redo A;
2322 } else {
2323 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2324 line => $self->{line_prev},
2325 column => $self->{column_prev} + 1 - length $self->{kwd});
2326 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2327 !!!cp (169);
2328 $self->{ct}->{quirks} = 1;
2329 $self->{state} = BOGUS_DOCTYPE_STATE;
2330 } else {
2331 !!!cp (169.1);
2332 $self->{state} = BOGUS_MD_STATE;
2333 }
2334 ## Reconsume.
2335 redo A;
2336 }
2337 } elsif ($self->{state} == SYSTEM_STATE) {
2338 ## ASCII case-insensitive
2339 if ($self->{nc} == [
2340 undef,
2341 0x0059, # Y
2342 0x0053, # S
2343 0x0054, # T
2344 0x0045, # E
2345 ]->[length $self->{kwd}] or
2346 $self->{nc} == [
2347 undef,
2348 0x0079, # y
2349 0x0073, # s
2350 0x0074, # t
2351 0x0065, # e
2352 ]->[length $self->{kwd}]) {
2353 !!!cp (170);
2354 ## Stay in the state.
2355 $self->{kwd} .= chr $self->{nc};
2356 !!!next-input-character;
2357 redo A;
2358 } elsif ((length $self->{kwd}) == 5 and
2359 ($self->{nc} == 0x004D or # M
2360 $self->{nc} == 0x006D)) { # m
2361 if ($self->{is_xml} and
2362 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2363 !!!cp (171.1);
2364 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2365 text => 'SYSTEM',
2366 line => $self->{line_prev},
2367 column => $self->{column_prev} - 4);
2368 } else {
2369 !!!cp (171);
2370 }
2371 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2372 !!!next-input-character;
2373 redo A;
2374 } else {
2375 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2376 line => $self->{line_prev},
2377 column => $self->{column_prev} + 1 - length $self->{kwd});
2378 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2379 !!!cp (172);
2380 $self->{ct}->{quirks} = 1;
2381 $self->{state} = BOGUS_DOCTYPE_STATE;
2382 } else {
2383 !!!cp (172.1);
2384 $self->{state} = BOGUS_MD_STATE;
2385 }
2386 ## Reconsume.
2387 redo A;
2388 }
2389 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2390 if ($is_space->{$self->{nc}}) {
2391 !!!cp (181);
2392 ## Stay in the state
2393 !!!next-input-character;
2394 redo A;
2395 } elsif ($self->{nc} eq 0x0022) { # "
2396 !!!cp (182);
2397 $self->{ct}->{pubid} = ''; # DOCTYPE
2398 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2399 !!!next-input-character;
2400 redo A;
2401 } elsif ($self->{nc} eq 0x0027) { # '
2402 !!!cp (183);
2403 $self->{ct}->{pubid} = ''; # DOCTYPE
2404 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2405 !!!next-input-character;
2406 redo A;
2407 } elsif ($self->{nc} eq 0x003E) { # >
2408 !!!parse-error (type => 'no PUBLIC literal');
2409
2410 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2411 !!!cp (184);
2412 $self->{state} = DATA_STATE;
2413 $self->{s_kwd} = '';
2414 $self->{ct}->{quirks} = 1;
2415 } else {
2416 !!!cp (184.1);
2417 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2418 }
2419
2420 !!!next-input-character;
2421 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2422 redo A;
2423 } elsif ($self->{nc} == -1) {
2424 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2425 !!!cp (185);
2426 !!!parse-error (type => 'unclosed DOCTYPE');
2427 $self->{state} = DATA_STATE;
2428 $self->{s_kwd} = '';
2429 $self->{ct}->{quirks} = 1;
2430 } else {
2431 !!!cp (185.1);
2432 !!!parse-error (type => 'unclosed md'); ## TODO: type
2433 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2434 }
2435
2436 ## reconsume
2437 !!!emit ($self->{ct}); # DOCTYPE
2438 redo A;
2439 } elsif ($self->{is_xml} and
2440 $self->{ct}->{type} == DOCTYPE_TOKEN and
2441 $self->{nc} == 0x005B) { # [
2442 !!!cp (186.1);
2443 !!!parse-error (type => 'no PUBLIC literal');
2444 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2445 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2446 $self->{in_subset} = 1;
2447 !!!next-input-character;
2448 !!!emit ($self->{ct}); # DOCTYPE
2449 redo A;
2450 } else {
2451 !!!parse-error (type => 'string after PUBLIC');
2452
2453 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2454 !!!cp (186);
2455 $self->{ct}->{quirks} = 1;
2456 $self->{state} = BOGUS_DOCTYPE_STATE;
2457 } else {
2458 !!!cp (186.2);
2459 $self->{state} = BOGUS_MD_STATE;
2460 }
2461
2462 !!!next-input-character;
2463 redo A;
2464 }
2465 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2466 if ($self->{nc} == 0x0022) { # "
2467 !!!cp (187);
2468 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2469 !!!next-input-character;
2470 redo A;
2471 } elsif ($self->{nc} == 0x003E) { # >
2472 !!!parse-error (type => 'unclosed PUBLIC literal');
2473
2474 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2475 !!!cp (188);
2476 $self->{state} = DATA_STATE;
2477 $self->{s_kwd} = '';
2478 $self->{ct}->{quirks} = 1;
2479 } else {
2480 !!!cp (188.1);
2481 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2482 }
2483
2484 !!!next-input-character;
2485 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2486 redo A;
2487 } elsif ($self->{nc} == -1) {
2488 !!!parse-error (type => 'unclosed PUBLIC literal');
2489
2490 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2491 !!!cp (189);
2492 $self->{state} = DATA_STATE;
2493 $self->{s_kwd} = '';
2494 $self->{ct}->{quirks} = 1;
2495 } else {
2496 !!!cp (189.1);
2497 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2498 }
2499
2500 ## Reconsume.
2501 !!!emit ($self->{ct}); # DOCTYPE
2502 redo A;
2503 } else {
2504 !!!cp (190);
2505 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2506 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2507 length $self->{ct}->{pubid});
2508
2509 ## Stay in the state
2510 !!!next-input-character;
2511 redo A;
2512 }
2513 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2514 if ($self->{nc} == 0x0027) { # '
2515 !!!cp (191);
2516 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2517 !!!next-input-character;
2518 redo A;
2519 } elsif ($self->{nc} == 0x003E) { # >
2520 !!!parse-error (type => 'unclosed PUBLIC literal');
2521
2522 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2523 !!!cp (192);
2524 $self->{state} = DATA_STATE;
2525 $self->{s_kwd} = '';
2526 $self->{ct}->{quirks} = 1;
2527 } else {
2528 !!!cp (192.1);
2529 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2530 }
2531
2532 !!!next-input-character;
2533 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2534 redo A;
2535 } elsif ($self->{nc} == -1) {
2536 !!!parse-error (type => 'unclosed PUBLIC literal');
2537
2538 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2539 !!!cp (193);
2540 $self->{state} = DATA_STATE;
2541 $self->{s_kwd} = '';
2542 $self->{ct}->{quirks} = 1;
2543 } else {
2544 !!!cp (193.1);
2545 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2546 }
2547
2548 ## reconsume
2549 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2550 redo A;
2551 } else {
2552 !!!cp (194);
2553 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2554 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2555 length $self->{ct}->{pubid});
2556
2557 ## Stay in the state
2558 !!!next-input-character;
2559 redo A;
2560 }
2561 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2562 if ($is_space->{$self->{nc}}) {
2563 !!!cp (195);
2564 ## Stay in the state
2565 !!!next-input-character;
2566 redo A;
2567 } elsif ($self->{nc} == 0x0022) { # "
2568 !!!cp (196);
2569 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2570 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2571 !!!next-input-character;
2572 redo A;
2573 } elsif ($self->{nc} == 0x0027) { # '
2574 !!!cp (197);
2575 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2576 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2577 !!!next-input-character;
2578 redo A;
2579 } elsif ($self->{nc} == 0x003E) { # >
2580 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2581 if ($self->{is_xml}) {
2582 !!!cp (198.1);
2583 !!!parse-error (type => 'no SYSTEM literal');
2584 } else {
2585 !!!cp (198);
2586 }
2587 $self->{state} = DATA_STATE;
2588 $self->{s_kwd} = '';
2589 } else {
2590 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2591 !!!cp (198.2);
2592 } else {
2593 !!!cp (198.3);
2594 !!!parse-error (type => 'no SYSTEM literal');
2595 }
2596 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597 }
2598
2599 !!!next-input-character;
2600 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2601 redo A;
2602 } elsif ($self->{nc} == -1) {
2603 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2604 !!!cp (199);
2605 !!!parse-error (type => 'unclosed DOCTYPE');
2606
2607 $self->{state} = DATA_STATE;
2608 $self->{s_kwd} = '';
2609 $self->{ct}->{quirks} = 1;
2610 } else {
2611 !!!parse-error (type => 'unclosed md'); ## TODO: type
2612 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2613 }
2614
2615 ## reconsume
2616 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2617 redo A;
2618 } elsif ($self->{is_xml} and
2619 $self->{ct}->{type} == DOCTYPE_TOKEN and
2620 $self->{nc} == 0x005B) { # [
2621 !!!cp (200.1);
2622 !!!parse-error (type => 'no SYSTEM literal');
2623 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2624 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2625 $self->{in_subset} = 1;
2626 !!!next-input-character;
2627 !!!emit ($self->{ct}); # DOCTYPE
2628 redo A;
2629 } else {
2630 !!!parse-error (type => 'string after PUBLIC literal');
2631
2632 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2633 !!!cp (200);
2634 $self->{ct}->{quirks} = 1;
2635 $self->{state} = BOGUS_DOCTYPE_STATE;
2636 } else {
2637 !!!cp (200.2);
2638 $self->{state} = BOGUS_MD_STATE;
2639 }
2640
2641 !!!next-input-character;
2642 redo A;
2643 }
2644 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2645 if ($is_space->{$self->{nc}}) {
2646 !!!cp (201);
2647 ## Stay in the state
2648 !!!next-input-character;
2649 redo A;
2650 } elsif ($self->{nc} == 0x0022) { # "
2651 !!!cp (202);
2652 $self->{ct}->{sysid} = ''; # DOCTYPE
2653 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2654 !!!next-input-character;
2655 redo A;
2656 } elsif ($self->{nc} == 0x0027) { # '
2657 !!!cp (203);
2658 $self->{ct}->{sysid} = ''; # DOCTYPE
2659 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2660 !!!next-input-character;
2661 redo A;
2662 } elsif ($self->{nc} == 0x003E) { # >
2663 !!!parse-error (type => 'no SYSTEM literal');
2664 !!!next-input-character;
2665
2666 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2667 !!!cp (204);
2668 $self->{state} = DATA_STATE;
2669 $self->{s_kwd} = '';
2670 $self->{ct}->{quirks} = 1;
2671 } else {
2672 !!!cp (204.1);
2673 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2674 }
2675
2676 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2677 redo A;
2678 } elsif ($self->{nc} == -1) {
2679 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2680 !!!cp (205);
2681 !!!parse-error (type => 'unclosed DOCTYPE');
2682 $self->{state} = DATA_STATE;
2683 $self->{s_kwd} = '';
2684 $self->{ct}->{quirks} = 1;
2685 } else {
2686 !!!cp (205.1);
2687 !!!parse-error (type => 'unclosed md'); ## TODO: type
2688 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2689 }
2690
2691 ## reconsume
2692 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2693 redo A;
2694 } elsif ($self->{is_xml} and
2695 $self->{ct}->{type} == DOCTYPE_TOKEN and
2696 $self->{nc} == 0x005B) { # [
2697 !!!cp (206.1);
2698 !!!parse-error (type => 'no SYSTEM literal');
2699
2700 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2701 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2702 $self->{in_subset} = 1;
2703 !!!next-input-character;
2704 !!!emit ($self->{ct}); # DOCTYPE
2705 redo A;
2706 } else {
2707 !!!parse-error (type => 'string after SYSTEM');
2708
2709 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2710 !!!cp (206);
2711 $self->{ct}->{quirks} = 1;
2712 $self->{state} = BOGUS_DOCTYPE_STATE;
2713 } else {
2714 !!!cp (206.2);
2715 $self->{state} = BOGUS_MD_STATE;
2716 }
2717
2718 !!!next-input-character;
2719 redo A;
2720 }
2721 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2722 if ($self->{nc} == 0x0022) { # "
2723 !!!cp (207);
2724 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2725 !!!next-input-character;
2726 redo A;
2727 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2728 !!!parse-error (type => 'unclosed SYSTEM literal');
2729
2730 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2731 !!!cp (208);
2732 $self->{state} = DATA_STATE;
2733 $self->{s_kwd} = '';
2734 $self->{ct}->{quirks} = 1;
2735 } else {
2736 !!!cp (208.1);
2737 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2738 }
2739
2740 !!!next-input-character;
2741 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2742 redo A;
2743 } elsif ($self->{nc} == -1) {
2744 !!!parse-error (type => 'unclosed SYSTEM literal');
2745
2746 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2747 !!!cp (209);
2748 $self->{state} = DATA_STATE;
2749 $self->{s_kwd} = '';
2750 $self->{ct}->{quirks} = 1;
2751 } else {
2752 !!!cp (209.1);
2753 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2754 }
2755
2756 ## reconsume
2757 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2758 redo A;
2759 } else {
2760 !!!cp (210);
2761 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2762 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2763 length $self->{ct}->{sysid});
2764
2765 ## Stay in the state
2766 !!!next-input-character;
2767 redo A;
2768 }
2769 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2770 if ($self->{nc} == 0x0027) { # '
2771 !!!cp (211);
2772 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2773 !!!next-input-character;
2774 redo A;
2775 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2776 !!!cp (212);
2777 !!!parse-error (type => 'unclosed SYSTEM literal');
2778
2779 $self->{state} = DATA_STATE;
2780 $self->{s_kwd} = '';
2781 !!!next-input-character;
2782
2783 $self->{ct}->{quirks} = 1;
2784 !!!emit ($self->{ct}); # DOCTYPE
2785
2786 redo A;
2787 } elsif ($self->{nc} == -1) {
2788 !!!parse-error (type => 'unclosed SYSTEM literal');
2789
2790 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2791 !!!cp (213);
2792 $self->{state} = DATA_STATE;
2793 $self->{s_kwd} = '';
2794 $self->{ct}->{quirks} = 1;
2795 } else {
2796 !!!cp (213.1);
2797 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2798 }
2799
2800 ## reconsume
2801 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2802 redo A;
2803 } else {
2804 !!!cp (214);
2805 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2806 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2807 length $self->{ct}->{sysid});
2808
2809 ## Stay in the state
2810 !!!next-input-character;
2811 redo A;
2812 }
2813 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2814 if ($is_space->{$self->{nc}}) {
2815 !!!cp (215);
2816 ## Stay in the state
2817 !!!next-input-character;
2818 redo A;
2819 } elsif ($self->{nc} == 0x003E) { # >
2820 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2821 !!!cp (216);
2822 $self->{state} = DATA_STATE;
2823 $self->{s_kwd} = '';
2824 } else {
2825 !!!cp (216.1);
2826 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2827 }
2828
2829 !!!next-input-character;
2830 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2831 redo A;
2832 ## TODO: "NDATA"
2833 } elsif ($self->{nc} == -1) {
2834 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2835 !!!cp (217);
2836 !!!parse-error (type => 'unclosed DOCTYPE');
2837 $self->{state} = DATA_STATE;
2838 $self->{s_kwd} = '';
2839 $self->{ct}->{quirks} = 1;
2840 } else {
2841 !!!cp (217.1);
2842 !!!parse-error (type => 'unclosed md'); ## TODO: type
2843 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2844 }
2845
2846 ## reconsume
2847 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2848 redo A;
2849 } elsif ($self->{is_xml} and
2850 $self->{ct}->{type} == DOCTYPE_TOKEN and
2851 $self->{nc} == 0x005B) { # [
2852 !!!cp (218.1);
2853 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2854 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2855 $self->{in_subset} = 1;
2856 !!!next-input-character;
2857 !!!emit ($self->{ct}); # DOCTYPE
2858 redo A;
2859 } else {
2860 !!!parse-error (type => 'string after SYSTEM literal');
2861
2862 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2863 !!!cp (218);
2864 #$self->{ct}->{quirks} = 1;
2865 $self->{state} = BOGUS_DOCTYPE_STATE;
2866 } else {
2867 !!!cp (218.2);
2868 $self->{state} = BOGUS_MD_STATE;
2869 }
2870
2871 !!!next-input-character;
2872 redo A;
2873 }
2874 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2875 if ($self->{nc} == 0x003E) { # >
2876 !!!cp (219);
2877 $self->{state} = DATA_STATE;
2878 $self->{s_kwd} = '';
2879 !!!next-input-character;
2880
2881 !!!emit ($self->{ct}); # DOCTYPE
2882
2883 redo A;
2884 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2885 !!!cp (220.1);
2886 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2887 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2888 $self->{in_subset} = 1;
2889 !!!next-input-character;
2890 !!!emit ($self->{ct}); # DOCTYPE
2891 redo A;
2892 } elsif ($self->{nc} == -1) {
2893 !!!cp (220);
2894 $self->{state} = DATA_STATE;
2895 $self->{s_kwd} = '';
2896 ## reconsume
2897
2898 !!!emit ($self->{ct}); # DOCTYPE
2899
2900 redo A;
2901 } else {
2902 !!!cp (221);
2903 my $s = '';
2904 $self->{read_until}->($s, q{>[}, 0);
2905
2906 ## Stay in the state
2907 !!!next-input-character;
2908 redo A;
2909 }
2910 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2911 ## NOTE: "CDATA section state" in the state is jointly implemented
2912 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2913 ## and |CDATA_SECTION_MSE2_STATE|.
2914
2915 ## XML5: "CDATA state".
2916
2917 if ($self->{nc} == 0x005D) { # ]
2918 !!!cp (221.1);
2919 $self->{state} = CDATA_SECTION_MSE1_STATE;
2920 !!!next-input-character;
2921 redo A;
2922 } elsif ($self->{nc} == -1) {
2923 if ($self->{is_xml}) {
2924 !!!cp (221.11);
2925 !!!parse-error (type => 'no mse'); ## TODO: type
2926 } else {
2927 !!!cp (221.12);
2928 }
2929
2930 $self->{state} = DATA_STATE;
2931 $self->{s_kwd} = '';
2932 ## Reconsume.
2933 if (length $self->{ct}->{data}) { # character
2934 !!!cp (221.2);
2935 !!!emit ($self->{ct}); # character
2936 } else {
2937 !!!cp (221.3);
2938 ## No token to emit. $self->{ct} is discarded.
2939 }
2940 redo A;
2941 } else {
2942 !!!cp (221.4);
2943 $self->{ct}->{data} .= chr $self->{nc};
2944 $self->{read_until}->($self->{ct}->{data},
2945 q<]>,
2946 length $self->{ct}->{data});
2947
2948 ## Stay in the state.
2949 !!!next-input-character;
2950 redo A;
2951 }
2952
2953 ## ISSUE: "text tokens" in spec.
2954 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2955 ## XML5: "CDATA bracket state".
2956
2957 if ($self->{nc} == 0x005D) { # ]
2958 !!!cp (221.5);
2959 $self->{state} = CDATA_SECTION_MSE2_STATE;
2960 !!!next-input-character;
2961 redo A;
2962 } else {
2963 !!!cp (221.6);
2964 ## XML5: If EOF, "]" is not appended and changed to the data state.
2965 $self->{ct}->{data} .= ']';
2966 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2967 ## Reconsume.
2968 redo A;
2969 }
2970 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2971 ## XML5: "CDATA end state".
2972
2973 if ($self->{nc} == 0x003E) { # >
2974 $self->{state} = DATA_STATE;
2975 $self->{s_kwd} = '';
2976 !!!next-input-character;
2977 if (length $self->{ct}->{data}) { # character
2978 !!!cp (221.7);
2979 !!!emit ($self->{ct}); # character
2980 } else {
2981 !!!cp (221.8);
2982 ## No token to emit. $self->{ct} is discarded.
2983 }
2984 redo A;
2985 } elsif ($self->{nc} == 0x005D) { # ]
2986 !!!cp (221.9); # character
2987 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2988 ## Stay in the state.
2989 !!!next-input-character;
2990 redo A;
2991 } else {
2992 !!!cp (221.11);
2993 $self->{ct}->{data} .= ']]'; # character
2994 $self->{state} = CDATA_SECTION_STATE;
2995 ## Reconsume. ## XML5: Emit.
2996 redo A;
2997 }
2998 } elsif ($self->{state} == ENTITY_STATE) {
2999 if ($is_space->{$self->{nc}} or
3000 {
3001 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3002 $self->{entity_add} => 1,
3003 }->{$self->{nc}}) {
3004 !!!cp (1001);
3005 ## Don't consume
3006 ## No error
3007 ## Return nothing.
3008 #
3009 } elsif ($self->{nc} == 0x0023) { # #
3010 !!!cp (999);
3011 $self->{state} = ENTITY_HASH_STATE;
3012 $self->{kwd} = '#';
3013 !!!next-input-character;
3014 redo A;
3015 } elsif ((0x0041 <= $self->{nc} and
3016 $self->{nc} <= 0x005A) or # A..Z
3017 (0x0061 <= $self->{nc} and
3018 $self->{nc} <= 0x007A)) { # a..z
3019 !!!cp (998);
3020 require Whatpm::_NamedEntityList;
3021 $self->{state} = ENTITY_NAME_STATE;
3022 $self->{kwd} = chr $self->{nc};
3023 $self->{entity__value} = $self->{kwd};
3024 $self->{entity__match} = 0;
3025 !!!next-input-character;
3026 redo A;
3027 } else {
3028 !!!cp (1027);
3029 !!!parse-error (type => 'bare ero');
3030 ## Return nothing.
3031 #
3032 }
3033
3034 ## NOTE: No character is consumed by the "consume a character
3035 ## reference" algorithm. In other word, there is an "&" character
3036 ## that does not introduce a character reference, which would be
3037 ## appended to the parent element or the attribute value in later
3038 ## process of the tokenizer.
3039
3040 if ($self->{prev_state} == DATA_STATE) {
3041 !!!cp (997);
3042 $self->{state} = $self->{prev_state};
3043 $self->{s_kwd} = '';
3044 ## Reconsume.
3045 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3046 line => $self->{line_prev},
3047 column => $self->{column_prev},
3048 });
3049 redo A;
3050 } else {
3051 !!!cp (996);
3052 $self->{ca}->{value} .= '&';
3053 $self->{state} = $self->{prev_state};
3054 $self->{s_kwd} = '';
3055 ## Reconsume.
3056 redo A;
3057 }
3058 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3059 if ($self->{nc} == 0x0078 or # x
3060 $self->{nc} == 0x0058) { # X
3061 !!!cp (995);
3062 $self->{state} = HEXREF_X_STATE;
3063 $self->{kwd} .= chr $self->{nc};
3064 !!!next-input-character;
3065 redo A;
3066 } elsif (0x0030 <= $self->{nc} and
3067 $self->{nc} <= 0x0039) { # 0..9
3068 !!!cp (994);
3069 $self->{state} = NCR_NUM_STATE;
3070 $self->{kwd} = $self->{nc} - 0x0030;
3071 !!!next-input-character;
3072 redo A;
3073 } else {
3074 !!!parse-error (type => 'bare nero',
3075 line => $self->{line_prev},
3076 column => $self->{column_prev} - 1);
3077
3078 ## NOTE: According to the spec algorithm, nothing is returned,
3079 ## and then "&#" is appended to the parent element or the attribute
3080 ## value in the later processing.
3081
3082 if ($self->{prev_state} == DATA_STATE) {
3083 !!!cp (1019);
3084 $self->{state} = $self->{prev_state};
3085 $self->{s_kwd} = '';
3086 ## Reconsume.
3087 !!!emit ({type => CHARACTER_TOKEN,
3088 data => '&#',
3089 line => $self->{line_prev},
3090 column => $self->{column_prev} - 1,
3091 });
3092 redo A;
3093 } else {
3094 !!!cp (993);
3095 $self->{ca}->{value} .= '&#';
3096 $self->{state} = $self->{prev_state};
3097 $self->{s_kwd} = '';
3098 ## Reconsume.
3099 redo A;
3100 }
3101 }
3102 } elsif ($self->{state} == NCR_NUM_STATE) {
3103 if (0x0030 <= $self->{nc} and
3104 $self->{nc} <= 0x0039) { # 0..9
3105 !!!cp (1012);
3106 $self->{kwd} *= 10;
3107 $self->{kwd} += $self->{nc} - 0x0030;
3108
3109 ## Stay in the state.
3110 !!!next-input-character;
3111 redo A;
3112 } elsif ($self->{nc} == 0x003B) { # ;
3113 !!!cp (1013);
3114 !!!next-input-character;
3115 #
3116 } else {
3117 !!!cp (1014);
3118 !!!parse-error (type => 'no refc');
3119 ## Reconsume.
3120 #
3121 }
3122
3123 my $code = $self->{kwd};
3124 my $l = $self->{line_prev};
3125 my $c = $self->{column_prev};
3126 if ($charref_map->{$code}) {
3127 !!!cp (1015);
3128 !!!parse-error (type => 'invalid character reference',
3129 text => (sprintf 'U+%04X', $code),
3130 line => $l, column => $c);
3131 $code = $charref_map->{$code};
3132 } elsif ($code > 0x10FFFF) {
3133 !!!cp (1016);
3134 !!!parse-error (type => 'invalid character reference',
3135 text => (sprintf 'U-%08X', $code),
3136 line => $l, column => $c);
3137 $code = 0xFFFD;
3138 }
3139
3140 if ($self->{prev_state} == DATA_STATE) {
3141 !!!cp (992);
3142 $self->{state} = $self->{prev_state};
3143 $self->{s_kwd} = '';
3144 ## Reconsume.
3145 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3146 has_reference => 1,
3147 line => $l, column => $c,
3148 });
3149 redo A;
3150 } else {
3151 !!!cp (991);
3152 $self->{ca}->{value} .= chr $code;
3153 $self->{ca}->{has_reference} = 1;
3154 $self->{state} = $self->{prev_state};
3155 $self->{s_kwd} = '';
3156 ## Reconsume.
3157 redo A;
3158 }
3159 } elsif ($self->{state} == HEXREF_X_STATE) {
3160 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3161 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3162 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3163 # 0..9, A..F, a..f
3164 !!!cp (990);
3165 $self->{state} = HEXREF_HEX_STATE;
3166 $self->{kwd} = 0;
3167 ## Reconsume.
3168 redo A;
3169 } else {
3170 !!!parse-error (type => 'bare hcro',
3171 line => $self->{line_prev},
3172 column => $self->{column_prev} - 2);
3173
3174 ## NOTE: According to the spec algorithm, nothing is returned,
3175 ## and then "&#" followed by "X" or "x" is appended to the parent
3176 ## element or the attribute value in the later processing.
3177
3178 if ($self->{prev_state} == DATA_STATE) {
3179 !!!cp (1005);
3180 $self->{state} = $self->{prev_state};
3181 $self->{s_kwd} = '';
3182 ## Reconsume.
3183 !!!emit ({type => CHARACTER_TOKEN,
3184 data => '&' . $self->{kwd},
3185 line => $self->{line_prev},
3186 column => $self->{column_prev} - length $self->{kwd},
3187 });
3188 redo A;
3189 } else {
3190 !!!cp (989);
3191 $self->{ca}->{value} .= '&' . $self->{kwd};
3192 $self->{state} = $self->{prev_state};
3193 $self->{s_kwd} = '';
3194 ## Reconsume.
3195 redo A;
3196 }
3197 }
3198 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3199 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3200 # 0..9
3201 !!!cp (1002);
3202 $self->{kwd} *= 0x10;
3203 $self->{kwd} += $self->{nc} - 0x0030;
3204 ## Stay in the state.
3205 !!!next-input-character;
3206 redo A;
3207 } elsif (0x0061 <= $self->{nc} and
3208 $self->{nc} <= 0x0066) { # a..f
3209 !!!cp (1003);
3210 $self->{kwd} *= 0x10;
3211 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3212 ## Stay in the state.
3213 !!!next-input-character;
3214 redo A;
3215 } elsif (0x0041 <= $self->{nc} and
3216 $self->{nc} <= 0x0046) { # A..F
3217 !!!cp (1004);
3218 $self->{kwd} *= 0x10;
3219 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3220 ## Stay in the state.
3221 !!!next-input-character;
3222 redo A;
3223 } elsif ($self->{nc} == 0x003B) { # ;
3224 !!!cp (1006);
3225 !!!next-input-character;
3226 #
3227 } else {
3228 !!!cp (1007);
3229 !!!parse-error (type => 'no refc',
3230 line => $self->{line},
3231 column => $self->{column});
3232 ## Reconsume.
3233 #
3234 }
3235
3236 my $code = $self->{kwd};
3237 my $l = $self->{line_prev};
3238 my $c = $self->{column_prev};
3239 if ($charref_map->{$code}) {
3240 !!!cp (1008);
3241 !!!parse-error (type => 'invalid character reference',
3242 text => (sprintf 'U+%04X', $code),
3243 line => $l, column => $c);
3244 $code = $charref_map->{$code};
3245 } elsif ($code > 0x10FFFF) {
3246 !!!cp (1009);
3247 !!!parse-error (type => 'invalid character reference',
3248 text => (sprintf 'U-%08X', $code),
3249 line => $l, column => $c);
3250 $code = 0xFFFD;
3251 }
3252
3253 if ($self->{prev_state} == DATA_STATE) {
3254 !!!cp (988);
3255 $self->{state} = $self->{prev_state};
3256 $self->{s_kwd} = '';
3257 ## Reconsume.
3258 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3259 has_reference => 1,
3260 line => $l, column => $c,
3261 });
3262 redo A;
3263 } else {
3264 !!!cp (987);
3265 $self->{ca}->{value} .= chr $code;
3266 $self->{ca}->{has_reference} = 1;
3267 $self->{state} = $self->{prev_state};
3268 $self->{s_kwd} = '';
3269 ## Reconsume.
3270 redo A;
3271 }
3272 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3273 if (length $self->{kwd} < 30 and
3274 ## NOTE: Some number greater than the maximum length of entity name
3275 ((0x0041 <= $self->{nc} and # a
3276 $self->{nc} <= 0x005A) or # x
3277 (0x0061 <= $self->{nc} and # a
3278 $self->{nc} <= 0x007A) or # z
3279 (0x0030 <= $self->{nc} and # 0
3280 $self->{nc} <= 0x0039) or # 9
3281 $self->{nc} == 0x003B)) { # ;
3282 our $EntityChar;
3283 $self->{kwd} .= chr $self->{nc};
3284 if (defined $EntityChar->{$self->{kwd}}) {
3285 if ($self->{nc} == 0x003B) { # ;
3286 !!!cp (1020);
3287 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3288 $self->{entity__match} = 1;
3289 !!!next-input-character;
3290 #
3291 } else {
3292 !!!cp (1021);
3293 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3294 $self->{entity__match} = -1;
3295 ## Stay in the state.
3296 !!!next-input-character;
3297 redo A;
3298 }
3299 } else {
3300 !!!cp (1022);
3301 $self->{entity__value} .= chr $self->{nc};
3302 $self->{entity__match} *= 2;
3303 ## Stay in the state.
3304 !!!next-input-character;
3305 redo A;
3306 }
3307 }
3308
3309 my $data;
3310 my $has_ref;
3311 if ($self->{entity__match} > 0) {
3312 !!!cp (1023);
3313 $data = $self->{entity__value};
3314 $has_ref = 1;
3315 #
3316 } elsif ($self->{entity__match} < 0) {
3317 !!!parse-error (type => 'no refc');
3318 if ($self->{prev_state} != DATA_STATE and # in attribute
3319 $self->{entity__match} < -1) {
3320 !!!cp (1024);
3321 $data = '&' . $self->{kwd};
3322 #
3323 } else {
3324 !!!cp (1025);
3325 $data = $self->{entity__value};
3326 $has_ref = 1;
3327 #
3328 }
3329 } else {
3330 !!!cp (1026);
3331 !!!parse-error (type => 'bare ero',
3332 line => $self->{line_prev},
3333 column => $self->{column_prev} - length $self->{kwd});
3334 $data = '&' . $self->{kwd};
3335 #
3336 }
3337
3338 ## NOTE: In these cases, when a character reference is found,
3339 ## it is consumed and a character token is returned, or, otherwise,
3340 ## nothing is consumed and returned, according to the spec algorithm.
3341 ## In this implementation, anything that has been examined by the
3342 ## tokenizer is appended to the parent element or the attribute value
3343 ## as string, either literal string when no character reference or
3344 ## entity-replaced string otherwise, in this stage, since any characters
3345 ## that would not be consumed are appended in the data state or in an
3346 ## appropriate attribute value state anyway.
3347
3348 if ($self->{prev_state} == DATA_STATE) {
3349 !!!cp (986);
3350 $self->{state} = $self->{prev_state};
3351 $self->{s_kwd} = '';
3352 ## Reconsume.
3353 !!!emit ({type => CHARACTER_TOKEN,
3354 data => $data,
3355 has_reference => $has_ref,
3356 line => $self->{line_prev},
3357 column => $self->{column_prev} + 1 - length $self->{kwd},
3358 });
3359 redo A;
3360 } else {
3361 !!!cp (985);
3362 $self->{ca}->{value} .= $data;
3363 $self->{ca}->{has_reference} = 1 if $has_ref;
3364 $self->{state} = $self->{prev_state};
3365 $self->{s_kwd} = '';
3366 ## Reconsume.
3367 redo A;
3368 }
3369
3370 ## XML-only states
3371
3372 } elsif ($self->{state} == PI_STATE) {
3373 ## XML5: "Pi state" and "DOCTYPE pi state".
3374
3375 if ($is_space->{$self->{nc}} or
3376 $self->{nc} == 0x003F or # ?
3377 $self->{nc} == -1) {
3378 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3379 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3380 ## "DOCTYPE pi state": Parse error, switch to the "data
3381 ## state".
3382 !!!parse-error (type => 'bare pio', ## TODO: type
3383 line => $self->{line_prev},
3384 column => $self->{column_prev}
3385 - 1 * ($self->{nc} != -1));
3386 $self->{state} = BOGUS_COMMENT_STATE;
3387 ## Reconsume.
3388 $self->{ct} = {type => COMMENT_TOKEN,
3389 data => '?',
3390 line => $self->{line_prev},
3391 column => $self->{column_prev}
3392 - 1 * ($self->{nc} != -1),
3393 };
3394 redo A;
3395 } else {
3396 ## XML5: "DOCTYPE pi state": Stay in the state.
3397 $self->{ct} = {type => PI_TOKEN,
3398 target => chr $self->{nc},
3399 data => '',
3400 line => $self->{line_prev},
3401 column => $self->{column_prev} - 1,
3402 };
3403 $self->{state} = PI_TARGET_STATE;
3404 !!!next-input-character;
3405 redo A;
3406 }
3407 } elsif ($self->{state} == PI_TARGET_STATE) {
3408 if ($is_space->{$self->{nc}}) {
3409 $self->{state} = PI_TARGET_AFTER_STATE;
3410 !!!next-input-character;
3411 redo A;
3412 } elsif ($self->{nc} == -1) {
3413 !!!parse-error (type => 'no pic'); ## TODO: type
3414 if ($self->{in_subset}) {
3415 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3416 } else {
3417 $self->{state} = DATA_STATE;
3418 $self->{s_kwd} = '';
3419 }
3420 ## Reconsume.
3421 !!!emit ($self->{ct}); # pi
3422 redo A;
3423 } elsif ($self->{nc} == 0x003F) { # ?
3424 $self->{state} = PI_AFTER_STATE;
3425 !!!next-input-character;
3426 redo A;
3427 } else {
3428 ## XML5: typo ("tag name" -> "target")
3429 $self->{ct}->{target} .= chr $self->{nc}; # pi
3430 !!!next-input-character;
3431 redo A;
3432 }
3433 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3434 if ($is_space->{$self->{nc}}) {
3435 ## Stay in the state.
3436 !!!next-input-character;
3437 redo A;
3438 } else {
3439 $self->{state} = PI_DATA_STATE;
3440 ## Reprocess.
3441 redo A;
3442 }
3443 } elsif ($self->{state} == PI_DATA_STATE) {
3444 if ($self->{nc} == 0x003F) { # ?
3445 $self->{state} = PI_DATA_AFTER_STATE;
3446 !!!next-input-character;
3447 redo A;
3448 } elsif ($self->{nc} == -1) {
3449 !!!parse-error (type => 'no pic'); ## TODO: type
3450 if ($self->{in_subset}) {
3451 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3452 } else {
3453 $self->{state} = DATA_STATE;
3454 $self->{s_kwd} = '';
3455 }
3456 ## Reprocess.
3457 !!!emit ($self->{ct}); # pi
3458 redo A;
3459 } else {
3460 $self->{ct}->{data} .= chr $self->{nc}; # pi
3461 $self->{read_until}->($self->{ct}->{data}, q[?],
3462 length $self->{ct}->{data});
3463 ## Stay in the state.
3464 !!!next-input-character;
3465 ## Reprocess.
3466 redo A;
3467 }
3468 } elsif ($self->{state} == PI_AFTER_STATE) {
3469 ## XML5: Part of "Pi after state".
3470
3471 if ($self->{nc} == 0x003E) { # >
3472 if ($self->{in_subset}) {
3473 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3474 } else {
3475 $self->{state} = DATA_STATE;
3476 $self->{s_kwd} = '';
3477 }
3478 !!!next-input-character;
3479 !!!emit ($self->{ct}); # pi
3480 redo A;
3481 } elsif ($self->{nc} == 0x003F) { # ?
3482 !!!parse-error (type => 'no s after target', ## TODO: type
3483 line => $self->{line_prev},
3484 column => $self->{column_prev}); ## XML5: no error
3485 $self->{ct}->{data} .= '?';
3486 $self->{state} = PI_DATA_AFTER_STATE;
3487 !!!next-input-character;
3488 redo A;
3489 } else {
3490 !!!parse-error (type => 'no s after target', ## TODO: type
3491 line => $self->{line_prev},
3492 column => $self->{column_prev}
3493 + 1 * ($self->{nc} == -1)); ## XML5: no error
3494 $self->{ct}->{data} .= '?'; ## XML5: not appended
3495 $self->{state} = PI_DATA_STATE;
3496 ## Reprocess.
3497 redo A;
3498 }
3499 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3500 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3501
3502 if ($self->{nc} == 0x003E) { # >
3503 if ($self->{in_subset}) {
3504 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3505 } else {
3506 $self->{state} = DATA_STATE;
3507 $self->{s_kwd} = '';
3508 }
3509 !!!next-input-character;
3510 !!!emit ($self->{ct}); # pi
3511 redo A;
3512 } elsif ($self->{nc} == 0x003F) { # ?
3513 $self->{ct}->{data} .= '?';
3514 ## Stay in the state.
3515 !!!next-input-character;
3516 redo A;
3517 } else {
3518 $self->{ct}->{data} .= '?'; ## XML5: not appended
3519 $self->{state} = PI_DATA_STATE;
3520 ## Reprocess.
3521 redo A;
3522 }
3523
3524 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3525 if ($self->{nc} == 0x003C) { # <
3526 $self->{state} = DOCTYPE_TAG_STATE;
3527 !!!next-input-character;
3528 redo A;
3529 } elsif ($self->{nc} == 0x0025) { # %
3530 ## XML5: Not defined yet.
3531
3532 ## TODO:
3533 !!!next-input-character;
3534 redo A;
3535 } elsif ($self->{nc} == 0x005D) { # ]
3536 delete $self->{in_subset};
3537 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3538 !!!next-input-character;
3539 redo A;
3540 } elsif ($is_space->{$self->{nc}}) {
3541 ## Stay in the state.
3542 !!!next-input-character;
3543 redo A;
3544 } elsif ($self->{nc} == -1) {
3545 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3546 delete $self->{in_subset};
3547 $self->{state} = DATA_STATE;
3548 $self->{s_kwd} = '';
3549 ## Reconsume.
3550 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3551 redo A;
3552 } else {
3553 unless ($self->{internal_subset_tainted}) {
3554 ## XML5: No parse error.
3555 !!!parse-error (type => 'string in internal subset');
3556 $self->{internal_subset_tainted} = 1;
3557 }
3558 ## Stay in the state.
3559 !!!next-input-character;
3560 redo A;
3561 }
3562 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3563 if ($self->{nc} == 0x003E) { # >
3564 $self->{state} = DATA_STATE;
3565 $self->{s_kwd} = '';
3566 !!!next-input-character;
3567 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3568 redo A;
3569 } elsif ($self->{nc} == -1) {
3570 !!!parse-error (type => 'unclosed DOCTYPE');
3571 $self->{state} = DATA_STATE;
3572 $self->{s_kwd} = '';
3573 ## Reconsume.
3574 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3575 redo A;
3576 } else {
3577 ## XML5: No parse error and stay in the state.
3578 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3579
3580 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3581 !!!next-input-character;
3582 redo A;
3583 }
3584 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3585 if ($self->{nc} == 0x003E) { # >
3586 $self->{state} = DATA_STATE;
3587 $self->{s_kwd} = '';
3588 !!!next-input-character;
3589 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3590 redo A;
3591 } elsif ($self->{nc} == -1) {
3592 $self->{state} = DATA_STATE;
3593 $self->{s_kwd} = '';
3594 ## Reconsume.
3595 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3596 redo A;
3597 } else {
3598 ## Stay in the state.
3599 !!!next-input-character;
3600 redo A;
3601 }
3602 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3603 if ($self->{nc} == 0x0021) { # !
3604 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3605 !!!next-input-character;
3606 redo A;
3607 } elsif ($self->{nc} == 0x003F) { # ?
3608 $self->{state} = PI_STATE;
3609 !!!next-input-character;
3610 redo A;
3611 } elsif ($self->{nc} == -1) {
3612 !!!parse-error (type => 'bare stago');
3613 $self->{state} = DATA_STATE;
3614 $self->{s_kwd} = '';
3615 ## Reconsume.
3616 redo A;
3617 } else {
3618 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3619 line => $self->{line_prev},
3620 column => $self->{column_prev});
3621 $self->{state} = BOGUS_COMMENT_STATE;
3622 $self->{ct} = {type => COMMENT_TOKEN,
3623 data => '',
3624 }; ## NOTE: Will be discarded.
3625 !!!next-input-character;
3626 redo A;
3627 }
3628 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3629 ## XML5: "DOCTYPE markup declaration state".
3630
3631 if ($self->{nc} == 0x002D) { # -
3632 $self->{state} = MD_HYPHEN_STATE;
3633 !!!next-input-character;
3634 redo A;
3635 } elsif ($self->{nc} == 0x0045 or # E
3636 $self->{nc} == 0x0065) { # e
3637 $self->{state} = MD_E_STATE;
3638 $self->{kwd} = chr $self->{nc};
3639 !!!next-input-character;
3640 redo A;
3641 } elsif ($self->{nc} == 0x0041 or # A
3642 $self->{nc} == 0x0061) { # a
3643 $self->{state} = MD_ATTLIST_STATE;
3644 $self->{kwd} = chr $self->{nc};
3645 !!!next-input-character;
3646 redo A;
3647 } elsif ($self->{nc} == 0x004E or # N
3648 $self->{nc} == 0x006E) { # n
3649 $self->{state} = MD_NOTATION_STATE;
3650 $self->{kwd} = chr $self->{nc};
3651 !!!next-input-character;
3652 redo A;
3653 } else {
3654 #
3655 }
3656
3657 ## XML5: No parse error.
3658 !!!parse-error (type => 'bogus comment',
3659 line => $self->{line_prev},
3660 column => $self->{column_prev} - 1);
3661 ## Reconsume.
3662 $self->{state} = BOGUS_COMMENT_STATE;
3663 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3664 redo A;
3665 } elsif ($self->{state} == MD_E_STATE) {
3666 if ($self->{nc} == 0x004E or # N
3667 $self->{nc} == 0x006E) { # n
3668 $self->{state} = MD_ENTITY_STATE;
3669 $self->{kwd} .= chr $self->{nc};
3670 !!!next-input-character;
3671 redo A;
3672 } elsif ($self->{nc} == 0x004C or # L
3673 $self->{nc} == 0x006C) { # l
3674 ## XML5: <!ELEMENT> not supported.
3675 $self->{state} = MD_ELEMENT_STATE;
3676 $self->{kwd} .= chr $self->{nc};
3677 !!!next-input-character;
3678 redo A;
3679 } else {
3680 ## XML5: No parse error.
3681 !!!parse-error (type => 'bogus comment',
3682 line => $self->{line_prev},
3683 column => $self->{column_prev} - 2
3684 + 1 * ($self->{nc} == -1));
3685 ## Reconsume.
3686 $self->{state} = BOGUS_COMMENT_STATE;
3687 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3688 redo A;
3689 }
3690 } elsif ($self->{state} == MD_ENTITY_STATE) {
3691 if ($self->{nc} == [
3692 undef,
3693 undef,
3694 0x0054, # T
3695 0x0049, # I
3696 0x0054, # T
3697 ]->[length $self->{kwd}] or
3698 $self->{nc} == [
3699 undef,
3700 undef,
3701 0x0074, # t
3702 0x0069, # i
3703 0x0074, # t
3704 ]->[length $self->{kwd}]) {
3705 ## Stay in the state.
3706 $self->{kwd} .= chr $self->{nc};
3707 !!!next-input-character;
3708 redo A;
3709 } elsif ((length $self->{kwd}) == 5 and
3710 ($self->{nc} == 0x0059 or # Y
3711 $self->{nc} == 0x0079)) { # y
3712 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3713 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3714 text => 'ENTITY',
3715 line => $self->{line_prev},
3716 column => $self->{column_prev} - 4);
3717 }
3718 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3719 line => $self->{line_prev},
3720 column => $self->{column_prev} - 6};
3721 $self->{state} = DOCTYPE_MD_STATE;
3722 !!!next-input-character;
3723 redo A;
3724 } else {
3725 !!!parse-error (type => 'bogus comment',
3726 line => $self->{line_prev},
3727 column => $self->{column_prev} - 1
3728 - (length $self->{kwd})
3729 + 1 * ($self->{nc} == -1));
3730 $self->{state} = BOGUS_COMMENT_STATE;
3731 ## Reconsume.
3732 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3733 redo A;
3734 }
3735 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3736 if ($self->{nc} == [
3737 undef,
3738 undef,
3739 0x0045, # E
3740 0x004D, # M
3741 0x0045, # E
3742 0x004E, # N
3743 ]->[length $self->{kwd}] or
3744 $self->{nc} == [
3745 undef,
3746 undef,
3747 0x0065, # e
3748 0x006D, # m
3749 0x0065, # e
3750 0x006E, # n
3751 ]->[length $self->{kwd}]) {
3752 ## Stay in the state.
3753 $self->{kwd} .= chr $self->{nc};
3754 !!!next-input-character;
3755 redo A;
3756 } elsif ((length $self->{kwd}) == 6 and
3757 ($self->{nc} == 0x0054 or # T
3758 $self->{nc} == 0x0074)) { # t
3759 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3760 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3761 text => 'ELEMENT',
3762 line => $self->{line_prev},
3763 column => $self->{column_prev} - 5);
3764 }
3765 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3766 line => $self->{line_prev},
3767 column => $self->{column_prev} - 6};
3768 $self->{state} = DOCTYPE_MD_STATE;
3769 !!!next-input-character;
3770 redo A;
3771 } else {
3772 !!!parse-error (type => 'bogus comment',
3773 line => $self->{line_prev},
3774 column => $self->{column_prev} - 1
3775 - (length $self->{kwd})
3776 + 1 * ($self->{nc} == -1));
3777 $self->{state} = BOGUS_COMMENT_STATE;
3778 ## Reconsume.
3779 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3780 redo A;
3781 }
3782 } elsif ($self->{state} == MD_ATTLIST_STATE) {
3783 if ($self->{nc} == [
3784 undef,
3785 0x0054, # T
3786 0x0054, # T
3787 0x004C, # L
3788 0x0049, # I
3789 0x0053, # S
3790 ]->[length $self->{kwd}] or
3791 $self->{nc} == [
3792 undef,
3793 0x0074, # t
3794 0x0074, # t
3795 0x006C, # l
3796 0x0069, # i
3797 0x0073, # s
3798 ]->[length $self->{kwd}]) {
3799 ## Stay in the state.
3800 $self->{kwd} .= chr $self->{nc};
3801 !!!next-input-character;
3802 redo A;
3803 } elsif ((length $self->{kwd}) == 6 and
3804 ($self->{nc} == 0x0054 or # T
3805 $self->{nc} == 0x0074)) { # t
3806 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3807 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3808 text => 'ATTLIST',
3809 line => $self->{line_prev},
3810 column => $self->{column_prev} - 5);
3811 }
3812 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3813 attrdefs => [],
3814 line => $self->{line_prev},
3815 column => $self->{column_prev} - 6};
3816 $self->{state} = DOCTYPE_MD_STATE;
3817 !!!next-input-character;
3818 redo A;
3819 } else {
3820 !!!parse-error (type => 'bogus comment',
3821 line => $self->{line_prev},
3822 column => $self->{column_prev} - 1
3823 - (length $self->{kwd})
3824 + 1 * ($self->{nc} == -1));
3825 $self->{state} = BOGUS_COMMENT_STATE;
3826 ## Reconsume.
3827 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3828 redo A;
3829 }
3830 } elsif ($self->{state} == MD_NOTATION_STATE) {
3831 if ($self->{nc} == [
3832 undef,
3833 0x004F, # O
3834 0x0054, # T
3835 0x0041, # A
3836 0x0054, # T
3837 0x0049, # I
3838 0x004F, # O
3839 ]->[length $self->{kwd}] or
3840 $self->{nc} == [
3841 undef,
3842 0x006F, # o
3843 0x0074, # t
3844 0x0061, # a
3845 0x0074, # t
3846 0x0069, # i
3847 0x006F, # o
3848 ]->[length $self->{kwd}]) {
3849 ## Stay in the state.
3850 $self->{kwd} .= chr $self->{nc};
3851 !!!next-input-character;
3852 redo A;
3853 } elsif ((length $self->{kwd}) == 7 and
3854 ($self->{nc} == 0x004E or # N
3855 $self->{nc} == 0x006E)) { # n
3856 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3857 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3858 text => 'NOTATION',
3859 line => $self->{line_prev},
3860 column => $self->{column_prev} - 6);
3861 }
3862 $self->{ct} = {type => NOTATION_TOKEN, name => '',
3863 line => $self->{line_prev},
3864 column => $self->{column_prev} - 6};
3865 $self->{state} = DOCTYPE_MD_STATE;
3866 !!!next-input-character;
3867 redo A;
3868 } else {
3869 !!!parse-error (type => 'bogus comment',
3870 line => $self->{line_prev},
3871 column => $self->{column_prev} - 1
3872 - (length $self->{kwd})
3873 + 1 * ($self->{nc} == -1));
3874 $self->{state} = BOGUS_COMMENT_STATE;
3875 ## Reconsume.
3876 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3877 redo A;
3878 }
3879 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3880 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3881 ## "DOCTYPE NOTATION state".
3882
3883 if ($is_space->{$self->{nc}}) {
3884 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3885 $self->{state} = BEFORE_MD_NAME_STATE;
3886 !!!next-input-character;
3887 redo A;
3888 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3889 $self->{nc} == 0x0025) { # %
3890 ## XML5: Switch to the "DOCTYPE bogus comment state".
3891 !!!parse-error (type => 'no space before md name'); ## TODO: type
3892 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3893 !!!next-input-character;
3894 redo A;
3895 } elsif ($self->{nc} == -1) {
3896 !!!parse-error (type => 'unclosed md'); ## TODO: type
3897 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3898 ## Reconsume.
3899 redo A;
3900 } elsif ($self->{nc} == 0x003E) { # >
3901 ## XML5: Switch to the "DOCTYPE bogus comment state".
3902 !!!parse-error (type => 'no md name'); ## TODO: type
3903 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3904 !!!next-input-character;
3905 redo A;
3906 } else {
3907 ## XML5: Switch to the "DOCTYPE bogus comment state".
3908 !!!parse-error (type => 'no space before md name'); ## TODO: type
3909 $self->{state} = BEFORE_MD_NAME_STATE;
3910 redo A;
3911 }
3912 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3913 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3914 ## before state", "DOCTYPE ATTLIST name before state".
3915
3916 if ($is_space->{$self->{nc}}) {
3917 ## Stay in the state.
3918 !!!next-input-character;
3919 redo A;
3920 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3921 $self->{nc} == 0x0025) { # %
3922 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3923 !!!next-input-character;
3924 redo A;
3925 } elsif ($self->{nc} == 0x003E) { # >
3926 ## XML5: Same as "Anything else".
3927 !!!parse-error (type => 'no md name'); ## TODO: type
3928 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3929 !!!next-input-character;
3930 redo A;
3931 } elsif ($self->{nc} == -1) {
3932 !!!parse-error (type => 'unclosed md'); ## TODO: type
3933 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3934 ## Reconsume.
3935 redo A;
3936 } else {
3937 ## XML5: [ATTLIST] Not defined yet.
3938 $self->{ct}->{name} .= chr $self->{nc};
3939 $self->{state} = MD_NAME_STATE;
3940 !!!next-input-character;
3941 redo A;
3942 }
3943 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3944 if ($is_space->{$self->{nc}}) {
3945 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3946 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3947 $self->{state} = BEFORE_MD_NAME_STATE;
3948 !!!next-input-character;
3949 redo A;
3950 } elsif ($self->{nc} == 0x003E) { # >
3951 ## XML5: Same as "Anything else".
3952 !!!parse-error (type => 'no md name'); ## TODO: type
3953 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3954 !!!next-input-character;
3955 redo A;
3956 } elsif ($self->{nc} == -1) {
3957 !!!parse-error (type => 'unclosed md');
3958 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3959 ## Reconsume.
3960 redo A;
3961 } else {
3962 ## XML5: No parse error.
3963 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3964 $self->{state} = BOGUS_COMMENT_STATE;
3965 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3966 ## Reconsume.
3967 redo A;
3968 }
3969 } elsif ($self->{state} == MD_NAME_STATE) {
3970 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3971
3972 if ($is_space->{$self->{nc}}) {
3973 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3974 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3975 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
3976 ## TODO: ...
3977 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3978 } else { # ENTITY/NOTATION
3979 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3980 }
3981 !!!next-input-character;
3982 redo A;
3983 } elsif ($self->{nc} == 0x003E) { # >
3984 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3985 #
3986 } else {
3987 !!!parse-error (type => 'no md def'); ## TODO: type
3988 }
3989 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3990 !!!next-input-character;
3991 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3992 redo A;
3993 } elsif ($self->{nc} == -1) {
3994 ## XML5: [ATTLIST] No parse error.
3995 !!!parse-error (type => 'unclosed md');
3996 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3997 ## Reconsume.
3998 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3999 redo A;
4000 } else {
4001 ## XML5: [ATTLIST] Not defined yet.
4002 $self->{ct}->{name} .= chr $self->{nc};
4003 ## Stay in the state.
4004 !!!next-input-character;
4005 redo A;
4006 }
4007 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4008 if ($is_space->{$self->{nc}}) {
4009 ## Stay in the state.
4010 !!!next-input-character;
4011 redo A;
4012 } elsif ($self->{nc} == 0x003E) { # >
4013 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4014 !!!next-input-character;
4015 !!!emit ($self->{ct}); # ATTLIST
4016 redo A;
4017 } elsif ($self->{nc} == -1) {
4018 ## XML5: No parse error.
4019 !!!parse-error (type => 'unclosed md'); ## TODO: type
4020 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4021 !!!emit ($self->{ct});
4022 redo A;
4023 } else {
4024 ## XML5: Not defined yet.
4025 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4026 tokens => [],
4027 line => $self->{line}, column => $self->{column}};
4028 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4029 !!!next-input-character;
4030 redo A;
4031 }
4032 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4033 if ($is_space->{$self->{nc}}) {
4034 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4035 !!!next-input-character;
4036 redo A;
4037 } elsif ($self->{nc} == 0x003E) { # >
4038 ## XML5: Same as "anything else".
4039 !!!parse-error (type => 'no attr type'); ## TODO: type
4040 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4041 !!!next-input-character;
4042 !!!emit ($self->{ct}); # ATTLIST
4043 redo A;
4044 } elsif ($self->{nc} == 0x0028) { # (
4045 ## XML5: Same as "anything else".
4046 !!!parse-error (type => 'no space before paren'); ## TODO: type
4047 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4048 !!!next-input-character;
4049 redo A;
4050 } elsif ($self->{nc} == -1) {
4051 ## XML5: No parse error.
4052 !!!parse-error (type => 'unclosed md'); ## TODO: type
4053 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4054 !!!next-input-character;
4055 !!!emit ($self->{ct}); # ATTLIST
4056 redo A;
4057 } else {
4058 ## XML5: Not defined yet.
4059 $self->{ca}->{name} .= chr $self->{nc};
4060 ## Stay in the state.
4061 !!!next-input-character;
4062 redo A;
4063 }
4064 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4065 if ($is_space->{$self->{nc}}) {
4066 ## Stay in the state.
4067 !!!next-input-character;
4068 redo A;
4069 } elsif ($self->{nc} == 0x003E) { # >
4070 ## XML5: Same as "anything else".
4071 !!!parse-error (type => 'no attr type'); ## TODO: type
4072 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4073 !!!next-input-character;
4074 !!!emit ($self->{ct}); # ATTLIST
4075 redo A;
4076 } elsif ($self->{nc} == 0x0028) { # (
4077 ## XML5: Same as "anything else".
4078 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4079 !!!next-input-character;
4080 redo A;
4081 } elsif ($self->{nc} == -1) {
4082 ## XML5: No parse error.
4083 !!!parse-error (type => 'unclosed md'); ## TODO: type
4084 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4085 !!!next-input-character;
4086 !!!emit ($self->{ct});
4087 redo A;
4088 } else {
4089 ## XML5: Not defined yet.
4090 $self->{ca}->{type} = chr $self->{nc};
4091 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4092 !!!next-input-character;
4093 redo A;
4094 }
4095 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4096 if ($is_space->{$self->{nc}}) {
4097 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4098 !!!next-input-character;
4099 redo A;
4100 } elsif ($self->{nc} == 0x0023) { # #
4101 ## XML5: Same as "anything else".
4102 !!!parse-error (type => 'no space before default value'); ## TODO: type
4103 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4104 !!!next-input-character;
4105 redo A;
4106 } elsif ($self->{nc} == 0x0022) { # "
4107 ## XML5: Same as "anything else".
4108 !!!parse-error (type => 'no space before default value'); ## TODO: type
4109 $self->{ca}->{value} = '';
4110 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4111 !!!next-input-character;
4112 redo A;
4113 } elsif ($self->{nc} == 0x0027) { # '
4114 ## XML5: Same as "anything else".
4115 !!!parse-error (type => 'no space before default value'); ## TODO: type
4116 $self->{ca}->{value} = '';
4117 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4118 !!!next-input-character;
4119 redo A;
4120 } elsif ($self->{nc} == 0x003E) { # >
4121 ## XML5: Same as "anything else".
4122 !!!parse-error (type => 'no attr default'); ## TODO: type
4123 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4124 !!!next-input-character;
4125 !!!emit ($self->{ct}); # ATTLIST
4126 redo A;
4127 } elsif ($self->{nc} == 0x0028) { # (
4128 ## XML5: Same as "anything else".
4129 !!!parse-error (type => 'no space before paren'); ## TODO: type
4130 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4131 !!!next-input-character;
4132 redo A;
4133 } elsif ($self->{nc} == -1) {
4134 ## XML5: No parse error.
4135 !!!parse-error (type => 'unclosed md'); ## TODO: type
4136 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4137 !!!next-input-character;
4138 !!!emit ($self->{ct});
4139 redo A;
4140 } else {
4141 ## XML5: Not defined yet.
4142 $self->{ca}->{type} .= chr $self->{nc};
4143 ## Stay in the state.
4144 !!!next-input-character;
4145 redo A;
4146 }
4147 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4148 if ($is_space->{$self->{nc}}) {
4149 ## Stay in the state.
4150 !!!next-input-character;
4151 redo A;
4152 } elsif ($self->{nc} == 0x0028) { # (
4153 ## XML5: Same as "anything else".
4154 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4155 !!!next-input-character;
4156 redo A;
4157 } elsif ($self->{nc} == 0x0023) { # #
4158 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4159 !!!next-input-character;
4160 redo A;
4161 } elsif ($self->{nc} == 0x0022) { # "
4162 ## XML5: Same as "anything else".
4163 $self->{ca}->{value} = '';
4164 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4165 !!!next-input-character;
4166 redo A;
4167 } elsif ($self->{nc} == 0x0027) { # '
4168 ## XML5: Same as "anything else".
4169 $self->{ca}->{value} = '';
4170 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4171 !!!next-input-character;
4172 redo A;
4173 } elsif ($self->{nc} == 0x003E) { # >
4174 ## XML5: Same as "anything else".
4175 !!!parse-error (type => 'no attr default'); ## TODO: type
4176 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4177 !!!next-input-character;
4178 !!!emit ($self->{ct}); # ATTLIST
4179 redo A;
4180 } elsif ($self->{nc} == -1) {
4181 ## XML5: No parse error.
4182 !!!parse-error (type => 'unclosed md'); ## TODO: type
4183 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4184 !!!next-input-character;
4185 !!!emit ($self->{ct});
4186 redo A;
4187 } else {
4188 ## XML5: Switch to the "DOCTYPE bogus comment state".
4189 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4190 $self->{ca}->{value} = '';
4191 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4192 ## Reconsume.
4193 redo A;
4194 }
4195 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4196 if ($is_space->{$self->{nc}}) {
4197 ## Stay in the state.
4198 !!!next-input-character;
4199 redo A;
4200 } elsif ($self->{nc} == 0x007C) { # |
4201 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4202 ## Stay in the state.
4203 !!!next-input-character;
4204 redo A;
4205 } elsif ($self->{nc} == 0x0029) { # )
4206 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4207 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4208 !!!next-input-character;
4209 redo A;
4210 } elsif ($self->{nc} == 0x003E) { # >
4211 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4212 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4213 !!!next-input-character;
4214 !!!emit ($self->{ct}); # ATTLIST
4215 redo A;
4216 } elsif ($self->{nc} == -1) {
4217 ## XML5: No parse error.
4218 !!!parse-error (type => 'unclosed md'); ## TODO: type
4219 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4220 !!!next-input-character;
4221 !!!emit ($self->{ct});
4222 redo A;
4223 } else {
4224 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4225 $self->{state} = ALLOWED_TOKEN_STATE;
4226 !!!next-input-character;
4227 redo A;
4228 }
4229 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4230 if ($is_space->{$self->{nc}}) {
4231 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4232 !!!next-input-character;
4233 redo A;
4234 } elsif ($self->{nc} == 0x007C) { # |
4235 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4236 !!!next-input-character;
4237 redo A;
4238 } elsif ($self->{nc} == 0x0029) { # )
4239 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4240 !!!next-input-character;
4241 redo A;
4242 } elsif ($self->{nc} == 0x003E) { # >
4243 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4244 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4245 !!!next-input-character;
4246 !!!emit ($self->{ct}); # ATTLIST
4247 redo A;
4248 } elsif ($self->{nc} == -1) {
4249 ## XML5: No parse error.
4250 !!!parse-error (type => 'unclosed md'); ## TODO: type
4251 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4252 !!!next-input-character;
4253 !!!emit ($self->{ct});
4254 redo A;
4255 } else {
4256 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4257 ## Stay in the state.
4258 !!!next-input-character;
4259 redo A;
4260 }
4261 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4262 if ($is_space->{$self->{nc}}) {
4263 ## Stay in the state.
4264 !!!next-input-character;
4265 redo A;
4266 } elsif ($self->{nc} == 0x007C) { # |
4267 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4268 !!!next-input-character;
4269 redo A;
4270 } elsif ($self->{nc} == 0x0029) { # )
4271 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4272 !!!next-input-character;
4273 redo A;
4274 } elsif ($self->{nc} == 0x003E) { # >
4275 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4276 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4277 !!!next-input-character;
4278 !!!emit ($self->{ct}); # ATTLIST
4279 redo A;
4280 } elsif ($self->{nc} == -1) {
4281 ## XML5: No parse error.
4282 !!!parse-error (type => 'unclosed md'); ## TODO: type
4283 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4284 !!!next-input-character;
4285 !!!emit ($self->{ct});
4286 redo A;
4287 } else {
4288 !!!parse-error (type => 'space in allowed token', ## TODO: type
4289 line => $self->{line_prev},
4290 column => $self->{column_prev});
4291 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4292 $self->{state} = ALLOWED_TOKEN_STATE;
4293 !!!next-input-character;
4294 redo A;
4295 }
4296 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4297 if ($is_space->{$self->{nc}}) {
4298 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4299 !!!next-input-character;
4300 redo A;
4301 } elsif ($self->{nc} == 0x0023) { # #
4302 !!!parse-error (type => 'no space before default value'); ## TODO: type
4303 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4304 !!!next-input-character;
4305 redo A;
4306 } elsif ($self->{nc} == 0x0022) { # "
4307 !!!parse-error (type => 'no space before default value'); ## TODO: type
4308 $self->{ca}->{value} = '';
4309 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4310 !!!next-input-character;
4311 redo A;
4312 } elsif ($self->{nc} == 0x0027) { # '
4313 !!!parse-error (type => 'no space before default value'); ## TODO: type
4314 $self->{ca}->{value} = '';
4315 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4316 !!!next-input-character;
4317 redo A;
4318 } elsif ($self->{nc} == 0x003E) { # >
4319 !!!parse-error (type => 'no attr default'); ## TODO: type
4320 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4321 !!!next-input-character;
4322 !!!emit ($self->{ct}); # ATTLIST
4323 redo A;
4324 } elsif ($self->{nc} == -1) {
4325 !!!parse-error (type => 'unclosed md'); ## TODO: type
4326 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4327 !!!next-input-character;
4328 !!!emit ($self->{ct});
4329 redo A;
4330 } else {
4331 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4332 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4333 ## Reconsume.
4334 redo A;
4335 }
4336 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4337 if ($is_space->{$self->{nc}}) {
4338 ## Stay in the state.
4339 !!!next-input-character;
4340 redo A;
4341 } elsif ($self->{nc} == 0x0023) { # #
4342 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4343 !!!next-input-character;
4344 redo A;
4345 } elsif ($self->{nc} == 0x0022) { # "
4346 $self->{ca}->{value} = '';
4347 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4348 !!!next-input-character;
4349 redo A;
4350 } elsif ($self->{nc} == 0x0027) { # '
4351 $self->{ca}->{value} = '';
4352 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4353 !!!next-input-character;
4354 redo A;
4355 } elsif ($self->{nc} == 0x003E) { # >
4356 !!!parse-error (type => 'no attr default'); ## TODO: type
4357 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4358 !!!next-input-character;
4359 !!!emit ($self->{ct}); # ATTLIST
4360 redo A;
4361 } elsif ($self->{nc} == -1) {
4362 !!!parse-error (type => 'unclosed md'); ## TODO: type
4363 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4364 !!!next-input-character;
4365 !!!emit ($self->{ct});
4366 redo A;
4367 } else {
4368 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4369 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4370 ## Reconsume.
4371 redo A;
4372 }
4373 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4374 if ($is_space->{$self->{nc}}) {
4375 ## XML5: No parse error.
4376 !!!parse-error (type => 'no default type'); ## TODO: type
4377 $self->{state} = BOGUS_MD_STATE;
4378 ## Reconsume.
4379 redo A;
4380 } elsif ($self->{nc} == 0x0022) { # "
4381 ## XML5: Same as "anything else".
4382 $self->{ca}->{value} = '';
4383 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4384 !!!next-input-character;
4385 redo A;
4386 } elsif ($self->{nc} == 0x0027) { # '
4387 ## XML5: Same as "anything else".
4388 $self->{ca}->{value} = '';
4389 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4390 !!!next-input-character;
4391 redo A;
4392 } elsif ($self->{nc} == 0x003E) { # >
4393 ## XML5: Same as "anything else".
4394 !!!parse-error (type => 'no attr default'); ## TODO: type
4395 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4396 !!!next-input-character;
4397 !!!emit ($self->{ct}); # ATTLIST
4398 redo A;
4399 } elsif ($self->{nc} == -1) {
4400 ## XML5: No parse error.
4401 !!!parse-error (type => 'unclosed md'); ## TODO: type
4402 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4403 !!!next-input-character;
4404 !!!emit ($self->{ct});
4405 redo A;
4406 } else {
4407 $self->{ca}->{default} = chr $self->{nc};
4408 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4409 !!!next-input-character;
4410 redo A;
4411 }
4412 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4413 if ($is_space->{$self->{nc}}) {
4414 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4415 !!!next-input-character;
4416 redo A;
4417 } elsif ($self->{nc} == 0x0022) { # "
4418 ## XML5: Same as "anything else".
4419 !!!parse-error (type => 'no space before default value'); ## TODO: type
4420 $self->{ca}->{value} = '';
4421 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4422 !!!next-input-character;
4423 redo A;
4424 } elsif ($self->{nc} == 0x0027) { # '
4425 ## XML5: Same as "anything else".
4426 !!!parse-error (type => 'no space before default value'); ## TODO: type
4427 $self->{ca}->{value} = '';
4428 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4429 !!!next-input-character;
4430 redo A;
4431 } elsif ($self->{nc} == 0x003E) { # >
4432 ## XML5: Same as "anything else".
4433 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4434 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4435 !!!next-input-character;
4436 !!!emit ($self->{ct}); # ATTLIST
4437 redo A;
4438 } elsif ($self->{nc} == -1) {
4439 ## XML5: No parse error.
4440 !!!parse-error (type => 'unclosed md'); ## TODO: type
4441 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4442 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4443 !!!next-input-character;
4444 !!!emit ($self->{ct});
4445 redo A;
4446 } else {
4447 $self->{ca}->{default} .= chr $self->{nc};
4448 ## Stay in the state.
4449 !!!next-input-character;
4450 redo A;
4451 }
4452 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4453 if ($is_space->{$self->{nc}}) {
4454 ## Stay in the state.
4455 !!!next-input-character;
4456 redo A;
4457 } elsif ($self->{nc} == 0x0022) { # "
4458 $self->{ca}->{value} = '';
4459 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4460 !!!next-input-character;
4461 redo A;
4462 } elsif ($self->{nc} == 0x0027) { # '
4463 $self->{ca}->{value} = '';
4464 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4465 !!!next-input-character;
4466 redo A;
4467 } elsif ($self->{nc} == 0x003E) { # >
4468 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4469 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4470 !!!next-input-character;
4471 !!!emit ($self->{ct}); # ATTLIST
4472 redo A;
4473 } elsif ($self->{nc} == -1) {
4474 ## XML5: No parse error.
4475 !!!parse-error (type => 'unclosed md'); ## TODO: type
4476 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4477 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4478 !!!next-input-character;
4479 !!!emit ($self->{ct});
4480 redo A;
4481 } else {
4482 ## XML5: Not defined yet.
4483 if ($self->{ca}->{default} eq 'FIXED') {
4484 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4485 } else {
4486 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4487 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4488 }
4489 ## Reconsume.
4490 redo A;
4491 }
4492 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4493 if ($is_space->{$self->{nc}} or
4494 $self->{nc} == -1 or
4495 $self->{nc} == 0x003E) { # >
4496 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4497 ## Reconsume.
4498 redo A;
4499 } else {
4500 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4501 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4502 ## Reconsume.
4503 redo A;
4504 }
4505
4506 } elsif ($self->{state} == BOGUS_MD_STATE) {
4507 if ($self->{nc} == 0x003E) { # >
4508 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4509 !!!next-input-character;
4510 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4511 redo A;
4512 } elsif ($self->{nc} == -1) {
4513 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4514 ## Reconsume.
4515 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4516 redo A;
4517 } else {
4518 ## Stay in the state.
4519 !!!next-input-character;
4520 redo A;
4521 }
4522 } else {
4523 die "$0: $self->{state}: Unknown state";
4524 }
4525 } # A
4526
4527 die "$0: _get_next_token: unexpected case";
4528 } # _get_next_token
4529
4530 1;
4531 ## $Date: 2008/10/18 11:34:49 $
4532

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24