/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.27 - (show annotations) (download) (as text)
Thu Jul 2 22:24:28 2009 UTC (15 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.26: +2 -6 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/HTML/ChangeLog	2 Jul 2009 22:24:21 -0000
	* Tokenizer.pm.src: Reduced a parse error (HTML5 revision 3194).

2009-07-03  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.26 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276 !!!next-input-character;
277 $self->{token} = [];
278 # $self->{escape}
279 } # _initialize_tokenizer
280
281 ## A token has:
282 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 ## ->{name} (DOCTYPE_TOKEN)
285 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 ## ->{target} (PI_TOKEN)
287 ## ->{pubid} (DOCTYPE_TOKEN)
288 ## ->{sysid} (DOCTYPE_TOKEN)
289 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291 ## ->{name}
292 ## ->{value}
293 ## ->{has_reference} == 1 or 0
294 ## ->{index}: Index of the attribute in a tag.
295 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299
300 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302 ## while the token is pushed back to the stack.
303
304 ## Emitted token MUST immediately be handled by the tree construction state.
305
306 ## Before each step, UA MAY check to see if either one of the scripts in
307 ## "list of scripts that will execute as soon as possible" or the first
308 ## script in the "list of scripts that will execute asynchronously",
309 ## has completed loading. If one has, then it MUST be executed
310 ## and removed from the list.
311
312 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313 ## (This requirement was dropped from HTML5 spec, unfortunately.)
314
315 my $is_space = {
316 0x0009 => 1, # CHARACTER TABULATION (HT)
317 0x000A => 1, # LINE FEED (LF)
318 #0x000B => 0, # LINE TABULATION (VT)
319 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 #0x000D => 1, # CARRIAGE RETURN (CR)
321 0x0020 => 1, # SPACE (SP)
322 };
323
324 sub _get_next_token ($) {
325 my $self = shift;
326
327 if ($self->{self_closing}) {
328 !!!parse-error (type => 'nestc', token => $self->{ct});
329 ## NOTE: The |self_closing| flag is only set by start tag token.
330 ## In addition, when a start tag token is emitted, it is always set to
331 ## |ct|.
332 delete $self->{self_closing};
333 }
334
335 if (@{$self->{token}}) {
336 $self->{self_closing} = $self->{token}->[0]->{self_closing};
337 return shift @{$self->{token}};
338 }
339
340 A: {
341 if ($self->{state} == PCDATA_STATE) {
342 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343
344 if ($self->{nc} == 0x0026) { # &
345 !!!cp (0.1);
346 ## NOTE: In the spec, the tokenizer is switched to the
347 ## "entity data state". In this implementation, the tokenizer
348 ## is switched to the |ENTITY_STATE|, which is an implementation
349 ## of the "consume a character reference" algorithm.
350 $self->{entity_add} = -1;
351 $self->{prev_state} = DATA_STATE;
352 $self->{state} = ENTITY_STATE;
353 !!!next-input-character;
354 redo A;
355 } elsif ($self->{nc} == 0x003C) { # <
356 !!!cp (0.2);
357 $self->{state} = TAG_OPEN_STATE;
358 !!!next-input-character;
359 redo A;
360 } elsif ($self->{nc} == -1) {
361 !!!cp (0.3);
362 !!!emit ({type => END_OF_FILE_TOKEN,
363 line => $self->{line}, column => $self->{column}});
364 last A; ## TODO: ok?
365 } else {
366 !!!cp (0.4);
367 #
368 }
369
370 # Anything else
371 my $token = {type => CHARACTER_TOKEN,
372 data => chr $self->{nc},
373 line => $self->{line}, column => $self->{column},
374 };
375 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376
377 ## Stay in the state.
378 !!!next-input-character;
379 !!!emit ($token);
380 redo A;
381 } elsif ($self->{state} == DATA_STATE) {
382 $self->{s_kwd} = '' unless defined $self->{s_kwd};
383 if ($self->{nc} == 0x0026) { # &
384 $self->{s_kwd} = '';
385 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386 not $self->{escape}) {
387 !!!cp (1);
388 ## NOTE: In the spec, the tokenizer is switched to the
389 ## "entity data state". In this implementation, the tokenizer
390 ## is switched to the |ENTITY_STATE|, which is an implementation
391 ## of the "consume a character reference" algorithm.
392 $self->{entity_add} = -1;
393 $self->{prev_state} = DATA_STATE;
394 $self->{state} = ENTITY_STATE;
395 !!!next-input-character;
396 redo A;
397 } else {
398 !!!cp (2);
399 #
400 }
401 } elsif ($self->{nc} == 0x002D) { # -
402 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 if ($self->{s_kwd} eq '<!-') {
404 !!!cp (3);
405 $self->{escape} = 1; # unless $self->{escape};
406 $self->{s_kwd} = '--';
407 #
408 } elsif ($self->{s_kwd} eq '-') {
409 !!!cp (4);
410 $self->{s_kwd} = '--';
411 #
412 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413 !!!cp (4.1);
414 $self->{s_kwd} .= '-';
415 #
416 } else {
417 !!!cp (5);
418 $self->{s_kwd} = '-';
419 #
420 }
421 }
422
423 #
424 } elsif ($self->{nc} == 0x0021) { # !
425 if (length $self->{s_kwd}) {
426 !!!cp (5.1);
427 $self->{s_kwd} .= '!';
428 #
429 } else {
430 !!!cp (5.2);
431 #$self->{s_kwd} = '';
432 #
433 }
434 #
435 } elsif ($self->{nc} == 0x003C) { # <
436 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438 not $self->{escape})) {
439 !!!cp (6);
440 $self->{state} = TAG_OPEN_STATE;
441 !!!next-input-character;
442 redo A;
443 } else {
444 !!!cp (7);
445 $self->{s_kwd} = '';
446 #
447 }
448 } elsif ($self->{nc} == 0x003E) { # >
449 if ($self->{escape} and
450 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451 if ($self->{s_kwd} eq '--') {
452 !!!cp (8);
453 delete $self->{escape};
454 #
455 } else {
456 !!!cp (9);
457 #
458 }
459 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460 !!!cp (9.1);
461 !!!parse-error (type => 'unmatched mse', ## TODO: type
462 line => $self->{line_prev},
463 column => $self->{column_prev} - 1);
464 #
465 } else {
466 !!!cp (10);
467 #
468 }
469
470 $self->{s_kwd} = '';
471 #
472 } elsif ($self->{nc} == 0x005D) { # ]
473 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474 !!!cp (10.1);
475 $self->{s_kwd} .= ']';
476 } elsif ($self->{s_kwd} eq ']]') {
477 !!!cp (10.2);
478 #
479 } else {
480 !!!cp (10.3);
481 $self->{s_kwd} = '';
482 }
483 #
484 } elsif ($self->{nc} == -1) {
485 !!!cp (11);
486 $self->{s_kwd} = '';
487 !!!emit ({type => END_OF_FILE_TOKEN,
488 line => $self->{line}, column => $self->{column}});
489 last A; ## TODO: ok?
490 } else {
491 !!!cp (12);
492 $self->{s_kwd} = '';
493 #
494 }
495
496 # Anything else
497 my $token = {type => CHARACTER_TOKEN,
498 data => chr $self->{nc},
499 line => $self->{line}, column => $self->{column},
500 };
501 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 length $token->{data})) {
503 $self->{s_kwd} = '';
504 }
505
506 ## Stay in the data state.
507 if (not $self->{is_xml} and
508 $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 !!!cp (13);
510 $self->{state} = PCDATA_STATE;
511 } else {
512 !!!cp (14);
513 ## Stay in the state.
514 }
515 !!!next-input-character;
516 !!!emit ($token);
517 redo A;
518 } elsif ($self->{state} == TAG_OPEN_STATE) {
519 ## XML5: "tag state".
520
521 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522 if ($self->{nc} == 0x002F) { # /
523 !!!cp (15);
524 !!!next-input-character;
525 $self->{state} = CLOSE_TAG_OPEN_STATE;
526 redo A;
527 } elsif ($self->{nc} == 0x0021) { # !
528 !!!cp (15.1);
529 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 #
531 } else {
532 !!!cp (16);
533 $self->{s_kwd} = '';
534 #
535 }
536
537 ## reconsume
538 $self->{state} = DATA_STATE;
539 !!!emit ({type => CHARACTER_TOKEN, data => '<',
540 line => $self->{line_prev},
541 column => $self->{column_prev},
542 });
543 redo A;
544 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545 if ($self->{nc} == 0x0021) { # !
546 !!!cp (17);
547 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548 !!!next-input-character;
549 redo A;
550 } elsif ($self->{nc} == 0x002F) { # /
551 !!!cp (18);
552 $self->{state} = CLOSE_TAG_OPEN_STATE;
553 !!!next-input-character;
554 redo A;
555 } elsif (0x0041 <= $self->{nc} and
556 $self->{nc} <= 0x005A) { # A..Z
557 !!!cp (19);
558 $self->{ct}
559 = {type => START_TAG_TOKEN,
560 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 line => $self->{line_prev},
562 column => $self->{column_prev}};
563 $self->{state} = TAG_NAME_STATE;
564 !!!next-input-character;
565 redo A;
566 } elsif (0x0061 <= $self->{nc} and
567 $self->{nc} <= 0x007A) { # a..z
568 !!!cp (20);
569 $self->{ct} = {type => START_TAG_TOKEN,
570 tag_name => chr ($self->{nc}),
571 line => $self->{line_prev},
572 column => $self->{column_prev}};
573 $self->{state} = TAG_NAME_STATE;
574 !!!next-input-character;
575 redo A;
576 } elsif ($self->{nc} == 0x003E) { # >
577 !!!cp (21);
578 !!!parse-error (type => 'empty start tag',
579 line => $self->{line_prev},
580 column => $self->{column_prev});
581 $self->{state} = DATA_STATE;
582 $self->{s_kwd} = '';
583 !!!next-input-character;
584
585 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586 line => $self->{line_prev},
587 column => $self->{column_prev},
588 });
589
590 redo A;
591 } elsif ($self->{nc} == 0x003F) { # ?
592 if ($self->{is_xml}) {
593 !!!cp (22.1);
594 $self->{state} = PI_STATE;
595 !!!next-input-character;
596 redo A;
597 } else {
598 !!!cp (22);
599 !!!parse-error (type => 'pio',
600 line => $self->{line_prev},
601 column => $self->{column_prev});
602 $self->{state} = BOGUS_COMMENT_STATE;
603 $self->{ct} = {type => COMMENT_TOKEN, data => '',
604 line => $self->{line_prev},
605 column => $self->{column_prev},
606 };
607 ## $self->{nc} is intentionally left as is
608 redo A;
609 }
610 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 !!!cp (23);
612 !!!parse-error (type => 'bare stago',
613 line => $self->{line_prev},
614 column => $self->{column_prev});
615 $self->{state} = DATA_STATE;
616 $self->{s_kwd} = '';
617 ## reconsume
618
619 !!!emit ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623
624 redo A;
625 } else {
626 ## XML5: "<:" is a parse error.
627 !!!cp (23.1);
628 $self->{ct} = {type => START_TAG_TOKEN,
629 tag_name => chr ($self->{nc}),
630 line => $self->{line_prev},
631 column => $self->{column_prev}};
632 $self->{state} = TAG_NAME_STATE;
633 !!!next-input-character;
634 redo A;
635 }
636 } else {
637 die "$0: $self->{content_model} in tag open";
638 }
639 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640 ## NOTE: The "close tag open state" in the spec is implemented as
641 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642
643 ## XML5: "end tag state".
644
645 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647 if (defined $self->{last_stag_name}) {
648 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 $self->{kwd} = '';
650 ## Reconsume.
651 redo A;
652 } else {
653 ## No start tag token has ever been emitted
654 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655 !!!cp (28);
656 $self->{state} = DATA_STATE;
657 $self->{s_kwd} = '';
658 ## Reconsume.
659 !!!emit ({type => CHARACTER_TOKEN, data => '</',
660 line => $l, column => $c,
661 });
662 redo A;
663 }
664 }
665
666 if (0x0041 <= $self->{nc} and
667 $self->{nc} <= 0x005A) { # A..Z
668 !!!cp (29);
669 $self->{ct}
670 = {type => END_TAG_TOKEN,
671 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 line => $l, column => $c};
673 $self->{state} = TAG_NAME_STATE;
674 !!!next-input-character;
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678 !!!cp (30);
679 $self->{ct} = {type => END_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $l, column => $c};
682 $self->{state} = TAG_NAME_STATE;
683 !!!next-input-character;
684 redo A;
685 } elsif ($self->{nc} == 0x003E) { # >
686 !!!parse-error (type => 'empty end tag',
687 line => $self->{line_prev}, ## "<" in "</>"
688 column => $self->{column_prev} - 1);
689 $self->{state} = DATA_STATE;
690 $self->{s_kwd} = '';
691 if ($self->{is_xml}) {
692 !!!cp (31);
693 ## XML5: No parse error.
694
695 ## NOTE: This parser raises a parse error, since it supports
696 ## XML1, not XML5.
697
698 ## NOTE: A short end tag token.
699 my $ct = {type => END_TAG_TOKEN,
700 tag_name => '',
701 line => $self->{line_prev},
702 column => $self->{column_prev} - 1,
703 };
704 !!!next-input-character;
705 !!!emit ($ct);
706 } else {
707 !!!cp (31.1);
708 !!!next-input-character;
709 }
710 redo A;
711 } elsif ($self->{nc} == -1) {
712 !!!cp (32);
713 !!!parse-error (type => 'bare etago');
714 $self->{s_kwd} = '';
715 $self->{state} = DATA_STATE;
716 # reconsume
717
718 !!!emit ({type => CHARACTER_TOKEN, data => '</',
719 line => $l, column => $c,
720 });
721
722 redo A;
723 } elsif (not $self->{is_xml} or
724 $is_space->{$self->{nc}}) {
725 !!!cp (33);
726 !!!parse-error (type => 'bogus end tag',
727 line => $self->{line_prev}, # "<" of "</"
728 column => $self->{column_prev} - 1);
729 $self->{state} = BOGUS_COMMENT_STATE;
730 $self->{ct} = {type => COMMENT_TOKEN, data => '',
731 line => $self->{line_prev}, # "<" of "</"
732 column => $self->{column_prev} - 1,
733 };
734 ## NOTE: $self->{nc} is intentionally left as is.
735 ## Although the "anything else" case of the spec not explicitly
736 ## states that the next input character is to be reconsumed,
737 ## it will be included to the |data| of the comment token
738 ## generated from the bogus end tag, as defined in the
739 ## "bogus comment state" entry.
740 redo A;
741 } else {
742 ## XML5: "</:" is a parse error.
743 !!!cp (30.1);
744 $self->{ct} = {type => END_TAG_TOKEN,
745 tag_name => chr ($self->{nc}),
746 line => $l, column => $c};
747 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748 !!!next-input-character;
749 redo A;
750 }
751 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 if (length $ch) {
754 my $CH = $ch;
755 $ch =~ tr/a-z/A-Z/;
756 my $nch = chr $self->{nc};
757 if ($nch eq $ch or $nch eq $CH) {
758 !!!cp (24);
759 ## Stay in the state.
760 $self->{kwd} .= $nch;
761 !!!next-input-character;
762 redo A;
763 } else {
764 !!!cp (25);
765 $self->{state} = DATA_STATE;
766 $self->{s_kwd} = '';
767 ## Reconsume.
768 !!!emit ({type => CHARACTER_TOKEN,
769 data => '</' . $self->{kwd},
770 line => $self->{line_prev},
771 column => $self->{column_prev} - 1 - length $self->{kwd},
772 });
773 redo A;
774 }
775 } else { # after "<{tag-name}"
776 unless ($is_space->{$self->{nc}} or
777 {
778 0x003E => 1, # >
779 0x002F => 1, # /
780 -1 => 1, # EOF
781 }->{$self->{nc}}) {
782 !!!cp (26);
783 ## Reconsume.
784 $self->{state} = DATA_STATE;
785 $self->{s_kwd} = '';
786 !!!emit ({type => CHARACTER_TOKEN,
787 data => '</' . $self->{kwd},
788 line => $self->{line_prev},
789 column => $self->{column_prev} - 1 - length $self->{kwd},
790 });
791 redo A;
792 } else {
793 !!!cp (27);
794 $self->{ct}
795 = {type => END_TAG_TOKEN,
796 tag_name => $self->{last_stag_name},
797 line => $self->{line_prev},
798 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 $self->{state} = TAG_NAME_STATE;
800 ## Reconsume.
801 redo A;
802 }
803 }
804 } elsif ($self->{state} == TAG_NAME_STATE) {
805 if ($is_space->{$self->{nc}}) {
806 !!!cp (34);
807 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808 !!!next-input-character;
809 redo A;
810 } elsif ($self->{nc} == 0x003E) { # >
811 if ($self->{ct}->{type} == START_TAG_TOKEN) {
812 !!!cp (35);
813 $self->{last_stag_name} = $self->{ct}->{tag_name};
814 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816 #if ($self->{ct}->{attributes}) {
817 # ## NOTE: This should never be reached.
818 # !!! cp (36);
819 # !!! parse-error (type => 'end tag attribute');
820 #} else {
821 !!!cp (37);
822 #}
823 } else {
824 die "$0: $self->{ct}->{type}: Unknown token type";
825 }
826 $self->{state} = DATA_STATE;
827 $self->{s_kwd} = '';
828 !!!next-input-character;
829
830 !!!emit ($self->{ct}); # start tag or end tag
831
832 redo A;
833 } elsif (0x0041 <= $self->{nc} and
834 $self->{nc} <= 0x005A) { # A..Z
835 !!!cp (38);
836 $self->{ct}->{tag_name}
837 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 # start tag or end tag
839 ## Stay in this state
840 !!!next-input-character;
841 redo A;
842 } elsif ($self->{nc} == -1) {
843 !!!parse-error (type => 'unclosed tag');
844 if ($self->{ct}->{type} == START_TAG_TOKEN) {
845 !!!cp (39);
846 $self->{last_stag_name} = $self->{ct}->{tag_name};
847 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849 #if ($self->{ct}->{attributes}) {
850 # ## NOTE: This state should never be reached.
851 # !!! cp (40);
852 # !!! parse-error (type => 'end tag attribute');
853 #} else {
854 !!!cp (41);
855 #}
856 } else {
857 die "$0: $self->{ct}->{type}: Unknown token type";
858 }
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 # reconsume
862
863 !!!emit ($self->{ct}); # start tag or end tag
864
865 redo A;
866 } elsif ($self->{nc} == 0x002F) { # /
867 !!!cp (42);
868 $self->{state} = SELF_CLOSING_START_TAG_STATE;
869 !!!next-input-character;
870 redo A;
871 } else {
872 !!!cp (44);
873 $self->{ct}->{tag_name} .= chr $self->{nc};
874 # start tag or end tag
875 ## Stay in the state
876 !!!next-input-character;
877 redo A;
878 }
879 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 ## XML5: "Tag attribute name before state".
881
882 if ($is_space->{$self->{nc}}) {
883 !!!cp (45);
884 ## Stay in the state
885 !!!next-input-character;
886 redo A;
887 } elsif ($self->{nc} == 0x003E) { # >
888 if ($self->{ct}->{type} == START_TAG_TOKEN) {
889 !!!cp (46);
890 $self->{last_stag_name} = $self->{ct}->{tag_name};
891 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893 if ($self->{ct}->{attributes}) {
894 !!!cp (47);
895 !!!parse-error (type => 'end tag attribute');
896 } else {
897 !!!cp (48);
898 }
899 } else {
900 die "$0: $self->{ct}->{type}: Unknown token type";
901 }
902 $self->{state} = DATA_STATE;
903 $self->{s_kwd} = '';
904 !!!next-input-character;
905
906 !!!emit ($self->{ct}); # start tag or end tag
907
908 redo A;
909 } elsif (0x0041 <= $self->{nc} and
910 $self->{nc} <= 0x005A) { # A..Z
911 !!!cp (49);
912 $self->{ca}
913 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 value => '',
915 line => $self->{line}, column => $self->{column}};
916 $self->{state} = ATTRIBUTE_NAME_STATE;
917 !!!next-input-character;
918 redo A;
919 } elsif ($self->{nc} == 0x002F) { # /
920 !!!cp (50);
921 $self->{state} = SELF_CLOSING_START_TAG_STATE;
922 !!!next-input-character;
923 redo A;
924 } elsif ($self->{nc} == -1) {
925 !!!parse-error (type => 'unclosed tag');
926 if ($self->{ct}->{type} == START_TAG_TOKEN) {
927 !!!cp (52);
928 $self->{last_stag_name} = $self->{ct}->{tag_name};
929 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931 if ($self->{ct}->{attributes}) {
932 !!!cp (53);
933 !!!parse-error (type => 'end tag attribute');
934 } else {
935 !!!cp (54);
936 }
937 } else {
938 die "$0: $self->{ct}->{type}: Unknown token type";
939 }
940 $self->{state} = DATA_STATE;
941 $self->{s_kwd} = '';
942 # reconsume
943
944 !!!emit ($self->{ct}); # start tag or end tag
945
946 redo A;
947 } else {
948 if ({
949 0x0022 => 1, # "
950 0x0027 => 1, # '
951 0x003D => 1, # =
952 }->{$self->{nc}}) {
953 !!!cp (55);
954 ## XML5: Not a parse error.
955 !!!parse-error (type => 'bad attribute name');
956 } else {
957 !!!cp (56);
958 ## XML5: ":" raises a parse error and is ignored.
959 }
960 $self->{ca}
961 = {name => chr ($self->{nc}),
962 value => '',
963 line => $self->{line}, column => $self->{column}};
964 $self->{state} = ATTRIBUTE_NAME_STATE;
965 !!!next-input-character;
966 redo A;
967 }
968 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 ## XML5: "Tag attribute name state".
970
971 my $before_leave = sub {
972 if (exists $self->{ct}->{attributes} # start tag or end tag
973 ->{$self->{ca}->{name}}) { # MUST
974 !!!cp (57);
975 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976 ## Discard $self->{ca} # MUST
977 } else {
978 !!!cp (58);
979 $self->{ct}->{attributes}->{$self->{ca}->{name}}
980 = $self->{ca};
981 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 }
983 }; # $before_leave
984
985 if ($is_space->{$self->{nc}}) {
986 !!!cp (59);
987 $before_leave->();
988 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989 !!!next-input-character;
990 redo A;
991 } elsif ($self->{nc} == 0x003D) { # =
992 !!!cp (60);
993 $before_leave->();
994 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{nc} == 0x003E) { # >
998 if ($self->{is_xml}) {
999 !!!cp (60.1);
1000 ## XML5: Not a parse error.
1001 !!!parse-error (type => 'no attr value'); ## TODO: type
1002 } else {
1003 !!!cp (60.2);
1004 }
1005
1006 $before_leave->();
1007 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008 !!!cp (61);
1009 $self->{last_stag_name} = $self->{ct}->{tag_name};
1010 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011 !!!cp (62);
1012 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013 if ($self->{ct}->{attributes}) {
1014 !!!parse-error (type => 'end tag attribute');
1015 }
1016 } else {
1017 die "$0: $self->{ct}->{type}: Unknown token type";
1018 }
1019 $self->{state} = DATA_STATE;
1020 $self->{s_kwd} = '';
1021 !!!next-input-character;
1022
1023 !!!emit ($self->{ct}); # start tag or end tag
1024
1025 redo A;
1026 } elsif (0x0041 <= $self->{nc} and
1027 $self->{nc} <= 0x005A) { # A..Z
1028 !!!cp (63);
1029 $self->{ca}->{name}
1030 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 ## Stay in the state
1032 !!!next-input-character;
1033 redo A;
1034 } elsif ($self->{nc} == 0x002F) { # /
1035 if ($self->{is_xml}) {
1036 !!!cp (64);
1037 ## XML5: Not a parse error.
1038 !!!parse-error (type => 'no attr value'); ## TODO: type
1039 } else {
1040 !!!cp (64.1);
1041 }
1042
1043 $before_leave->();
1044 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045 !!!next-input-character;
1046 redo A;
1047 } elsif ($self->{nc} == -1) {
1048 !!!parse-error (type => 'unclosed tag');
1049 $before_leave->();
1050 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051 !!!cp (66);
1052 $self->{last_stag_name} = $self->{ct}->{tag_name};
1053 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055 if ($self->{ct}->{attributes}) {
1056 !!!cp (67);
1057 !!!parse-error (type => 'end tag attribute');
1058 } else {
1059 ## NOTE: This state should never be reached.
1060 !!!cp (68);
1061 }
1062 } else {
1063 die "$0: $self->{ct}->{type}: Unknown token type";
1064 }
1065 $self->{state} = DATA_STATE;
1066 $self->{s_kwd} = '';
1067 # reconsume
1068
1069 !!!emit ($self->{ct}); # start tag or end tag
1070
1071 redo A;
1072 } else {
1073 if ($self->{nc} == 0x0022 or # "
1074 $self->{nc} == 0x0027) { # '
1075 !!!cp (69);
1076 ## XML5: Not a parse error.
1077 !!!parse-error (type => 'bad attribute name');
1078 } else {
1079 !!!cp (70);
1080 }
1081 $self->{ca}->{name} .= chr ($self->{nc});
1082 ## Stay in the state
1083 !!!next-input-character;
1084 redo A;
1085 }
1086 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 ## XML5: "Tag attribute name after state".
1088
1089 if ($is_space->{$self->{nc}}) {
1090 !!!cp (71);
1091 ## Stay in the state
1092 !!!next-input-character;
1093 redo A;
1094 } elsif ($self->{nc} == 0x003D) { # =
1095 !!!cp (72);
1096 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097 !!!next-input-character;
1098 redo A;
1099 } elsif ($self->{nc} == 0x003E) { # >
1100 if ($self->{is_xml}) {
1101 !!!cp (72.1);
1102 ## XML5: Not a parse error.
1103 !!!parse-error (type => 'no attr value'); ## TODO: type
1104 } else {
1105 !!!cp (72.2);
1106 }
1107
1108 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109 !!!cp (73);
1110 $self->{last_stag_name} = $self->{ct}->{tag_name};
1111 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113 if ($self->{ct}->{attributes}) {
1114 !!!cp (74);
1115 !!!parse-error (type => 'end tag attribute');
1116 } else {
1117 ## NOTE: This state should never be reached.
1118 !!!cp (75);
1119 }
1120 } else {
1121 die "$0: $self->{ct}->{type}: Unknown token type";
1122 }
1123 $self->{state} = DATA_STATE;
1124 $self->{s_kwd} = '';
1125 !!!next-input-character;
1126
1127 !!!emit ($self->{ct}); # start tag or end tag
1128
1129 redo A;
1130 } elsif (0x0041 <= $self->{nc} and
1131 $self->{nc} <= 0x005A) { # A..Z
1132 !!!cp (76);
1133 $self->{ca}
1134 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 value => '',
1136 line => $self->{line}, column => $self->{column}};
1137 $self->{state} = ATTRIBUTE_NAME_STATE;
1138 !!!next-input-character;
1139 redo A;
1140 } elsif ($self->{nc} == 0x002F) { # /
1141 if ($self->{is_xml}) {
1142 !!!cp (77);
1143 ## XML5: Not a parse error.
1144 !!!parse-error (type => 'no attr value'); ## TODO: type
1145 } else {
1146 !!!cp (77.1);
1147 }
1148
1149 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150 !!!next-input-character;
1151 redo A;
1152 } elsif ($self->{nc} == -1) {
1153 !!!parse-error (type => 'unclosed tag');
1154 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155 !!!cp (79);
1156 $self->{last_stag_name} = $self->{ct}->{tag_name};
1157 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159 if ($self->{ct}->{attributes}) {
1160 !!!cp (80);
1161 !!!parse-error (type => 'end tag attribute');
1162 } else {
1163 ## NOTE: This state should never be reached.
1164 !!!cp (81);
1165 }
1166 } else {
1167 die "$0: $self->{ct}->{type}: Unknown token type";
1168 }
1169 $self->{s_kwd} = '';
1170 $self->{state} = DATA_STATE;
1171 # reconsume
1172
1173 !!!emit ($self->{ct}); # start tag or end tag
1174
1175 redo A;
1176 } else {
1177 if ($self->{is_xml}) {
1178 !!!cp (78.1);
1179 ## XML5: Not a parse error.
1180 !!!parse-error (type => 'no attr value'); ## TODO: type
1181 } else {
1182 !!!cp (78.2);
1183 }
1184
1185 if ($self->{nc} == 0x0022 or # "
1186 $self->{nc} == 0x0027) { # '
1187 !!!cp (78);
1188 ## XML5: Not a parse error.
1189 !!!parse-error (type => 'bad attribute name');
1190 } else {
1191 !!!cp (82);
1192 }
1193 $self->{ca}
1194 = {name => chr ($self->{nc}),
1195 value => '',
1196 line => $self->{line}, column => $self->{column}};
1197 $self->{state} = ATTRIBUTE_NAME_STATE;
1198 !!!next-input-character;
1199 redo A;
1200 }
1201 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 ## XML5: "Tag attribute value before state".
1203
1204 if ($is_space->{$self->{nc}}) {
1205 !!!cp (83);
1206 ## Stay in the state
1207 !!!next-input-character;
1208 redo A;
1209 } elsif ($self->{nc} == 0x0022) { # "
1210 !!!cp (84);
1211 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 } elsif ($self->{nc} == 0x0026) { # &
1215 !!!cp (85);
1216 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217 ## reconsume
1218 redo A;
1219 } elsif ($self->{nc} == 0x0027) { # '
1220 !!!cp (86);
1221 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222 !!!next-input-character;
1223 redo A;
1224 } elsif ($self->{nc} == 0x003E) { # >
1225 !!!parse-error (type => 'empty unquoted attribute value');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227 !!!cp (87);
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232 !!!cp (88);
1233 !!!parse-error (type => 'end tag attribute');
1234 } else {
1235 ## NOTE: This state should never be reached.
1236 !!!cp (89);
1237 }
1238 } else {
1239 die "$0: $self->{ct}->{type}: Unknown token type";
1240 }
1241 $self->{state} = DATA_STATE;
1242 $self->{s_kwd} = '';
1243 !!!next-input-character;
1244
1245 !!!emit ($self->{ct}); # start tag or end tag
1246
1247 redo A;
1248 } elsif ($self->{nc} == -1) {
1249 !!!parse-error (type => 'unclosed tag');
1250 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251 !!!cp (90);
1252 $self->{last_stag_name} = $self->{ct}->{tag_name};
1253 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255 if ($self->{ct}->{attributes}) {
1256 !!!cp (91);
1257 !!!parse-error (type => 'end tag attribute');
1258 } else {
1259 ## NOTE: This state should never be reached.
1260 !!!cp (92);
1261 }
1262 } else {
1263 die "$0: $self->{ct}->{type}: Unknown token type";
1264 }
1265 $self->{state} = DATA_STATE;
1266 $self->{s_kwd} = '';
1267 ## reconsume
1268
1269 !!!emit ($self->{ct}); # start tag or end tag
1270
1271 redo A;
1272 } else {
1273 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1274 !!!cp (93);
1275 ## XML5: Not a parse error.
1276 !!!parse-error (type => 'bad attribute value');
1277 } elsif ($self->{is_xml}) {
1278 !!!cp (93.1);
1279 ## XML5: No parse error.
1280 !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 } else {
1282 !!!cp (94);
1283 }
1284 $self->{ca}->{value} .= chr ($self->{nc});
1285 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286 !!!next-input-character;
1287 redo A;
1288 }
1289 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291 ## ATTLIST attribute value double quoted state".
1292
1293 if ($self->{nc} == 0x0022) { # "
1294 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295 !!!cp (95.1);
1296 ## XML5: "DOCTYPE ATTLIST name after state".
1297 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299 } else {
1300 !!!cp (95);
1301 ## XML5: "Tag attribute name before state".
1302 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303 }
1304 !!!next-input-character;
1305 redo A;
1306 } elsif ($self->{nc} == 0x0026) { # &
1307 !!!cp (96);
1308 ## XML5: Not defined yet.
1309
1310 ## NOTE: In the spec, the tokenizer is switched to the
1311 ## "entity in attribute value state". In this implementation, the
1312 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313 ## implementation of the "consume a character reference" algorithm.
1314 $self->{prev_state} = $self->{state};
1315 $self->{entity_add} = 0x0022; # "
1316 $self->{state} = ENTITY_STATE;
1317 !!!next-input-character;
1318 redo A;
1319 } elsif ($self->{is_xml} and
1320 $is_space->{$self->{nc}}) {
1321 !!!cp (97.1);
1322 $self->{ca}->{value} .= ' ';
1323 ## Stay in the state.
1324 !!!next-input-character;
1325 redo A;
1326 } elsif ($self->{nc} == -1) {
1327 !!!parse-error (type => 'unclosed attribute value');
1328 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1329 !!!cp (97);
1330 $self->{last_stag_name} = $self->{ct}->{tag_name};
1331
1332 $self->{state} = DATA_STATE;
1333 $self->{s_kwd} = '';
1334 ## reconsume
1335 !!!emit ($self->{ct}); # start tag
1336 redo A;
1337 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339 if ($self->{ct}->{attributes}) {
1340 !!!cp (98);
1341 !!!parse-error (type => 'end tag attribute');
1342 } else {
1343 ## NOTE: This state should never be reached.
1344 !!!cp (99);
1345 }
1346
1347 $self->{state} = DATA_STATE;
1348 $self->{s_kwd} = '';
1349 ## reconsume
1350 !!!emit ($self->{ct}); # end tag
1351 redo A;
1352 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1353 ## XML5: No parse error above; not defined yet.
1354 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1355 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1356 ## Reconsume.
1357 !!!emit ($self->{ct}); # ATTLIST
1358 redo A;
1359 } else {
1360 die "$0: $self->{ct}->{type}: Unknown token type";
1361 }
1362 } else {
1363 ## XML5 [ATTLIST]: Not defined yet.
1364 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1365 !!!cp (100);
1366 ## XML5: Not a parse error.
1367 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1368 } else {
1369 !!!cp (100.1);
1370 }
1371 $self->{ca}->{value} .= chr ($self->{nc});
1372 $self->{read_until}->($self->{ca}->{value},
1373 qq["&<\x09\x0C\x20],
1374 length $self->{ca}->{value});
1375
1376 ## Stay in the state
1377 !!!next-input-character;
1378 redo A;
1379 }
1380 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1381 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1382 ## ATTLIST attribute value single quoted state".
1383
1384 if ($self->{nc} == 0x0027) { # '
1385 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1386 !!!cp (101.1);
1387 ## XML5: "DOCTYPE ATTLIST name after state".
1388 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1389 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1390 } else {
1391 !!!cp (101);
1392 ## XML5: "Before attribute name state" (sic).
1393 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1394 }
1395 !!!next-input-character;
1396 redo A;
1397 } elsif ($self->{nc} == 0x0026) { # &
1398 !!!cp (102);
1399 ## XML5: Not defined yet.
1400
1401 ## NOTE: In the spec, the tokenizer is switched to the
1402 ## "entity in attribute value state". In this implementation, the
1403 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1404 ## implementation of the "consume a character reference" algorithm.
1405 $self->{entity_add} = 0x0027; # '
1406 $self->{prev_state} = $self->{state};
1407 $self->{state} = ENTITY_STATE;
1408 !!!next-input-character;
1409 redo A;
1410 } elsif ($self->{is_xml} and
1411 $is_space->{$self->{nc}}) {
1412 !!!cp (103.1);
1413 $self->{ca}->{value} .= ' ';
1414 ## Stay in the state.
1415 !!!next-input-character;
1416 redo A;
1417 } elsif ($self->{nc} == -1) {
1418 !!!parse-error (type => 'unclosed attribute value');
1419 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1420 !!!cp (103);
1421 $self->{last_stag_name} = $self->{ct}->{tag_name};
1422
1423 $self->{state} = DATA_STATE;
1424 $self->{s_kwd} = '';
1425 ## reconsume
1426 !!!emit ($self->{ct}); # start tag
1427 redo A;
1428 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1429 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1430 if ($self->{ct}->{attributes}) {
1431 !!!cp (104);
1432 !!!parse-error (type => 'end tag attribute');
1433 } else {
1434 ## NOTE: This state should never be reached.
1435 !!!cp (105);
1436 }
1437
1438 $self->{state} = DATA_STATE;
1439 $self->{s_kwd} = '';
1440 ## reconsume
1441 !!!emit ($self->{ct}); # end tag
1442 redo A;
1443 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1444 ## XML5: No parse error above; not defined yet.
1445 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1446 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1447 ## Reconsume.
1448 !!!emit ($self->{ct}); # ATTLIST
1449 redo A;
1450 } else {
1451 die "$0: $self->{ct}->{type}: Unknown token type";
1452 }
1453 } else {
1454 ## XML5 [ATTLIST]: Not defined yet.
1455 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1456 !!!cp (106);
1457 ## XML5: Not a parse error.
1458 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1459 } else {
1460 !!!cp (106.1);
1461 }
1462 $self->{ca}->{value} .= chr ($self->{nc});
1463 $self->{read_until}->($self->{ca}->{value},
1464 qq['&<\x09\x0C\x20],
1465 length $self->{ca}->{value});
1466
1467 ## Stay in the state
1468 !!!next-input-character;
1469 redo A;
1470 }
1471 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1472 ## XML5: "Tag attribute value unquoted state".
1473
1474 if ($is_space->{$self->{nc}}) {
1475 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1476 !!!cp (107.1);
1477 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1478 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1479 } else {
1480 !!!cp (107);
1481 ## XML5: "Tag attribute name before state".
1482 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1483 }
1484 !!!next-input-character;
1485 redo A;
1486 } elsif ($self->{nc} == 0x0026) { # &
1487 !!!cp (108);
1488
1489 ## XML5: Not defined yet.
1490
1491 ## NOTE: In the spec, the tokenizer is switched to the
1492 ## "entity in attribute value state". In this implementation, the
1493 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1494 ## implementation of the "consume a character reference" algorithm.
1495 $self->{entity_add} = -1;
1496 $self->{prev_state} = $self->{state};
1497 $self->{state} = ENTITY_STATE;
1498 !!!next-input-character;
1499 redo A;
1500 } elsif ($self->{nc} == 0x003E) { # >
1501 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1502 !!!cp (109);
1503 $self->{last_stag_name} = $self->{ct}->{tag_name};
1504
1505 $self->{state} = DATA_STATE;
1506 $self->{s_kwd} = '';
1507 !!!next-input-character;
1508 !!!emit ($self->{ct}); # start tag
1509 redo A;
1510 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512 if ($self->{ct}->{attributes}) {
1513 !!!cp (110);
1514 !!!parse-error (type => 'end tag attribute');
1515 } else {
1516 ## NOTE: This state should never be reached.
1517 !!!cp (111);
1518 }
1519
1520 $self->{state} = DATA_STATE;
1521 $self->{s_kwd} = '';
1522 !!!next-input-character;
1523 !!!emit ($self->{ct}); # end tag
1524 redo A;
1525 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1526 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1527 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1528 !!!next-input-character;
1529 !!!emit ($self->{ct}); # ATTLIST
1530 redo A;
1531 } else {
1532 die "$0: $self->{ct}->{type}: Unknown token type";
1533 }
1534 } elsif ($self->{nc} == -1) {
1535 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1536 !!!cp (112);
1537 !!!parse-error (type => 'unclosed tag');
1538 $self->{last_stag_name} = $self->{ct}->{tag_name};
1539
1540 $self->{state} = DATA_STATE;
1541 $self->{s_kwd} = '';
1542 ## reconsume
1543 !!!emit ($self->{ct}); # start tag
1544 redo A;
1545 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 !!!parse-error (type => 'unclosed tag');
1547 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1548 if ($self->{ct}->{attributes}) {
1549 !!!cp (113);
1550 !!!parse-error (type => 'end tag attribute');
1551 } else {
1552 ## NOTE: This state should never be reached.
1553 !!!cp (114);
1554 }
1555
1556 $self->{state} = DATA_STATE;
1557 $self->{s_kwd} = '';
1558 ## reconsume
1559 !!!emit ($self->{ct}); # end tag
1560 redo A;
1561 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1562 !!!parse-error (type => 'unclosed md'); ## TODO: type
1563 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1564 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1565 ## Reconsume.
1566 !!!emit ($self->{ct}); # ATTLIST
1567 redo A;
1568 } else {
1569 die "$0: $self->{ct}->{type}: Unknown token type";
1570 }
1571 } else {
1572 if ({
1573 0x0022 => 1, # "
1574 0x0027 => 1, # '
1575 0x003D => 1, # =
1576 0x003C => 1, # <
1577 }->{$self->{nc}}) {
1578 !!!cp (115);
1579 ## XML5: Not a parse error.
1580 !!!parse-error (type => 'bad attribute value');
1581 } else {
1582 !!!cp (116);
1583 }
1584 $self->{ca}->{value} .= chr ($self->{nc});
1585 $self->{read_until}->($self->{ca}->{value},
1586 qq["'=& \x09\x0C>],
1587 length $self->{ca}->{value});
1588
1589 ## Stay in the state
1590 !!!next-input-character;
1591 redo A;
1592 }
1593 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1594 if ($is_space->{$self->{nc}}) {
1595 !!!cp (118);
1596 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1597 !!!next-input-character;
1598 redo A;
1599 } elsif ($self->{nc} == 0x003E) { # >
1600 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1601 !!!cp (119);
1602 $self->{last_stag_name} = $self->{ct}->{tag_name};
1603 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1604 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1605 if ($self->{ct}->{attributes}) {
1606 !!!cp (120);
1607 !!!parse-error (type => 'end tag attribute');
1608 } else {
1609 ## NOTE: This state should never be reached.
1610 !!!cp (121);
1611 }
1612 } else {
1613 die "$0: $self->{ct}->{type}: Unknown token type";
1614 }
1615 $self->{state} = DATA_STATE;
1616 $self->{s_kwd} = '';
1617 !!!next-input-character;
1618
1619 !!!emit ($self->{ct}); # start tag or end tag
1620
1621 redo A;
1622 } elsif ($self->{nc} == 0x002F) { # /
1623 !!!cp (122);
1624 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1625 !!!next-input-character;
1626 redo A;
1627 } elsif ($self->{nc} == -1) {
1628 !!!parse-error (type => 'unclosed tag');
1629 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1630 !!!cp (122.3);
1631 $self->{last_stag_name} = $self->{ct}->{tag_name};
1632 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1633 if ($self->{ct}->{attributes}) {
1634 !!!cp (122.1);
1635 !!!parse-error (type => 'end tag attribute');
1636 } else {
1637 ## NOTE: This state should never be reached.
1638 !!!cp (122.2);
1639 }
1640 } else {
1641 die "$0: $self->{ct}->{type}: Unknown token type";
1642 }
1643 $self->{state} = DATA_STATE;
1644 $self->{s_kwd} = '';
1645 ## Reconsume.
1646 !!!emit ($self->{ct}); # start tag or end tag
1647 redo A;
1648 } else {
1649 !!!cp ('124.1');
1650 !!!parse-error (type => 'no space between attributes');
1651 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1652 ## reconsume
1653 redo A;
1654 }
1655 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1656 ## XML5: "Empty tag state".
1657
1658 if ($self->{nc} == 0x003E) { # >
1659 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1660 !!!cp ('124.2');
1661 !!!parse-error (type => 'nestc', token => $self->{ct});
1662 ## TODO: Different type than slash in start tag
1663 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1664 if ($self->{ct}->{attributes}) {
1665 !!!cp ('124.4');
1666 !!!parse-error (type => 'end tag attribute');
1667 } else {
1668 !!!cp ('124.5');
1669 }
1670 ## TODO: Test |<title></title/>|
1671 } else {
1672 !!!cp ('124.3');
1673 $self->{self_closing} = 1;
1674 }
1675
1676 $self->{state} = DATA_STATE;
1677 $self->{s_kwd} = '';
1678 !!!next-input-character;
1679
1680 !!!emit ($self->{ct}); # start tag or end tag
1681
1682 redo A;
1683 } elsif ($self->{nc} == -1) {
1684 !!!parse-error (type => 'unclosed tag');
1685 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1686 !!!cp (124.7);
1687 $self->{last_stag_name} = $self->{ct}->{tag_name};
1688 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1689 if ($self->{ct}->{attributes}) {
1690 !!!cp (124.5);
1691 !!!parse-error (type => 'end tag attribute');
1692 } else {
1693 ## NOTE: This state should never be reached.
1694 !!!cp (124.6);
1695 }
1696 } else {
1697 die "$0: $self->{ct}->{type}: Unknown token type";
1698 }
1699 ## XML5: "Tag attribute name before state".
1700 $self->{state} = DATA_STATE;
1701 $self->{s_kwd} = '';
1702 ## Reconsume.
1703 !!!emit ($self->{ct}); # start tag or end tag
1704 redo A;
1705 } else {
1706 !!!cp ('124.4');
1707 !!!parse-error (type => 'nestc');
1708 ## TODO: This error type is wrong.
1709 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1710 ## Reconsume.
1711 redo A;
1712 }
1713 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1714 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1715
1716 ## NOTE: Unlike spec's "bogus comment state", this implementation
1717 ## consumes characters one-by-one basis.
1718
1719 if ($self->{nc} == 0x003E) { # >
1720 if ($self->{in_subset}) {
1721 !!!cp (123);
1722 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1723 } else {
1724 !!!cp (124);
1725 $self->{state} = DATA_STATE;
1726 $self->{s_kwd} = '';
1727 }
1728 !!!next-input-character;
1729
1730 !!!emit ($self->{ct}); # comment
1731 redo A;
1732 } elsif ($self->{nc} == -1) {
1733 if ($self->{in_subset}) {
1734 !!!cp (125.1);
1735 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1736 } else {
1737 !!!cp (125);
1738 $self->{state} = DATA_STATE;
1739 $self->{s_kwd} = '';
1740 }
1741 ## reconsume
1742
1743 !!!emit ($self->{ct}); # comment
1744 redo A;
1745 } else {
1746 !!!cp (126);
1747 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1748 $self->{read_until}->($self->{ct}->{data},
1749 q[>],
1750 length $self->{ct}->{data});
1751
1752 ## Stay in the state.
1753 !!!next-input-character;
1754 redo A;
1755 }
1756 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1757 ## XML5: "Markup declaration state".
1758
1759 if ($self->{nc} == 0x002D) { # -
1760 !!!cp (133);
1761 $self->{state} = MD_HYPHEN_STATE;
1762 !!!next-input-character;
1763 redo A;
1764 } elsif ($self->{nc} == 0x0044 or # D
1765 $self->{nc} == 0x0064) { # d
1766 ## ASCII case-insensitive.
1767 !!!cp (130);
1768 $self->{state} = MD_DOCTYPE_STATE;
1769 $self->{kwd} = chr $self->{nc};
1770 !!!next-input-character;
1771 redo A;
1772 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1773 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1774 $self->{is_xml}) and
1775 $self->{nc} == 0x005B) { # [
1776 !!!cp (135.4);
1777 $self->{state} = MD_CDATA_STATE;
1778 $self->{kwd} = '[';
1779 !!!next-input-character;
1780 redo A;
1781 } else {
1782 !!!cp (136);
1783 }
1784
1785 !!!parse-error (type => 'bogus comment',
1786 line => $self->{line_prev},
1787 column => $self->{column_prev} - 1);
1788 ## Reconsume.
1789 $self->{state} = BOGUS_COMMENT_STATE;
1790 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1791 line => $self->{line_prev},
1792 column => $self->{column_prev} - 1,
1793 };
1794 redo A;
1795 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1796 if ($self->{nc} == 0x002D) { # -
1797 !!!cp (127);
1798 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799 line => $self->{line_prev},
1800 column => $self->{column_prev} - 2,
1801 };
1802 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1803 !!!next-input-character;
1804 redo A;
1805 } else {
1806 !!!cp (128);
1807 !!!parse-error (type => 'bogus comment',
1808 line => $self->{line_prev},
1809 column => $self->{column_prev} - 2);
1810 $self->{state} = BOGUS_COMMENT_STATE;
1811 ## Reconsume.
1812 $self->{ct} = {type => COMMENT_TOKEN,
1813 data => '-',
1814 line => $self->{line_prev},
1815 column => $self->{column_prev} - 2,
1816 };
1817 redo A;
1818 }
1819 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1820 ## ASCII case-insensitive.
1821 if ($self->{nc} == [
1822 undef,
1823 0x004F, # O
1824 0x0043, # C
1825 0x0054, # T
1826 0x0059, # Y
1827 0x0050, # P
1828 ]->[length $self->{kwd}] or
1829 $self->{nc} == [
1830 undef,
1831 0x006F, # o
1832 0x0063, # c
1833 0x0074, # t
1834 0x0079, # y
1835 0x0070, # p
1836 ]->[length $self->{kwd}]) {
1837 !!!cp (131);
1838 ## Stay in the state.
1839 $self->{kwd} .= chr $self->{nc};
1840 !!!next-input-character;
1841 redo A;
1842 } elsif ((length $self->{kwd}) == 6 and
1843 ($self->{nc} == 0x0045 or # E
1844 $self->{nc} == 0x0065)) { # e
1845 if ($self->{is_xml} and
1846 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1847 !!!cp (129);
1848 ## XML5: case-sensitive.
1849 !!!parse-error (type => 'lowercase keyword', ## TODO
1850 text => 'DOCTYPE',
1851 line => $self->{line_prev},
1852 column => $self->{column_prev} - 5);
1853 } else {
1854 !!!cp (129.1);
1855 }
1856 $self->{state} = DOCTYPE_STATE;
1857 $self->{ct} = {type => DOCTYPE_TOKEN,
1858 quirks => 1,
1859 line => $self->{line_prev},
1860 column => $self->{column_prev} - 7,
1861 };
1862 !!!next-input-character;
1863 redo A;
1864 } else {
1865 !!!cp (132);
1866 !!!parse-error (type => 'bogus comment',
1867 line => $self->{line_prev},
1868 column => $self->{column_prev} - 1 - length $self->{kwd});
1869 $self->{state} = BOGUS_COMMENT_STATE;
1870 ## Reconsume.
1871 $self->{ct} = {type => COMMENT_TOKEN,
1872 data => $self->{kwd},
1873 line => $self->{line_prev},
1874 column => $self->{column_prev} - 1 - length $self->{kwd},
1875 };
1876 redo A;
1877 }
1878 } elsif ($self->{state} == MD_CDATA_STATE) {
1879 if ($self->{nc} == {
1880 '[' => 0x0043, # C
1881 '[C' => 0x0044, # D
1882 '[CD' => 0x0041, # A
1883 '[CDA' => 0x0054, # T
1884 '[CDAT' => 0x0041, # A
1885 }->{$self->{kwd}}) {
1886 !!!cp (135.1);
1887 ## Stay in the state.
1888 $self->{kwd} .= chr $self->{nc};
1889 !!!next-input-character;
1890 redo A;
1891 } elsif ($self->{kwd} eq '[CDATA' and
1892 $self->{nc} == 0x005B) { # [
1893 if ($self->{is_xml} and
1894 not $self->{tainted} and
1895 @{$self->{open_elements} or []} == 0) {
1896 !!!cp (135.2);
1897 !!!parse-error (type => 'cdata outside of root element',
1898 line => $self->{line_prev},
1899 column => $self->{column_prev} - 7);
1900 $self->{tainted} = 1;
1901 } else {
1902 !!!cp (135.21);
1903 }
1904
1905 $self->{ct} = {type => CHARACTER_TOKEN,
1906 data => '',
1907 line => $self->{line_prev},
1908 column => $self->{column_prev} - 7};
1909 $self->{state} = CDATA_SECTION_STATE;
1910 !!!next-input-character;
1911 redo A;
1912 } else {
1913 !!!cp (135.3);
1914 !!!parse-error (type => 'bogus comment',
1915 line => $self->{line_prev},
1916 column => $self->{column_prev} - 1 - length $self->{kwd});
1917 $self->{state} = BOGUS_COMMENT_STATE;
1918 ## Reconsume.
1919 $self->{ct} = {type => COMMENT_TOKEN,
1920 data => $self->{kwd},
1921 line => $self->{line_prev},
1922 column => $self->{column_prev} - 1 - length $self->{kwd},
1923 };
1924 redo A;
1925 }
1926 } elsif ($self->{state} == COMMENT_START_STATE) {
1927 if ($self->{nc} == 0x002D) { # -
1928 !!!cp (137);
1929 $self->{state} = COMMENT_START_DASH_STATE;
1930 !!!next-input-character;
1931 redo A;
1932 } elsif ($self->{nc} == 0x003E) { # >
1933 !!!parse-error (type => 'bogus comment');
1934 if ($self->{in_subset}) {
1935 !!!cp (138.1);
1936 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937 } else {
1938 !!!cp (138);
1939 $self->{state} = DATA_STATE;
1940 $self->{s_kwd} = '';
1941 }
1942 !!!next-input-character;
1943
1944 !!!emit ($self->{ct}); # comment
1945
1946 redo A;
1947 } elsif ($self->{nc} == -1) {
1948 !!!parse-error (type => 'unclosed comment');
1949 if ($self->{in_subset}) {
1950 !!!cp (139.1);
1951 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1952 } else {
1953 !!!cp (139);
1954 $self->{state} = DATA_STATE;
1955 $self->{s_kwd} = '';
1956 }
1957 ## reconsume
1958
1959 !!!emit ($self->{ct}); # comment
1960
1961 redo A;
1962 } else {
1963 !!!cp (140);
1964 $self->{ct}->{data} # comment
1965 .= chr ($self->{nc});
1966 $self->{state} = COMMENT_STATE;
1967 !!!next-input-character;
1968 redo A;
1969 }
1970 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1971 if ($self->{nc} == 0x002D) { # -
1972 !!!cp (141);
1973 $self->{state} = COMMENT_END_STATE;
1974 !!!next-input-character;
1975 redo A;
1976 } elsif ($self->{nc} == 0x003E) { # >
1977 !!!parse-error (type => 'bogus comment');
1978 if ($self->{in_subset}) {
1979 !!!cp (142.1);
1980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981 } else {
1982 !!!cp (142);
1983 $self->{state} = DATA_STATE;
1984 $self->{s_kwd} = '';
1985 }
1986 !!!next-input-character;
1987
1988 !!!emit ($self->{ct}); # comment
1989
1990 redo A;
1991 } elsif ($self->{nc} == -1) {
1992 !!!parse-error (type => 'unclosed comment');
1993 if ($self->{in_subset}) {
1994 !!!cp (143.1);
1995 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996 } else {
1997 !!!cp (143);
1998 $self->{state} = DATA_STATE;
1999 $self->{s_kwd} = '';
2000 }
2001 ## reconsume
2002
2003 !!!emit ($self->{ct}); # comment
2004
2005 redo A;
2006 } else {
2007 !!!cp (144);
2008 $self->{ct}->{data} # comment
2009 .= '-' . chr ($self->{nc});
2010 $self->{state} = COMMENT_STATE;
2011 !!!next-input-character;
2012 redo A;
2013 }
2014 } elsif ($self->{state} == COMMENT_STATE) {
2015 ## XML5: "Comment state" and "DOCTYPE comment state".
2016
2017 if ($self->{nc} == 0x002D) { # -
2018 !!!cp (145);
2019 $self->{state} = COMMENT_END_DASH_STATE;
2020 !!!next-input-character;
2021 redo A;
2022 } elsif ($self->{nc} == -1) {
2023 !!!parse-error (type => 'unclosed comment');
2024 if ($self->{in_subset}) {
2025 !!!cp (146.1);
2026 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2027 } else {
2028 !!!cp (146);
2029 $self->{state} = DATA_STATE;
2030 $self->{s_kwd} = '';
2031 }
2032 ## reconsume
2033
2034 !!!emit ($self->{ct}); # comment
2035
2036 redo A;
2037 } else {
2038 !!!cp (147);
2039 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2040 $self->{read_until}->($self->{ct}->{data},
2041 q[-],
2042 length $self->{ct}->{data});
2043
2044 ## Stay in the state
2045 !!!next-input-character;
2046 redo A;
2047 }
2048 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2049 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2050
2051 if ($self->{nc} == 0x002D) { # -
2052 !!!cp (148);
2053 $self->{state} = COMMENT_END_STATE;
2054 !!!next-input-character;
2055 redo A;
2056 } elsif ($self->{nc} == -1) {
2057 !!!parse-error (type => 'unclosed comment');
2058 if ($self->{in_subset}) {
2059 !!!cp (149.1);
2060 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061 } else {
2062 !!!cp (149);
2063 $self->{state} = DATA_STATE;
2064 $self->{s_kwd} = '';
2065 }
2066 ## reconsume
2067
2068 !!!emit ($self->{ct}); # comment
2069
2070 redo A;
2071 } else {
2072 !!!cp (150);
2073 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2074 $self->{state} = COMMENT_STATE;
2075 !!!next-input-character;
2076 redo A;
2077 }
2078 } elsif ($self->{state} == COMMENT_END_STATE) {
2079 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2080
2081 if ($self->{nc} == 0x003E) { # >
2082 if ($self->{in_subset}) {
2083 !!!cp (151.1);
2084 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085 } else {
2086 !!!cp (151);
2087 $self->{state} = DATA_STATE;
2088 $self->{s_kwd} = '';
2089 }
2090 !!!next-input-character;
2091
2092 !!!emit ($self->{ct}); # comment
2093
2094 redo A;
2095 } elsif ($self->{nc} == 0x002D) { # -
2096 !!!cp (152);
2097 ## XML5: Not a parse error.
2098 !!!parse-error (type => 'dash in comment',
2099 line => $self->{line_prev},
2100 column => $self->{column_prev});
2101 $self->{ct}->{data} .= '-'; # comment
2102 ## Stay in the state
2103 !!!next-input-character;
2104 redo A;
2105 } elsif ($self->{nc} == -1) {
2106 !!!parse-error (type => 'unclosed comment');
2107 if ($self->{in_subset}) {
2108 !!!cp (153.1);
2109 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2110 } else {
2111 !!!cp (153);
2112 $self->{state} = DATA_STATE;
2113 $self->{s_kwd} = '';
2114 }
2115 ## reconsume
2116
2117 !!!emit ($self->{ct}); # comment
2118
2119 redo A;
2120 } else {
2121 !!!cp (154);
2122 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2123 $self->{state} = COMMENT_STATE;
2124 !!!next-input-character;
2125 redo A;
2126 }
2127 } elsif ($self->{state} == DOCTYPE_STATE) {
2128 if ($is_space->{$self->{nc}}) {
2129 !!!cp (155);
2130 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2131 !!!next-input-character;
2132 redo A;
2133 } else {
2134 !!!cp (156);
2135 ## XML5: Unless EOF, swith to the bogus comment state.
2136 !!!parse-error (type => 'no space before DOCTYPE name');
2137 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2138 ## reconsume
2139 redo A;
2140 }
2141 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2142 ## XML5: "DOCTYPE root name before state".
2143
2144 if ($is_space->{$self->{nc}}) {
2145 !!!cp (157);
2146 ## Stay in the state
2147 !!!next-input-character;
2148 redo A;
2149 } elsif ($self->{nc} == 0x003E) { # >
2150 !!!cp (158);
2151 ## XML5: No parse error.
2152 !!!parse-error (type => 'no DOCTYPE name');
2153 $self->{state} = DATA_STATE;
2154 $self->{s_kwd} = '';
2155 !!!next-input-character;
2156
2157 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2158
2159 redo A;
2160 } elsif ($self->{nc} == -1) {
2161 !!!cp (159);
2162 !!!parse-error (type => 'no DOCTYPE name');
2163 $self->{state} = DATA_STATE;
2164 $self->{s_kwd} = '';
2165 ## reconsume
2166
2167 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2168
2169 redo A;
2170 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2171 !!!cp (159.1);
2172 !!!parse-error (type => 'no DOCTYPE name');
2173 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2174 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2175 $self->{in_subset} = 1;
2176 !!!next-input-character;
2177 !!!emit ($self->{ct}); # DOCTYPE
2178 redo A;
2179 } else {
2180 !!!cp (160);
2181 $self->{ct}->{name} = chr $self->{nc};
2182 delete $self->{ct}->{quirks};
2183 $self->{state} = DOCTYPE_NAME_STATE;
2184 !!!next-input-character;
2185 redo A;
2186 }
2187 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2188 ## XML5: "DOCTYPE root name state".
2189
2190 ## ISSUE: Redundant "First," in the spec.
2191
2192 if ($is_space->{$self->{nc}}) {
2193 !!!cp (161);
2194 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2195 !!!next-input-character;
2196 redo A;
2197 } elsif ($self->{nc} == 0x003E) { # >
2198 !!!cp (162);
2199 $self->{state} = DATA_STATE;
2200 $self->{s_kwd} = '';
2201 !!!next-input-character;
2202
2203 !!!emit ($self->{ct}); # DOCTYPE
2204
2205 redo A;
2206 } elsif ($self->{nc} == -1) {
2207 !!!cp (163);
2208 !!!parse-error (type => 'unclosed DOCTYPE');
2209 $self->{state} = DATA_STATE;
2210 $self->{s_kwd} = '';
2211 ## reconsume
2212
2213 $self->{ct}->{quirks} = 1;
2214 !!!emit ($self->{ct}); # DOCTYPE
2215
2216 redo A;
2217 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2218 !!!cp (163.1);
2219 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2220 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2221 $self->{in_subset} = 1;
2222 !!!next-input-character;
2223 !!!emit ($self->{ct}); # DOCTYPE
2224 redo A;
2225 } else {
2226 !!!cp (164);
2227 $self->{ct}->{name}
2228 .= chr ($self->{nc}); # DOCTYPE
2229 ## Stay in the state
2230 !!!next-input-character;
2231 redo A;
2232 }
2233 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2234 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2235 ## state", but implemented differently.
2236
2237 if ($is_space->{$self->{nc}}) {
2238 !!!cp (165);
2239 ## Stay in the state
2240 !!!next-input-character;
2241 redo A;
2242 } elsif ($self->{nc} == 0x003E) { # >
2243 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2244 !!!cp (166);
2245 $self->{state} = DATA_STATE;
2246 $self->{s_kwd} = '';
2247 } else {
2248 !!!cp (166.1);
2249 !!!parse-error (type => 'no md def'); ## TODO: type
2250 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2251 }
2252
2253 !!!next-input-character;
2254 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2255 redo A;
2256 } elsif ($self->{nc} == -1) {
2257 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2258 !!!cp (167);
2259 !!!parse-error (type => 'unclosed DOCTYPE');
2260 $self->{state} = DATA_STATE;
2261 $self->{s_kwd} = '';
2262 $self->{ct}->{quirks} = 1;
2263 } else {
2264 !!!cp (167.12);
2265 !!!parse-error (type => 'unclosed md'); ## TODO: type
2266 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2267 }
2268
2269 ## Reconsume.
2270 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2271 redo A;
2272 } elsif ($self->{nc} == 0x0050 or # P
2273 $self->{nc} == 0x0070) { # p
2274 !!!cp (167.1);
2275 $self->{state} = PUBLIC_STATE;
2276 $self->{kwd} = chr $self->{nc};
2277 !!!next-input-character;
2278 redo A;
2279 } elsif ($self->{nc} == 0x0053 or # S
2280 $self->{nc} == 0x0073) { # s
2281 !!!cp (167.2);
2282 $self->{state} = SYSTEM_STATE;
2283 $self->{kwd} = chr $self->{nc};
2284 !!!next-input-character;
2285 redo A;
2286 } elsif ($self->{nc} == 0x0022 and # "
2287 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2288 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2289 !!!cp (167.21);
2290 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2291 $self->{ct}->{value} = ''; # ENTITY
2292 !!!next-input-character;
2293 redo A;
2294 } elsif ($self->{nc} == 0x0027 and # '
2295 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2296 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2297 !!!cp (167.22);
2298 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2299 $self->{ct}->{value} = ''; # ENTITY
2300 !!!next-input-character;
2301 redo A;
2302 } elsif ($self->{is_xml} and
2303 $self->{ct}->{type} == DOCTYPE_TOKEN and
2304 $self->{nc} == 0x005B) { # [
2305 !!!cp (167.3);
2306 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2307 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2308 $self->{in_subset} = 1;
2309 !!!next-input-character;
2310 !!!emit ($self->{ct}); # DOCTYPE
2311 redo A;
2312 } else {
2313 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2314
2315 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2316 !!!cp (180);
2317 $self->{ct}->{quirks} = 1;
2318 $self->{state} = BOGUS_DOCTYPE_STATE;
2319 } else {
2320 !!!cp (180.1);
2321 $self->{state} = BOGUS_MD_STATE;
2322 }
2323
2324 !!!next-input-character;
2325 redo A;
2326 }
2327 } elsif ($self->{state} == PUBLIC_STATE) {
2328 ## ASCII case-insensitive
2329 if ($self->{nc} == [
2330 undef,
2331 0x0055, # U
2332 0x0042, # B
2333 0x004C, # L
2334 0x0049, # I
2335 ]->[length $self->{kwd}] or
2336 $self->{nc} == [
2337 undef,
2338 0x0075, # u
2339 0x0062, # b
2340 0x006C, # l
2341 0x0069, # i
2342 ]->[length $self->{kwd}]) {
2343 !!!cp (175);
2344 ## Stay in the state.
2345 $self->{kwd} .= chr $self->{nc};
2346 !!!next-input-character;
2347 redo A;
2348 } elsif ((length $self->{kwd}) == 5 and
2349 ($self->{nc} == 0x0043 or # C
2350 $self->{nc} == 0x0063)) { # c
2351 if ($self->{is_xml} and
2352 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2353 !!!cp (168.1);
2354 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2355 text => 'PUBLIC',
2356 line => $self->{line_prev},
2357 column => $self->{column_prev} - 4);
2358 } else {
2359 !!!cp (168);
2360 }
2361 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2362 !!!next-input-character;
2363 redo A;
2364 } else {
2365 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2366 line => $self->{line_prev},
2367 column => $self->{column_prev} + 1 - length $self->{kwd});
2368 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2369 !!!cp (169);
2370 $self->{ct}->{quirks} = 1;
2371 $self->{state} = BOGUS_DOCTYPE_STATE;
2372 } else {
2373 !!!cp (169.1);
2374 $self->{state} = BOGUS_MD_STATE;
2375 }
2376 ## Reconsume.
2377 redo A;
2378 }
2379 } elsif ($self->{state} == SYSTEM_STATE) {
2380 ## ASCII case-insensitive
2381 if ($self->{nc} == [
2382 undef,
2383 0x0059, # Y
2384 0x0053, # S
2385 0x0054, # T
2386 0x0045, # E
2387 ]->[length $self->{kwd}] or
2388 $self->{nc} == [
2389 undef,
2390 0x0079, # y
2391 0x0073, # s
2392 0x0074, # t
2393 0x0065, # e
2394 ]->[length $self->{kwd}]) {
2395 !!!cp (170);
2396 ## Stay in the state.
2397 $self->{kwd} .= chr $self->{nc};
2398 !!!next-input-character;
2399 redo A;
2400 } elsif ((length $self->{kwd}) == 5 and
2401 ($self->{nc} == 0x004D or # M
2402 $self->{nc} == 0x006D)) { # m
2403 if ($self->{is_xml} and
2404 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2405 !!!cp (171.1);
2406 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2407 text => 'SYSTEM',
2408 line => $self->{line_prev},
2409 column => $self->{column_prev} - 4);
2410 } else {
2411 !!!cp (171);
2412 }
2413 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2414 !!!next-input-character;
2415 redo A;
2416 } else {
2417 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2418 line => $self->{line_prev},
2419 column => $self->{column_prev} + 1 - length $self->{kwd});
2420 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2421 !!!cp (172);
2422 $self->{ct}->{quirks} = 1;
2423 $self->{state} = BOGUS_DOCTYPE_STATE;
2424 } else {
2425 !!!cp (172.1);
2426 $self->{state} = BOGUS_MD_STATE;
2427 }
2428 ## Reconsume.
2429 redo A;
2430 }
2431 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2432 if ($is_space->{$self->{nc}}) {
2433 !!!cp (181);
2434 ## Stay in the state
2435 !!!next-input-character;
2436 redo A;
2437 } elsif ($self->{nc} eq 0x0022) { # "
2438 !!!cp (182);
2439 $self->{ct}->{pubid} = ''; # DOCTYPE
2440 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2441 !!!next-input-character;
2442 redo A;
2443 } elsif ($self->{nc} eq 0x0027) { # '
2444 !!!cp (183);
2445 $self->{ct}->{pubid} = ''; # DOCTYPE
2446 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2447 !!!next-input-character;
2448 redo A;
2449 } elsif ($self->{nc} eq 0x003E) { # >
2450 !!!parse-error (type => 'no PUBLIC literal');
2451
2452 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2453 !!!cp (184);
2454 $self->{state} = DATA_STATE;
2455 $self->{s_kwd} = '';
2456 $self->{ct}->{quirks} = 1;
2457 } else {
2458 !!!cp (184.1);
2459 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2460 }
2461
2462 !!!next-input-character;
2463 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2464 redo A;
2465 } elsif ($self->{nc} == -1) {
2466 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2467 !!!cp (185);
2468 !!!parse-error (type => 'unclosed DOCTYPE');
2469 $self->{state} = DATA_STATE;
2470 $self->{s_kwd} = '';
2471 $self->{ct}->{quirks} = 1;
2472 } else {
2473 !!!cp (185.1);
2474 !!!parse-error (type => 'unclosed md'); ## TODO: type
2475 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2476 }
2477
2478 ## reconsume
2479 !!!emit ($self->{ct}); # DOCTYPE
2480 redo A;
2481 } elsif ($self->{is_xml} and
2482 $self->{ct}->{type} == DOCTYPE_TOKEN and
2483 $self->{nc} == 0x005B) { # [
2484 !!!cp (186.1);
2485 !!!parse-error (type => 'no PUBLIC literal');
2486 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2487 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2488 $self->{in_subset} = 1;
2489 !!!next-input-character;
2490 !!!emit ($self->{ct}); # DOCTYPE
2491 redo A;
2492 } else {
2493 !!!parse-error (type => 'string after PUBLIC');
2494
2495 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2496 !!!cp (186);
2497 $self->{ct}->{quirks} = 1;
2498 $self->{state} = BOGUS_DOCTYPE_STATE;
2499 } else {
2500 !!!cp (186.2);
2501 $self->{state} = BOGUS_MD_STATE;
2502 }
2503
2504 !!!next-input-character;
2505 redo A;
2506 }
2507 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2508 if ($self->{nc} == 0x0022) { # "
2509 !!!cp (187);
2510 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2511 !!!next-input-character;
2512 redo A;
2513 } elsif ($self->{nc} == 0x003E) { # >
2514 !!!parse-error (type => 'unclosed PUBLIC literal');
2515
2516 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2517 !!!cp (188);
2518 $self->{state} = DATA_STATE;
2519 $self->{s_kwd} = '';
2520 $self->{ct}->{quirks} = 1;
2521 } else {
2522 !!!cp (188.1);
2523 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2524 }
2525
2526 !!!next-input-character;
2527 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2528 redo A;
2529 } elsif ($self->{nc} == -1) {
2530 !!!parse-error (type => 'unclosed PUBLIC literal');
2531
2532 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2533 !!!cp (189);
2534 $self->{state} = DATA_STATE;
2535 $self->{s_kwd} = '';
2536 $self->{ct}->{quirks} = 1;
2537 } else {
2538 !!!cp (189.1);
2539 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2540 }
2541
2542 ## Reconsume.
2543 !!!emit ($self->{ct}); # DOCTYPE
2544 redo A;
2545 } else {
2546 !!!cp (190);
2547 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2548 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2549 length $self->{ct}->{pubid});
2550
2551 ## Stay in the state
2552 !!!next-input-character;
2553 redo A;
2554 }
2555 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2556 if ($self->{nc} == 0x0027) { # '
2557 !!!cp (191);
2558 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2559 !!!next-input-character;
2560 redo A;
2561 } elsif ($self->{nc} == 0x003E) { # >
2562 !!!parse-error (type => 'unclosed PUBLIC literal');
2563
2564 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2565 !!!cp (192);
2566 $self->{state} = DATA_STATE;
2567 $self->{s_kwd} = '';
2568 $self->{ct}->{quirks} = 1;
2569 } else {
2570 !!!cp (192.1);
2571 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2572 }
2573
2574 !!!next-input-character;
2575 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2576 redo A;
2577 } elsif ($self->{nc} == -1) {
2578 !!!parse-error (type => 'unclosed PUBLIC literal');
2579
2580 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2581 !!!cp (193);
2582 $self->{state} = DATA_STATE;
2583 $self->{s_kwd} = '';
2584 $self->{ct}->{quirks} = 1;
2585 } else {
2586 !!!cp (193.1);
2587 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2588 }
2589
2590 ## reconsume
2591 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2592 redo A;
2593 } else {
2594 !!!cp (194);
2595 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2596 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2597 length $self->{ct}->{pubid});
2598
2599 ## Stay in the state
2600 !!!next-input-character;
2601 redo A;
2602 }
2603 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2604 if ($is_space->{$self->{nc}}) {
2605 !!!cp (195);
2606 ## Stay in the state
2607 !!!next-input-character;
2608 redo A;
2609 } elsif ($self->{nc} == 0x0022) { # "
2610 !!!cp (196);
2611 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2612 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2613 !!!next-input-character;
2614 redo A;
2615 } elsif ($self->{nc} == 0x0027) { # '
2616 !!!cp (197);
2617 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2618 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2619 !!!next-input-character;
2620 redo A;
2621 } elsif ($self->{nc} == 0x003E) { # >
2622 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2623 if ($self->{is_xml}) {
2624 !!!cp (198.1);
2625 !!!parse-error (type => 'no SYSTEM literal');
2626 } else {
2627 !!!cp (198);
2628 }
2629 $self->{state} = DATA_STATE;
2630 $self->{s_kwd} = '';
2631 } else {
2632 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2633 !!!cp (198.2);
2634 } else {
2635 !!!cp (198.3);
2636 !!!parse-error (type => 'no SYSTEM literal');
2637 }
2638 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2639 }
2640
2641 !!!next-input-character;
2642 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2643 redo A;
2644 } elsif ($self->{nc} == -1) {
2645 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2646 !!!cp (199);
2647 !!!parse-error (type => 'unclosed DOCTYPE');
2648
2649 $self->{state} = DATA_STATE;
2650 $self->{s_kwd} = '';
2651 $self->{ct}->{quirks} = 1;
2652 } else {
2653 !!!parse-error (type => 'unclosed md'); ## TODO: type
2654 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2655 }
2656
2657 ## reconsume
2658 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2659 redo A;
2660 } elsif ($self->{is_xml} and
2661 $self->{ct}->{type} == DOCTYPE_TOKEN and
2662 $self->{nc} == 0x005B) { # [
2663 !!!cp (200.1);
2664 !!!parse-error (type => 'no SYSTEM literal');
2665 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2666 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2667 $self->{in_subset} = 1;
2668 !!!next-input-character;
2669 !!!emit ($self->{ct}); # DOCTYPE
2670 redo A;
2671 } else {
2672 !!!parse-error (type => 'string after PUBLIC literal');
2673
2674 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2675 !!!cp (200);
2676 $self->{ct}->{quirks} = 1;
2677 $self->{state} = BOGUS_DOCTYPE_STATE;
2678 } else {
2679 !!!cp (200.2);
2680 $self->{state} = BOGUS_MD_STATE;
2681 }
2682
2683 !!!next-input-character;
2684 redo A;
2685 }
2686 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2687 if ($is_space->{$self->{nc}}) {
2688 !!!cp (201);
2689 ## Stay in the state
2690 !!!next-input-character;
2691 redo A;
2692 } elsif ($self->{nc} == 0x0022) { # "
2693 !!!cp (202);
2694 $self->{ct}->{sysid} = ''; # DOCTYPE
2695 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2696 !!!next-input-character;
2697 redo A;
2698 } elsif ($self->{nc} == 0x0027) { # '
2699 !!!cp (203);
2700 $self->{ct}->{sysid} = ''; # DOCTYPE
2701 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2702 !!!next-input-character;
2703 redo A;
2704 } elsif ($self->{nc} == 0x003E) { # >
2705 !!!parse-error (type => 'no SYSTEM literal');
2706 !!!next-input-character;
2707
2708 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2709 !!!cp (204);
2710 $self->{state} = DATA_STATE;
2711 $self->{s_kwd} = '';
2712 $self->{ct}->{quirks} = 1;
2713 } else {
2714 !!!cp (204.1);
2715 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2716 }
2717
2718 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2719 redo A;
2720 } elsif ($self->{nc} == -1) {
2721 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2722 !!!cp (205);
2723 !!!parse-error (type => 'unclosed DOCTYPE');
2724 $self->{state} = DATA_STATE;
2725 $self->{s_kwd} = '';
2726 $self->{ct}->{quirks} = 1;
2727 } else {
2728 !!!cp (205.1);
2729 !!!parse-error (type => 'unclosed md'); ## TODO: type
2730 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2731 }
2732
2733 ## reconsume
2734 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2735 redo A;
2736 } elsif ($self->{is_xml} and
2737 $self->{ct}->{type} == DOCTYPE_TOKEN and
2738 $self->{nc} == 0x005B) { # [
2739 !!!cp (206.1);
2740 !!!parse-error (type => 'no SYSTEM literal');
2741
2742 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2743 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2744 $self->{in_subset} = 1;
2745 !!!next-input-character;
2746 !!!emit ($self->{ct}); # DOCTYPE
2747 redo A;
2748 } else {
2749 !!!parse-error (type => 'string after SYSTEM');
2750
2751 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2752 !!!cp (206);
2753 $self->{ct}->{quirks} = 1;
2754 $self->{state} = BOGUS_DOCTYPE_STATE;
2755 } else {
2756 !!!cp (206.2);
2757 $self->{state} = BOGUS_MD_STATE;
2758 }
2759
2760 !!!next-input-character;
2761 redo A;
2762 }
2763 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2764 if ($self->{nc} == 0x0022) { # "
2765 !!!cp (207);
2766 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2767 !!!next-input-character;
2768 redo A;
2769 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2770 !!!parse-error (type => 'unclosed SYSTEM literal');
2771
2772 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2773 !!!cp (208);
2774 $self->{state} = DATA_STATE;
2775 $self->{s_kwd} = '';
2776 $self->{ct}->{quirks} = 1;
2777 } else {
2778 !!!cp (208.1);
2779 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2780 }
2781
2782 !!!next-input-character;
2783 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2784 redo A;
2785 } elsif ($self->{nc} == -1) {
2786 !!!parse-error (type => 'unclosed SYSTEM literal');
2787
2788 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2789 !!!cp (209);
2790 $self->{state} = DATA_STATE;
2791 $self->{s_kwd} = '';
2792 $self->{ct}->{quirks} = 1;
2793 } else {
2794 !!!cp (209.1);
2795 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2796 }
2797
2798 ## reconsume
2799 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2800 redo A;
2801 } else {
2802 !!!cp (210);
2803 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2804 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2805 length $self->{ct}->{sysid});
2806
2807 ## Stay in the state
2808 !!!next-input-character;
2809 redo A;
2810 }
2811 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2812 if ($self->{nc} == 0x0027) { # '
2813 !!!cp (211);
2814 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2815 !!!next-input-character;
2816 redo A;
2817 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2818 !!!cp (212);
2819 !!!parse-error (type => 'unclosed SYSTEM literal');
2820
2821 $self->{state} = DATA_STATE;
2822 $self->{s_kwd} = '';
2823 !!!next-input-character;
2824
2825 $self->{ct}->{quirks} = 1;
2826 !!!emit ($self->{ct}); # DOCTYPE
2827
2828 redo A;
2829 } elsif ($self->{nc} == -1) {
2830 !!!parse-error (type => 'unclosed SYSTEM literal');
2831
2832 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2833 !!!cp (213);
2834 $self->{state} = DATA_STATE;
2835 $self->{s_kwd} = '';
2836 $self->{ct}->{quirks} = 1;
2837 } else {
2838 !!!cp (213.1);
2839 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2840 }
2841
2842 ## reconsume
2843 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2844 redo A;
2845 } else {
2846 !!!cp (214);
2847 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2848 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2849 length $self->{ct}->{sysid});
2850
2851 ## Stay in the state
2852 !!!next-input-character;
2853 redo A;
2854 }
2855 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2856 if ($is_space->{$self->{nc}}) {
2857 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2858 !!!cp (215.1);
2859 $self->{state} = BEFORE_NDATA_STATE;
2860 } else {
2861 !!!cp (215);
2862 ## Stay in the state
2863 }
2864 !!!next-input-character;
2865 redo A;
2866 } elsif ($self->{nc} == 0x003E) { # >
2867 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2868 !!!cp (216);
2869 $self->{state} = DATA_STATE;
2870 $self->{s_kwd} = '';
2871 } else {
2872 !!!cp (216.1);
2873 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2874 }
2875
2876 !!!next-input-character;
2877 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2878 redo A;
2879 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2880 ($self->{nc} == 0x004E or # N
2881 $self->{nc} == 0x006E)) { # n
2882 !!!cp (216.2);
2883 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2884 $self->{state} = NDATA_STATE;
2885 $self->{kwd} = chr $self->{nc};
2886 !!!next-input-character;
2887 redo A;
2888 } elsif ($self->{nc} == -1) {
2889 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2890 !!!cp (217);
2891 !!!parse-error (type => 'unclosed DOCTYPE');
2892 $self->{state} = DATA_STATE;
2893 $self->{s_kwd} = '';
2894 $self->{ct}->{quirks} = 1;
2895 } else {
2896 !!!cp (217.1);
2897 !!!parse-error (type => 'unclosed md'); ## TODO: type
2898 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2899 }
2900
2901 ## reconsume
2902 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2903 redo A;
2904 } elsif ($self->{is_xml} and
2905 $self->{ct}->{type} == DOCTYPE_TOKEN and
2906 $self->{nc} == 0x005B) { # [
2907 !!!cp (218.1);
2908 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2909 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2910 $self->{in_subset} = 1;
2911 !!!next-input-character;
2912 !!!emit ($self->{ct}); # DOCTYPE
2913 redo A;
2914 } else {
2915 !!!parse-error (type => 'string after SYSTEM literal');
2916
2917 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2918 !!!cp (218);
2919 #$self->{ct}->{quirks} = 1;
2920 $self->{state} = BOGUS_DOCTYPE_STATE;
2921 } else {
2922 !!!cp (218.2);
2923 $self->{state} = BOGUS_MD_STATE;
2924 }
2925
2926 !!!next-input-character;
2927 redo A;
2928 }
2929 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2930 if ($is_space->{$self->{nc}}) {
2931 !!!cp (218.3);
2932 ## Stay in the state.
2933 !!!next-input-character;
2934 redo A;
2935 } elsif ($self->{nc} == 0x003E) { # >
2936 !!!cp (218.4);
2937 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2938 !!!next-input-character;
2939 !!!emit ($self->{ct}); # ENTITY
2940 redo A;
2941 } elsif ($self->{nc} == 0x004E or # N
2942 $self->{nc} == 0x006E) { # n
2943 !!!cp (218.5);
2944 $self->{state} = NDATA_STATE;
2945 $self->{kwd} = chr $self->{nc};
2946 !!!next-input-character;
2947 redo A;
2948 } elsif ($self->{nc} == -1) {
2949 !!!cp (218.6);
2950 !!!parse-error (type => 'unclosed md'); ## TODO: type
2951 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2952 ## reconsume
2953 !!!emit ($self->{ct}); # ENTITY
2954 redo A;
2955 } else {
2956 !!!cp (218.7);
2957 !!!parse-error (type => 'string after SYSTEM literal');
2958 $self->{state} = BOGUS_MD_STATE;
2959 !!!next-input-character;
2960 redo A;
2961 }
2962 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2963 if ($self->{nc} == 0x003E) { # >
2964 !!!cp (219);
2965 $self->{state} = DATA_STATE;
2966 $self->{s_kwd} = '';
2967 !!!next-input-character;
2968
2969 !!!emit ($self->{ct}); # DOCTYPE
2970
2971 redo A;
2972 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2973 !!!cp (220.1);
2974 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2975 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2976 $self->{in_subset} = 1;
2977 !!!next-input-character;
2978 !!!emit ($self->{ct}); # DOCTYPE
2979 redo A;
2980 } elsif ($self->{nc} == -1) {
2981 !!!cp (220);
2982 $self->{state} = DATA_STATE;
2983 $self->{s_kwd} = '';
2984 ## reconsume
2985
2986 !!!emit ($self->{ct}); # DOCTYPE
2987
2988 redo A;
2989 } else {
2990 !!!cp (221);
2991 my $s = '';
2992 $self->{read_until}->($s, q{>[}, 0);
2993
2994 ## Stay in the state
2995 !!!next-input-character;
2996 redo A;
2997 }
2998 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2999 ## NOTE: "CDATA section state" in the state is jointly implemented
3000 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3001 ## and |CDATA_SECTION_MSE2_STATE|.
3002
3003 ## XML5: "CDATA state".
3004
3005 if ($self->{nc} == 0x005D) { # ]
3006 !!!cp (221.1);
3007 $self->{state} = CDATA_SECTION_MSE1_STATE;
3008 !!!next-input-character;
3009 redo A;
3010 } elsif ($self->{nc} == -1) {
3011 if ($self->{is_xml}) {
3012 !!!cp (221.11);
3013 !!!parse-error (type => 'no mse'); ## TODO: type
3014 } else {
3015 !!!cp (221.12);
3016 }
3017
3018 $self->{state} = DATA_STATE;
3019 $self->{s_kwd} = '';
3020 ## Reconsume.
3021 if (length $self->{ct}->{data}) { # character
3022 !!!cp (221.2);
3023 !!!emit ($self->{ct}); # character
3024 } else {
3025 !!!cp (221.3);
3026 ## No token to emit. $self->{ct} is discarded.
3027 }
3028 redo A;
3029 } else {
3030 !!!cp (221.4);
3031 $self->{ct}->{data} .= chr $self->{nc};
3032 $self->{read_until}->($self->{ct}->{data},
3033 q<]>,
3034 length $self->{ct}->{data});
3035
3036 ## Stay in the state.
3037 !!!next-input-character;
3038 redo A;
3039 }
3040
3041 ## ISSUE: "text tokens" in spec.
3042 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3043 ## XML5: "CDATA bracket state".
3044
3045 if ($self->{nc} == 0x005D) { # ]
3046 !!!cp (221.5);
3047 $self->{state} = CDATA_SECTION_MSE2_STATE;
3048 !!!next-input-character;
3049 redo A;
3050 } else {
3051 !!!cp (221.6);
3052 ## XML5: If EOF, "]" is not appended and changed to the data state.
3053 $self->{ct}->{data} .= ']';
3054 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3055 ## Reconsume.
3056 redo A;
3057 }
3058 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3059 ## XML5: "CDATA end state".
3060
3061 if ($self->{nc} == 0x003E) { # >
3062 $self->{state} = DATA_STATE;
3063 $self->{s_kwd} = '';
3064 !!!next-input-character;
3065 if (length $self->{ct}->{data}) { # character
3066 !!!cp (221.7);
3067 !!!emit ($self->{ct}); # character
3068 } else {
3069 !!!cp (221.8);
3070 ## No token to emit. $self->{ct} is discarded.
3071 }
3072 redo A;
3073 } elsif ($self->{nc} == 0x005D) { # ]
3074 !!!cp (221.9); # character
3075 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3076 ## Stay in the state.
3077 !!!next-input-character;
3078 redo A;
3079 } else {
3080 !!!cp (221.11);
3081 $self->{ct}->{data} .= ']]'; # character
3082 $self->{state} = CDATA_SECTION_STATE;
3083 ## Reconsume. ## XML5: Emit.
3084 redo A;
3085 }
3086 } elsif ($self->{state} == ENTITY_STATE) {
3087 if ($is_space->{$self->{nc}} or
3088 {
3089 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3090 $self->{entity_add} => 1,
3091 }->{$self->{nc}}) {
3092 if ($self->{is_xml}) {
3093 !!!cp (1001.1);
3094 !!!parse-error (type => 'bare ero',
3095 line => $self->{line_prev},
3096 column => $self->{column_prev}
3097 + ($self->{nc} == -1 ? 1 : 0));
3098 } else {
3099 !!!cp (1001);
3100 ## No error
3101 }
3102 ## Don't consume
3103 ## Return nothing.
3104 #
3105 } elsif ($self->{nc} == 0x0023) { # #
3106 !!!cp (999);
3107 $self->{state} = ENTITY_HASH_STATE;
3108 $self->{kwd} = '#';
3109 !!!next-input-character;
3110 redo A;
3111 } elsif ($self->{is_xml} or
3112 (0x0041 <= $self->{nc} and
3113 $self->{nc} <= 0x005A) or # A..Z
3114 (0x0061 <= $self->{nc} and
3115 $self->{nc} <= 0x007A)) { # a..z
3116 !!!cp (998);
3117 require Whatpm::_NamedEntityList;
3118 $self->{state} = ENTITY_NAME_STATE;
3119 $self->{kwd} = chr $self->{nc};
3120 $self->{entity__value} = $self->{kwd};
3121 $self->{entity__match} = 0;
3122 !!!next-input-character;
3123 redo A;
3124 } else {
3125 !!!cp (1027);
3126 !!!parse-error (type => 'bare ero');
3127 ## Return nothing.
3128 #
3129 }
3130
3131 ## NOTE: No character is consumed by the "consume a character
3132 ## reference" algorithm. In other word, there is an "&" character
3133 ## that does not introduce a character reference, which would be
3134 ## appended to the parent element or the attribute value in later
3135 ## process of the tokenizer.
3136
3137 if ($self->{prev_state} == DATA_STATE) {
3138 !!!cp (997);
3139 $self->{state} = $self->{prev_state};
3140 $self->{s_kwd} = '';
3141 ## Reconsume.
3142 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3143 line => $self->{line_prev},
3144 column => $self->{column_prev},
3145 });
3146 redo A;
3147 } else {
3148 !!!cp (996);
3149 $self->{ca}->{value} .= '&';
3150 $self->{state} = $self->{prev_state};
3151 $self->{s_kwd} = '';
3152 ## Reconsume.
3153 redo A;
3154 }
3155 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3156 if ($self->{nc} == 0x0078) { # x
3157 !!!cp (995);
3158 $self->{state} = HEXREF_X_STATE;
3159 $self->{kwd} .= chr $self->{nc};
3160 !!!next-input-character;
3161 redo A;
3162 } elsif ($self->{nc} == 0x0058) { # X
3163 !!!cp (995.1);
3164 if ($self->{is_xml}) {
3165 !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3166 }
3167 $self->{state} = HEXREF_X_STATE;
3168 $self->{kwd} .= chr $self->{nc};
3169 !!!next-input-character;
3170 redo A;
3171 } elsif (0x0030 <= $self->{nc} and
3172 $self->{nc} <= 0x0039) { # 0..9
3173 !!!cp (994);
3174 $self->{state} = NCR_NUM_STATE;
3175 $self->{kwd} = $self->{nc} - 0x0030;
3176 !!!next-input-character;
3177 redo A;
3178 } else {
3179 !!!parse-error (type => 'bare nero',
3180 line => $self->{line_prev},
3181 column => $self->{column_prev} - 1);
3182
3183 ## NOTE: According to the spec algorithm, nothing is returned,
3184 ## and then "&#" is appended to the parent element or the attribute
3185 ## value in the later processing.
3186
3187 if ($self->{prev_state} == DATA_STATE) {
3188 !!!cp (1019);
3189 $self->{state} = $self->{prev_state};
3190 $self->{s_kwd} = '';
3191 ## Reconsume.
3192 !!!emit ({type => CHARACTER_TOKEN,
3193 data => '&#',
3194 line => $self->{line_prev},
3195 column => $self->{column_prev} - 1,
3196 });
3197 redo A;
3198 } else {
3199 !!!cp (993);
3200 $self->{ca}->{value} .= '&#';
3201 $self->{state} = $self->{prev_state};
3202 $self->{s_kwd} = '';
3203 ## Reconsume.
3204 redo A;
3205 }
3206 }
3207 } elsif ($self->{state} == NCR_NUM_STATE) {
3208 if (0x0030 <= $self->{nc} and
3209 $self->{nc} <= 0x0039) { # 0..9
3210 !!!cp (1012);
3211 $self->{kwd} *= 10;
3212 $self->{kwd} += $self->{nc} - 0x0030;
3213
3214 ## Stay in the state.
3215 !!!next-input-character;
3216 redo A;
3217 } elsif ($self->{nc} == 0x003B) { # ;
3218 !!!cp (1013);
3219 !!!next-input-character;
3220 #
3221 } else {
3222 !!!cp (1014);
3223 !!!parse-error (type => 'no refc');
3224 ## Reconsume.
3225 #
3226 }
3227
3228 my $code = $self->{kwd};
3229 my $l = $self->{line_prev};
3230 my $c = $self->{column_prev};
3231 if ((not $self->{is_xml} and $charref_map->{$code}) or
3232 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3233 ($self->{is_xml} and $code == 0x0000)) {
3234 !!!cp (1015);
3235 !!!parse-error (type => 'invalid character reference',
3236 text => (sprintf 'U+%04X', $code),
3237 line => $l, column => $c);
3238 $code = $charref_map->{$code};
3239 } elsif ($code > 0x10FFFF) {
3240 !!!cp (1016);
3241 !!!parse-error (type => 'invalid character reference',
3242 text => (sprintf 'U-%08X', $code),
3243 line => $l, column => $c);
3244 $code = 0xFFFD;
3245 }
3246
3247 if ($self->{prev_state} == DATA_STATE) {
3248 !!!cp (992);
3249 $self->{state} = $self->{prev_state};
3250 $self->{s_kwd} = '';
3251 ## Reconsume.
3252 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3253 has_reference => 1,
3254 line => $l, column => $c,
3255 });
3256 redo A;
3257 } else {
3258 !!!cp (991);
3259 $self->{ca}->{value} .= chr $code;
3260 $self->{ca}->{has_reference} = 1;
3261 $self->{state} = $self->{prev_state};
3262 $self->{s_kwd} = '';
3263 ## Reconsume.
3264 redo A;
3265 }
3266 } elsif ($self->{state} == HEXREF_X_STATE) {
3267 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3268 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3269 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3270 # 0..9, A..F, a..f
3271 !!!cp (990);
3272 $self->{state} = HEXREF_HEX_STATE;
3273 $self->{kwd} = 0;
3274 ## Reconsume.
3275 redo A;
3276 } else {
3277 !!!parse-error (type => 'bare hcro',
3278 line => $self->{line_prev},
3279 column => $self->{column_prev} - 2);
3280
3281 ## NOTE: According to the spec algorithm, nothing is returned,
3282 ## and then "&#" followed by "X" or "x" is appended to the parent
3283 ## element or the attribute value in the later processing.
3284
3285 if ($self->{prev_state} == DATA_STATE) {
3286 !!!cp (1005);
3287 $self->{state} = $self->{prev_state};
3288 $self->{s_kwd} = '';
3289 ## Reconsume.
3290 !!!emit ({type => CHARACTER_TOKEN,
3291 data => '&' . $self->{kwd},
3292 line => $self->{line_prev},
3293 column => $self->{column_prev} - length $self->{kwd},
3294 });
3295 redo A;
3296 } else {
3297 !!!cp (989);
3298 $self->{ca}->{value} .= '&' . $self->{kwd};
3299 $self->{state} = $self->{prev_state};
3300 $self->{s_kwd} = '';
3301 ## Reconsume.
3302 redo A;
3303 }
3304 }
3305 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3306 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3307 # 0..9
3308 !!!cp (1002);
3309 $self->{kwd} *= 0x10;
3310 $self->{kwd} += $self->{nc} - 0x0030;
3311 ## Stay in the state.
3312 !!!next-input-character;
3313 redo A;
3314 } elsif (0x0061 <= $self->{nc} and
3315 $self->{nc} <= 0x0066) { # a..f
3316 !!!cp (1003);
3317 $self->{kwd} *= 0x10;
3318 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3319 ## Stay in the state.
3320 !!!next-input-character;
3321 redo A;
3322 } elsif (0x0041 <= $self->{nc} and
3323 $self->{nc} <= 0x0046) { # A..F
3324 !!!cp (1004);
3325 $self->{kwd} *= 0x10;
3326 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3327 ## Stay in the state.
3328 !!!next-input-character;
3329 redo A;
3330 } elsif ($self->{nc} == 0x003B) { # ;
3331 !!!cp (1006);
3332 !!!next-input-character;
3333 #
3334 } else {
3335 !!!cp (1007);
3336 !!!parse-error (type => 'no refc',
3337 line => $self->{line},
3338 column => $self->{column});
3339 ## Reconsume.
3340 #
3341 }
3342
3343 my $code = $self->{kwd};
3344 my $l = $self->{line_prev};
3345 my $c = $self->{column_prev};
3346 if ((not $self->{is_xml} and $charref_map->{$code}) or
3347 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3348 ($self->{is_xml} and $code == 0x0000)) {
3349 !!!cp (1008);
3350 !!!parse-error (type => 'invalid character reference',
3351 text => (sprintf 'U+%04X', $code),
3352 line => $l, column => $c);
3353 $code = $charref_map->{$code};
3354 } elsif ($code > 0x10FFFF) {
3355 !!!cp (1009);
3356 !!!parse-error (type => 'invalid character reference',
3357 text => (sprintf 'U-%08X', $code),
3358 line => $l, column => $c);
3359 $code = 0xFFFD;
3360 }
3361
3362 if ($self->{prev_state} == DATA_STATE) {
3363 !!!cp (988);
3364 $self->{state} = $self->{prev_state};
3365 $self->{s_kwd} = '';
3366 ## Reconsume.
3367 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3368 has_reference => 1,
3369 line => $l, column => $c,
3370 });
3371 redo A;
3372 } else {
3373 !!!cp (987);
3374 $self->{ca}->{value} .= chr $code;
3375 $self->{ca}->{has_reference} = 1;
3376 $self->{state} = $self->{prev_state};
3377 $self->{s_kwd} = '';
3378 ## Reconsume.
3379 redo A;
3380 }
3381 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3382 if ((0x0041 <= $self->{nc} and # a
3383 $self->{nc} <= 0x005A) or # x
3384 (0x0061 <= $self->{nc} and # a
3385 $self->{nc} <= 0x007A) or # z
3386 (0x0030 <= $self->{nc} and # 0
3387 $self->{nc} <= 0x0039) or # 9
3388 $self->{nc} == 0x003B or # ;
3389 ($self->{is_xml} and
3390 not ($is_space->{$self->{nc}} or
3391 {
3392 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3393 $self->{entity_add} => 1,
3394 }->{$self->{nc}}))) {
3395 our $EntityChar;
3396 $self->{kwd} .= chr $self->{nc};
3397 if (defined $EntityChar->{$self->{kwd}} or
3398 $self->{ge}->{$self->{kwd}}) {
3399 if ($self->{nc} == 0x003B) { # ;
3400 if (defined $self->{ge}->{$self->{kwd}}) {
3401 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3402 !!!cp (1020.1);
3403 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3404 } else {
3405 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3406 !!!cp (1020.2);
3407 !!!parse-error (type => 'unparsed entity', ## TODO: type
3408 value => $self->{kwd});
3409 } else {
3410 !!!cp (1020.3);
3411 }
3412 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3413 }
3414 } else {
3415 if ($self->{is_xml}) {
3416 !!!cp (1020.4);
3417 !!!parse-error (type => 'entity not declared', ## TODO: type
3418 value => $self->{kwd},
3419 level => {
3420 'amp;' => $self->{level}->{warn},
3421 'quot;' => $self->{level}->{warn},
3422 'lt;' => $self->{level}->{warn},
3423 'gt;' => $self->{level}->{warn},
3424 'apos;' => $self->{level}->{warn},
3425 }->{$self->{kwd}} ||
3426 $self->{level}->{must});
3427 } else {
3428 !!!cp (1020);
3429 }
3430 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3431 }
3432 $self->{entity__match} = 1;
3433 !!!next-input-character;
3434 #
3435 } else {
3436 !!!cp (1021);
3437 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3438 $self->{entity__match} = -1;
3439 ## Stay in the state.
3440 !!!next-input-character;
3441 redo A;
3442 }
3443 } else {
3444 !!!cp (1022);
3445 $self->{entity__value} .= chr $self->{nc};
3446 $self->{entity__match} *= 2;
3447 ## Stay in the state.
3448 !!!next-input-character;
3449 redo A;
3450 }
3451 }
3452
3453 my $data;
3454 my $has_ref;
3455 if ($self->{entity__match} > 0) {
3456 !!!cp (1023);
3457 $data = $self->{entity__value};
3458 $has_ref = 1;
3459 #
3460 } elsif ($self->{entity__match} < 0) {
3461 !!!parse-error (type => 'no refc');
3462 if ($self->{prev_state} != DATA_STATE and # in attribute
3463 $self->{entity__match} < -1) {
3464 !!!cp (1024);
3465 $data = '&' . $self->{kwd};
3466 #
3467 } else {
3468 !!!cp (1025);
3469 $data = $self->{entity__value};
3470 $has_ref = 1;
3471 #
3472 }
3473 } else {
3474 !!!cp (1026);
3475 !!!parse-error (type => 'bare ero',
3476 line => $self->{line_prev},
3477 column => $self->{column_prev} - length $self->{kwd});
3478 $data = '&' . $self->{kwd};
3479 #
3480 }
3481
3482 ## NOTE: In these cases, when a character reference is found,
3483 ## it is consumed and a character token is returned, or, otherwise,
3484 ## nothing is consumed and returned, according to the spec algorithm.
3485 ## In this implementation, anything that has been examined by the
3486 ## tokenizer is appended to the parent element or the attribute value
3487 ## as string, either literal string when no character reference or
3488 ## entity-replaced string otherwise, in this stage, since any characters
3489 ## that would not be consumed are appended in the data state or in an
3490 ## appropriate attribute value state anyway.
3491
3492 if ($self->{prev_state} == DATA_STATE) {
3493 !!!cp (986);
3494 $self->{state} = $self->{prev_state};
3495 $self->{s_kwd} = '';
3496 ## Reconsume.
3497 !!!emit ({type => CHARACTER_TOKEN,
3498 data => $data,
3499 has_reference => $has_ref,
3500 line => $self->{line_prev},
3501 column => $self->{column_prev} + 1 - length $self->{kwd},
3502 });
3503 redo A;
3504 } else {
3505 !!!cp (985);
3506 $self->{ca}->{value} .= $data;
3507 $self->{ca}->{has_reference} = 1 if $has_ref;
3508 $self->{state} = $self->{prev_state};
3509 $self->{s_kwd} = '';
3510 ## Reconsume.
3511 redo A;
3512 }
3513
3514 ## XML-only states
3515
3516 } elsif ($self->{state} == PI_STATE) {
3517 ## XML5: "Pi state" and "DOCTYPE pi state".
3518
3519 if ($is_space->{$self->{nc}} or
3520 $self->{nc} == 0x003F or # ?
3521 $self->{nc} == -1) {
3522 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3523 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3524 ## "DOCTYPE pi state": Parse error, switch to the "data
3525 ## state".
3526 !!!parse-error (type => 'bare pio', ## TODO: type
3527 line => $self->{line_prev},
3528 column => $self->{column_prev}
3529 - 1 * ($self->{nc} != -1));
3530 $self->{state} = BOGUS_COMMENT_STATE;
3531 ## Reconsume.
3532 $self->{ct} = {type => COMMENT_TOKEN,
3533 data => '?',
3534 line => $self->{line_prev},
3535 column => $self->{column_prev}
3536 - 1 * ($self->{nc} != -1),
3537 };
3538 redo A;
3539 } else {
3540 ## XML5: "DOCTYPE pi state": Stay in the state.
3541 $self->{ct} = {type => PI_TOKEN,
3542 target => chr $self->{nc},
3543 data => '',
3544 line => $self->{line_prev},
3545 column => $self->{column_prev} - 1,
3546 };
3547 $self->{state} = PI_TARGET_STATE;
3548 !!!next-input-character;
3549 redo A;
3550 }
3551 } elsif ($self->{state} == PI_TARGET_STATE) {
3552 if ($is_space->{$self->{nc}}) {
3553 $self->{state} = PI_TARGET_AFTER_STATE;
3554 !!!next-input-character;
3555 redo A;
3556 } elsif ($self->{nc} == -1) {
3557 !!!parse-error (type => 'no pic'); ## TODO: type
3558 if ($self->{in_subset}) {
3559 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3560 } else {
3561 $self->{state} = DATA_STATE;
3562 $self->{s_kwd} = '';
3563 }
3564 ## Reconsume.
3565 !!!emit ($self->{ct}); # pi
3566 redo A;
3567 } elsif ($self->{nc} == 0x003F) { # ?
3568 $self->{state} = PI_AFTER_STATE;
3569 !!!next-input-character;
3570 redo A;
3571 } else {
3572 ## XML5: typo ("tag name" -> "target")
3573 $self->{ct}->{target} .= chr $self->{nc}; # pi
3574 !!!next-input-character;
3575 redo A;
3576 }
3577 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3578 if ($is_space->{$self->{nc}}) {
3579 ## Stay in the state.
3580 !!!next-input-character;
3581 redo A;
3582 } else {
3583 $self->{state} = PI_DATA_STATE;
3584 ## Reprocess.
3585 redo A;
3586 }
3587 } elsif ($self->{state} == PI_DATA_STATE) {
3588 if ($self->{nc} == 0x003F) { # ?
3589 $self->{state} = PI_DATA_AFTER_STATE;
3590 !!!next-input-character;
3591 redo A;
3592 } elsif ($self->{nc} == -1) {
3593 !!!parse-error (type => 'no pic'); ## TODO: type
3594 if ($self->{in_subset}) {
3595 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3596 } else {
3597 $self->{state} = DATA_STATE;
3598 $self->{s_kwd} = '';
3599 }
3600 ## Reprocess.
3601 !!!emit ($self->{ct}); # pi
3602 redo A;
3603 } else {
3604 $self->{ct}->{data} .= chr $self->{nc}; # pi
3605 $self->{read_until}->($self->{ct}->{data}, q[?],
3606 length $self->{ct}->{data});
3607 ## Stay in the state.
3608 !!!next-input-character;
3609 ## Reprocess.
3610 redo A;
3611 }
3612 } elsif ($self->{state} == PI_AFTER_STATE) {
3613 ## XML5: Part of "Pi after state".
3614
3615 if ($self->{nc} == 0x003E) { # >
3616 if ($self->{in_subset}) {
3617 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3618 } else {
3619 $self->{state} = DATA_STATE;
3620 $self->{s_kwd} = '';
3621 }
3622 !!!next-input-character;
3623 !!!emit ($self->{ct}); # pi
3624 redo A;
3625 } elsif ($self->{nc} == 0x003F) { # ?
3626 !!!parse-error (type => 'no s after target', ## TODO: type
3627 line => $self->{line_prev},
3628 column => $self->{column_prev}); ## XML5: no error
3629 $self->{ct}->{data} .= '?';
3630 $self->{state} = PI_DATA_AFTER_STATE;
3631 !!!next-input-character;
3632 redo A;
3633 } else {
3634 !!!parse-error (type => 'no s after target', ## TODO: type
3635 line => $self->{line_prev},
3636 column => $self->{column_prev}
3637 + 1 * ($self->{nc} == -1)); ## XML5: no error
3638 $self->{ct}->{data} .= '?'; ## XML5: not appended
3639 $self->{state} = PI_DATA_STATE;
3640 ## Reprocess.
3641 redo A;
3642 }
3643 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3644 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3645
3646 if ($self->{nc} == 0x003E) { # >
3647 if ($self->{in_subset}) {
3648 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3649 } else {
3650 $self->{state} = DATA_STATE;
3651 $self->{s_kwd} = '';
3652 }
3653 !!!next-input-character;
3654 !!!emit ($self->{ct}); # pi
3655 redo A;
3656 } elsif ($self->{nc} == 0x003F) { # ?
3657 $self->{ct}->{data} .= '?';
3658 ## Stay in the state.
3659 !!!next-input-character;
3660 redo A;
3661 } else {
3662 $self->{ct}->{data} .= '?'; ## XML5: not appended
3663 $self->{state} = PI_DATA_STATE;
3664 ## Reprocess.
3665 redo A;
3666 }
3667
3668 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3669 if ($self->{nc} == 0x003C) { # <
3670 $self->{state} = DOCTYPE_TAG_STATE;
3671 !!!next-input-character;
3672 redo A;
3673 } elsif ($self->{nc} == 0x0025) { # %
3674 ## XML5: Not defined yet.
3675
3676 ## TODO:
3677
3678 if (not $self->{stop_processing} and
3679 not $self->{document}->xml_standalone) {
3680 !!!parse-error (type => 'stop processing', ## TODO: type
3681 level => $self->{level}->{info});
3682 $self->{stop_processing} = 1;
3683 }
3684
3685 !!!next-input-character;
3686 redo A;
3687 } elsif ($self->{nc} == 0x005D) { # ]
3688 delete $self->{in_subset};
3689 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3690 !!!next-input-character;
3691 redo A;
3692 } elsif ($is_space->{$self->{nc}}) {
3693 ## Stay in the state.
3694 !!!next-input-character;
3695 redo A;
3696 } elsif ($self->{nc} == -1) {
3697 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3698 delete $self->{in_subset};
3699 $self->{state} = DATA_STATE;
3700 $self->{s_kwd} = '';
3701 ## Reconsume.
3702 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3703 redo A;
3704 } else {
3705 unless ($self->{internal_subset_tainted}) {
3706 ## XML5: No parse error.
3707 !!!parse-error (type => 'string in internal subset');
3708 $self->{internal_subset_tainted} = 1;
3709 }
3710 ## Stay in the state.
3711 !!!next-input-character;
3712 redo A;
3713 }
3714 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3715 if ($self->{nc} == 0x003E) { # >
3716 $self->{state} = DATA_STATE;
3717 $self->{s_kwd} = '';
3718 !!!next-input-character;
3719 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3720 redo A;
3721 } elsif ($self->{nc} == -1) {
3722 !!!parse-error (type => 'unclosed DOCTYPE');
3723 $self->{state} = DATA_STATE;
3724 $self->{s_kwd} = '';
3725 ## Reconsume.
3726 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3727 redo A;
3728 } else {
3729 ## XML5: No parse error and stay in the state.
3730 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3731
3732 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3733 !!!next-input-character;
3734 redo A;
3735 }
3736 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3737 if ($self->{nc} == 0x003E) { # >
3738 $self->{state} = DATA_STATE;
3739 $self->{s_kwd} = '';
3740 !!!next-input-character;
3741 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3742 redo A;
3743 } elsif ($self->{nc} == -1) {
3744 $self->{state} = DATA_STATE;
3745 $self->{s_kwd} = '';
3746 ## Reconsume.
3747 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3748 redo A;
3749 } else {
3750 ## Stay in the state.
3751 !!!next-input-character;
3752 redo A;
3753 }
3754 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3755 if ($self->{nc} == 0x0021) { # !
3756 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3757 !!!next-input-character;
3758 redo A;
3759 } elsif ($self->{nc} == 0x003F) { # ?
3760 $self->{state} = PI_STATE;
3761 !!!next-input-character;
3762 redo A;
3763 } elsif ($self->{nc} == -1) {
3764 !!!parse-error (type => 'bare stago');
3765 $self->{state} = DATA_STATE;
3766 $self->{s_kwd} = '';
3767 ## Reconsume.
3768 redo A;
3769 } else {
3770 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3771 line => $self->{line_prev},
3772 column => $self->{column_prev});
3773 $self->{state} = BOGUS_COMMENT_STATE;
3774 $self->{ct} = {type => COMMENT_TOKEN,
3775 data => '',
3776 }; ## NOTE: Will be discarded.
3777 !!!next-input-character;
3778 redo A;
3779 }
3780 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3781 ## XML5: "DOCTYPE markup declaration state".
3782
3783 if ($self->{nc} == 0x002D) { # -
3784 $self->{state} = MD_HYPHEN_STATE;
3785 !!!next-input-character;
3786 redo A;
3787 } elsif ($self->{nc} == 0x0045 or # E
3788 $self->{nc} == 0x0065) { # e
3789 $self->{state} = MD_E_STATE;
3790 $self->{kwd} = chr $self->{nc};
3791 !!!next-input-character;
3792 redo A;
3793 } elsif ($self->{nc} == 0x0041 or # A
3794 $self->{nc} == 0x0061) { # a
3795 $self->{state} = MD_ATTLIST_STATE;
3796 $self->{kwd} = chr $self->{nc};
3797 !!!next-input-character;
3798 redo A;
3799 } elsif ($self->{nc} == 0x004E or # N
3800 $self->{nc} == 0x006E) { # n
3801 $self->{state} = MD_NOTATION_STATE;
3802 $self->{kwd} = chr $self->{nc};
3803 !!!next-input-character;
3804 redo A;
3805 } else {
3806 #
3807 }
3808
3809 ## XML5: No parse error.
3810 !!!parse-error (type => 'bogus comment',
3811 line => $self->{line_prev},
3812 column => $self->{column_prev} - 1);
3813 ## Reconsume.
3814 $self->{state} = BOGUS_COMMENT_STATE;
3815 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3816 redo A;
3817 } elsif ($self->{state} == MD_E_STATE) {
3818 if ($self->{nc} == 0x004E or # N
3819 $self->{nc} == 0x006E) { # n
3820 $self->{state} = MD_ENTITY_STATE;
3821 $self->{kwd} .= chr $self->{nc};
3822 !!!next-input-character;
3823 redo A;
3824 } elsif ($self->{nc} == 0x004C or # L
3825 $self->{nc} == 0x006C) { # l
3826 ## XML5: <!ELEMENT> not supported.
3827 $self->{state} = MD_ELEMENT_STATE;
3828 $self->{kwd} .= chr $self->{nc};
3829 !!!next-input-character;
3830 redo A;
3831 } else {
3832 ## XML5: No parse error.
3833 !!!parse-error (type => 'bogus comment',
3834 line => $self->{line_prev},
3835 column => $self->{column_prev} - 2
3836 + 1 * ($self->{nc} == -1));
3837 ## Reconsume.
3838 $self->{state} = BOGUS_COMMENT_STATE;
3839 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3840 redo A;
3841 }
3842 } elsif ($self->{state} == MD_ENTITY_STATE) {
3843 if ($self->{nc} == [
3844 undef,
3845 undef,
3846 0x0054, # T
3847 0x0049, # I
3848 0x0054, # T
3849 ]->[length $self->{kwd}] or
3850 $self->{nc} == [
3851 undef,
3852 undef,
3853 0x0074, # t
3854 0x0069, # i
3855 0x0074, # t
3856 ]->[length $self->{kwd}]) {
3857 ## Stay in the state.
3858 $self->{kwd} .= chr $self->{nc};
3859 !!!next-input-character;
3860 redo A;
3861 } elsif ((length $self->{kwd}) == 5 and
3862 ($self->{nc} == 0x0059 or # Y
3863 $self->{nc} == 0x0079)) { # y
3864 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3865 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3866 text => 'ENTITY',
3867 line => $self->{line_prev},
3868 column => $self->{column_prev} - 4);
3869 }
3870 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3871 line => $self->{line_prev},
3872 column => $self->{column_prev} - 6};
3873 $self->{state} = DOCTYPE_MD_STATE;
3874 !!!next-input-character;
3875 redo A;
3876 } else {
3877 !!!parse-error (type => 'bogus comment',
3878 line => $self->{line_prev},
3879 column => $self->{column_prev} - 1
3880 - (length $self->{kwd})
3881 + 1 * ($self->{nc} == -1));
3882 $self->{state} = BOGUS_COMMENT_STATE;
3883 ## Reconsume.
3884 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3885 redo A;
3886 }
3887 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3888 if ($self->{nc} == [
3889 undef,
3890 undef,
3891 0x0045, # E
3892 0x004D, # M
3893 0x0045, # E
3894 0x004E, # N
3895 ]->[length $self->{kwd}] or
3896 $self->{nc} == [
3897 undef,
3898 undef,
3899 0x0065, # e
3900 0x006D, # m
3901 0x0065, # e
3902 0x006E, # n
3903 ]->[length $self->{kwd}]) {
3904 ## Stay in the state.
3905 $self->{kwd} .= chr $self->{nc};
3906 !!!next-input-character;
3907 redo A;
3908 } elsif ((length $self->{kwd}) == 6 and
3909 ($self->{nc} == 0x0054 or # T
3910 $self->{nc} == 0x0074)) { # t
3911 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3912 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3913 text => 'ELEMENT',
3914 line => $self->{line_prev},
3915 column => $self->{column_prev} - 5);
3916 }
3917 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3918 line => $self->{line_prev},
3919 column => $self->{column_prev} - 7};
3920 $self->{state} = DOCTYPE_MD_STATE;
3921 !!!next-input-character;
3922 redo A;
3923 } else {
3924 !!!parse-error (type => 'bogus comment',
3925 line => $self->{line_prev},
3926 column => $self->{column_prev} - 1
3927 - (length $self->{kwd})
3928 + 1 * ($self->{nc} == -1));
3929 $self->{state} = BOGUS_COMMENT_STATE;
3930 ## Reconsume.
3931 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3932 redo A;
3933 }
3934 } elsif ($self->{state} == MD_ATTLIST_STATE) {
3935 if ($self->{nc} == [
3936 undef,
3937 0x0054, # T
3938 0x0054, # T
3939 0x004C, # L
3940 0x0049, # I
3941 0x0053, # S
3942 ]->[length $self->{kwd}] or
3943 $self->{nc} == [
3944 undef,
3945 0x0074, # t
3946 0x0074, # t
3947 0x006C, # l
3948 0x0069, # i
3949 0x0073, # s
3950 ]->[length $self->{kwd}]) {
3951 ## Stay in the state.
3952 $self->{kwd} .= chr $self->{nc};
3953 !!!next-input-character;
3954 redo A;
3955 } elsif ((length $self->{kwd}) == 6 and
3956 ($self->{nc} == 0x0054 or # T
3957 $self->{nc} == 0x0074)) { # t
3958 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3959 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3960 text => 'ATTLIST',
3961 line => $self->{line_prev},
3962 column => $self->{column_prev} - 5);
3963 }
3964 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3965 attrdefs => [],
3966 line => $self->{line_prev},
3967 column => $self->{column_prev} - 7};
3968 $self->{state} = DOCTYPE_MD_STATE;
3969 !!!next-input-character;
3970 redo A;
3971 } else {
3972 !!!parse-error (type => 'bogus comment',
3973 line => $self->{line_prev},
3974 column => $self->{column_prev} - 1
3975 - (length $self->{kwd})
3976 + 1 * ($self->{nc} == -1));
3977 $self->{state} = BOGUS_COMMENT_STATE;
3978 ## Reconsume.
3979 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3980 redo A;
3981 }
3982 } elsif ($self->{state} == MD_NOTATION_STATE) {
3983 if ($self->{nc} == [
3984 undef,
3985 0x004F, # O
3986 0x0054, # T
3987 0x0041, # A
3988 0x0054, # T
3989 0x0049, # I
3990 0x004F, # O
3991 ]->[length $self->{kwd}] or
3992 $self->{nc} == [
3993 undef,
3994 0x006F, # o
3995 0x0074, # t
3996 0x0061, # a
3997 0x0074, # t
3998 0x0069, # i
3999 0x006F, # o
4000 ]->[length $self->{kwd}]) {
4001 ## Stay in the state.
4002 $self->{kwd} .= chr $self->{nc};
4003 !!!next-input-character;
4004 redo A;
4005 } elsif ((length $self->{kwd}) == 7 and
4006 ($self->{nc} == 0x004E or # N
4007 $self->{nc} == 0x006E)) { # n
4008 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4009 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4010 text => 'NOTATION',
4011 line => $self->{line_prev},
4012 column => $self->{column_prev} - 6);
4013 }
4014 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4015 line => $self->{line_prev},
4016 column => $self->{column_prev} - 8};
4017 $self->{state} = DOCTYPE_MD_STATE;
4018 !!!next-input-character;
4019 redo A;
4020 } else {
4021 !!!parse-error (type => 'bogus comment',
4022 line => $self->{line_prev},
4023 column => $self->{column_prev} - 1
4024 - (length $self->{kwd})
4025 + 1 * ($self->{nc} == -1));
4026 $self->{state} = BOGUS_COMMENT_STATE;
4027 ## Reconsume.
4028 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4029 redo A;
4030 }
4031 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4032 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4033 ## "DOCTYPE NOTATION state".
4034
4035 if ($is_space->{$self->{nc}}) {
4036 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4037 $self->{state} = BEFORE_MD_NAME_STATE;
4038 !!!next-input-character;
4039 redo A;
4040 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4041 $self->{nc} == 0x0025) { # %
4042 ## XML5: Switch to the "DOCTYPE bogus comment state".
4043 !!!parse-error (type => 'no space before md name'); ## TODO: type
4044 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4045 !!!next-input-character;
4046 redo A;
4047 } elsif ($self->{nc} == -1) {
4048 !!!parse-error (type => 'unclosed md'); ## TODO: type
4049 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4050 ## Reconsume.
4051 redo A;
4052 } elsif ($self->{nc} == 0x003E) { # >
4053 ## XML5: Switch to the "DOCTYPE bogus comment state".
4054 !!!parse-error (type => 'no md name'); ## TODO: type
4055 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4056 !!!next-input-character;
4057 redo A;
4058 } else {
4059 ## XML5: Switch to the "DOCTYPE bogus comment state".
4060 !!!parse-error (type => 'no space before md name'); ## TODO: type
4061 $self->{state} = BEFORE_MD_NAME_STATE;
4062 redo A;
4063 }
4064 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4065 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4066 ## before state", "DOCTYPE ATTLIST name before state".
4067
4068 if ($is_space->{$self->{nc}}) {
4069 ## Stay in the state.
4070 !!!next-input-character;
4071 redo A;
4072 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4073 $self->{nc} == 0x0025) { # %
4074 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4075 !!!next-input-character;
4076 redo A;
4077 } elsif ($self->{nc} == 0x003E) { # >
4078 ## XML5: Same as "Anything else".
4079 !!!parse-error (type => 'no md name'); ## TODO: type
4080 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4081 !!!next-input-character;
4082 redo A;
4083 } elsif ($self->{nc} == -1) {
4084 !!!parse-error (type => 'unclosed md'); ## TODO: type
4085 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4086 ## Reconsume.
4087 redo A;
4088 } else {
4089 ## XML5: [ATTLIST] Not defined yet.
4090 $self->{ct}->{name} .= chr $self->{nc};
4091 $self->{state} = MD_NAME_STATE;
4092 !!!next-input-character;
4093 redo A;
4094 }
4095 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4096 if ($is_space->{$self->{nc}}) {
4097 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4098 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4099 $self->{state} = BEFORE_MD_NAME_STATE;
4100 !!!next-input-character;
4101 redo A;
4102 } elsif ($self->{nc} == 0x003E) { # >
4103 ## XML5: Same as "Anything else".
4104 !!!parse-error (type => 'no md name'); ## TODO: type
4105 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4106 !!!next-input-character;
4107 redo A;
4108 } elsif ($self->{nc} == -1) {
4109 !!!parse-error (type => 'unclosed md');
4110 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4111 ## Reconsume.
4112 redo A;
4113 } else {
4114 ## XML5: No parse error.
4115 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4116 $self->{state} = BOGUS_COMMENT_STATE;
4117 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4118 ## Reconsume.
4119 redo A;
4120 }
4121 } elsif ($self->{state} == MD_NAME_STATE) {
4122 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4123
4124 if ($is_space->{$self->{nc}}) {
4125 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4126 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4127 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4128 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4129 } else { # ENTITY/NOTATION
4130 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4131 }
4132 !!!next-input-character;
4133 redo A;
4134 } elsif ($self->{nc} == 0x003E) { # >
4135 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4136 #
4137 } else {
4138 !!!parse-error (type => 'no md def'); ## TODO: type
4139 }
4140 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4141 !!!next-input-character;
4142 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4143 redo A;
4144 } elsif ($self->{nc} == -1) {
4145 ## XML5: [ATTLIST] No parse error.
4146 !!!parse-error (type => 'unclosed md');
4147 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4148 ## Reconsume.
4149 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4150 redo A;
4151 } else {
4152 ## XML5: [ATTLIST] Not defined yet.
4153 $self->{ct}->{name} .= chr $self->{nc};
4154 ## Stay in the state.
4155 !!!next-input-character;
4156 redo A;
4157 }
4158 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4159 if ($is_space->{$self->{nc}}) {
4160 ## Stay in the state.
4161 !!!next-input-character;
4162 redo A;
4163 } elsif ($self->{nc} == 0x003E) { # >
4164 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4165 !!!next-input-character;
4166 !!!emit ($self->{ct}); # ATTLIST
4167 redo A;
4168 } elsif ($self->{nc} == -1) {
4169 ## XML5: No parse error.
4170 !!!parse-error (type => 'unclosed md'); ## TODO: type
4171 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4172 !!!emit ($self->{ct});
4173 redo A;
4174 } else {
4175 ## XML5: Not defined yet.
4176 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4177 tokens => [],
4178 line => $self->{line}, column => $self->{column}};
4179 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4180 !!!next-input-character;
4181 redo A;
4182 }
4183 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4184 if ($is_space->{$self->{nc}}) {
4185 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4186 !!!next-input-character;
4187 redo A;
4188 } elsif ($self->{nc} == 0x003E) { # >
4189 ## XML5: Same as "anything else".
4190 !!!parse-error (type => 'no attr type'); ## TODO: type
4191 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4192 !!!next-input-character;
4193 !!!emit ($self->{ct}); # ATTLIST
4194 redo A;
4195 } elsif ($self->{nc} == 0x0028) { # (
4196 ## XML5: Same as "anything else".
4197 !!!parse-error (type => 'no space before paren'); ## TODO: type
4198 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4199 !!!next-input-character;
4200 redo A;
4201 } elsif ($self->{nc} == -1) {
4202 ## XML5: No parse error.
4203 !!!parse-error (type => 'unclosed md'); ## TODO: type
4204 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4205 !!!next-input-character;
4206 !!!emit ($self->{ct}); # ATTLIST
4207 redo A;
4208 } else {
4209 ## XML5: Not defined yet.
4210 $self->{ca}->{name} .= chr $self->{nc};
4211 ## Stay in the state.
4212 !!!next-input-character;
4213 redo A;
4214 }
4215 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4216 if ($is_space->{$self->{nc}}) {
4217 ## Stay in the state.
4218 !!!next-input-character;
4219 redo A;
4220 } elsif ($self->{nc} == 0x003E) { # >
4221 ## XML5: Same as "anything else".
4222 !!!parse-error (type => 'no attr type'); ## TODO: type
4223 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4224 !!!next-input-character;
4225 !!!emit ($self->{ct}); # ATTLIST
4226 redo A;
4227 } elsif ($self->{nc} == 0x0028) { # (
4228 ## XML5: Same as "anything else".
4229 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4230 !!!next-input-character;
4231 redo A;
4232 } elsif ($self->{nc} == -1) {
4233 ## XML5: No parse error.
4234 !!!parse-error (type => 'unclosed md'); ## TODO: type
4235 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4236 !!!next-input-character;
4237 !!!emit ($self->{ct});
4238 redo A;
4239 } else {
4240 ## XML5: Not defined yet.
4241 $self->{ca}->{type} = chr $self->{nc};
4242 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4243 !!!next-input-character;
4244 redo A;
4245 }
4246 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4247 if ($is_space->{$self->{nc}}) {
4248 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4249 !!!next-input-character;
4250 redo A;
4251 } elsif ($self->{nc} == 0x0023) { # #
4252 ## XML5: Same as "anything else".
4253 !!!parse-error (type => 'no space before default value'); ## TODO: type
4254 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4255 !!!next-input-character;
4256 redo A;
4257 } elsif ($self->{nc} == 0x0022) { # "
4258 ## XML5: Same as "anything else".
4259 !!!parse-error (type => 'no space before default value'); ## TODO: type
4260 $self->{ca}->{value} = '';
4261 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4262 !!!next-input-character;
4263 redo A;
4264 } elsif ($self->{nc} == 0x0027) { # '
4265 ## XML5: Same as "anything else".
4266 !!!parse-error (type => 'no space before default value'); ## TODO: type
4267 $self->{ca}->{value} = '';
4268 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4269 !!!next-input-character;
4270 redo A;
4271 } elsif ($self->{nc} == 0x003E) { # >
4272 ## XML5: Same as "anything else".
4273 !!!parse-error (type => 'no attr default'); ## TODO: type
4274 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275 !!!next-input-character;
4276 !!!emit ($self->{ct}); # ATTLIST
4277 redo A;
4278 } elsif ($self->{nc} == 0x0028) { # (
4279 ## XML5: Same as "anything else".
4280 !!!parse-error (type => 'no space before paren'); ## TODO: type
4281 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4282 !!!next-input-character;
4283 redo A;
4284 } elsif ($self->{nc} == -1) {
4285 ## XML5: No parse error.
4286 !!!parse-error (type => 'unclosed md'); ## TODO: type
4287 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4288 !!!next-input-character;
4289 !!!emit ($self->{ct});
4290 redo A;
4291 } else {
4292 ## XML5: Not defined yet.
4293 $self->{ca}->{type} .= chr $self->{nc};
4294 ## Stay in the state.
4295 !!!next-input-character;
4296 redo A;
4297 }
4298 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4299 if ($is_space->{$self->{nc}}) {
4300 ## Stay in the state.
4301 !!!next-input-character;
4302 redo A;
4303 } elsif ($self->{nc} == 0x0028) { # (
4304 ## XML5: Same as "anything else".
4305 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4306 !!!next-input-character;
4307 redo A;
4308 } elsif ($self->{nc} == 0x0023) { # #
4309 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4310 !!!next-input-character;
4311 redo A;
4312 } elsif ($self->{nc} == 0x0022) { # "
4313 ## XML5: Same as "anything else".
4314 $self->{ca}->{value} = '';
4315 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4316 !!!next-input-character;
4317 redo A;
4318 } elsif ($self->{nc} == 0x0027) { # '
4319 ## XML5: Same as "anything else".
4320 $self->{ca}->{value} = '';
4321 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4322 !!!next-input-character;
4323 redo A;
4324 } elsif ($self->{nc} == 0x003E) { # >
4325 ## XML5: Same as "anything else".
4326 !!!parse-error (type => 'no attr default'); ## TODO: type
4327 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4328 !!!next-input-character;
4329 !!!emit ($self->{ct}); # ATTLIST
4330 redo A;
4331 } elsif ($self->{nc} == -1) {
4332 ## XML5: No parse error.
4333 !!!parse-error (type => 'unclosed md'); ## TODO: type
4334 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4335 !!!next-input-character;
4336 !!!emit ($self->{ct});
4337 redo A;
4338 } else {
4339 ## XML5: Switch to the "DOCTYPE bogus comment state".
4340 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4341 $self->{ca}->{value} = '';
4342 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4343 ## Reconsume.
4344 redo A;
4345 }
4346 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4347 if ($is_space->{$self->{nc}}) {
4348 ## Stay in the state.
4349 !!!next-input-character;
4350 redo A;
4351 } elsif ($self->{nc} == 0x007C) { # |
4352 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4353 ## Stay in the state.
4354 !!!next-input-character;
4355 redo A;
4356 } elsif ($self->{nc} == 0x0029) { # )
4357 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4358 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4359 !!!next-input-character;
4360 redo A;
4361 } elsif ($self->{nc} == 0x003E) { # >
4362 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4363 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4364 !!!next-input-character;
4365 !!!emit ($self->{ct}); # ATTLIST
4366 redo A;
4367 } elsif ($self->{nc} == -1) {
4368 ## XML5: No parse error.
4369 !!!parse-error (type => 'unclosed md'); ## TODO: type
4370 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4371 !!!next-input-character;
4372 !!!emit ($self->{ct});
4373 redo A;
4374 } else {
4375 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4376 $self->{state} = ALLOWED_TOKEN_STATE;
4377 !!!next-input-character;
4378 redo A;
4379 }
4380 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4381 if ($is_space->{$self->{nc}}) {
4382 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4383 !!!next-input-character;
4384 redo A;
4385 } elsif ($self->{nc} == 0x007C) { # |
4386 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4387 !!!next-input-character;
4388 redo A;
4389 } elsif ($self->{nc} == 0x0029) { # )
4390 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4391 !!!next-input-character;
4392 redo A;
4393 } elsif ($self->{nc} == 0x003E) { # >
4394 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4395 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4396 !!!next-input-character;
4397 !!!emit ($self->{ct}); # ATTLIST
4398 redo A;
4399 } elsif ($self->{nc} == -1) {
4400 ## XML5: No parse error.
4401 !!!parse-error (type => 'unclosed md'); ## TODO: type
4402 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4403 !!!next-input-character;
4404 !!!emit ($self->{ct});
4405 redo A;
4406 } else {
4407 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4408 ## Stay in the state.
4409 !!!next-input-character;
4410 redo A;
4411 }
4412 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4413 if ($is_space->{$self->{nc}}) {
4414 ## Stay in the state.
4415 !!!next-input-character;
4416 redo A;
4417 } elsif ($self->{nc} == 0x007C) { # |
4418 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4419 !!!next-input-character;
4420 redo A;
4421 } elsif ($self->{nc} == 0x0029) { # )
4422 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4423 !!!next-input-character;
4424 redo A;
4425 } elsif ($self->{nc} == 0x003E) { # >
4426 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4427 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428 !!!next-input-character;
4429 !!!emit ($self->{ct}); # ATTLIST
4430 redo A;
4431 } elsif ($self->{nc} == -1) {
4432 ## XML5: No parse error.
4433 !!!parse-error (type => 'unclosed md'); ## TODO: type
4434 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4435 !!!next-input-character;
4436 !!!emit ($self->{ct});
4437 redo A;
4438 } else {
4439 !!!parse-error (type => 'space in allowed token', ## TODO: type
4440 line => $self->{line_prev},
4441 column => $self->{column_prev});
4442 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4443 $self->{state} = ALLOWED_TOKEN_STATE;
4444 !!!next-input-character;
4445 redo A;
4446 }
4447 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4448 if ($is_space->{$self->{nc}}) {
4449 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4450 !!!next-input-character;
4451 redo A;
4452 } elsif ($self->{nc} == 0x0023) { # #
4453 !!!parse-error (type => 'no space before default value'); ## TODO: type
4454 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4455 !!!next-input-character;
4456 redo A;
4457 } elsif ($self->{nc} == 0x0022) { # "
4458 !!!parse-error (type => 'no space before default value'); ## TODO: type
4459 $self->{ca}->{value} = '';
4460 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4461 !!!next-input-character;
4462 redo A;
4463 } elsif ($self->{nc} == 0x0027) { # '
4464 !!!parse-error (type => 'no space before default value'); ## TODO: type
4465 $self->{ca}->{value} = '';
4466 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4467 !!!next-input-character;
4468 redo A;
4469 } elsif ($self->{nc} == 0x003E) { # >
4470 !!!parse-error (type => 'no attr default'); ## TODO: type
4471 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4472 !!!next-input-character;
4473 !!!emit ($self->{ct}); # ATTLIST
4474 redo A;
4475 } elsif ($self->{nc} == -1) {
4476 !!!parse-error (type => 'unclosed md'); ## TODO: type
4477 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4478 !!!next-input-character;
4479 !!!emit ($self->{ct});
4480 redo A;
4481 } else {
4482 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4483 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4484 ## Reconsume.
4485 redo A;
4486 }
4487 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4488 if ($is_space->{$self->{nc}}) {
4489 ## Stay in the state.
4490 !!!next-input-character;
4491 redo A;
4492 } elsif ($self->{nc} == 0x0023) { # #
4493 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4494 !!!next-input-character;
4495 redo A;
4496 } elsif ($self->{nc} == 0x0022) { # "
4497 $self->{ca}->{value} = '';
4498 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4499 !!!next-input-character;
4500 redo A;
4501 } elsif ($self->{nc} == 0x0027) { # '
4502 $self->{ca}->{value} = '';
4503 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4504 !!!next-input-character;
4505 redo A;
4506 } elsif ($self->{nc} == 0x003E) { # >
4507 !!!parse-error (type => 'no attr default'); ## TODO: type
4508 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4509 !!!next-input-character;
4510 !!!emit ($self->{ct}); # ATTLIST
4511 redo A;
4512 } elsif ($self->{nc} == -1) {
4513 !!!parse-error (type => 'unclosed md'); ## TODO: type
4514 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4515 !!!next-input-character;
4516 !!!emit ($self->{ct});
4517 redo A;
4518 } else {
4519 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4520 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4521 ## Reconsume.
4522 redo A;
4523 }
4524 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4525 if ($is_space->{$self->{nc}}) {
4526 ## XML5: No parse error.
4527 !!!parse-error (type => 'no default type'); ## TODO: type
4528 $self->{state} = BOGUS_MD_STATE;
4529 ## Reconsume.
4530 redo A;
4531 } elsif ($self->{nc} == 0x0022) { # "
4532 ## XML5: Same as "anything else".
4533 $self->{ca}->{value} = '';
4534 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4535 !!!next-input-character;
4536 redo A;
4537 } elsif ($self->{nc} == 0x0027) { # '
4538 ## XML5: Same as "anything else".
4539 $self->{ca}->{value} = '';
4540 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4541 !!!next-input-character;
4542 redo A;
4543 } elsif ($self->{nc} == 0x003E) { # >
4544 ## XML5: Same as "anything else".
4545 !!!parse-error (type => 'no attr default'); ## TODO: type
4546 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4547 !!!next-input-character;
4548 !!!emit ($self->{ct}); # ATTLIST
4549 redo A;
4550 } elsif ($self->{nc} == -1) {
4551 ## XML5: No parse error.
4552 !!!parse-error (type => 'unclosed md'); ## TODO: type
4553 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4554 !!!next-input-character;
4555 !!!emit ($self->{ct});
4556 redo A;
4557 } else {
4558 $self->{ca}->{default} = chr $self->{nc};
4559 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4560 !!!next-input-character;
4561 redo A;
4562 }
4563 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4564 if ($is_space->{$self->{nc}}) {
4565 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4566 !!!next-input-character;
4567 redo A;
4568 } elsif ($self->{nc} == 0x0022) { # "
4569 ## XML5: Same as "anything else".
4570 !!!parse-error (type => 'no space before default value'); ## TODO: type
4571 $self->{ca}->{value} = '';
4572 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4573 !!!next-input-character;
4574 redo A;
4575 } elsif ($self->{nc} == 0x0027) { # '
4576 ## XML5: Same as "anything else".
4577 !!!parse-error (type => 'no space before default value'); ## TODO: type
4578 $self->{ca}->{value} = '';
4579 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4580 !!!next-input-character;
4581 redo A;
4582 } elsif ($self->{nc} == 0x003E) { # >
4583 ## XML5: Same as "anything else".
4584 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4585 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4586 !!!next-input-character;
4587 !!!emit ($self->{ct}); # ATTLIST
4588 redo A;
4589 } elsif ($self->{nc} == -1) {
4590 ## XML5: No parse error.
4591 !!!parse-error (type => 'unclosed md'); ## TODO: type
4592 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4593 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4594 !!!next-input-character;
4595 !!!emit ($self->{ct});
4596 redo A;
4597 } else {
4598 $self->{ca}->{default} .= chr $self->{nc};
4599 ## Stay in the state.
4600 !!!next-input-character;
4601 redo A;
4602 }
4603 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4604 if ($is_space->{$self->{nc}}) {
4605 ## Stay in the state.
4606 !!!next-input-character;
4607 redo A;
4608 } elsif ($self->{nc} == 0x0022) { # "
4609 $self->{ca}->{value} = '';
4610 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4611 !!!next-input-character;
4612 redo A;
4613 } elsif ($self->{nc} == 0x0027) { # '
4614 $self->{ca}->{value} = '';
4615 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4616 !!!next-input-character;
4617 redo A;
4618 } elsif ($self->{nc} == 0x003E) { # >
4619 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4620 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4621 !!!next-input-character;
4622 !!!emit ($self->{ct}); # ATTLIST
4623 redo A;
4624 } elsif ($self->{nc} == -1) {
4625 ## XML5: No parse error.
4626 !!!parse-error (type => 'unclosed md'); ## TODO: type
4627 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4628 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4629 !!!next-input-character;
4630 !!!emit ($self->{ct});
4631 redo A;
4632 } else {
4633 ## XML5: Not defined yet.
4634 if ($self->{ca}->{default} eq 'FIXED') {
4635 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4636 } else {
4637 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4638 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4639 }
4640 ## Reconsume.
4641 redo A;
4642 }
4643 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4644 if ($is_space->{$self->{nc}} or
4645 $self->{nc} == -1 or
4646 $self->{nc} == 0x003E) { # >
4647 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4648 ## Reconsume.
4649 redo A;
4650 } else {
4651 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4652 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4653 ## Reconsume.
4654 redo A;
4655 }
4656 } elsif ($self->{state} == NDATA_STATE) {
4657 ## ASCII case-insensitive
4658 if ($self->{nc} == [
4659 undef,
4660 0x0044, # D
4661 0x0041, # A
4662 0x0054, # T
4663 ]->[length $self->{kwd}] or
4664 $self->{nc} == [
4665 undef,
4666 0x0064, # d
4667 0x0061, # a
4668 0x0074, # t
4669 ]->[length $self->{kwd}]) {
4670 !!!cp (172.2);
4671 ## Stay in the state.
4672 $self->{kwd} .= chr $self->{nc};
4673 !!!next-input-character;
4674 redo A;
4675 } elsif ((length $self->{kwd}) == 4 and
4676 ($self->{nc} == 0x0041 or # A
4677 $self->{nc} == 0x0061)) { # a
4678 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4679 !!!cp (172.3);
4680 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4681 text => 'NDATA',
4682 line => $self->{line_prev},
4683 column => $self->{column_prev} - 4);
4684 } else {
4685 !!!cp (172.4);
4686 }
4687 $self->{state} = AFTER_NDATA_STATE;
4688 !!!next-input-character;
4689 redo A;
4690 } else {
4691 !!!parse-error (type => 'string after literal', ## TODO: type
4692 line => $self->{line_prev},
4693 column => $self->{column_prev} + 1
4694 - length $self->{kwd});
4695 !!!cp (172.5);
4696 $self->{state} = BOGUS_MD_STATE;
4697 ## Reconsume.
4698 redo A;
4699 }
4700 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4701 if ($is_space->{$self->{nc}}) {
4702 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4703 !!!next-input-character;
4704 redo A;
4705 } elsif ($self->{nc} == 0x003E) { # >
4706 !!!parse-error (type => 'no notation name'); ## TODO: type
4707 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4708 !!!next-input-character;
4709 !!!emit ($self->{ct}); # ENTITY
4710 redo A;
4711 } elsif ($self->{nc} == -1) {
4712 !!!parse-error (type => 'unclosed md'); ## TODO: type
4713 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4714 !!!next-input-character;
4715 !!!emit ($self->{ct}); # ENTITY
4716 redo A;
4717 } else {
4718 !!!parse-error (type => 'string after literal', ## TODO: type
4719 line => $self->{line_prev},
4720 column => $self->{column_prev} + 1
4721 - length $self->{kwd});
4722 $self->{state} = BOGUS_MD_STATE;
4723 ## Reconsume.
4724 redo A;
4725 }
4726 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4727 if ($is_space->{$self->{nc}}) {
4728 ## Stay in the state.
4729 !!!next-input-character;
4730 redo A;
4731 } elsif ($self->{nc} == 0x003E) { # >
4732 !!!parse-error (type => 'no notation name'); ## TODO: type
4733 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4734 !!!next-input-character;
4735 !!!emit ($self->{ct}); # ENTITY
4736 redo A;
4737 } elsif ($self->{nc} == -1) {
4738 !!!parse-error (type => 'unclosed md'); ## TODO: type
4739 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4740 !!!next-input-character;
4741 !!!emit ($self->{ct}); # ENTITY
4742 redo A;
4743 } else {
4744 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4745 $self->{state} = NOTATION_NAME_STATE;
4746 !!!next-input-character;
4747 redo A;
4748 }
4749 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4750 if ($is_space->{$self->{nc}}) {
4751 $self->{state} = AFTER_MD_DEF_STATE;
4752 !!!next-input-character;
4753 redo A;
4754 } elsif ($self->{nc} == 0x003E) { # >
4755 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4756 !!!next-input-character;
4757 !!!emit ($self->{ct}); # ENTITY
4758 redo A;
4759 } elsif ($self->{nc} == -1) {
4760 !!!parse-error (type => 'unclosed md'); ## TODO: type
4761 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4762 !!!next-input-character;
4763 !!!emit ($self->{ct}); # ENTITY
4764 redo A;
4765 } else {
4766 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4767 ## Stay in the state.
4768 !!!next-input-character;
4769 redo A;
4770 }
4771 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4772 if ($self->{nc} == 0x0022) { # "
4773 $self->{state} = AFTER_MD_DEF_STATE;
4774 !!!next-input-character;
4775 redo A;
4776 } elsif ($self->{nc} == 0x0026) { # &
4777 $self->{prev_state} = $self->{state};
4778 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4779 $self->{entity_add} = 0x0022; # "
4780 !!!next-input-character;
4781 redo A;
4782 ## TODO: %
4783 } elsif ($self->{nc} == -1) {
4784 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4785 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4786 ## Reconsume.
4787 !!!emit ($self->{ct}); # ENTITY
4788 redo A;
4789 } else {
4790 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4791 !!!next-input-character;
4792 redo A;
4793 }
4794 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4795 if ($self->{nc} == 0x0027) { # '
4796 $self->{state} = AFTER_MD_DEF_STATE;
4797 !!!next-input-character;
4798 redo A;
4799 } elsif ($self->{nc} == 0x0026) { # &
4800 $self->{prev_state} = $self->{state};
4801 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4802 $self->{entity_add} = 0x0027; # '
4803 !!!next-input-character;
4804 redo A;
4805 ## TODO: %
4806 } elsif ($self->{nc} == -1) {
4807 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4808 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4809 ## Reconsume.
4810 !!!emit ($self->{ct}); # ENTITY
4811 redo A;
4812 } else {
4813 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4814 !!!next-input-character;
4815 redo A;
4816 }
4817 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4818 if ($is_space->{$self->{nc}} or
4819 {
4820 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4821 $self->{entity_add} => 1,
4822 }->{$self->{nc}}) {
4823 !!!parse-error (type => 'bare ero',
4824 line => $self->{line_prev},
4825 column => $self->{column_prev}
4826 + ($self->{nc} == -1 ? 1 : 0));
4827 ## Don't consume
4828 ## Return nothing.
4829 #
4830 } elsif ($self->{nc} == 0x0023) { # #
4831 $self->{ca} = $self->{ct};
4832 $self->{state} = ENTITY_HASH_STATE;
4833 $self->{kwd} = '#';
4834 !!!next-input-character;
4835 redo A;
4836 } else {
4837 #
4838 }
4839
4840 $self->{ct}->{value} .= '&';
4841 $self->{state} = $self->{prev_state};
4842 ## Reconsume.
4843 redo A;
4844 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4845 if ($is_space->{$self->{nc}}) {
4846 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4847 !!!next-input-character;
4848 redo A;
4849 } elsif ($self->{nc} == 0x0028) { # (
4850 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4851 $self->{ct}->{content} = ['('];
4852 $self->{group_depth} = 1;
4853 !!!next-input-character;
4854 redo A;
4855 } elsif ($self->{nc} == 0x003E) { # >
4856 !!!parse-error (type => 'no md def'); ## TODO: type
4857 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4858 !!!next-input-character;
4859 !!!emit ($self->{ct}); # ELEMENT
4860 redo A;
4861 } elsif ($self->{nc} == -1) {
4862 !!!parse-error (type => 'unclosed md'); ## TODO: type
4863 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4864 !!!next-input-character;
4865 !!!emit ($self->{ct}); # ELEMENT
4866 redo A;
4867 } else {
4868 $self->{ct}->{content} = [chr $self->{nc}];
4869 $self->{state} = CONTENT_KEYWORD_STATE;
4870 !!!next-input-character;
4871 redo A;
4872 }
4873 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4874 if ($is_space->{$self->{nc}}) {
4875 $self->{state} = AFTER_MD_DEF_STATE;
4876 !!!next-input-character;
4877 redo A;
4878 } elsif ($self->{nc} == 0x003E) { # >
4879 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4880 !!!next-input-character;
4881 !!!emit ($self->{ct}); # ELEMENT
4882 redo A;
4883 } elsif ($self->{nc} == -1) {
4884 !!!parse-error (type => 'unclosed md'); ## TODO: type
4885 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4886 !!!next-input-character;
4887 !!!emit ($self->{ct}); # ELEMENT
4888 redo A;
4889 } else {
4890 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4891 ## Stay in the state.
4892 !!!next-input-character;
4893 redo A;
4894 }
4895 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4896 if ($is_space->{$self->{nc}}) {
4897 ## Stay in the state.
4898 !!!next-input-character;
4899 redo A;
4900 } elsif ($self->{nc} == 0x0028) { # (
4901 $self->{group_depth}++;
4902 push @{$self->{ct}->{content}}, chr $self->{nc};
4903 ## Stay in the state.
4904 !!!next-input-character;
4905 redo A;
4906 } elsif ($self->{nc} == 0x007C or # |
4907 $self->{nc} == 0x002C) { # ,
4908 !!!parse-error (type => 'empty element name'); ## TODO: type
4909 ## Stay in the state.
4910 !!!next-input-character;
4911 redo A;
4912 } elsif ($self->{nc} == 0x0029) { # )
4913 !!!parse-error (type => 'empty element name'); ## TODO: type
4914 push @{$self->{ct}->{content}}, chr $self->{nc};
4915 $self->{group_depth}--;
4916 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4917 !!!next-input-character;
4918 redo A;
4919 } elsif ($self->{nc} == 0x003E) { # >
4920 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4921 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4922 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4923 !!!next-input-character;
4924 !!!emit ($self->{ct}); # ELEMENT
4925 redo A;
4926 } elsif ($self->{nc} == -1) {
4927 !!!parse-error (type => 'unclosed md'); ## TODO: type
4928 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4929 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4930 !!!next-input-character;
4931 !!!emit ($self->{ct}); # ELEMENT
4932 redo A;
4933 } else {
4934 push @{$self->{ct}->{content}}, chr $self->{nc};
4935 $self->{state} = CM_ELEMENT_NAME_STATE;
4936 !!!next-input-character;
4937 redo A;
4938 }
4939 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4940 if ($is_space->{$self->{nc}}) {
4941 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4942 !!!next-input-character;
4943 redo A;
4944 } elsif ($self->{nc} == 0x002A or # *
4945 $self->{nc} == 0x002B or # +
4946 $self->{nc} == 0x003F) { # ?
4947 push @{$self->{ct}->{content}}, chr $self->{nc};
4948 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4949 !!!next-input-character;
4950 redo A;
4951 } elsif ($self->{nc} == 0x007C or # |
4952 $self->{nc} == 0x002C) { # ,
4953 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4954 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4955 !!!next-input-character;
4956 redo A;
4957 } elsif ($self->{nc} == 0x0029) { # )
4958 $self->{group_depth}--;
4959 push @{$self->{ct}->{content}}, chr $self->{nc};
4960 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4961 !!!next-input-character;
4962 redo A;
4963 } elsif ($self->{nc} == 0x003E) { # >
4964 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4965 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4966 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4967 !!!next-input-character;
4968 !!!emit ($self->{ct}); # ELEMENT
4969 redo A;
4970 } elsif ($self->{nc} == -1) {
4971 !!!parse-error (type => 'unclosed md'); ## TODO: type
4972 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4973 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4974 !!!next-input-character;
4975 !!!emit ($self->{ct}); # ELEMENT
4976 redo A;
4977 } else {
4978 $self->{ct}->{content}->[-1] .= chr $self->{nc};
4979 ## Stay in the state.
4980 !!!next-input-character;
4981 redo A;
4982 }
4983 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4984 if ($is_space->{$self->{nc}}) {
4985 ## Stay in the state.
4986 !!!next-input-character;
4987 redo A;
4988 } elsif ($self->{nc} == 0x007C or # |
4989 $self->{nc} == 0x002C) { # ,
4990 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4991 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4992 !!!next-input-character;
4993 redo A;
4994 } elsif ($self->{nc} == 0x0029) { # )
4995 $self->{group_depth}--;
4996 push @{$self->{ct}->{content}}, chr $self->{nc};
4997 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4998 !!!next-input-character;
4999 redo A;
5000 } elsif ($self->{nc} == 0x003E) { # >
5001 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5002 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5003 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5004 !!!next-input-character;
5005 !!!emit ($self->{ct}); # ELEMENT
5006 redo A;
5007 } elsif ($self->{nc} == -1) {
5008 !!!parse-error (type => 'unclosed md'); ## TODO: type
5009 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5010 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5011 !!!next-input-character;
5012 !!!emit ($self->{ct}); # ELEMENT
5013 redo A;
5014 } else {
5015 !!!parse-error (type => 'after element name'); ## TODO: type
5016 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5017 $self->{state} = BOGUS_MD_STATE;
5018 !!!next-input-character;
5019 redo A;
5020 }
5021 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5022 if ($is_space->{$self->{nc}}) {
5023 if ($self->{group_depth}) {
5024 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5025 } else {
5026 $self->{state} = AFTER_MD_DEF_STATE;
5027 }
5028 !!!next-input-character;
5029 redo A;
5030 } elsif ($self->{nc} == 0x002A or # *
5031 $self->{nc} == 0x002B or # +
5032 $self->{nc} == 0x003F) { # ?
5033 push @{$self->{ct}->{content}}, chr $self->{nc};
5034 if ($self->{group_depth}) {
5035 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5036 } else {
5037 $self->{state} = AFTER_MD_DEF_STATE;
5038 }
5039 !!!next-input-character;
5040 redo A;
5041 } elsif ($self->{nc} == 0x0029) { # )
5042 if ($self->{group_depth}) {
5043 $self->{group_depth}--;
5044 push @{$self->{ct}->{content}}, chr $self->{nc};
5045 ## Stay in the state.
5046 !!!next-input-character;
5047 redo A;
5048 } else {
5049 !!!parse-error (type => 'string after md def'); ## TODO: type
5050 $self->{state} = BOGUS_MD_STATE;
5051 ## Reconsume.
5052 redo A;
5053 }
5054 } elsif ($self->{nc} == 0x003E) { # >
5055 if ($self->{group_depth}) {
5056 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5057 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5058 }
5059 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5060 !!!next-input-character;
5061 !!!emit ($self->{ct}); # ELEMENT
5062 redo A;
5063 } elsif ($self->{nc} == -1) {
5064 !!!parse-error (type => 'unclosed md'); ## TODO: type
5065 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5066 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5067 !!!next-input-character;
5068 !!!emit ($self->{ct}); # ELEMENT
5069 redo A;
5070 } else {
5071 if ($self->{group_depth}) {
5072 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5073 } else {
5074 !!!parse-error (type => 'string after md def'); ## TODO: type
5075 $self->{state} = BOGUS_MD_STATE;
5076 }
5077 ## Reconsume.
5078 redo A;
5079 }
5080 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5081 if ($is_space->{$self->{nc}}) {
5082 ## Stay in the state.
5083 !!!next-input-character;
5084 redo A;
5085 } elsif ($self->{nc} == 0x003E) { # >
5086 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5087 !!!next-input-character;
5088 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5089 redo A;
5090 } elsif ($self->{nc} == -1) {
5091 !!!parse-error (type => 'unclosed md'); ## TODO: type
5092 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5093 !!!next-input-character;
5094 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5095 redo A;
5096 } else {
5097 !!!parse-error (type => 'string after md def'); ## TODO: type
5098 $self->{state} = BOGUS_MD_STATE;
5099 ## Reconsume.
5100 redo A;
5101 }
5102 } elsif ($self->{state} == BOGUS_MD_STATE) {
5103 if ($self->{nc} == 0x003E) { # >
5104 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5105 !!!next-input-character;
5106 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5107 redo A;
5108 } elsif ($self->{nc} == -1) {
5109 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5110 ## Reconsume.
5111 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5112 redo A;
5113 } else {
5114 ## Stay in the state.
5115 !!!next-input-character;
5116 redo A;
5117 }
5118 } else {
5119 die "$0: $self->{state}: Unknown state";
5120 }
5121 } # A
5122
5123 die "$0: _get_next_token: unexpected case";
5124 } # _get_next_token
5125
5126 1;
5127 ## $Date: 2009/07/02 21:42:43 $
5128

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24