/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.31 - (show annotations) (download) (as text)
Sat Sep 5 09:26:55 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.30: +31 -12 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Sep 2009 09:26:39 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Added test cases for "comment end bang
	state" (HTML5 revision 3191).

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:26:12 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end
	bang state" (HTML5 revision 3191).

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.30 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_BANG_STATE () { 102 } ## LAST
109 sub COMMENT_END_DASH_STATE () { 18 }
110 sub BOGUS_COMMENT_STATE () { 19 }
111 sub DOCTYPE_STATE () { 20 }
112 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
113 sub DOCTYPE_NAME_STATE () { 22 }
114 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
115 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
117 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
118 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
119 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
121 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
122 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
123 sub BOGUS_DOCTYPE_STATE () { 32 }
124 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
125 sub SELF_CLOSING_START_TAG_STATE () { 34 }
126 sub CDATA_SECTION_STATE () { 35 }
127 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
128 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
129 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
130 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
131 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
132 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
133 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
134 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
135 ## NOTE: "Entity data state", "entity in attribute value state", and
136 ## "consume a character reference" algorithm are jointly implemented
137 ## using the following six states:
138 sub ENTITY_STATE () { 44 }
139 sub ENTITY_HASH_STATE () { 45 }
140 sub NCR_NUM_STATE () { 46 }
141 sub HEXREF_X_STATE () { 47 }
142 sub HEXREF_HEX_STATE () { 48 }
143 sub ENTITY_NAME_STATE () { 49 }
144 sub PCDATA_STATE () { 50 } # "data state" in the spec
145
146 ## XML-only states
147 sub PI_STATE () { 51 }
148 sub PI_TARGET_STATE () { 52 }
149 sub PI_TARGET_AFTER_STATE () { 53 }
150 sub PI_DATA_STATE () { 54 }
151 sub PI_AFTER_STATE () { 55 }
152 sub PI_DATA_AFTER_STATE () { 56 }
153 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
154 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
155 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
156 sub DOCTYPE_TAG_STATE () { 60 }
157 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
158 sub MD_ATTLIST_STATE () { 62 }
159 sub MD_E_STATE () { 63 }
160 sub MD_ELEMENT_STATE () { 64 }
161 sub MD_ENTITY_STATE () { 65 }
162 sub MD_NOTATION_STATE () { 66 }
163 sub DOCTYPE_MD_STATE () { 67 }
164 sub BEFORE_MD_NAME_STATE () { 68 }
165 sub MD_NAME_STATE () { 69 }
166 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
167 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
171 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
172 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
173 sub ALLOWED_TOKEN_STATE () { 77 }
174 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
175 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
176 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
179 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
180 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
181 sub BEFORE_NDATA_STATE () { 85 }
182 sub NDATA_STATE () { 86 }
183 sub AFTER_NDATA_STATE () { 87 }
184 sub BEFORE_NOTATION_NAME_STATE () { 88 }
185 sub NOTATION_NAME_STATE () { 89 }
186 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
187 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
188 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
189 sub AFTER_ELEMENT_NAME_STATE () { 93 }
190 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
191 sub CONTENT_KEYWORD_STATE () { 95 }
192 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
193 sub CM_ELEMENT_NAME_STATE () { 97 }
194 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
195 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
196 sub AFTER_MD_DEF_STATE () { 100 }
197 sub BOGUS_MD_STATE () { 101 }
198
199 ## Tree constructor state constants (see Whatpm::HTML for the full
200 ## list and descriptions)
201
202 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
203 sub FOREIGN_EL () { 0b1_00000000000 }
204
205 ## Character reference mappings
206
207 my $charref_map = {
208 0x0D => 0x000A,
209 0x80 => 0x20AC,
210 0x81 => 0xFFFD,
211 0x82 => 0x201A,
212 0x83 => 0x0192,
213 0x84 => 0x201E,
214 0x85 => 0x2026,
215 0x86 => 0x2020,
216 0x87 => 0x2021,
217 0x88 => 0x02C6,
218 0x89 => 0x2030,
219 0x8A => 0x0160,
220 0x8B => 0x2039,
221 0x8C => 0x0152,
222 0x8D => 0xFFFD,
223 0x8E => 0x017D,
224 0x8F => 0xFFFD,
225 0x90 => 0xFFFD,
226 0x91 => 0x2018,
227 0x92 => 0x2019,
228 0x93 => 0x201C,
229 0x94 => 0x201D,
230 0x95 => 0x2022,
231 0x96 => 0x2013,
232 0x97 => 0x2014,
233 0x98 => 0x02DC,
234 0x99 => 0x2122,
235 0x9A => 0x0161,
236 0x9B => 0x203A,
237 0x9C => 0x0153,
238 0x9D => 0xFFFD,
239 0x9E => 0x017E,
240 0x9F => 0x0178,
241 }; # $charref_map
242 $charref_map->{$_} = 0xFFFD
243 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
244 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
245 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
246 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
247 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
248 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
249 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
250
251 ## Implementations MUST act as if state machine in the spec
252
253 sub _initialize_tokenizer ($) {
254 my $self = shift;
255
256 ## NOTE: Fields set by |new| constructor:
257 #$self->{level}
258 #$self->{set_nc}
259 #$self->{parse_error}
260 #$self->{is_xml} (if XML)
261
262 $self->{state} = DATA_STATE; # MUST
263 $self->{s_kwd} = ''; # Data state keyword
264 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
265 #$self->{entity__value}; # initialized when used
266 #$self->{entity__match}; # initialized when used
267 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
268 undef $self->{ct}; # current token
269 undef $self->{ca}; # current attribute
270 undef $self->{last_stag_name}; # last emitted start tag name
271 #$self->{prev_state}; # initialized when used
272 delete $self->{self_closing};
273 $self->{char_buffer} = '';
274 $self->{char_buffer_pos} = 0;
275 $self->{nc} = -1; # next input character
276 #$self->{next_nc}
277 !!!next-input-character;
278 $self->{token} = [];
279 # $self->{escape}
280 } # _initialize_tokenizer
281
282 ## A token has:
283 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
284 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
285 ## ->{name} (DOCTYPE_TOKEN)
286 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
287 ## ->{target} (PI_TOKEN)
288 ## ->{pubid} (DOCTYPE_TOKEN)
289 ## ->{sysid} (DOCTYPE_TOKEN)
290 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
291 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
292 ## ->{name}
293 ## ->{value}
294 ## ->{has_reference} == 1 or 0
295 ## ->{index}: Index of the attribute in a tag.
296 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
297 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
298 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
299 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
300
301 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
302 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
303 ## while the token is pushed back to the stack.
304
305 ## Emitted token MUST immediately be handled by the tree construction state.
306
307 ## Before each step, UA MAY check to see if either one of the scripts in
308 ## "list of scripts that will execute as soon as possible" or the first
309 ## script in the "list of scripts that will execute asynchronously",
310 ## has completed loading. If one has, then it MUST be executed
311 ## and removed from the list.
312
313 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
314 ## (This requirement was dropped from HTML5 spec, unfortunately.)
315
316 my $is_space = {
317 0x0009 => 1, # CHARACTER TABULATION (HT)
318 0x000A => 1, # LINE FEED (LF)
319 #0x000B => 0, # LINE TABULATION (VT)
320 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
321 #0x000D => 1, # CARRIAGE RETURN (CR)
322 0x0020 => 1, # SPACE (SP)
323 };
324
325 sub _get_next_token ($) {
326 my $self = shift;
327
328 if ($self->{self_closing}) {
329 !!!parse-error (type => 'nestc', token => $self->{ct});
330 ## NOTE: The |self_closing| flag is only set by start tag token.
331 ## In addition, when a start tag token is emitted, it is always set to
332 ## |ct|.
333 delete $self->{self_closing};
334 }
335
336 if (@{$self->{token}}) {
337 $self->{self_closing} = $self->{token}->[0]->{self_closing};
338 return shift @{$self->{token}};
339 }
340
341 A: {
342 if ($self->{state} == PCDATA_STATE) {
343 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
344
345 if ($self->{nc} == 0x0026) { # &
346 !!!cp (0.1);
347 ## NOTE: In the spec, the tokenizer is switched to the
348 ## "entity data state". In this implementation, the tokenizer
349 ## is switched to the |ENTITY_STATE|, which is an implementation
350 ## of the "consume a character reference" algorithm.
351 $self->{entity_add} = -1;
352 $self->{prev_state} = DATA_STATE;
353 $self->{state} = ENTITY_STATE;
354 !!!next-input-character;
355 redo A;
356 } elsif ($self->{nc} == 0x003C) { # <
357 !!!cp (0.2);
358 $self->{state} = TAG_OPEN_STATE;
359 !!!next-input-character;
360 redo A;
361 } elsif ($self->{nc} == -1) {
362 !!!cp (0.3);
363 !!!emit ({type => END_OF_FILE_TOKEN,
364 line => $self->{line}, column => $self->{column}});
365 last A; ## TODO: ok?
366 } else {
367 !!!cp (0.4);
368 #
369 }
370
371 # Anything else
372 my $token = {type => CHARACTER_TOKEN,
373 data => chr $self->{nc},
374 line => $self->{line}, column => $self->{column},
375 };
376 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
377
378 ## Stay in the state.
379 !!!next-input-character;
380 !!!emit ($token);
381 redo A;
382 } elsif ($self->{state} == DATA_STATE) {
383 $self->{s_kwd} = '' unless defined $self->{s_kwd};
384 if ($self->{nc} == 0x0026) { # &
385 $self->{s_kwd} = '';
386 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
387 not $self->{escape}) {
388 !!!cp (1);
389 ## NOTE: In the spec, the tokenizer is switched to the
390 ## "entity data state". In this implementation, the tokenizer
391 ## is switched to the |ENTITY_STATE|, which is an implementation
392 ## of the "consume a character reference" algorithm.
393 $self->{entity_add} = -1;
394 $self->{prev_state} = DATA_STATE;
395 $self->{state} = ENTITY_STATE;
396 !!!next-input-character;
397 redo A;
398 } else {
399 !!!cp (2);
400 #
401 }
402 } elsif ($self->{nc} == 0x002D) { # -
403 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
404 if ($self->{s_kwd} eq '<!-') {
405 !!!cp (3);
406 $self->{escape} = 1; # unless $self->{escape};
407 $self->{s_kwd} = '--';
408 #
409 } elsif ($self->{s_kwd} eq '-') {
410 !!!cp (4);
411 $self->{s_kwd} = '--';
412 #
413 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
414 !!!cp (4.1);
415 $self->{s_kwd} .= '-';
416 #
417 } else {
418 !!!cp (5);
419 $self->{s_kwd} = '-';
420 #
421 }
422 }
423
424 #
425 } elsif ($self->{nc} == 0x0021) { # !
426 if (length $self->{s_kwd}) {
427 !!!cp (5.1);
428 $self->{s_kwd} .= '!';
429 #
430 } else {
431 !!!cp (5.2);
432 #$self->{s_kwd} = '';
433 #
434 }
435 #
436 } elsif ($self->{nc} == 0x003C) { # <
437 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
438 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
439 not $self->{escape})) {
440 !!!cp (6);
441 $self->{state} = TAG_OPEN_STATE;
442 !!!next-input-character;
443 redo A;
444 } else {
445 !!!cp (7);
446 $self->{s_kwd} = '';
447 #
448 }
449 } elsif ($self->{nc} == 0x003E) { # >
450 if ($self->{escape} and
451 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
452 if ($self->{s_kwd} eq '--') {
453 !!!cp (8);
454 delete $self->{escape};
455 #
456 } else {
457 !!!cp (9);
458 #
459 }
460 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
461 !!!cp (9.1);
462 !!!parse-error (type => 'unmatched mse', ## TODO: type
463 line => $self->{line_prev},
464 column => $self->{column_prev} - 1);
465 #
466 } else {
467 !!!cp (10);
468 #
469 }
470
471 $self->{s_kwd} = '';
472 #
473 } elsif ($self->{nc} == 0x005D) { # ]
474 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
475 !!!cp (10.1);
476 $self->{s_kwd} .= ']';
477 } elsif ($self->{s_kwd} eq ']]') {
478 !!!cp (10.2);
479 #
480 } else {
481 !!!cp (10.3);
482 $self->{s_kwd} = '';
483 }
484 #
485 } elsif ($self->{nc} == -1) {
486 !!!cp (11);
487 $self->{s_kwd} = '';
488 !!!emit ({type => END_OF_FILE_TOKEN,
489 line => $self->{line}, column => $self->{column}});
490 last A; ## TODO: ok?
491 } else {
492 !!!cp (12);
493 $self->{s_kwd} = '';
494 #
495 }
496
497 # Anything else
498 my $token = {type => CHARACTER_TOKEN,
499 data => chr $self->{nc},
500 line => $self->{line}, column => $self->{column},
501 };
502 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
503 length $token->{data})) {
504 $self->{s_kwd} = '';
505 }
506
507 ## Stay in the data state.
508 if (not $self->{is_xml} and
509 $self->{content_model} == PCDATA_CONTENT_MODEL) {
510 !!!cp (13);
511 $self->{state} = PCDATA_STATE;
512 } else {
513 !!!cp (14);
514 ## Stay in the state.
515 }
516 !!!next-input-character;
517 !!!emit ($token);
518 redo A;
519 } elsif ($self->{state} == TAG_OPEN_STATE) {
520 ## XML5: "tag state".
521
522 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
523 if ($self->{nc} == 0x002F) { # /
524 !!!cp (15);
525 !!!next-input-character;
526 $self->{state} = CLOSE_TAG_OPEN_STATE;
527 redo A;
528 } elsif ($self->{nc} == 0x0021) { # !
529 !!!cp (15.1);
530 $self->{s_kwd} = $self->{escaped} ? '' : '<';
531 #
532 } else {
533 !!!cp (16);
534 $self->{s_kwd} = '';
535 #
536 }
537
538 ## reconsume
539 $self->{state} = DATA_STATE;
540 !!!emit ({type => CHARACTER_TOKEN, data => '<',
541 line => $self->{line_prev},
542 column => $self->{column_prev},
543 });
544 redo A;
545 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
546 if ($self->{nc} == 0x0021) { # !
547 !!!cp (17);
548 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
549 !!!next-input-character;
550 redo A;
551 } elsif ($self->{nc} == 0x002F) { # /
552 !!!cp (18);
553 $self->{state} = CLOSE_TAG_OPEN_STATE;
554 !!!next-input-character;
555 redo A;
556 } elsif (0x0041 <= $self->{nc} and
557 $self->{nc} <= 0x005A) { # A..Z
558 !!!cp (19);
559 $self->{ct}
560 = {type => START_TAG_TOKEN,
561 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
562 line => $self->{line_prev},
563 column => $self->{column_prev}};
564 $self->{state} = TAG_NAME_STATE;
565 !!!next-input-character;
566 redo A;
567 } elsif (0x0061 <= $self->{nc} and
568 $self->{nc} <= 0x007A) { # a..z
569 !!!cp (20);
570 $self->{ct} = {type => START_TAG_TOKEN,
571 tag_name => chr ($self->{nc}),
572 line => $self->{line_prev},
573 column => $self->{column_prev}};
574 $self->{state} = TAG_NAME_STATE;
575 !!!next-input-character;
576 redo A;
577 } elsif ($self->{nc} == 0x003E) { # >
578 !!!cp (21);
579 !!!parse-error (type => 'empty start tag',
580 line => $self->{line_prev},
581 column => $self->{column_prev});
582 $self->{state} = DATA_STATE;
583 $self->{s_kwd} = '';
584 !!!next-input-character;
585
586 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
587 line => $self->{line_prev},
588 column => $self->{column_prev},
589 });
590
591 redo A;
592 } elsif ($self->{nc} == 0x003F) { # ?
593 if ($self->{is_xml}) {
594 !!!cp (22.1);
595 $self->{state} = PI_STATE;
596 !!!next-input-character;
597 redo A;
598 } else {
599 !!!cp (22);
600 !!!parse-error (type => 'pio',
601 line => $self->{line_prev},
602 column => $self->{column_prev});
603 $self->{state} = BOGUS_COMMENT_STATE;
604 $self->{ct} = {type => COMMENT_TOKEN, data => '',
605 line => $self->{line_prev},
606 column => $self->{column_prev},
607 };
608 ## $self->{nc} is intentionally left as is
609 redo A;
610 }
611 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
612 !!!cp (23);
613 !!!parse-error (type => 'bare stago',
614 line => $self->{line_prev},
615 column => $self->{column_prev});
616 $self->{state} = DATA_STATE;
617 $self->{s_kwd} = '';
618 ## reconsume
619
620 !!!emit ({type => CHARACTER_TOKEN, data => '<',
621 line => $self->{line_prev},
622 column => $self->{column_prev},
623 });
624
625 redo A;
626 } else {
627 ## XML5: "<:" is a parse error.
628 !!!cp (23.1);
629 $self->{ct} = {type => START_TAG_TOKEN,
630 tag_name => chr ($self->{nc}),
631 line => $self->{line_prev},
632 column => $self->{column_prev}};
633 $self->{state} = TAG_NAME_STATE;
634 !!!next-input-character;
635 redo A;
636 }
637 } else {
638 die "$0: $self->{content_model} in tag open";
639 }
640 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
641 ## NOTE: The "close tag open state" in the spec is implemented as
642 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
643
644 ## XML5: "end tag state".
645
646 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
647 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
648 if (defined $self->{last_stag_name}) {
649 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
650 $self->{kwd} = '';
651 ## Reconsume.
652 redo A;
653 } else {
654 ## No start tag token has ever been emitted
655 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
656 !!!cp (28);
657 $self->{state} = DATA_STATE;
658 $self->{s_kwd} = '';
659 ## Reconsume.
660 !!!emit ({type => CHARACTER_TOKEN, data => '</',
661 line => $l, column => $c,
662 });
663 redo A;
664 }
665 }
666
667 if (0x0041 <= $self->{nc} and
668 $self->{nc} <= 0x005A) { # A..Z
669 !!!cp (29);
670 $self->{ct}
671 = {type => END_TAG_TOKEN,
672 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
673 line => $l, column => $c};
674 $self->{state} = TAG_NAME_STATE;
675 !!!next-input-character;
676 redo A;
677 } elsif (0x0061 <= $self->{nc} and
678 $self->{nc} <= 0x007A) { # a..z
679 !!!cp (30);
680 $self->{ct} = {type => END_TAG_TOKEN,
681 tag_name => chr ($self->{nc}),
682 line => $l, column => $c};
683 $self->{state} = TAG_NAME_STATE;
684 !!!next-input-character;
685 redo A;
686 } elsif ($self->{nc} == 0x003E) { # >
687 !!!parse-error (type => 'empty end tag',
688 line => $self->{line_prev}, ## "<" in "</>"
689 column => $self->{column_prev} - 1);
690 $self->{state} = DATA_STATE;
691 $self->{s_kwd} = '';
692 if ($self->{is_xml}) {
693 !!!cp (31);
694 ## XML5: No parse error.
695
696 ## NOTE: This parser raises a parse error, since it supports
697 ## XML1, not XML5.
698
699 ## NOTE: A short end tag token.
700 my $ct = {type => END_TAG_TOKEN,
701 tag_name => '',
702 line => $self->{line_prev},
703 column => $self->{column_prev} - 1,
704 };
705 !!!next-input-character;
706 !!!emit ($ct);
707 } else {
708 !!!cp (31.1);
709 !!!next-input-character;
710 }
711 redo A;
712 } elsif ($self->{nc} == -1) {
713 !!!cp (32);
714 !!!parse-error (type => 'bare etago');
715 $self->{s_kwd} = '';
716 $self->{state} = DATA_STATE;
717 # reconsume
718
719 !!!emit ({type => CHARACTER_TOKEN, data => '</',
720 line => $l, column => $c,
721 });
722
723 redo A;
724 } elsif (not $self->{is_xml} or
725 $is_space->{$self->{nc}}) {
726 !!!cp (33);
727 !!!parse-error (type => 'bogus end tag',
728 line => $self->{line_prev}, # "<" of "</"
729 column => $self->{column_prev} - 1);
730 $self->{state} = BOGUS_COMMENT_STATE;
731 $self->{ct} = {type => COMMENT_TOKEN, data => '',
732 line => $self->{line_prev}, # "<" of "</"
733 column => $self->{column_prev} - 1,
734 };
735 ## NOTE: $self->{nc} is intentionally left as is.
736 ## Although the "anything else" case of the spec not explicitly
737 ## states that the next input character is to be reconsumed,
738 ## it will be included to the |data| of the comment token
739 ## generated from the bogus end tag, as defined in the
740 ## "bogus comment state" entry.
741 redo A;
742 } else {
743 ## XML5: "</:" is a parse error.
744 !!!cp (30.1);
745 $self->{ct} = {type => END_TAG_TOKEN,
746 tag_name => chr ($self->{nc}),
747 line => $l, column => $c};
748 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
749 !!!next-input-character;
750 redo A;
751 }
752 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
753 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
754 if (length $ch) {
755 my $CH = $ch;
756 $ch =~ tr/a-z/A-Z/;
757 my $nch = chr $self->{nc};
758 if ($nch eq $ch or $nch eq $CH) {
759 !!!cp (24);
760 ## Stay in the state.
761 $self->{kwd} .= $nch;
762 !!!next-input-character;
763 redo A;
764 } else {
765 !!!cp (25);
766 $self->{state} = DATA_STATE;
767 $self->{s_kwd} = '';
768 ## Reconsume.
769 !!!emit ({type => CHARACTER_TOKEN,
770 data => '</' . $self->{kwd},
771 line => $self->{line_prev},
772 column => $self->{column_prev} - 1 - length $self->{kwd},
773 });
774 redo A;
775 }
776 } else { # after "<{tag-name}"
777 unless ($is_space->{$self->{nc}} or
778 {
779 0x003E => 1, # >
780 0x002F => 1, # /
781 -1 => 1, # EOF
782 }->{$self->{nc}}) {
783 !!!cp (26);
784 ## Reconsume.
785 $self->{state} = DATA_STATE;
786 $self->{s_kwd} = '';
787 !!!emit ({type => CHARACTER_TOKEN,
788 data => '</' . $self->{kwd},
789 line => $self->{line_prev},
790 column => $self->{column_prev} - 1 - length $self->{kwd},
791 });
792 redo A;
793 } else {
794 !!!cp (27);
795 $self->{ct}
796 = {type => END_TAG_TOKEN,
797 tag_name => $self->{last_stag_name},
798 line => $self->{line_prev},
799 column => $self->{column_prev} - 1 - length $self->{kwd}};
800 $self->{state} = TAG_NAME_STATE;
801 ## Reconsume.
802 redo A;
803 }
804 }
805 } elsif ($self->{state} == TAG_NAME_STATE) {
806 if ($is_space->{$self->{nc}}) {
807 !!!cp (34);
808 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
809 !!!next-input-character;
810 redo A;
811 } elsif ($self->{nc} == 0x003E) { # >
812 if ($self->{ct}->{type} == START_TAG_TOKEN) {
813 !!!cp (35);
814 $self->{last_stag_name} = $self->{ct}->{tag_name};
815 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
816 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
817 #if ($self->{ct}->{attributes}) {
818 # ## NOTE: This should never be reached.
819 # !!! cp (36);
820 # !!! parse-error (type => 'end tag attribute');
821 #} else {
822 !!!cp (37);
823 #}
824 } else {
825 die "$0: $self->{ct}->{type}: Unknown token type";
826 }
827 $self->{state} = DATA_STATE;
828 $self->{s_kwd} = '';
829 !!!next-input-character;
830
831 !!!emit ($self->{ct}); # start tag or end tag
832
833 redo A;
834 } elsif (0x0041 <= $self->{nc} and
835 $self->{nc} <= 0x005A) { # A..Z
836 !!!cp (38);
837 $self->{ct}->{tag_name}
838 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
839 # start tag or end tag
840 ## Stay in this state
841 !!!next-input-character;
842 redo A;
843 } elsif ($self->{nc} == -1) {
844 !!!parse-error (type => 'unclosed tag');
845 if ($self->{ct}->{type} == START_TAG_TOKEN) {
846 !!!cp (39);
847 $self->{last_stag_name} = $self->{ct}->{tag_name};
848 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
849 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
850 #if ($self->{ct}->{attributes}) {
851 # ## NOTE: This state should never be reached.
852 # !!! cp (40);
853 # !!! parse-error (type => 'end tag attribute');
854 #} else {
855 !!!cp (41);
856 #}
857 } else {
858 die "$0: $self->{ct}->{type}: Unknown token type";
859 }
860 $self->{state} = DATA_STATE;
861 $self->{s_kwd} = '';
862 # reconsume
863
864 !!!emit ($self->{ct}); # start tag or end tag
865
866 redo A;
867 } elsif ($self->{nc} == 0x002F) { # /
868 !!!cp (42);
869 $self->{state} = SELF_CLOSING_START_TAG_STATE;
870 !!!next-input-character;
871 redo A;
872 } else {
873 !!!cp (44);
874 $self->{ct}->{tag_name} .= chr $self->{nc};
875 # start tag or end tag
876 ## Stay in the state
877 !!!next-input-character;
878 redo A;
879 }
880 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
881 ## XML5: "Tag attribute name before state".
882
883 if ($is_space->{$self->{nc}}) {
884 !!!cp (45);
885 ## Stay in the state
886 !!!next-input-character;
887 redo A;
888 } elsif ($self->{nc} == 0x003E) { # >
889 if ($self->{ct}->{type} == START_TAG_TOKEN) {
890 !!!cp (46);
891 $self->{last_stag_name} = $self->{ct}->{tag_name};
892 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
893 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
894 if ($self->{ct}->{attributes}) {
895 !!!cp (47);
896 !!!parse-error (type => 'end tag attribute');
897 } else {
898 !!!cp (48);
899 }
900 } else {
901 die "$0: $self->{ct}->{type}: Unknown token type";
902 }
903 $self->{state} = DATA_STATE;
904 $self->{s_kwd} = '';
905 !!!next-input-character;
906
907 !!!emit ($self->{ct}); # start tag or end tag
908
909 redo A;
910 } elsif (0x0041 <= $self->{nc} and
911 $self->{nc} <= 0x005A) { # A..Z
912 !!!cp (49);
913 $self->{ca}
914 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
915 value => '',
916 line => $self->{line}, column => $self->{column}};
917 $self->{state} = ATTRIBUTE_NAME_STATE;
918 !!!next-input-character;
919 redo A;
920 } elsif ($self->{nc} == 0x002F) { # /
921 !!!cp (50);
922 $self->{state} = SELF_CLOSING_START_TAG_STATE;
923 !!!next-input-character;
924 redo A;
925 } elsif ($self->{nc} == -1) {
926 !!!parse-error (type => 'unclosed tag');
927 if ($self->{ct}->{type} == START_TAG_TOKEN) {
928 !!!cp (52);
929 $self->{last_stag_name} = $self->{ct}->{tag_name};
930 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
931 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
932 if ($self->{ct}->{attributes}) {
933 !!!cp (53);
934 !!!parse-error (type => 'end tag attribute');
935 } else {
936 !!!cp (54);
937 }
938 } else {
939 die "$0: $self->{ct}->{type}: Unknown token type";
940 }
941 $self->{state} = DATA_STATE;
942 $self->{s_kwd} = '';
943 # reconsume
944
945 !!!emit ($self->{ct}); # start tag or end tag
946
947 redo A;
948 } else {
949 if ({
950 0x0022 => 1, # "
951 0x0027 => 1, # '
952 0x003C => 1, # <
953 0x003D => 1, # =
954 }->{$self->{nc}}) {
955 !!!cp (55);
956 ## XML5: Not a parse error.
957 !!!parse-error (type => 'bad attribute name');
958 } else {
959 !!!cp (56);
960 ## XML5: ":" raises a parse error and is ignored.
961 }
962 $self->{ca}
963 = {name => chr ($self->{nc}),
964 value => '',
965 line => $self->{line}, column => $self->{column}};
966 $self->{state} = ATTRIBUTE_NAME_STATE;
967 !!!next-input-character;
968 redo A;
969 }
970 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
971 ## XML5: "Tag attribute name state".
972
973 my $before_leave = sub {
974 if (exists $self->{ct}->{attributes} # start tag or end tag
975 ->{$self->{ca}->{name}}) { # MUST
976 !!!cp (57);
977 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
978 ## Discard $self->{ca} # MUST
979 } else {
980 !!!cp (58);
981 $self->{ct}->{attributes}->{$self->{ca}->{name}}
982 = $self->{ca};
983 $self->{ca}->{index} = ++$self->{ct}->{last_index};
984 }
985 }; # $before_leave
986
987 if ($is_space->{$self->{nc}}) {
988 !!!cp (59);
989 $before_leave->();
990 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
991 !!!next-input-character;
992 redo A;
993 } elsif ($self->{nc} == 0x003D) { # =
994 !!!cp (60);
995 $before_leave->();
996 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
997 !!!next-input-character;
998 redo A;
999 } elsif ($self->{nc} == 0x003E) { # >
1000 if ($self->{is_xml}) {
1001 !!!cp (60.1);
1002 ## XML5: Not a parse error.
1003 !!!parse-error (type => 'no attr value'); ## TODO: type
1004 } else {
1005 !!!cp (60.2);
1006 }
1007
1008 $before_leave->();
1009 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1010 !!!cp (61);
1011 $self->{last_stag_name} = $self->{ct}->{tag_name};
1012 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1013 !!!cp (62);
1014 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1015 if ($self->{ct}->{attributes}) {
1016 !!!parse-error (type => 'end tag attribute');
1017 }
1018 } else {
1019 die "$0: $self->{ct}->{type}: Unknown token type";
1020 }
1021 $self->{state} = DATA_STATE;
1022 $self->{s_kwd} = '';
1023 !!!next-input-character;
1024
1025 !!!emit ($self->{ct}); # start tag or end tag
1026
1027 redo A;
1028 } elsif (0x0041 <= $self->{nc} and
1029 $self->{nc} <= 0x005A) { # A..Z
1030 !!!cp (63);
1031 $self->{ca}->{name}
1032 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1033 ## Stay in the state
1034 !!!next-input-character;
1035 redo A;
1036 } elsif ($self->{nc} == 0x002F) { # /
1037 if ($self->{is_xml}) {
1038 !!!cp (64);
1039 ## XML5: Not a parse error.
1040 !!!parse-error (type => 'no attr value'); ## TODO: type
1041 } else {
1042 !!!cp (64.1);
1043 }
1044
1045 $before_leave->();
1046 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1047 !!!next-input-character;
1048 redo A;
1049 } elsif ($self->{nc} == -1) {
1050 !!!parse-error (type => 'unclosed tag');
1051 $before_leave->();
1052 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1053 !!!cp (66);
1054 $self->{last_stag_name} = $self->{ct}->{tag_name};
1055 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1056 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1057 if ($self->{ct}->{attributes}) {
1058 !!!cp (67);
1059 !!!parse-error (type => 'end tag attribute');
1060 } else {
1061 ## NOTE: This state should never be reached.
1062 !!!cp (68);
1063 }
1064 } else {
1065 die "$0: $self->{ct}->{type}: Unknown token type";
1066 }
1067 $self->{state} = DATA_STATE;
1068 $self->{s_kwd} = '';
1069 # reconsume
1070
1071 !!!emit ($self->{ct}); # start tag or end tag
1072
1073 redo A;
1074 } else {
1075 if ({
1076 0x0022 => 1, # "
1077 0x0027 => 1, # '
1078 0x003C => 1, # <
1079 }->{$self->{nc}}) {
1080 !!!cp (69);
1081 ## XML5: Not a parse error.
1082 !!!parse-error (type => 'bad attribute name');
1083 } else {
1084 !!!cp (70);
1085 }
1086 $self->{ca}->{name} .= chr ($self->{nc});
1087 ## Stay in the state
1088 !!!next-input-character;
1089 redo A;
1090 }
1091 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1092 ## XML5: "Tag attribute name after state".
1093
1094 if ($is_space->{$self->{nc}}) {
1095 !!!cp (71);
1096 ## Stay in the state
1097 !!!next-input-character;
1098 redo A;
1099 } elsif ($self->{nc} == 0x003D) { # =
1100 !!!cp (72);
1101 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1102 !!!next-input-character;
1103 redo A;
1104 } elsif ($self->{nc} == 0x003E) { # >
1105 if ($self->{is_xml}) {
1106 !!!cp (72.1);
1107 ## XML5: Not a parse error.
1108 !!!parse-error (type => 'no attr value'); ## TODO: type
1109 } else {
1110 !!!cp (72.2);
1111 }
1112
1113 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1114 !!!cp (73);
1115 $self->{last_stag_name} = $self->{ct}->{tag_name};
1116 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1117 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1118 if ($self->{ct}->{attributes}) {
1119 !!!cp (74);
1120 !!!parse-error (type => 'end tag attribute');
1121 } else {
1122 ## NOTE: This state should never be reached.
1123 !!!cp (75);
1124 }
1125 } else {
1126 die "$0: $self->{ct}->{type}: Unknown token type";
1127 }
1128 $self->{state} = DATA_STATE;
1129 $self->{s_kwd} = '';
1130 !!!next-input-character;
1131
1132 !!!emit ($self->{ct}); # start tag or end tag
1133
1134 redo A;
1135 } elsif (0x0041 <= $self->{nc} and
1136 $self->{nc} <= 0x005A) { # A..Z
1137 !!!cp (76);
1138 $self->{ca}
1139 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1140 value => '',
1141 line => $self->{line}, column => $self->{column}};
1142 $self->{state} = ATTRIBUTE_NAME_STATE;
1143 !!!next-input-character;
1144 redo A;
1145 } elsif ($self->{nc} == 0x002F) { # /
1146 if ($self->{is_xml}) {
1147 !!!cp (77);
1148 ## XML5: Not a parse error.
1149 !!!parse-error (type => 'no attr value'); ## TODO: type
1150 } else {
1151 !!!cp (77.1);
1152 }
1153
1154 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1155 !!!next-input-character;
1156 redo A;
1157 } elsif ($self->{nc} == -1) {
1158 !!!parse-error (type => 'unclosed tag');
1159 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1160 !!!cp (79);
1161 $self->{last_stag_name} = $self->{ct}->{tag_name};
1162 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1163 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1164 if ($self->{ct}->{attributes}) {
1165 !!!cp (80);
1166 !!!parse-error (type => 'end tag attribute');
1167 } else {
1168 ## NOTE: This state should never be reached.
1169 !!!cp (81);
1170 }
1171 } else {
1172 die "$0: $self->{ct}->{type}: Unknown token type";
1173 }
1174 $self->{s_kwd} = '';
1175 $self->{state} = DATA_STATE;
1176 # reconsume
1177
1178 !!!emit ($self->{ct}); # start tag or end tag
1179
1180 redo A;
1181 } else {
1182 if ($self->{is_xml}) {
1183 !!!cp (78.1);
1184 ## XML5: Not a parse error.
1185 !!!parse-error (type => 'no attr value'); ## TODO: type
1186 } else {
1187 !!!cp (78.2);
1188 }
1189
1190 if ({
1191 0x0022 => 1, # "
1192 0x0027 => 1, # '
1193 0x003C => 1, # <
1194 }->{$self->{nc}}) {
1195 !!!cp (78);
1196 ## XML5: Not a parse error.
1197 !!!parse-error (type => 'bad attribute name');
1198 } else {
1199 !!!cp (82);
1200 }
1201 $self->{ca}
1202 = {name => chr ($self->{nc}),
1203 value => '',
1204 line => $self->{line}, column => $self->{column}};
1205 $self->{state} = ATTRIBUTE_NAME_STATE;
1206 !!!next-input-character;
1207 redo A;
1208 }
1209 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1210 ## XML5: "Tag attribute value before state".
1211
1212 if ($is_space->{$self->{nc}}) {
1213 !!!cp (83);
1214 ## Stay in the state
1215 !!!next-input-character;
1216 redo A;
1217 } elsif ($self->{nc} == 0x0022) { # "
1218 !!!cp (84);
1219 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1220 !!!next-input-character;
1221 redo A;
1222 } elsif ($self->{nc} == 0x0026) { # &
1223 !!!cp (85);
1224 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1225 ## reconsume
1226 redo A;
1227 } elsif ($self->{nc} == 0x0027) { # '
1228 !!!cp (86);
1229 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1230 !!!next-input-character;
1231 redo A;
1232 } elsif ($self->{nc} == 0x003E) { # >
1233 !!!parse-error (type => 'empty unquoted attribute value');
1234 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1235 !!!cp (87);
1236 $self->{last_stag_name} = $self->{ct}->{tag_name};
1237 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1238 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1239 if ($self->{ct}->{attributes}) {
1240 !!!cp (88);
1241 !!!parse-error (type => 'end tag attribute');
1242 } else {
1243 ## NOTE: This state should never be reached.
1244 !!!cp (89);
1245 }
1246 } else {
1247 die "$0: $self->{ct}->{type}: Unknown token type";
1248 }
1249 $self->{state} = DATA_STATE;
1250 $self->{s_kwd} = '';
1251 !!!next-input-character;
1252
1253 !!!emit ($self->{ct}); # start tag or end tag
1254
1255 redo A;
1256 } elsif ($self->{nc} == -1) {
1257 !!!parse-error (type => 'unclosed tag');
1258 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1259 !!!cp (90);
1260 $self->{last_stag_name} = $self->{ct}->{tag_name};
1261 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1262 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1263 if ($self->{ct}->{attributes}) {
1264 !!!cp (91);
1265 !!!parse-error (type => 'end tag attribute');
1266 } else {
1267 ## NOTE: This state should never be reached.
1268 !!!cp (92);
1269 }
1270 } else {
1271 die "$0: $self->{ct}->{type}: Unknown token type";
1272 }
1273 $self->{state} = DATA_STATE;
1274 $self->{s_kwd} = '';
1275 ## reconsume
1276
1277 !!!emit ($self->{ct}); # start tag or end tag
1278
1279 redo A;
1280 } else {
1281 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1282 !!!cp (93);
1283 ## XML5: Not a parse error.
1284 !!!parse-error (type => 'bad attribute value');
1285 } elsif ($self->{is_xml}) {
1286 !!!cp (93.1);
1287 ## XML5: No parse error.
1288 !!!parse-error (type => 'unquoted attr value'); ## TODO
1289 } else {
1290 !!!cp (94);
1291 }
1292 $self->{ca}->{value} .= chr ($self->{nc});
1293 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1294 !!!next-input-character;
1295 redo A;
1296 }
1297 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1298 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1299 ## ATTLIST attribute value double quoted state".
1300
1301 if ($self->{nc} == 0x0022) { # "
1302 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1303 !!!cp (95.1);
1304 ## XML5: "DOCTYPE ATTLIST name after state".
1305 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1306 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1307 } else {
1308 !!!cp (95);
1309 ## XML5: "Tag attribute name before state".
1310 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1311 }
1312 !!!next-input-character;
1313 redo A;
1314 } elsif ($self->{nc} == 0x0026) { # &
1315 !!!cp (96);
1316 ## XML5: Not defined yet.
1317
1318 ## NOTE: In the spec, the tokenizer is switched to the
1319 ## "entity in attribute value state". In this implementation, the
1320 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1321 ## implementation of the "consume a character reference" algorithm.
1322 $self->{prev_state} = $self->{state};
1323 $self->{entity_add} = 0x0022; # "
1324 $self->{state} = ENTITY_STATE;
1325 !!!next-input-character;
1326 redo A;
1327 } elsif ($self->{is_xml} and
1328 $is_space->{$self->{nc}}) {
1329 !!!cp (97.1);
1330 $self->{ca}->{value} .= ' ';
1331 ## Stay in the state.
1332 !!!next-input-character;
1333 redo A;
1334 } elsif ($self->{nc} == -1) {
1335 !!!parse-error (type => 'unclosed attribute value');
1336 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1337 !!!cp (97);
1338 $self->{last_stag_name} = $self->{ct}->{tag_name};
1339
1340 $self->{state} = DATA_STATE;
1341 $self->{s_kwd} = '';
1342 ## reconsume
1343 !!!emit ($self->{ct}); # start tag
1344 redo A;
1345 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1346 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1347 if ($self->{ct}->{attributes}) {
1348 !!!cp (98);
1349 !!!parse-error (type => 'end tag attribute');
1350 } else {
1351 ## NOTE: This state should never be reached.
1352 !!!cp (99);
1353 }
1354
1355 $self->{state} = DATA_STATE;
1356 $self->{s_kwd} = '';
1357 ## reconsume
1358 !!!emit ($self->{ct}); # end tag
1359 redo A;
1360 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1361 ## XML5: No parse error above; not defined yet.
1362 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1363 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1364 ## Reconsume.
1365 !!!emit ($self->{ct}); # ATTLIST
1366 redo A;
1367 } else {
1368 die "$0: $self->{ct}->{type}: Unknown token type";
1369 }
1370 } else {
1371 ## XML5 [ATTLIST]: Not defined yet.
1372 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1373 !!!cp (100);
1374 ## XML5: Not a parse error.
1375 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1376 } else {
1377 !!!cp (100.1);
1378 }
1379 $self->{ca}->{value} .= chr ($self->{nc});
1380 $self->{read_until}->($self->{ca}->{value},
1381 qq["&<\x09\x0C\x20],
1382 length $self->{ca}->{value});
1383
1384 ## Stay in the state
1385 !!!next-input-character;
1386 redo A;
1387 }
1388 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1389 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1390 ## ATTLIST attribute value single quoted state".
1391
1392 if ($self->{nc} == 0x0027) { # '
1393 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1394 !!!cp (101.1);
1395 ## XML5: "DOCTYPE ATTLIST name after state".
1396 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1397 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1398 } else {
1399 !!!cp (101);
1400 ## XML5: "Before attribute name state" (sic).
1401 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1402 }
1403 !!!next-input-character;
1404 redo A;
1405 } elsif ($self->{nc} == 0x0026) { # &
1406 !!!cp (102);
1407 ## XML5: Not defined yet.
1408
1409 ## NOTE: In the spec, the tokenizer is switched to the
1410 ## "entity in attribute value state". In this implementation, the
1411 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1412 ## implementation of the "consume a character reference" algorithm.
1413 $self->{entity_add} = 0x0027; # '
1414 $self->{prev_state} = $self->{state};
1415 $self->{state} = ENTITY_STATE;
1416 !!!next-input-character;
1417 redo A;
1418 } elsif ($self->{is_xml} and
1419 $is_space->{$self->{nc}}) {
1420 !!!cp (103.1);
1421 $self->{ca}->{value} .= ' ';
1422 ## Stay in the state.
1423 !!!next-input-character;
1424 redo A;
1425 } elsif ($self->{nc} == -1) {
1426 !!!parse-error (type => 'unclosed attribute value');
1427 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1428 !!!cp (103);
1429 $self->{last_stag_name} = $self->{ct}->{tag_name};
1430
1431 $self->{state} = DATA_STATE;
1432 $self->{s_kwd} = '';
1433 ## reconsume
1434 !!!emit ($self->{ct}); # start tag
1435 redo A;
1436 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1437 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1438 if ($self->{ct}->{attributes}) {
1439 !!!cp (104);
1440 !!!parse-error (type => 'end tag attribute');
1441 } else {
1442 ## NOTE: This state should never be reached.
1443 !!!cp (105);
1444 }
1445
1446 $self->{state} = DATA_STATE;
1447 $self->{s_kwd} = '';
1448 ## reconsume
1449 !!!emit ($self->{ct}); # end tag
1450 redo A;
1451 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1452 ## XML5: No parse error above; not defined yet.
1453 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1454 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1455 ## Reconsume.
1456 !!!emit ($self->{ct}); # ATTLIST
1457 redo A;
1458 } else {
1459 die "$0: $self->{ct}->{type}: Unknown token type";
1460 }
1461 } else {
1462 ## XML5 [ATTLIST]: Not defined yet.
1463 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1464 !!!cp (106);
1465 ## XML5: Not a parse error.
1466 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1467 } else {
1468 !!!cp (106.1);
1469 }
1470 $self->{ca}->{value} .= chr ($self->{nc});
1471 $self->{read_until}->($self->{ca}->{value},
1472 qq['&<\x09\x0C\x20],
1473 length $self->{ca}->{value});
1474
1475 ## Stay in the state
1476 !!!next-input-character;
1477 redo A;
1478 }
1479 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1480 ## XML5: "Tag attribute value unquoted state".
1481
1482 if ($is_space->{$self->{nc}}) {
1483 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1484 !!!cp (107.1);
1485 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1486 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1487 } else {
1488 !!!cp (107);
1489 ## XML5: "Tag attribute name before state".
1490 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1491 }
1492 !!!next-input-character;
1493 redo A;
1494 } elsif ($self->{nc} == 0x0026) { # &
1495 !!!cp (108);
1496
1497 ## XML5: Not defined yet.
1498
1499 ## NOTE: In the spec, the tokenizer is switched to the
1500 ## "entity in attribute value state". In this implementation, the
1501 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1502 ## implementation of the "consume a character reference" algorithm.
1503 $self->{entity_add} = -1;
1504 $self->{prev_state} = $self->{state};
1505 $self->{state} = ENTITY_STATE;
1506 !!!next-input-character;
1507 redo A;
1508 } elsif ($self->{nc} == 0x003E) { # >
1509 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1510 !!!cp (109);
1511 $self->{last_stag_name} = $self->{ct}->{tag_name};
1512
1513 $self->{state} = DATA_STATE;
1514 $self->{s_kwd} = '';
1515 !!!next-input-character;
1516 !!!emit ($self->{ct}); # start tag
1517 redo A;
1518 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1519 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1520 if ($self->{ct}->{attributes}) {
1521 !!!cp (110);
1522 !!!parse-error (type => 'end tag attribute');
1523 } else {
1524 ## NOTE: This state should never be reached.
1525 !!!cp (111);
1526 }
1527
1528 $self->{state} = DATA_STATE;
1529 $self->{s_kwd} = '';
1530 !!!next-input-character;
1531 !!!emit ($self->{ct}); # end tag
1532 redo A;
1533 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1534 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1535 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1536 !!!next-input-character;
1537 !!!emit ($self->{ct}); # ATTLIST
1538 redo A;
1539 } else {
1540 die "$0: $self->{ct}->{type}: Unknown token type";
1541 }
1542 } elsif ($self->{nc} == -1) {
1543 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1544 !!!cp (112);
1545 !!!parse-error (type => 'unclosed tag');
1546 $self->{last_stag_name} = $self->{ct}->{tag_name};
1547
1548 $self->{state} = DATA_STATE;
1549 $self->{s_kwd} = '';
1550 ## reconsume
1551 !!!emit ($self->{ct}); # start tag
1552 redo A;
1553 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1554 !!!parse-error (type => 'unclosed tag');
1555 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1556 if ($self->{ct}->{attributes}) {
1557 !!!cp (113);
1558 !!!parse-error (type => 'end tag attribute');
1559 } else {
1560 ## NOTE: This state should never be reached.
1561 !!!cp (114);
1562 }
1563
1564 $self->{state} = DATA_STATE;
1565 $self->{s_kwd} = '';
1566 ## reconsume
1567 !!!emit ($self->{ct}); # end tag
1568 redo A;
1569 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1570 !!!parse-error (type => 'unclosed md'); ## TODO: type
1571 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1572 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1573 ## Reconsume.
1574 !!!emit ($self->{ct}); # ATTLIST
1575 redo A;
1576 } else {
1577 die "$0: $self->{ct}->{type}: Unknown token type";
1578 }
1579 } else {
1580 if ({
1581 0x0022 => 1, # "
1582 0x0027 => 1, # '
1583 0x003D => 1, # =
1584 0x003C => 1, # <
1585 }->{$self->{nc}}) {
1586 !!!cp (115);
1587 ## XML5: Not a parse error.
1588 !!!parse-error (type => 'bad attribute value');
1589 } else {
1590 !!!cp (116);
1591 }
1592 $self->{ca}->{value} .= chr ($self->{nc});
1593 $self->{read_until}->($self->{ca}->{value},
1594 qq["'=& \x09\x0C>],
1595 length $self->{ca}->{value});
1596
1597 ## Stay in the state
1598 !!!next-input-character;
1599 redo A;
1600 }
1601 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1602 if ($is_space->{$self->{nc}}) {
1603 !!!cp (118);
1604 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1605 !!!next-input-character;
1606 redo A;
1607 } elsif ($self->{nc} == 0x003E) { # >
1608 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1609 !!!cp (119);
1610 $self->{last_stag_name} = $self->{ct}->{tag_name};
1611 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1612 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1613 if ($self->{ct}->{attributes}) {
1614 !!!cp (120);
1615 !!!parse-error (type => 'end tag attribute');
1616 } else {
1617 ## NOTE: This state should never be reached.
1618 !!!cp (121);
1619 }
1620 } else {
1621 die "$0: $self->{ct}->{type}: Unknown token type";
1622 }
1623 $self->{state} = DATA_STATE;
1624 $self->{s_kwd} = '';
1625 !!!next-input-character;
1626
1627 !!!emit ($self->{ct}); # start tag or end tag
1628
1629 redo A;
1630 } elsif ($self->{nc} == 0x002F) { # /
1631 !!!cp (122);
1632 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1633 !!!next-input-character;
1634 redo A;
1635 } elsif ($self->{nc} == -1) {
1636 !!!parse-error (type => 'unclosed tag');
1637 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1638 !!!cp (122.3);
1639 $self->{last_stag_name} = $self->{ct}->{tag_name};
1640 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1641 if ($self->{ct}->{attributes}) {
1642 !!!cp (122.1);
1643 !!!parse-error (type => 'end tag attribute');
1644 } else {
1645 ## NOTE: This state should never be reached.
1646 !!!cp (122.2);
1647 }
1648 } else {
1649 die "$0: $self->{ct}->{type}: Unknown token type";
1650 }
1651 $self->{state} = DATA_STATE;
1652 $self->{s_kwd} = '';
1653 ## Reconsume.
1654 !!!emit ($self->{ct}); # start tag or end tag
1655 redo A;
1656 } else {
1657 !!!cp ('124.1');
1658 !!!parse-error (type => 'no space between attributes');
1659 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1660 ## reconsume
1661 redo A;
1662 }
1663 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1664 ## XML5: "Empty tag state".
1665
1666 if ($self->{nc} == 0x003E) { # >
1667 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1668 !!!cp ('124.2');
1669 !!!parse-error (type => 'nestc', token => $self->{ct});
1670 ## TODO: Different type than slash in start tag
1671 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1672 if ($self->{ct}->{attributes}) {
1673 !!!cp ('124.4');
1674 !!!parse-error (type => 'end tag attribute');
1675 } else {
1676 !!!cp ('124.5');
1677 }
1678 ## TODO: Test |<title></title/>|
1679 } else {
1680 !!!cp ('124.3');
1681 $self->{self_closing} = 1;
1682 }
1683
1684 $self->{state} = DATA_STATE;
1685 $self->{s_kwd} = '';
1686 !!!next-input-character;
1687
1688 !!!emit ($self->{ct}); # start tag or end tag
1689
1690 redo A;
1691 } elsif ($self->{nc} == -1) {
1692 !!!parse-error (type => 'unclosed tag');
1693 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1694 !!!cp (124.7);
1695 $self->{last_stag_name} = $self->{ct}->{tag_name};
1696 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1697 if ($self->{ct}->{attributes}) {
1698 !!!cp (124.5);
1699 !!!parse-error (type => 'end tag attribute');
1700 } else {
1701 ## NOTE: This state should never be reached.
1702 !!!cp (124.6);
1703 }
1704 } else {
1705 die "$0: $self->{ct}->{type}: Unknown token type";
1706 }
1707 ## XML5: "Tag attribute name before state".
1708 $self->{state} = DATA_STATE;
1709 $self->{s_kwd} = '';
1710 ## Reconsume.
1711 !!!emit ($self->{ct}); # start tag or end tag
1712 redo A;
1713 } else {
1714 !!!cp ('124.4');
1715 !!!parse-error (type => 'nestc');
1716 ## TODO: This error type is wrong.
1717 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1718 ## Reconsume.
1719 redo A;
1720 }
1721 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1722 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1723
1724 ## NOTE: Unlike spec's "bogus comment state", this implementation
1725 ## consumes characters one-by-one basis.
1726
1727 if ($self->{nc} == 0x003E) { # >
1728 if ($self->{in_subset}) {
1729 !!!cp (123);
1730 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1731 } else {
1732 !!!cp (124);
1733 $self->{state} = DATA_STATE;
1734 $self->{s_kwd} = '';
1735 }
1736 !!!next-input-character;
1737
1738 !!!emit ($self->{ct}); # comment
1739 redo A;
1740 } elsif ($self->{nc} == -1) {
1741 if ($self->{in_subset}) {
1742 !!!cp (125.1);
1743 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1744 } else {
1745 !!!cp (125);
1746 $self->{state} = DATA_STATE;
1747 $self->{s_kwd} = '';
1748 }
1749 ## reconsume
1750
1751 !!!emit ($self->{ct}); # comment
1752 redo A;
1753 } else {
1754 !!!cp (126);
1755 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1756 $self->{read_until}->($self->{ct}->{data},
1757 q[>],
1758 length $self->{ct}->{data});
1759
1760 ## Stay in the state.
1761 !!!next-input-character;
1762 redo A;
1763 }
1764 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1765 ## XML5: "Markup declaration state".
1766
1767 if ($self->{nc} == 0x002D) { # -
1768 !!!cp (133);
1769 $self->{state} = MD_HYPHEN_STATE;
1770 !!!next-input-character;
1771 redo A;
1772 } elsif ($self->{nc} == 0x0044 or # D
1773 $self->{nc} == 0x0064) { # d
1774 ## ASCII case-insensitive.
1775 !!!cp (130);
1776 $self->{state} = MD_DOCTYPE_STATE;
1777 $self->{kwd} = chr $self->{nc};
1778 !!!next-input-character;
1779 redo A;
1780 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1781 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1782 $self->{is_xml}) and
1783 $self->{nc} == 0x005B) { # [
1784 !!!cp (135.4);
1785 $self->{state} = MD_CDATA_STATE;
1786 $self->{kwd} = '[';
1787 !!!next-input-character;
1788 redo A;
1789 } else {
1790 !!!cp (136);
1791 }
1792
1793 !!!parse-error (type => 'bogus comment',
1794 line => $self->{line_prev},
1795 column => $self->{column_prev} - 1);
1796 ## Reconsume.
1797 $self->{state} = BOGUS_COMMENT_STATE;
1798 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799 line => $self->{line_prev},
1800 column => $self->{column_prev} - 1,
1801 };
1802 redo A;
1803 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1804 if ($self->{nc} == 0x002D) { # -
1805 !!!cp (127);
1806 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1807 line => $self->{line_prev},
1808 column => $self->{column_prev} - 2,
1809 };
1810 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1811 !!!next-input-character;
1812 redo A;
1813 } else {
1814 !!!cp (128);
1815 !!!parse-error (type => 'bogus comment',
1816 line => $self->{line_prev},
1817 column => $self->{column_prev} - 2);
1818 $self->{state} = BOGUS_COMMENT_STATE;
1819 ## Reconsume.
1820 $self->{ct} = {type => COMMENT_TOKEN,
1821 data => '-',
1822 line => $self->{line_prev},
1823 column => $self->{column_prev} - 2,
1824 };
1825 redo A;
1826 }
1827 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1828 ## ASCII case-insensitive.
1829 if ($self->{nc} == [
1830 undef,
1831 0x004F, # O
1832 0x0043, # C
1833 0x0054, # T
1834 0x0059, # Y
1835 0x0050, # P
1836 ]->[length $self->{kwd}] or
1837 $self->{nc} == [
1838 undef,
1839 0x006F, # o
1840 0x0063, # c
1841 0x0074, # t
1842 0x0079, # y
1843 0x0070, # p
1844 ]->[length $self->{kwd}]) {
1845 !!!cp (131);
1846 ## Stay in the state.
1847 $self->{kwd} .= chr $self->{nc};
1848 !!!next-input-character;
1849 redo A;
1850 } elsif ((length $self->{kwd}) == 6 and
1851 ($self->{nc} == 0x0045 or # E
1852 $self->{nc} == 0x0065)) { # e
1853 if ($self->{is_xml} and
1854 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1855 !!!cp (129);
1856 ## XML5: case-sensitive.
1857 !!!parse-error (type => 'lowercase keyword', ## TODO
1858 text => 'DOCTYPE',
1859 line => $self->{line_prev},
1860 column => $self->{column_prev} - 5);
1861 } else {
1862 !!!cp (129.1);
1863 }
1864 $self->{state} = DOCTYPE_STATE;
1865 $self->{ct} = {type => DOCTYPE_TOKEN,
1866 quirks => 1,
1867 line => $self->{line_prev},
1868 column => $self->{column_prev} - 7,
1869 };
1870 !!!next-input-character;
1871 redo A;
1872 } else {
1873 !!!cp (132);
1874 !!!parse-error (type => 'bogus comment',
1875 line => $self->{line_prev},
1876 column => $self->{column_prev} - 1 - length $self->{kwd});
1877 $self->{state} = BOGUS_COMMENT_STATE;
1878 ## Reconsume.
1879 $self->{ct} = {type => COMMENT_TOKEN,
1880 data => $self->{kwd},
1881 line => $self->{line_prev},
1882 column => $self->{column_prev} - 1 - length $self->{kwd},
1883 };
1884 redo A;
1885 }
1886 } elsif ($self->{state} == MD_CDATA_STATE) {
1887 if ($self->{nc} == {
1888 '[' => 0x0043, # C
1889 '[C' => 0x0044, # D
1890 '[CD' => 0x0041, # A
1891 '[CDA' => 0x0054, # T
1892 '[CDAT' => 0x0041, # A
1893 }->{$self->{kwd}}) {
1894 !!!cp (135.1);
1895 ## Stay in the state.
1896 $self->{kwd} .= chr $self->{nc};
1897 !!!next-input-character;
1898 redo A;
1899 } elsif ($self->{kwd} eq '[CDATA' and
1900 $self->{nc} == 0x005B) { # [
1901 if ($self->{is_xml} and
1902 not $self->{tainted} and
1903 @{$self->{open_elements} or []} == 0) {
1904 !!!cp (135.2);
1905 !!!parse-error (type => 'cdata outside of root element',
1906 line => $self->{line_prev},
1907 column => $self->{column_prev} - 7);
1908 $self->{tainted} = 1;
1909 } else {
1910 !!!cp (135.21);
1911 }
1912
1913 $self->{ct} = {type => CHARACTER_TOKEN,
1914 data => '',
1915 line => $self->{line_prev},
1916 column => $self->{column_prev} - 7};
1917 $self->{state} = CDATA_SECTION_STATE;
1918 !!!next-input-character;
1919 redo A;
1920 } else {
1921 !!!cp (135.3);
1922 !!!parse-error (type => 'bogus comment',
1923 line => $self->{line_prev},
1924 column => $self->{column_prev} - 1 - length $self->{kwd});
1925 $self->{state} = BOGUS_COMMENT_STATE;
1926 ## Reconsume.
1927 $self->{ct} = {type => COMMENT_TOKEN,
1928 data => $self->{kwd},
1929 line => $self->{line_prev},
1930 column => $self->{column_prev} - 1 - length $self->{kwd},
1931 };
1932 redo A;
1933 }
1934 } elsif ($self->{state} == COMMENT_START_STATE) {
1935 if ($self->{nc} == 0x002D) { # -
1936 !!!cp (137);
1937 $self->{state} = COMMENT_START_DASH_STATE;
1938 !!!next-input-character;
1939 redo A;
1940 } elsif ($self->{nc} == 0x003E) { # >
1941 !!!parse-error (type => 'bogus comment');
1942 if ($self->{in_subset}) {
1943 !!!cp (138.1);
1944 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1945 } else {
1946 !!!cp (138);
1947 $self->{state} = DATA_STATE;
1948 $self->{s_kwd} = '';
1949 }
1950 !!!next-input-character;
1951
1952 !!!emit ($self->{ct}); # comment
1953
1954 redo A;
1955 } elsif ($self->{nc} == -1) {
1956 !!!parse-error (type => 'unclosed comment');
1957 if ($self->{in_subset}) {
1958 !!!cp (139.1);
1959 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1960 } else {
1961 !!!cp (139);
1962 $self->{state} = DATA_STATE;
1963 $self->{s_kwd} = '';
1964 }
1965 ## reconsume
1966
1967 !!!emit ($self->{ct}); # comment
1968
1969 redo A;
1970 } else {
1971 !!!cp (140);
1972 $self->{ct}->{data} # comment
1973 .= chr ($self->{nc});
1974 $self->{state} = COMMENT_STATE;
1975 !!!next-input-character;
1976 redo A;
1977 }
1978 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1979 if ($self->{nc} == 0x002D) { # -
1980 !!!cp (141);
1981 $self->{state} = COMMENT_END_STATE;
1982 !!!next-input-character;
1983 redo A;
1984 } elsif ($self->{nc} == 0x003E) { # >
1985 !!!parse-error (type => 'bogus comment');
1986 if ($self->{in_subset}) {
1987 !!!cp (142.1);
1988 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1989 } else {
1990 !!!cp (142);
1991 $self->{state} = DATA_STATE;
1992 $self->{s_kwd} = '';
1993 }
1994 !!!next-input-character;
1995
1996 !!!emit ($self->{ct}); # comment
1997
1998 redo A;
1999 } elsif ($self->{nc} == -1) {
2000 !!!parse-error (type => 'unclosed comment');
2001 if ($self->{in_subset}) {
2002 !!!cp (143.1);
2003 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2004 } else {
2005 !!!cp (143);
2006 $self->{state} = DATA_STATE;
2007 $self->{s_kwd} = '';
2008 }
2009 ## reconsume
2010
2011 !!!emit ($self->{ct}); # comment
2012
2013 redo A;
2014 } else {
2015 !!!cp (144);
2016 $self->{ct}->{data} # comment
2017 .= '-' . chr ($self->{nc});
2018 $self->{state} = COMMENT_STATE;
2019 !!!next-input-character;
2020 redo A;
2021 }
2022 } elsif ($self->{state} == COMMENT_STATE) {
2023 ## XML5: "Comment state" and "DOCTYPE comment state".
2024
2025 if ($self->{nc} == 0x002D) { # -
2026 !!!cp (145);
2027 $self->{state} = COMMENT_END_DASH_STATE;
2028 !!!next-input-character;
2029 redo A;
2030 } elsif ($self->{nc} == -1) {
2031 !!!parse-error (type => 'unclosed comment');
2032 if ($self->{in_subset}) {
2033 !!!cp (146.1);
2034 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2035 } else {
2036 !!!cp (146);
2037 $self->{state} = DATA_STATE;
2038 $self->{s_kwd} = '';
2039 }
2040 ## reconsume
2041
2042 !!!emit ($self->{ct}); # comment
2043
2044 redo A;
2045 } else {
2046 !!!cp (147);
2047 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2048 $self->{read_until}->($self->{ct}->{data},
2049 q[-],
2050 length $self->{ct}->{data});
2051
2052 ## Stay in the state
2053 !!!next-input-character;
2054 redo A;
2055 }
2056 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2057 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2058
2059 if ($self->{nc} == 0x002D) { # -
2060 !!!cp (148);
2061 $self->{state} = COMMENT_END_STATE;
2062 !!!next-input-character;
2063 redo A;
2064 } elsif ($self->{nc} == -1) {
2065 !!!parse-error (type => 'unclosed comment');
2066 if ($self->{in_subset}) {
2067 !!!cp (149.1);
2068 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2069 } else {
2070 !!!cp (149);
2071 $self->{state} = DATA_STATE;
2072 $self->{s_kwd} = '';
2073 }
2074 ## reconsume
2075
2076 !!!emit ($self->{ct}); # comment
2077
2078 redo A;
2079 } else {
2080 !!!cp (150);
2081 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2082 $self->{state} = COMMENT_STATE;
2083 !!!next-input-character;
2084 redo A;
2085 }
2086 } elsif ($self->{state} == COMMENT_END_STATE or
2087 $self->{state} == COMMENT_END_BANG_STATE) {
2088 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2089 ## (No comment end bang state.)
2090
2091 if ($self->{nc} == 0x003E) { # >
2092 if ($self->{in_subset}) {
2093 !!!cp (151.1);
2094 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2095 } else {
2096 !!!cp (151);
2097 $self->{state} = DATA_STATE;
2098 $self->{s_kwd} = '';
2099 }
2100 !!!next-input-character;
2101
2102 !!!emit ($self->{ct}); # comment
2103
2104 redo A;
2105 } elsif ($self->{nc} == 0x002D) { # -
2106 if ($self->{state} == COMMENT_END_BANG_STATE) {
2107 !!!cp (154.3);
2108 $self->{ct}->{data} .= '--!'; # comment
2109 $self->{state} = COMMENT_END_DASH_STATE;
2110 } else {
2111 !!!cp (152);
2112 ## XML5: Not a parse error.
2113 !!!parse-error (type => 'dash in comment',
2114 line => $self->{line_prev},
2115 column => $self->{column_prev});
2116 $self->{ct}->{data} .= '-'; # comment
2117 ## Stay in the state
2118 }
2119 !!!next-input-character;
2120 redo A;
2121 } elsif ($self->{nc} == 0x0021 and # !
2122 $self->{state} != COMMENT_END_BANG_STATE) {
2123 !!!parse-error (type => 'comment end bang'); # XXX error type
2124 $self->{state} = COMMENT_END_BANG_STATE;
2125 !!!next-input-character;
2126 redo A;
2127 } elsif ($self->{nc} == -1) {
2128 !!!parse-error (type => 'unclosed comment');
2129 if ($self->{in_subset}) {
2130 !!!cp (153.1);
2131 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2132 } else {
2133 !!!cp (153);
2134 $self->{state} = DATA_STATE;
2135 $self->{s_kwd} = '';
2136 }
2137 ## Reconsume.
2138
2139 !!!emit ($self->{ct}); # comment
2140
2141 redo A;
2142 } else {
2143 !!!cp (154);
2144 if ($self->{state} == COMMENT_END_BANG_STATE) {
2145 $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2146 } else {
2147 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2148 }
2149 $self->{state} = COMMENT_STATE;
2150 !!!next-input-character;
2151 redo A;
2152 }
2153 } elsif ($self->{state} == DOCTYPE_STATE) {
2154 if ($is_space->{$self->{nc}}) {
2155 !!!cp (155);
2156 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2157 !!!next-input-character;
2158 redo A;
2159 } elsif ($self->{nc} == -1) {
2160 !!!cp (155.1);
2161 !!!parse-error (type => 'unclosed DOCTYPE');
2162 $self->{ct}->{quirks} = 1;
2163
2164 $self->{state} = DATA_STATE;
2165 ## Reconsume.
2166 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2167
2168 redo A;
2169 } else {
2170 !!!cp (156);
2171 ## XML5: Swith to the bogus comment state.
2172 !!!parse-error (type => 'no space before DOCTYPE name');
2173 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2174 ## reconsume
2175 redo A;
2176 }
2177 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2178 ## XML5: "DOCTYPE root name before state".
2179
2180 if ($is_space->{$self->{nc}}) {
2181 !!!cp (157);
2182 ## Stay in the state
2183 !!!next-input-character;
2184 redo A;
2185 } elsif ($self->{nc} == 0x003E) { # >
2186 !!!cp (158);
2187 ## XML5: No parse error.
2188 !!!parse-error (type => 'no DOCTYPE name');
2189 $self->{state} = DATA_STATE;
2190 $self->{s_kwd} = '';
2191 !!!next-input-character;
2192
2193 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2194
2195 redo A;
2196 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2197 !!!cp (158.1);
2198 $self->{ct}->{name} # DOCTYPE
2199 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2200 delete $self->{ct}->{quirks};
2201 $self->{state} = DOCTYPE_NAME_STATE;
2202 !!!next-input-character;
2203 redo A;
2204 } elsif ($self->{nc} == -1) {
2205 !!!cp (159);
2206 !!!parse-error (type => 'no DOCTYPE name');
2207 $self->{state} = DATA_STATE;
2208 $self->{s_kwd} = '';
2209 ## reconsume
2210
2211 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2212
2213 redo A;
2214 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2215 !!!cp (159.1);
2216 !!!parse-error (type => 'no DOCTYPE name');
2217 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2218 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2219 $self->{in_subset} = 1;
2220 !!!next-input-character;
2221 !!!emit ($self->{ct}); # DOCTYPE
2222 redo A;
2223 } else {
2224 !!!cp (160);
2225 $self->{ct}->{name} = chr $self->{nc};
2226 delete $self->{ct}->{quirks};
2227 $self->{state} = DOCTYPE_NAME_STATE;
2228 !!!next-input-character;
2229 redo A;
2230 }
2231 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2232 ## XML5: "DOCTYPE root name state".
2233
2234 ## ISSUE: Redundant "First," in the spec.
2235
2236 if ($is_space->{$self->{nc}}) {
2237 !!!cp (161);
2238 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2239 !!!next-input-character;
2240 redo A;
2241 } elsif ($self->{nc} == 0x003E) { # >
2242 !!!cp (162);
2243 $self->{state} = DATA_STATE;
2244 $self->{s_kwd} = '';
2245 !!!next-input-character;
2246
2247 !!!emit ($self->{ct}); # DOCTYPE
2248
2249 redo A;
2250 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2251 !!!cp (162.1);
2252 $self->{ct}->{name} # DOCTYPE
2253 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2254 delete $self->{ct}->{quirks};
2255 ## Stay in the state.
2256 !!!next-input-character;
2257 redo A;
2258 } elsif ($self->{nc} == -1) {
2259 !!!cp (163);
2260 !!!parse-error (type => 'unclosed DOCTYPE');
2261 $self->{state} = DATA_STATE;
2262 $self->{s_kwd} = '';
2263 ## reconsume
2264
2265 $self->{ct}->{quirks} = 1;
2266 !!!emit ($self->{ct}); # DOCTYPE
2267
2268 redo A;
2269 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2270 !!!cp (163.1);
2271 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2272 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2273 $self->{in_subset} = 1;
2274 !!!next-input-character;
2275 !!!emit ($self->{ct}); # DOCTYPE
2276 redo A;
2277 } else {
2278 !!!cp (164);
2279 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2280 ## Stay in the state.
2281 !!!next-input-character;
2282 redo A;
2283 }
2284 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2285 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2286 ## state", but implemented differently.
2287
2288 if ($is_space->{$self->{nc}}) {
2289 !!!cp (165);
2290 ## Stay in the state
2291 !!!next-input-character;
2292 redo A;
2293 } elsif ($self->{nc} == 0x003E) { # >
2294 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2295 !!!cp (166);
2296 $self->{state} = DATA_STATE;
2297 $self->{s_kwd} = '';
2298 } else {
2299 !!!cp (166.1);
2300 !!!parse-error (type => 'no md def'); ## TODO: type
2301 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2302 }
2303
2304 !!!next-input-character;
2305 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2306 redo A;
2307 } elsif ($self->{nc} == -1) {
2308 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2309 !!!cp (167);
2310 !!!parse-error (type => 'unclosed DOCTYPE');
2311 $self->{state} = DATA_STATE;
2312 $self->{s_kwd} = '';
2313 $self->{ct}->{quirks} = 1;
2314 } else {
2315 !!!cp (167.12);
2316 !!!parse-error (type => 'unclosed md'); ## TODO: type
2317 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2318 }
2319
2320 ## Reconsume.
2321 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2322 redo A;
2323 } elsif ($self->{nc} == 0x0050 or # P
2324 $self->{nc} == 0x0070) { # p
2325 !!!cp (167.1);
2326 $self->{state} = PUBLIC_STATE;
2327 $self->{kwd} = chr $self->{nc};
2328 !!!next-input-character;
2329 redo A;
2330 } elsif ($self->{nc} == 0x0053 or # S
2331 $self->{nc} == 0x0073) { # s
2332 !!!cp (167.2);
2333 $self->{state} = SYSTEM_STATE;
2334 $self->{kwd} = chr $self->{nc};
2335 !!!next-input-character;
2336 redo A;
2337 } elsif ($self->{nc} == 0x0022 and # "
2338 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2339 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2340 !!!cp (167.21);
2341 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2342 $self->{ct}->{value} = ''; # ENTITY
2343 !!!next-input-character;
2344 redo A;
2345 } elsif ($self->{nc} == 0x0027 and # '
2346 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2347 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2348 !!!cp (167.22);
2349 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2350 $self->{ct}->{value} = ''; # ENTITY
2351 !!!next-input-character;
2352 redo A;
2353 } elsif ($self->{is_xml} and
2354 $self->{ct}->{type} == DOCTYPE_TOKEN and
2355 $self->{nc} == 0x005B) { # [
2356 !!!cp (167.3);
2357 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2358 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2359 $self->{in_subset} = 1;
2360 !!!next-input-character;
2361 !!!emit ($self->{ct}); # DOCTYPE
2362 redo A;
2363 } else {
2364 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2365
2366 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2367 !!!cp (180);
2368 $self->{ct}->{quirks} = 1;
2369 $self->{state} = BOGUS_DOCTYPE_STATE;
2370 } else {
2371 !!!cp (180.1);
2372 $self->{state} = BOGUS_MD_STATE;
2373 }
2374
2375 !!!next-input-character;
2376 redo A;
2377 }
2378 } elsif ($self->{state} == PUBLIC_STATE) {
2379 ## ASCII case-insensitive
2380 if ($self->{nc} == [
2381 undef,
2382 0x0055, # U
2383 0x0042, # B
2384 0x004C, # L
2385 0x0049, # I
2386 ]->[length $self->{kwd}] or
2387 $self->{nc} == [
2388 undef,
2389 0x0075, # u
2390 0x0062, # b
2391 0x006C, # l
2392 0x0069, # i
2393 ]->[length $self->{kwd}]) {
2394 !!!cp (175);
2395 ## Stay in the state.
2396 $self->{kwd} .= chr $self->{nc};
2397 !!!next-input-character;
2398 redo A;
2399 } elsif ((length $self->{kwd}) == 5 and
2400 ($self->{nc} == 0x0043 or # C
2401 $self->{nc} == 0x0063)) { # c
2402 if ($self->{is_xml} and
2403 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2404 !!!cp (168.1);
2405 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2406 text => 'PUBLIC',
2407 line => $self->{line_prev},
2408 column => $self->{column_prev} - 4);
2409 } else {
2410 !!!cp (168);
2411 }
2412 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2413 !!!next-input-character;
2414 redo A;
2415 } else {
2416 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2417 line => $self->{line_prev},
2418 column => $self->{column_prev} + 1 - length $self->{kwd});
2419 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2420 !!!cp (169);
2421 $self->{ct}->{quirks} = 1;
2422 $self->{state} = BOGUS_DOCTYPE_STATE;
2423 } else {
2424 !!!cp (169.1);
2425 $self->{state} = BOGUS_MD_STATE;
2426 }
2427 ## Reconsume.
2428 redo A;
2429 }
2430 } elsif ($self->{state} == SYSTEM_STATE) {
2431 ## ASCII case-insensitive
2432 if ($self->{nc} == [
2433 undef,
2434 0x0059, # Y
2435 0x0053, # S
2436 0x0054, # T
2437 0x0045, # E
2438 ]->[length $self->{kwd}] or
2439 $self->{nc} == [
2440 undef,
2441 0x0079, # y
2442 0x0073, # s
2443 0x0074, # t
2444 0x0065, # e
2445 ]->[length $self->{kwd}]) {
2446 !!!cp (170);
2447 ## Stay in the state.
2448 $self->{kwd} .= chr $self->{nc};
2449 !!!next-input-character;
2450 redo A;
2451 } elsif ((length $self->{kwd}) == 5 and
2452 ($self->{nc} == 0x004D or # M
2453 $self->{nc} == 0x006D)) { # m
2454 if ($self->{is_xml} and
2455 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2456 !!!cp (171.1);
2457 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2458 text => 'SYSTEM',
2459 line => $self->{line_prev},
2460 column => $self->{column_prev} - 4);
2461 } else {
2462 !!!cp (171);
2463 }
2464 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2465 !!!next-input-character;
2466 redo A;
2467 } else {
2468 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2469 line => $self->{line_prev},
2470 column => $self->{column_prev} + 1 - length $self->{kwd});
2471 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2472 !!!cp (172);
2473 $self->{ct}->{quirks} = 1;
2474 $self->{state} = BOGUS_DOCTYPE_STATE;
2475 } else {
2476 !!!cp (172.1);
2477 $self->{state} = BOGUS_MD_STATE;
2478 }
2479 ## Reconsume.
2480 redo A;
2481 }
2482 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2483 if ($is_space->{$self->{nc}}) {
2484 !!!cp (181);
2485 ## Stay in the state
2486 !!!next-input-character;
2487 redo A;
2488 } elsif ($self->{nc} eq 0x0022) { # "
2489 !!!cp (182);
2490 $self->{ct}->{pubid} = ''; # DOCTYPE
2491 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2492 !!!next-input-character;
2493 redo A;
2494 } elsif ($self->{nc} eq 0x0027) { # '
2495 !!!cp (183);
2496 $self->{ct}->{pubid} = ''; # DOCTYPE
2497 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2498 !!!next-input-character;
2499 redo A;
2500 } elsif ($self->{nc} eq 0x003E) { # >
2501 !!!parse-error (type => 'no PUBLIC literal');
2502
2503 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2504 !!!cp (184);
2505 $self->{state} = DATA_STATE;
2506 $self->{s_kwd} = '';
2507 $self->{ct}->{quirks} = 1;
2508 } else {
2509 !!!cp (184.1);
2510 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2511 }
2512
2513 !!!next-input-character;
2514 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2515 redo A;
2516 } elsif ($self->{nc} == -1) {
2517 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2518 !!!cp (185);
2519 !!!parse-error (type => 'unclosed DOCTYPE');
2520 $self->{state} = DATA_STATE;
2521 $self->{s_kwd} = '';
2522 $self->{ct}->{quirks} = 1;
2523 } else {
2524 !!!cp (185.1);
2525 !!!parse-error (type => 'unclosed md'); ## TODO: type
2526 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2527 }
2528
2529 ## reconsume
2530 !!!emit ($self->{ct}); # DOCTYPE
2531 redo A;
2532 } elsif ($self->{is_xml} and
2533 $self->{ct}->{type} == DOCTYPE_TOKEN and
2534 $self->{nc} == 0x005B) { # [
2535 !!!cp (186.1);
2536 !!!parse-error (type => 'no PUBLIC literal');
2537 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2538 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2539 $self->{in_subset} = 1;
2540 !!!next-input-character;
2541 !!!emit ($self->{ct}); # DOCTYPE
2542 redo A;
2543 } else {
2544 !!!parse-error (type => 'string after PUBLIC');
2545
2546 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2547 !!!cp (186);
2548 $self->{ct}->{quirks} = 1;
2549 $self->{state} = BOGUS_DOCTYPE_STATE;
2550 } else {
2551 !!!cp (186.2);
2552 $self->{state} = BOGUS_MD_STATE;
2553 }
2554
2555 !!!next-input-character;
2556 redo A;
2557 }
2558 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2559 if ($self->{nc} == 0x0022) { # "
2560 !!!cp (187);
2561 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2562 !!!next-input-character;
2563 redo A;
2564 } elsif ($self->{nc} == 0x003E) { # >
2565 !!!parse-error (type => 'unclosed PUBLIC literal');
2566
2567 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2568 !!!cp (188);
2569 $self->{state} = DATA_STATE;
2570 $self->{s_kwd} = '';
2571 $self->{ct}->{quirks} = 1;
2572 } else {
2573 !!!cp (188.1);
2574 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2575 }
2576
2577 !!!next-input-character;
2578 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2579 redo A;
2580 } elsif ($self->{nc} == -1) {
2581 !!!parse-error (type => 'unclosed PUBLIC literal');
2582
2583 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2584 !!!cp (189);
2585 $self->{state} = DATA_STATE;
2586 $self->{s_kwd} = '';
2587 $self->{ct}->{quirks} = 1;
2588 } else {
2589 !!!cp (189.1);
2590 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2591 }
2592
2593 ## Reconsume.
2594 !!!emit ($self->{ct}); # DOCTYPE
2595 redo A;
2596 } else {
2597 !!!cp (190);
2598 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2599 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2600 length $self->{ct}->{pubid});
2601
2602 ## Stay in the state
2603 !!!next-input-character;
2604 redo A;
2605 }
2606 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2607 if ($self->{nc} == 0x0027) { # '
2608 !!!cp (191);
2609 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2610 !!!next-input-character;
2611 redo A;
2612 } elsif ($self->{nc} == 0x003E) { # >
2613 !!!parse-error (type => 'unclosed PUBLIC literal');
2614
2615 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2616 !!!cp (192);
2617 $self->{state} = DATA_STATE;
2618 $self->{s_kwd} = '';
2619 $self->{ct}->{quirks} = 1;
2620 } else {
2621 !!!cp (192.1);
2622 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2623 }
2624
2625 !!!next-input-character;
2626 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2627 redo A;
2628 } elsif ($self->{nc} == -1) {
2629 !!!parse-error (type => 'unclosed PUBLIC literal');
2630
2631 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2632 !!!cp (193);
2633 $self->{state} = DATA_STATE;
2634 $self->{s_kwd} = '';
2635 $self->{ct}->{quirks} = 1;
2636 } else {
2637 !!!cp (193.1);
2638 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2639 }
2640
2641 ## reconsume
2642 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2643 redo A;
2644 } else {
2645 !!!cp (194);
2646 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2647 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2648 length $self->{ct}->{pubid});
2649
2650 ## Stay in the state
2651 !!!next-input-character;
2652 redo A;
2653 }
2654 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2655 if ($is_space->{$self->{nc}}) {
2656 !!!cp (195);
2657 ## Stay in the state
2658 !!!next-input-character;
2659 redo A;
2660 } elsif ($self->{nc} == 0x0022) { # "
2661 !!!cp (196);
2662 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2663 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2664 !!!next-input-character;
2665 redo A;
2666 } elsif ($self->{nc} == 0x0027) { # '
2667 !!!cp (197);
2668 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2669 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2670 !!!next-input-character;
2671 redo A;
2672 } elsif ($self->{nc} == 0x003E) { # >
2673 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2674 if ($self->{is_xml}) {
2675 !!!cp (198.1);
2676 !!!parse-error (type => 'no SYSTEM literal');
2677 } else {
2678 !!!cp (198);
2679 }
2680 $self->{state} = DATA_STATE;
2681 $self->{s_kwd} = '';
2682 } else {
2683 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2684 !!!cp (198.2);
2685 } else {
2686 !!!cp (198.3);
2687 !!!parse-error (type => 'no SYSTEM literal');
2688 }
2689 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2690 }
2691
2692 !!!next-input-character;
2693 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2694 redo A;
2695 } elsif ($self->{nc} == -1) {
2696 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2697 !!!cp (199);
2698 !!!parse-error (type => 'unclosed DOCTYPE');
2699
2700 $self->{state} = DATA_STATE;
2701 $self->{s_kwd} = '';
2702 $self->{ct}->{quirks} = 1;
2703 } else {
2704 !!!parse-error (type => 'unclosed md'); ## TODO: type
2705 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2706 }
2707
2708 ## reconsume
2709 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2710 redo A;
2711 } elsif ($self->{is_xml} and
2712 $self->{ct}->{type} == DOCTYPE_TOKEN and
2713 $self->{nc} == 0x005B) { # [
2714 !!!cp (200.1);
2715 !!!parse-error (type => 'no SYSTEM literal');
2716 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2717 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2718 $self->{in_subset} = 1;
2719 !!!next-input-character;
2720 !!!emit ($self->{ct}); # DOCTYPE
2721 redo A;
2722 } else {
2723 !!!parse-error (type => 'string after PUBLIC literal');
2724
2725 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2726 !!!cp (200);
2727 $self->{ct}->{quirks} = 1;
2728 $self->{state} = BOGUS_DOCTYPE_STATE;
2729 } else {
2730 !!!cp (200.2);
2731 $self->{state} = BOGUS_MD_STATE;
2732 }
2733
2734 !!!next-input-character;
2735 redo A;
2736 }
2737 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2738 if ($is_space->{$self->{nc}}) {
2739 !!!cp (201);
2740 ## Stay in the state
2741 !!!next-input-character;
2742 redo A;
2743 } elsif ($self->{nc} == 0x0022) { # "
2744 !!!cp (202);
2745 $self->{ct}->{sysid} = ''; # DOCTYPE
2746 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2747 !!!next-input-character;
2748 redo A;
2749 } elsif ($self->{nc} == 0x0027) { # '
2750 !!!cp (203);
2751 $self->{ct}->{sysid} = ''; # DOCTYPE
2752 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2753 !!!next-input-character;
2754 redo A;
2755 } elsif ($self->{nc} == 0x003E) { # >
2756 !!!parse-error (type => 'no SYSTEM literal');
2757 !!!next-input-character;
2758
2759 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2760 !!!cp (204);
2761 $self->{state} = DATA_STATE;
2762 $self->{s_kwd} = '';
2763 $self->{ct}->{quirks} = 1;
2764 } else {
2765 !!!cp (204.1);
2766 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2767 }
2768
2769 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2770 redo A;
2771 } elsif ($self->{nc} == -1) {
2772 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2773 !!!cp (205);
2774 !!!parse-error (type => 'unclosed DOCTYPE');
2775 $self->{state} = DATA_STATE;
2776 $self->{s_kwd} = '';
2777 $self->{ct}->{quirks} = 1;
2778 } else {
2779 !!!cp (205.1);
2780 !!!parse-error (type => 'unclosed md'); ## TODO: type
2781 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2782 }
2783
2784 ## reconsume
2785 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2786 redo A;
2787 } elsif ($self->{is_xml} and
2788 $self->{ct}->{type} == DOCTYPE_TOKEN and
2789 $self->{nc} == 0x005B) { # [
2790 !!!cp (206.1);
2791 !!!parse-error (type => 'no SYSTEM literal');
2792
2793 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2794 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2795 $self->{in_subset} = 1;
2796 !!!next-input-character;
2797 !!!emit ($self->{ct}); # DOCTYPE
2798 redo A;
2799 } else {
2800 !!!parse-error (type => 'string after SYSTEM');
2801
2802 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2803 !!!cp (206);
2804 $self->{ct}->{quirks} = 1;
2805 $self->{state} = BOGUS_DOCTYPE_STATE;
2806 } else {
2807 !!!cp (206.2);
2808 $self->{state} = BOGUS_MD_STATE;
2809 }
2810
2811 !!!next-input-character;
2812 redo A;
2813 }
2814 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2815 if ($self->{nc} == 0x0022) { # "
2816 !!!cp (207);
2817 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2818 !!!next-input-character;
2819 redo A;
2820 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2821 !!!parse-error (type => 'unclosed SYSTEM literal');
2822
2823 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2824 !!!cp (208);
2825 $self->{state} = DATA_STATE;
2826 $self->{s_kwd} = '';
2827 $self->{ct}->{quirks} = 1;
2828 } else {
2829 !!!cp (208.1);
2830 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2831 }
2832
2833 !!!next-input-character;
2834 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2835 redo A;
2836 } elsif ($self->{nc} == -1) {
2837 !!!parse-error (type => 'unclosed SYSTEM literal');
2838
2839 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2840 !!!cp (209);
2841 $self->{state} = DATA_STATE;
2842 $self->{s_kwd} = '';
2843 $self->{ct}->{quirks} = 1;
2844 } else {
2845 !!!cp (209.1);
2846 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2847 }
2848
2849 ## reconsume
2850 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2851 redo A;
2852 } else {
2853 !!!cp (210);
2854 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2855 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2856 length $self->{ct}->{sysid});
2857
2858 ## Stay in the state
2859 !!!next-input-character;
2860 redo A;
2861 }
2862 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2863 if ($self->{nc} == 0x0027) { # '
2864 !!!cp (211);
2865 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2866 !!!next-input-character;
2867 redo A;
2868 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2869 !!!cp (212);
2870 !!!parse-error (type => 'unclosed SYSTEM literal');
2871
2872 $self->{state} = DATA_STATE;
2873 $self->{s_kwd} = '';
2874 !!!next-input-character;
2875
2876 $self->{ct}->{quirks} = 1;
2877 !!!emit ($self->{ct}); # DOCTYPE
2878
2879 redo A;
2880 } elsif ($self->{nc} == -1) {
2881 !!!parse-error (type => 'unclosed SYSTEM literal');
2882
2883 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2884 !!!cp (213);
2885 $self->{state} = DATA_STATE;
2886 $self->{s_kwd} = '';
2887 $self->{ct}->{quirks} = 1;
2888 } else {
2889 !!!cp (213.1);
2890 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2891 }
2892
2893 ## reconsume
2894 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2895 redo A;
2896 } else {
2897 !!!cp (214);
2898 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2899 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2900 length $self->{ct}->{sysid});
2901
2902 ## Stay in the state
2903 !!!next-input-character;
2904 redo A;
2905 }
2906 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2907 if ($is_space->{$self->{nc}}) {
2908 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2909 !!!cp (215.1);
2910 $self->{state} = BEFORE_NDATA_STATE;
2911 } else {
2912 !!!cp (215);
2913 ## Stay in the state
2914 }
2915 !!!next-input-character;
2916 redo A;
2917 } elsif ($self->{nc} == 0x003E) { # >
2918 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2919 !!!cp (216);
2920 $self->{state} = DATA_STATE;
2921 $self->{s_kwd} = '';
2922 } else {
2923 !!!cp (216.1);
2924 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2925 }
2926
2927 !!!next-input-character;
2928 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2929 redo A;
2930 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2931 ($self->{nc} == 0x004E or # N
2932 $self->{nc} == 0x006E)) { # n
2933 !!!cp (216.2);
2934 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2935 $self->{state} = NDATA_STATE;
2936 $self->{kwd} = chr $self->{nc};
2937 !!!next-input-character;
2938 redo A;
2939 } elsif ($self->{nc} == -1) {
2940 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2941 !!!cp (217);
2942 !!!parse-error (type => 'unclosed DOCTYPE');
2943 $self->{state} = DATA_STATE;
2944 $self->{s_kwd} = '';
2945 $self->{ct}->{quirks} = 1;
2946 } else {
2947 !!!cp (217.1);
2948 !!!parse-error (type => 'unclosed md'); ## TODO: type
2949 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2950 }
2951
2952 ## reconsume
2953 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2954 redo A;
2955 } elsif ($self->{is_xml} and
2956 $self->{ct}->{type} == DOCTYPE_TOKEN and
2957 $self->{nc} == 0x005B) { # [
2958 !!!cp (218.1);
2959 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2960 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2961 $self->{in_subset} = 1;
2962 !!!next-input-character;
2963 !!!emit ($self->{ct}); # DOCTYPE
2964 redo A;
2965 } else {
2966 !!!parse-error (type => 'string after SYSTEM literal');
2967
2968 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2969 !!!cp (218);
2970 #$self->{ct}->{quirks} = 1;
2971 $self->{state} = BOGUS_DOCTYPE_STATE;
2972 } else {
2973 !!!cp (218.2);
2974 $self->{state} = BOGUS_MD_STATE;
2975 }
2976
2977 !!!next-input-character;
2978 redo A;
2979 }
2980 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2981 if ($is_space->{$self->{nc}}) {
2982 !!!cp (218.3);
2983 ## Stay in the state.
2984 !!!next-input-character;
2985 redo A;
2986 } elsif ($self->{nc} == 0x003E) { # >
2987 !!!cp (218.4);
2988 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2989 !!!next-input-character;
2990 !!!emit ($self->{ct}); # ENTITY
2991 redo A;
2992 } elsif ($self->{nc} == 0x004E or # N
2993 $self->{nc} == 0x006E) { # n
2994 !!!cp (218.5);
2995 $self->{state} = NDATA_STATE;
2996 $self->{kwd} = chr $self->{nc};
2997 !!!next-input-character;
2998 redo A;
2999 } elsif ($self->{nc} == -1) {
3000 !!!cp (218.6);
3001 !!!parse-error (type => 'unclosed md'); ## TODO: type
3002 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3003 ## reconsume
3004 !!!emit ($self->{ct}); # ENTITY
3005 redo A;
3006 } else {
3007 !!!cp (218.7);
3008 !!!parse-error (type => 'string after SYSTEM literal');
3009 $self->{state} = BOGUS_MD_STATE;
3010 !!!next-input-character;
3011 redo A;
3012 }
3013 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3014 if ($self->{nc} == 0x003E) { # >
3015 !!!cp (219);
3016 $self->{state} = DATA_STATE;
3017 $self->{s_kwd} = '';
3018 !!!next-input-character;
3019
3020 !!!emit ($self->{ct}); # DOCTYPE
3021
3022 redo A;
3023 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3024 !!!cp (220.1);
3025 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3026 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3027 $self->{in_subset} = 1;
3028 !!!next-input-character;
3029 !!!emit ($self->{ct}); # DOCTYPE
3030 redo A;
3031 } elsif ($self->{nc} == -1) {
3032 !!!cp (220);
3033 $self->{state} = DATA_STATE;
3034 $self->{s_kwd} = '';
3035 ## reconsume
3036
3037 !!!emit ($self->{ct}); # DOCTYPE
3038
3039 redo A;
3040 } else {
3041 !!!cp (221);
3042 my $s = '';
3043 $self->{read_until}->($s, q{>[}, 0);
3044
3045 ## Stay in the state
3046 !!!next-input-character;
3047 redo A;
3048 }
3049 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3050 ## NOTE: "CDATA section state" in the state is jointly implemented
3051 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3052 ## and |CDATA_SECTION_MSE2_STATE|.
3053
3054 ## XML5: "CDATA state".
3055
3056 if ($self->{nc} == 0x005D) { # ]
3057 !!!cp (221.1);
3058 $self->{state} = CDATA_SECTION_MSE1_STATE;
3059 !!!next-input-character;
3060 redo A;
3061 } elsif ($self->{nc} == -1) {
3062 if ($self->{is_xml}) {
3063 !!!cp (221.11);
3064 !!!parse-error (type => 'no mse'); ## TODO: type
3065 } else {
3066 !!!cp (221.12);
3067 }
3068
3069 $self->{state} = DATA_STATE;
3070 $self->{s_kwd} = '';
3071 ## Reconsume.
3072 if (length $self->{ct}->{data}) { # character
3073 !!!cp (221.2);
3074 !!!emit ($self->{ct}); # character
3075 } else {
3076 !!!cp (221.3);
3077 ## No token to emit. $self->{ct} is discarded.
3078 }
3079 redo A;
3080 } else {
3081 !!!cp (221.4);
3082 $self->{ct}->{data} .= chr $self->{nc};
3083 $self->{read_until}->($self->{ct}->{data},
3084 q<]>,
3085 length $self->{ct}->{data});
3086
3087 ## Stay in the state.
3088 !!!next-input-character;
3089 redo A;
3090 }
3091
3092 ## ISSUE: "text tokens" in spec.
3093 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3094 ## XML5: "CDATA bracket state".
3095
3096 if ($self->{nc} == 0x005D) { # ]
3097 !!!cp (221.5);
3098 $self->{state} = CDATA_SECTION_MSE2_STATE;
3099 !!!next-input-character;
3100 redo A;
3101 } else {
3102 !!!cp (221.6);
3103 ## XML5: If EOF, "]" is not appended and changed to the data state.
3104 $self->{ct}->{data} .= ']';
3105 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3106 ## Reconsume.
3107 redo A;
3108 }
3109 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3110 ## XML5: "CDATA end state".
3111
3112 if ($self->{nc} == 0x003E) { # >
3113 $self->{state} = DATA_STATE;
3114 $self->{s_kwd} = '';
3115 !!!next-input-character;
3116 if (length $self->{ct}->{data}) { # character
3117 !!!cp (221.7);
3118 !!!emit ($self->{ct}); # character
3119 } else {
3120 !!!cp (221.8);
3121 ## No token to emit. $self->{ct} is discarded.
3122 }
3123 redo A;
3124 } elsif ($self->{nc} == 0x005D) { # ]
3125 !!!cp (221.9); # character
3126 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3127 ## Stay in the state.
3128 !!!next-input-character;
3129 redo A;
3130 } else {
3131 !!!cp (221.11);
3132 $self->{ct}->{data} .= ']]'; # character
3133 $self->{state} = CDATA_SECTION_STATE;
3134 ## Reconsume. ## XML5: Emit.
3135 redo A;
3136 }
3137 } elsif ($self->{state} == ENTITY_STATE) {
3138 if ($is_space->{$self->{nc}} or
3139 {
3140 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3141 $self->{entity_add} => 1,
3142 }->{$self->{nc}}) {
3143 if ($self->{is_xml}) {
3144 !!!cp (1001.1);
3145 !!!parse-error (type => 'bare ero',
3146 line => $self->{line_prev},
3147 column => $self->{column_prev}
3148 + ($self->{nc} == -1 ? 1 : 0));
3149 } else {
3150 !!!cp (1001);
3151 ## No error
3152 }
3153 ## Don't consume
3154 ## Return nothing.
3155 #
3156 } elsif ($self->{nc} == 0x0023) { # #
3157 !!!cp (999);
3158 $self->{state} = ENTITY_HASH_STATE;
3159 $self->{kwd} = '#';
3160 !!!next-input-character;
3161 redo A;
3162 } elsif ($self->{is_xml} or
3163 (0x0041 <= $self->{nc} and
3164 $self->{nc} <= 0x005A) or # A..Z
3165 (0x0061 <= $self->{nc} and
3166 $self->{nc} <= 0x007A)) { # a..z
3167 !!!cp (998);
3168 require Whatpm::_NamedEntityList;
3169 $self->{state} = ENTITY_NAME_STATE;
3170 $self->{kwd} = chr $self->{nc};
3171 $self->{entity__value} = $self->{kwd};
3172 $self->{entity__match} = 0;
3173 !!!next-input-character;
3174 redo A;
3175 } else {
3176 !!!cp (1027);
3177 !!!parse-error (type => 'bare ero');
3178 ## Return nothing.
3179 #
3180 }
3181
3182 ## NOTE: No character is consumed by the "consume a character
3183 ## reference" algorithm. In other word, there is an "&" character
3184 ## that does not introduce a character reference, which would be
3185 ## appended to the parent element or the attribute value in later
3186 ## process of the tokenizer.
3187
3188 if ($self->{prev_state} == DATA_STATE) {
3189 !!!cp (997);
3190 $self->{state} = $self->{prev_state};
3191 $self->{s_kwd} = '';
3192 ## Reconsume.
3193 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3194 line => $self->{line_prev},
3195 column => $self->{column_prev},
3196 });
3197 redo A;
3198 } else {
3199 !!!cp (996);
3200 $self->{ca}->{value} .= '&';
3201 $self->{state} = $self->{prev_state};
3202 $self->{s_kwd} = '';
3203 ## Reconsume.
3204 redo A;
3205 }
3206 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3207 if ($self->{nc} == 0x0078) { # x
3208 !!!cp (995);
3209 $self->{state} = HEXREF_X_STATE;
3210 $self->{kwd} .= chr $self->{nc};
3211 !!!next-input-character;
3212 redo A;
3213 } elsif ($self->{nc} == 0x0058) { # X
3214 !!!cp (995.1);
3215 if ($self->{is_xml}) {
3216 !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3217 }
3218 $self->{state} = HEXREF_X_STATE;
3219 $self->{kwd} .= chr $self->{nc};
3220 !!!next-input-character;
3221 redo A;
3222 } elsif (0x0030 <= $self->{nc} and
3223 $self->{nc} <= 0x0039) { # 0..9
3224 !!!cp (994);
3225 $self->{state} = NCR_NUM_STATE;
3226 $self->{kwd} = $self->{nc} - 0x0030;
3227 !!!next-input-character;
3228 redo A;
3229 } else {
3230 !!!parse-error (type => 'bare nero',
3231 line => $self->{line_prev},
3232 column => $self->{column_prev} - 1);
3233
3234 ## NOTE: According to the spec algorithm, nothing is returned,
3235 ## and then "&#" is appended to the parent element or the attribute
3236 ## value in the later processing.
3237
3238 if ($self->{prev_state} == DATA_STATE) {
3239 !!!cp (1019);
3240 $self->{state} = $self->{prev_state};
3241 $self->{s_kwd} = '';
3242 ## Reconsume.
3243 !!!emit ({type => CHARACTER_TOKEN,
3244 data => '&#',
3245 line => $self->{line_prev},
3246 column => $self->{column_prev} - 1,
3247 });
3248 redo A;
3249 } else {
3250 !!!cp (993);
3251 $self->{ca}->{value} .= '&#';
3252 $self->{state} = $self->{prev_state};
3253 $self->{s_kwd} = '';
3254 ## Reconsume.
3255 redo A;
3256 }
3257 }
3258 } elsif ($self->{state} == NCR_NUM_STATE) {
3259 if (0x0030 <= $self->{nc} and
3260 $self->{nc} <= 0x0039) { # 0..9
3261 !!!cp (1012);
3262 $self->{kwd} *= 10;
3263 $self->{kwd} += $self->{nc} - 0x0030;
3264
3265 ## Stay in the state.
3266 !!!next-input-character;
3267 redo A;
3268 } elsif ($self->{nc} == 0x003B) { # ;
3269 !!!cp (1013);
3270 !!!next-input-character;
3271 #
3272 } else {
3273 !!!cp (1014);
3274 !!!parse-error (type => 'no refc');
3275 ## Reconsume.
3276 #
3277 }
3278
3279 my $code = $self->{kwd};
3280 my $l = $self->{line_prev};
3281 my $c = $self->{column_prev};
3282 if ((not $self->{is_xml} and $charref_map->{$code}) or
3283 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3284 ($self->{is_xml} and $code == 0x0000)) {
3285 !!!cp (1015);
3286 !!!parse-error (type => 'invalid character reference',
3287 text => (sprintf 'U+%04X', $code),
3288 line => $l, column => $c);
3289 $code = $charref_map->{$code};
3290 } elsif ($code > 0x10FFFF) {
3291 !!!cp (1016);
3292 !!!parse-error (type => 'invalid character reference',
3293 text => (sprintf 'U-%08X', $code),
3294 line => $l, column => $c);
3295 $code = 0xFFFD;
3296 }
3297
3298 if ($self->{prev_state} == DATA_STATE) {
3299 !!!cp (992);
3300 $self->{state} = $self->{prev_state};
3301 $self->{s_kwd} = '';
3302 ## Reconsume.
3303 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3304 has_reference => 1,
3305 line => $l, column => $c,
3306 });
3307 redo A;
3308 } else {
3309 !!!cp (991);
3310 $self->{ca}->{value} .= chr $code;
3311 $self->{ca}->{has_reference} = 1;
3312 $self->{state} = $self->{prev_state};
3313 $self->{s_kwd} = '';
3314 ## Reconsume.
3315 redo A;
3316 }
3317 } elsif ($self->{state} == HEXREF_X_STATE) {
3318 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3319 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3320 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3321 # 0..9, A..F, a..f
3322 !!!cp (990);
3323 $self->{state} = HEXREF_HEX_STATE;
3324 $self->{kwd} = 0;
3325 ## Reconsume.
3326 redo A;
3327 } else {
3328 !!!parse-error (type => 'bare hcro',
3329 line => $self->{line_prev},
3330 column => $self->{column_prev} - 2);
3331
3332 ## NOTE: According to the spec algorithm, nothing is returned,
3333 ## and then "&#" followed by "X" or "x" is appended to the parent
3334 ## element or the attribute value in the later processing.
3335
3336 if ($self->{prev_state} == DATA_STATE) {
3337 !!!cp (1005);
3338 $self->{state} = $self->{prev_state};
3339 $self->{s_kwd} = '';
3340 ## Reconsume.
3341 !!!emit ({type => CHARACTER_TOKEN,
3342 data => '&' . $self->{kwd},
3343 line => $self->{line_prev},
3344 column => $self->{column_prev} - length $self->{kwd},
3345 });
3346 redo A;
3347 } else {
3348 !!!cp (989);
3349 $self->{ca}->{value} .= '&' . $self->{kwd};
3350 $self->{state} = $self->{prev_state};
3351 $self->{s_kwd} = '';
3352 ## Reconsume.
3353 redo A;
3354 }
3355 }
3356 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3357 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3358 # 0..9
3359 !!!cp (1002);
3360 $self->{kwd} *= 0x10;
3361 $self->{kwd} += $self->{nc} - 0x0030;
3362 ## Stay in the state.
3363 !!!next-input-character;
3364 redo A;
3365 } elsif (0x0061 <= $self->{nc} and
3366 $self->{nc} <= 0x0066) { # a..f
3367 !!!cp (1003);
3368 $self->{kwd} *= 0x10;
3369 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3370 ## Stay in the state.
3371 !!!next-input-character;
3372 redo A;
3373 } elsif (0x0041 <= $self->{nc} and
3374 $self->{nc} <= 0x0046) { # A..F
3375 !!!cp (1004);
3376 $self->{kwd} *= 0x10;
3377 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3378 ## Stay in the state.
3379 !!!next-input-character;
3380 redo A;
3381 } elsif ($self->{nc} == 0x003B) { # ;
3382 !!!cp (1006);
3383 !!!next-input-character;
3384 #
3385 } else {
3386 !!!cp (1007);
3387 !!!parse-error (type => 'no refc',
3388 line => $self->{line},
3389 column => $self->{column});
3390 ## Reconsume.
3391 #
3392 }
3393
3394 my $code = $self->{kwd};
3395 my $l = $self->{line_prev};
3396 my $c = $self->{column_prev};
3397 if ((not $self->{is_xml} and $charref_map->{$code}) or
3398 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3399 ($self->{is_xml} and $code == 0x0000)) {
3400 !!!cp (1008);
3401 !!!parse-error (type => 'invalid character reference',
3402 text => (sprintf 'U+%04X', $code),
3403 line => $l, column => $c);
3404 $code = $charref_map->{$code};
3405 } elsif ($code > 0x10FFFF) {
3406 !!!cp (1009);
3407 !!!parse-error (type => 'invalid character reference',
3408 text => (sprintf 'U-%08X', $code),
3409 line => $l, column => $c);
3410 $code = 0xFFFD;
3411 }
3412
3413 if ($self->{prev_state} == DATA_STATE) {
3414 !!!cp (988);
3415 $self->{state} = $self->{prev_state};
3416 $self->{s_kwd} = '';
3417 ## Reconsume.
3418 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3419 has_reference => 1,
3420 line => $l, column => $c,
3421 });
3422 redo A;
3423 } else {
3424 !!!cp (987);
3425 $self->{ca}->{value} .= chr $code;
3426 $self->{ca}->{has_reference} = 1;
3427 $self->{state} = $self->{prev_state};
3428 $self->{s_kwd} = '';
3429 ## Reconsume.
3430 redo A;
3431 }
3432 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3433 if ((0x0041 <= $self->{nc} and # a
3434 $self->{nc} <= 0x005A) or # x
3435 (0x0061 <= $self->{nc} and # a
3436 $self->{nc} <= 0x007A) or # z
3437 (0x0030 <= $self->{nc} and # 0
3438 $self->{nc} <= 0x0039) or # 9
3439 $self->{nc} == 0x003B or # ;
3440 ($self->{is_xml} and
3441 not ($is_space->{$self->{nc}} or
3442 {
3443 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3444 $self->{entity_add} => 1,
3445 }->{$self->{nc}}))) {
3446 our $EntityChar;
3447 $self->{kwd} .= chr $self->{nc};
3448 if (defined $EntityChar->{$self->{kwd}} or
3449 $self->{ge}->{$self->{kwd}}) {
3450 if ($self->{nc} == 0x003B) { # ;
3451 if (defined $self->{ge}->{$self->{kwd}}) {
3452 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3453 !!!cp (1020.1);
3454 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3455 } else {
3456 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3457 !!!cp (1020.2);
3458 !!!parse-error (type => 'unparsed entity', ## TODO: type
3459 value => $self->{kwd});
3460 } else {
3461 !!!cp (1020.3);
3462 }
3463 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3464 }
3465 } else {
3466 if ($self->{is_xml}) {
3467 !!!cp (1020.4);
3468 !!!parse-error (type => 'entity not declared', ## TODO: type
3469 value => $self->{kwd},
3470 level => {
3471 'amp;' => $self->{level}->{warn},
3472 'quot;' => $self->{level}->{warn},
3473 'lt;' => $self->{level}->{warn},
3474 'gt;' => $self->{level}->{warn},
3475 'apos;' => $self->{level}->{warn},
3476 }->{$self->{kwd}} ||
3477 $self->{level}->{must});
3478 } else {
3479 !!!cp (1020);
3480 }
3481 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3482 }
3483 $self->{entity__match} = 1;
3484 !!!next-input-character;
3485 #
3486 } else {
3487 !!!cp (1021);
3488 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3489 $self->{entity__match} = -1;
3490 ## Stay in the state.
3491 !!!next-input-character;
3492 redo A;
3493 }
3494 } else {
3495 !!!cp (1022);
3496 $self->{entity__value} .= chr $self->{nc};
3497 $self->{entity__match} *= 2;
3498 ## Stay in the state.
3499 !!!next-input-character;
3500 redo A;
3501 }
3502 }
3503
3504 my $data;
3505 my $has_ref;
3506 if ($self->{entity__match} > 0) {
3507 !!!cp (1023);
3508 $data = $self->{entity__value};
3509 $has_ref = 1;
3510 #
3511 } elsif ($self->{entity__match} < 0) {
3512 !!!parse-error (type => 'no refc');
3513 if ($self->{prev_state} != DATA_STATE and # in attribute
3514 $self->{entity__match} < -1) {
3515 !!!cp (1024);
3516 $data = '&' . $self->{kwd};
3517 #
3518 } else {
3519 !!!cp (1025);
3520 $data = $self->{entity__value};
3521 $has_ref = 1;
3522 #
3523 }
3524 } else {
3525 !!!cp (1026);
3526 !!!parse-error (type => 'bare ero',
3527 line => $self->{line_prev},
3528 column => $self->{column_prev} - length $self->{kwd});
3529 $data = '&' . $self->{kwd};
3530 #
3531 }
3532
3533 ## NOTE: In these cases, when a character reference is found,
3534 ## it is consumed and a character token is returned, or, otherwise,
3535 ## nothing is consumed and returned, according to the spec algorithm.
3536 ## In this implementation, anything that has been examined by the
3537 ## tokenizer is appended to the parent element or the attribute value
3538 ## as string, either literal string when no character reference or
3539 ## entity-replaced string otherwise, in this stage, since any characters
3540 ## that would not be consumed are appended in the data state or in an
3541 ## appropriate attribute value state anyway.
3542
3543 if ($self->{prev_state} == DATA_STATE) {
3544 !!!cp (986);
3545 $self->{state} = $self->{prev_state};
3546 $self->{s_kwd} = '';
3547 ## Reconsume.
3548 !!!emit ({type => CHARACTER_TOKEN,
3549 data => $data,
3550 has_reference => $has_ref,
3551 line => $self->{line_prev},
3552 column => $self->{column_prev} + 1 - length $self->{kwd},
3553 });
3554 redo A;
3555 } else {
3556 !!!cp (985);
3557 $self->{ca}->{value} .= $data;
3558 $self->{ca}->{has_reference} = 1 if $has_ref;
3559 $self->{state} = $self->{prev_state};
3560 $self->{s_kwd} = '';
3561 ## Reconsume.
3562 redo A;
3563 }
3564
3565 ## XML-only states
3566
3567 } elsif ($self->{state} == PI_STATE) {
3568 ## XML5: "Pi state" and "DOCTYPE pi state".
3569
3570 if ($is_space->{$self->{nc}} or
3571 $self->{nc} == 0x003F or # ?
3572 $self->{nc} == -1) {
3573 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3574 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3575 ## "DOCTYPE pi state": Parse error, switch to the "data
3576 ## state".
3577 !!!parse-error (type => 'bare pio', ## TODO: type
3578 line => $self->{line_prev},
3579 column => $self->{column_prev}
3580 - 1 * ($self->{nc} != -1));
3581 $self->{state} = BOGUS_COMMENT_STATE;
3582 ## Reconsume.
3583 $self->{ct} = {type => COMMENT_TOKEN,
3584 data => '?',
3585 line => $self->{line_prev},
3586 column => $self->{column_prev}
3587 - 1 * ($self->{nc} != -1),
3588 };
3589 redo A;
3590 } else {
3591 ## XML5: "DOCTYPE pi state": Stay in the state.
3592 $self->{ct} = {type => PI_TOKEN,
3593 target => chr $self->{nc},
3594 data => '',
3595 line => $self->{line_prev},
3596 column => $self->{column_prev} - 1,
3597 };
3598 $self->{state} = PI_TARGET_STATE;
3599 !!!next-input-character;
3600 redo A;
3601 }
3602 } elsif ($self->{state} == PI_TARGET_STATE) {
3603 if ($is_space->{$self->{nc}}) {
3604 $self->{state} = PI_TARGET_AFTER_STATE;
3605 !!!next-input-character;
3606 redo A;
3607 } elsif ($self->{nc} == -1) {
3608 !!!parse-error (type => 'no pic'); ## TODO: type
3609 if ($self->{in_subset}) {
3610 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3611 } else {
3612 $self->{state} = DATA_STATE;
3613 $self->{s_kwd} = '';
3614 }
3615 ## Reconsume.
3616 !!!emit ($self->{ct}); # pi
3617 redo A;
3618 } elsif ($self->{nc} == 0x003F) { # ?
3619 $self->{state} = PI_AFTER_STATE;
3620 !!!next-input-character;
3621 redo A;
3622 } else {
3623 ## XML5: typo ("tag name" -> "target")
3624 $self->{ct}->{target} .= chr $self->{nc}; # pi
3625 !!!next-input-character;
3626 redo A;
3627 }
3628 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3629 if ($is_space->{$self->{nc}}) {
3630 ## Stay in the state.
3631 !!!next-input-character;
3632 redo A;
3633 } else {
3634 $self->{state} = PI_DATA_STATE;
3635 ## Reprocess.
3636 redo A;
3637 }
3638 } elsif ($self->{state} == PI_DATA_STATE) {
3639 if ($self->{nc} == 0x003F) { # ?
3640 $self->{state} = PI_DATA_AFTER_STATE;
3641 !!!next-input-character;
3642 redo A;
3643 } elsif ($self->{nc} == -1) {
3644 !!!parse-error (type => 'no pic'); ## TODO: type
3645 if ($self->{in_subset}) {
3646 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3647 } else {
3648 $self->{state} = DATA_STATE;
3649 $self->{s_kwd} = '';
3650 }
3651 ## Reprocess.
3652 !!!emit ($self->{ct}); # pi
3653 redo A;
3654 } else {
3655 $self->{ct}->{data} .= chr $self->{nc}; # pi
3656 $self->{read_until}->($self->{ct}->{data}, q[?],
3657 length $self->{ct}->{data});
3658 ## Stay in the state.
3659 !!!next-input-character;
3660 ## Reprocess.
3661 redo A;
3662 }
3663 } elsif ($self->{state} == PI_AFTER_STATE) {
3664 ## XML5: Part of "Pi after state".
3665
3666 if ($self->{nc} == 0x003E) { # >
3667 if ($self->{in_subset}) {
3668 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3669 } else {
3670 $self->{state} = DATA_STATE;
3671 $self->{s_kwd} = '';
3672 }
3673 !!!next-input-character;
3674 !!!emit ($self->{ct}); # pi
3675 redo A;
3676 } elsif ($self->{nc} == 0x003F) { # ?
3677 !!!parse-error (type => 'no s after target', ## TODO: type
3678 line => $self->{line_prev},
3679 column => $self->{column_prev}); ## XML5: no error
3680 $self->{ct}->{data} .= '?';
3681 $self->{state} = PI_DATA_AFTER_STATE;
3682 !!!next-input-character;
3683 redo A;
3684 } else {
3685 !!!parse-error (type => 'no s after target', ## TODO: type
3686 line => $self->{line_prev},
3687 column => $self->{column_prev}
3688 + 1 * ($self->{nc} == -1)); ## XML5: no error
3689 $self->{ct}->{data} .= '?'; ## XML5: not appended
3690 $self->{state} = PI_DATA_STATE;
3691 ## Reprocess.
3692 redo A;
3693 }
3694 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3695 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3696
3697 if ($self->{nc} == 0x003E) { # >
3698 if ($self->{in_subset}) {
3699 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3700 } else {
3701 $self->{state} = DATA_STATE;
3702 $self->{s_kwd} = '';
3703 }
3704 !!!next-input-character;
3705 !!!emit ($self->{ct}); # pi
3706 redo A;
3707 } elsif ($self->{nc} == 0x003F) { # ?
3708 $self->{ct}->{data} .= '?';
3709 ## Stay in the state.
3710 !!!next-input-character;
3711 redo A;
3712 } else {
3713 $self->{ct}->{data} .= '?'; ## XML5: not appended
3714 $self->{state} = PI_DATA_STATE;
3715 ## Reprocess.
3716 redo A;
3717 }
3718
3719 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3720 if ($self->{nc} == 0x003C) { # <
3721 $self->{state} = DOCTYPE_TAG_STATE;
3722 !!!next-input-character;
3723 redo A;
3724 } elsif ($self->{nc} == 0x0025) { # %
3725 ## XML5: Not defined yet.
3726
3727 ## TODO:
3728
3729 if (not $self->{stop_processing} and
3730 not $self->{document}->xml_standalone) {
3731 !!!parse-error (type => 'stop processing', ## TODO: type
3732 level => $self->{level}->{info});
3733 $self->{stop_processing} = 1;
3734 }
3735
3736 !!!next-input-character;
3737 redo A;
3738 } elsif ($self->{nc} == 0x005D) { # ]
3739 delete $self->{in_subset};
3740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3741 !!!next-input-character;
3742 redo A;
3743 } elsif ($is_space->{$self->{nc}}) {
3744 ## Stay in the state.
3745 !!!next-input-character;
3746 redo A;
3747 } elsif ($self->{nc} == -1) {
3748 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3749 delete $self->{in_subset};
3750 $self->{state} = DATA_STATE;
3751 $self->{s_kwd} = '';
3752 ## Reconsume.
3753 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3754 redo A;
3755 } else {
3756 unless ($self->{internal_subset_tainted}) {
3757 ## XML5: No parse error.
3758 !!!parse-error (type => 'string in internal subset');
3759 $self->{internal_subset_tainted} = 1;
3760 }
3761 ## Stay in the state.
3762 !!!next-input-character;
3763 redo A;
3764 }
3765 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3766 if ($self->{nc} == 0x003E) { # >
3767 $self->{state} = DATA_STATE;
3768 $self->{s_kwd} = '';
3769 !!!next-input-character;
3770 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3771 redo A;
3772 } elsif ($self->{nc} == -1) {
3773 !!!parse-error (type => 'unclosed DOCTYPE');
3774 $self->{state} = DATA_STATE;
3775 $self->{s_kwd} = '';
3776 ## Reconsume.
3777 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3778 redo A;
3779 } else {
3780 ## XML5: No parse error and stay in the state.
3781 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3782
3783 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3784 !!!next-input-character;
3785 redo A;
3786 }
3787 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3788 if ($self->{nc} == 0x003E) { # >
3789 $self->{state} = DATA_STATE;
3790 $self->{s_kwd} = '';
3791 !!!next-input-character;
3792 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3793 redo A;
3794 } elsif ($self->{nc} == -1) {
3795 $self->{state} = DATA_STATE;
3796 $self->{s_kwd} = '';
3797 ## Reconsume.
3798 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3799 redo A;
3800 } else {
3801 ## Stay in the state.
3802 !!!next-input-character;
3803 redo A;
3804 }
3805 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3806 if ($self->{nc} == 0x0021) { # !
3807 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3808 !!!next-input-character;
3809 redo A;
3810 } elsif ($self->{nc} == 0x003F) { # ?
3811 $self->{state} = PI_STATE;
3812 !!!next-input-character;
3813 redo A;
3814 } elsif ($self->{nc} == -1) {
3815 !!!parse-error (type => 'bare stago');
3816 $self->{state} = DATA_STATE;
3817 $self->{s_kwd} = '';
3818 ## Reconsume.
3819 redo A;
3820 } else {
3821 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3822 line => $self->{line_prev},
3823 column => $self->{column_prev});
3824 $self->{state} = BOGUS_COMMENT_STATE;
3825 $self->{ct} = {type => COMMENT_TOKEN,
3826 data => '',
3827 }; ## NOTE: Will be discarded.
3828 !!!next-input-character;
3829 redo A;
3830 }
3831 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3832 ## XML5: "DOCTYPE markup declaration state".
3833
3834 if ($self->{nc} == 0x002D) { # -
3835 $self->{state} = MD_HYPHEN_STATE;
3836 !!!next-input-character;
3837 redo A;
3838 } elsif ($self->{nc} == 0x0045 or # E
3839 $self->{nc} == 0x0065) { # e
3840 $self->{state} = MD_E_STATE;
3841 $self->{kwd} = chr $self->{nc};
3842 !!!next-input-character;
3843 redo A;
3844 } elsif ($self->{nc} == 0x0041 or # A
3845 $self->{nc} == 0x0061) { # a
3846 $self->{state} = MD_ATTLIST_STATE;
3847 $self->{kwd} = chr $self->{nc};
3848 !!!next-input-character;
3849 redo A;
3850 } elsif ($self->{nc} == 0x004E or # N
3851 $self->{nc} == 0x006E) { # n
3852 $self->{state} = MD_NOTATION_STATE;
3853 $self->{kwd} = chr $self->{nc};
3854 !!!next-input-character;
3855 redo A;
3856 } else {
3857 #
3858 }
3859
3860 ## XML5: No parse error.
3861 !!!parse-error (type => 'bogus comment',
3862 line => $self->{line_prev},
3863 column => $self->{column_prev} - 1);
3864 ## Reconsume.
3865 $self->{state} = BOGUS_COMMENT_STATE;
3866 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3867 redo A;
3868 } elsif ($self->{state} == MD_E_STATE) {
3869 if ($self->{nc} == 0x004E or # N
3870 $self->{nc} == 0x006E) { # n
3871 $self->{state} = MD_ENTITY_STATE;
3872 $self->{kwd} .= chr $self->{nc};
3873 !!!next-input-character;
3874 redo A;
3875 } elsif ($self->{nc} == 0x004C or # L
3876 $self->{nc} == 0x006C) { # l
3877 ## XML5: <!ELEMENT> not supported.
3878 $self->{state} = MD_ELEMENT_STATE;
3879 $self->{kwd} .= chr $self->{nc};
3880 !!!next-input-character;
3881 redo A;
3882 } else {
3883 ## XML5: No parse error.
3884 !!!parse-error (type => 'bogus comment',
3885 line => $self->{line_prev},
3886 column => $self->{column_prev} - 2
3887 + 1 * ($self->{nc} == -1));
3888 ## Reconsume.
3889 $self->{state} = BOGUS_COMMENT_STATE;
3890 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3891 redo A;
3892 }
3893 } elsif ($self->{state} == MD_ENTITY_STATE) {
3894 if ($self->{nc} == [
3895 undef,
3896 undef,
3897 0x0054, # T
3898 0x0049, # I
3899 0x0054, # T
3900 ]->[length $self->{kwd}] or
3901 $self->{nc} == [
3902 undef,
3903 undef,
3904 0x0074, # t
3905 0x0069, # i
3906 0x0074, # t
3907 ]->[length $self->{kwd}]) {
3908 ## Stay in the state.
3909 $self->{kwd} .= chr $self->{nc};
3910 !!!next-input-character;
3911 redo A;
3912 } elsif ((length $self->{kwd}) == 5 and
3913 ($self->{nc} == 0x0059 or # Y
3914 $self->{nc} == 0x0079)) { # y
3915 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3916 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3917 text => 'ENTITY',
3918 line => $self->{line_prev},
3919 column => $self->{column_prev} - 4);
3920 }
3921 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3922 line => $self->{line_prev},
3923 column => $self->{column_prev} - 6};
3924 $self->{state} = DOCTYPE_MD_STATE;
3925 !!!next-input-character;
3926 redo A;
3927 } else {
3928 !!!parse-error (type => 'bogus comment',
3929 line => $self->{line_prev},
3930 column => $self->{column_prev} - 1
3931 - (length $self->{kwd})
3932 + 1 * ($self->{nc} == -1));
3933 $self->{state} = BOGUS_COMMENT_STATE;
3934 ## Reconsume.
3935 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3936 redo A;
3937 }
3938 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3939 if ($self->{nc} == [
3940 undef,
3941 undef,
3942 0x0045, # E
3943 0x004D, # M
3944 0x0045, # E
3945 0x004E, # N
3946 ]->[length $self->{kwd}] or
3947 $self->{nc} == [
3948 undef,
3949 undef,
3950 0x0065, # e
3951 0x006D, # m
3952 0x0065, # e
3953 0x006E, # n
3954 ]->[length $self->{kwd}]) {
3955 ## Stay in the state.
3956 $self->{kwd} .= chr $self->{nc};
3957 !!!next-input-character;
3958 redo A;
3959 } elsif ((length $self->{kwd}) == 6 and
3960 ($self->{nc} == 0x0054 or # T
3961 $self->{nc} == 0x0074)) { # t
3962 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3963 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3964 text => 'ELEMENT',
3965 line => $self->{line_prev},
3966 column => $self->{column_prev} - 5);
3967 }
3968 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3969 line => $self->{line_prev},
3970 column => $self->{column_prev} - 7};
3971 $self->{state} = DOCTYPE_MD_STATE;
3972 !!!next-input-character;
3973 redo A;
3974 } else {
3975 !!!parse-error (type => 'bogus comment',
3976 line => $self->{line_prev},
3977 column => $self->{column_prev} - 1
3978 - (length $self->{kwd})
3979 + 1 * ($self->{nc} == -1));
3980 $self->{state} = BOGUS_COMMENT_STATE;
3981 ## Reconsume.
3982 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3983 redo A;
3984 }
3985 } elsif ($self->{state} == MD_ATTLIST_STATE) {
3986 if ($self->{nc} == [
3987 undef,
3988 0x0054, # T
3989 0x0054, # T
3990 0x004C, # L
3991 0x0049, # I
3992 0x0053, # S
3993 ]->[length $self->{kwd}] or
3994 $self->{nc} == [
3995 undef,
3996 0x0074, # t
3997 0x0074, # t
3998 0x006C, # l
3999 0x0069, # i
4000 0x0073, # s
4001 ]->[length $self->{kwd}]) {
4002 ## Stay in the state.
4003 $self->{kwd} .= chr $self->{nc};
4004 !!!next-input-character;
4005 redo A;
4006 } elsif ((length $self->{kwd}) == 6 and
4007 ($self->{nc} == 0x0054 or # T
4008 $self->{nc} == 0x0074)) { # t
4009 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
4010 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4011 text => 'ATTLIST',
4012 line => $self->{line_prev},
4013 column => $self->{column_prev} - 5);
4014 }
4015 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4016 attrdefs => [],
4017 line => $self->{line_prev},
4018 column => $self->{column_prev} - 7};
4019 $self->{state} = DOCTYPE_MD_STATE;
4020 !!!next-input-character;
4021 redo A;
4022 } else {
4023 !!!parse-error (type => 'bogus comment',
4024 line => $self->{line_prev},
4025 column => $self->{column_prev} - 1
4026 - (length $self->{kwd})
4027 + 1 * ($self->{nc} == -1));
4028 $self->{state} = BOGUS_COMMENT_STATE;
4029 ## Reconsume.
4030 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4031 redo A;
4032 }
4033 } elsif ($self->{state} == MD_NOTATION_STATE) {
4034 if ($self->{nc} == [
4035 undef,
4036 0x004F, # O
4037 0x0054, # T
4038 0x0041, # A
4039 0x0054, # T
4040 0x0049, # I
4041 0x004F, # O
4042 ]->[length $self->{kwd}] or
4043 $self->{nc} == [
4044 undef,
4045 0x006F, # o
4046 0x0074, # t
4047 0x0061, # a
4048 0x0074, # t
4049 0x0069, # i
4050 0x006F, # o
4051 ]->[length $self->{kwd}]) {
4052 ## Stay in the state.
4053 $self->{kwd} .= chr $self->{nc};
4054 !!!next-input-character;
4055 redo A;
4056 } elsif ((length $self->{kwd}) == 7 and
4057 ($self->{nc} == 0x004E or # N
4058 $self->{nc} == 0x006E)) { # n
4059 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4060 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4061 text => 'NOTATION',
4062 line => $self->{line_prev},
4063 column => $self->{column_prev} - 6);
4064 }
4065 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4066 line => $self->{line_prev},
4067 column => $self->{column_prev} - 8};
4068 $self->{state} = DOCTYPE_MD_STATE;
4069 !!!next-input-character;
4070 redo A;
4071 } else {
4072 !!!parse-error (type => 'bogus comment',
4073 line => $self->{line_prev},
4074 column => $self->{column_prev} - 1
4075 - (length $self->{kwd})
4076 + 1 * ($self->{nc} == -1));
4077 $self->{state} = BOGUS_COMMENT_STATE;
4078 ## Reconsume.
4079 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4080 redo A;
4081 }
4082 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4083 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4084 ## "DOCTYPE NOTATION state".
4085
4086 if ($is_space->{$self->{nc}}) {
4087 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4088 $self->{state} = BEFORE_MD_NAME_STATE;
4089 !!!next-input-character;
4090 redo A;
4091 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4092 $self->{nc} == 0x0025) { # %
4093 ## XML5: Switch to the "DOCTYPE bogus comment state".
4094 !!!parse-error (type => 'no space before md name'); ## TODO: type
4095 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4096 !!!next-input-character;
4097 redo A;
4098 } elsif ($self->{nc} == -1) {
4099 !!!parse-error (type => 'unclosed md'); ## TODO: type
4100 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4101 ## Reconsume.
4102 redo A;
4103 } elsif ($self->{nc} == 0x003E) { # >
4104 ## XML5: Switch to the "DOCTYPE bogus comment state".
4105 !!!parse-error (type => 'no md name'); ## TODO: type
4106 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4107 !!!next-input-character;
4108 redo A;
4109 } else {
4110 ## XML5: Switch to the "DOCTYPE bogus comment state".
4111 !!!parse-error (type => 'no space before md name'); ## TODO: type
4112 $self->{state} = BEFORE_MD_NAME_STATE;
4113 redo A;
4114 }
4115 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4116 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4117 ## before state", "DOCTYPE ATTLIST name before state".
4118
4119 if ($is_space->{$self->{nc}}) {
4120 ## Stay in the state.
4121 !!!next-input-character;
4122 redo A;
4123 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4124 $self->{nc} == 0x0025) { # %
4125 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4126 !!!next-input-character;
4127 redo A;
4128 } elsif ($self->{nc} == 0x003E) { # >
4129 ## XML5: Same as "Anything else".
4130 !!!parse-error (type => 'no md name'); ## TODO: type
4131 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4132 !!!next-input-character;
4133 redo A;
4134 } elsif ($self->{nc} == -1) {
4135 !!!parse-error (type => 'unclosed md'); ## TODO: type
4136 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4137 ## Reconsume.
4138 redo A;
4139 } else {
4140 ## XML5: [ATTLIST] Not defined yet.
4141 $self->{ct}->{name} .= chr $self->{nc};
4142 $self->{state} = MD_NAME_STATE;
4143 !!!next-input-character;
4144 redo A;
4145 }
4146 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4147 if ($is_space->{$self->{nc}}) {
4148 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4149 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4150 $self->{state} = BEFORE_MD_NAME_STATE;
4151 !!!next-input-character;
4152 redo A;
4153 } elsif ($self->{nc} == 0x003E) { # >
4154 ## XML5: Same as "Anything else".
4155 !!!parse-error (type => 'no md name'); ## TODO: type
4156 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4157 !!!next-input-character;
4158 redo A;
4159 } elsif ($self->{nc} == -1) {
4160 !!!parse-error (type => 'unclosed md');
4161 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4162 ## Reconsume.
4163 redo A;
4164 } else {
4165 ## XML5: No parse error.
4166 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4167 $self->{state} = BOGUS_COMMENT_STATE;
4168 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4169 ## Reconsume.
4170 redo A;
4171 }
4172 } elsif ($self->{state} == MD_NAME_STATE) {
4173 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4174
4175 if ($is_space->{$self->{nc}}) {
4176 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4177 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4178 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4179 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4180 } else { # ENTITY/NOTATION
4181 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4182 }
4183 !!!next-input-character;
4184 redo A;
4185 } elsif ($self->{nc} == 0x003E) { # >
4186 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4187 #
4188 } else {
4189 !!!parse-error (type => 'no md def'); ## TODO: type
4190 }
4191 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4192 !!!next-input-character;
4193 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4194 redo A;
4195 } elsif ($self->{nc} == -1) {
4196 ## XML5: [ATTLIST] No parse error.
4197 !!!parse-error (type => 'unclosed md');
4198 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4199 ## Reconsume.
4200 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4201 redo A;
4202 } else {
4203 ## XML5: [ATTLIST] Not defined yet.
4204 $self->{ct}->{name} .= chr $self->{nc};
4205 ## Stay in the state.
4206 !!!next-input-character;
4207 redo A;
4208 }
4209 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4210 if ($is_space->{$self->{nc}}) {
4211 ## Stay in the state.
4212 !!!next-input-character;
4213 redo A;
4214 } elsif ($self->{nc} == 0x003E) { # >
4215 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4216 !!!next-input-character;
4217 !!!emit ($self->{ct}); # ATTLIST
4218 redo A;
4219 } elsif ($self->{nc} == -1) {
4220 ## XML5: No parse error.
4221 !!!parse-error (type => 'unclosed md'); ## TODO: type
4222 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4223 !!!emit ($self->{ct});
4224 redo A;
4225 } else {
4226 ## XML5: Not defined yet.
4227 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4228 tokens => [],
4229 line => $self->{line}, column => $self->{column}};
4230 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4231 !!!next-input-character;
4232 redo A;
4233 }
4234 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4235 if ($is_space->{$self->{nc}}) {
4236 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4237 !!!next-input-character;
4238 redo A;
4239 } elsif ($self->{nc} == 0x003E) { # >
4240 ## XML5: Same as "anything else".
4241 !!!parse-error (type => 'no attr type'); ## TODO: type
4242 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4243 !!!next-input-character;
4244 !!!emit ($self->{ct}); # ATTLIST
4245 redo A;
4246 } elsif ($self->{nc} == 0x0028) { # (
4247 ## XML5: Same as "anything else".
4248 !!!parse-error (type => 'no space before paren'); ## TODO: type
4249 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4250 !!!next-input-character;
4251 redo A;
4252 } elsif ($self->{nc} == -1) {
4253 ## XML5: No parse error.
4254 !!!parse-error (type => 'unclosed md'); ## TODO: type
4255 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4256 !!!next-input-character;
4257 !!!emit ($self->{ct}); # ATTLIST
4258 redo A;
4259 } else {
4260 ## XML5: Not defined yet.
4261 $self->{ca}->{name} .= chr $self->{nc};
4262 ## Stay in the state.
4263 !!!next-input-character;
4264 redo A;
4265 }
4266 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4267 if ($is_space->{$self->{nc}}) {
4268 ## Stay in the state.
4269 !!!next-input-character;
4270 redo A;
4271 } elsif ($self->{nc} == 0x003E) { # >
4272 ## XML5: Same as "anything else".
4273 !!!parse-error (type => 'no attr type'); ## TODO: type
4274 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275 !!!next-input-character;
4276 !!!emit ($self->{ct}); # ATTLIST
4277 redo A;
4278 } elsif ($self->{nc} == 0x0028) { # (
4279 ## XML5: Same as "anything else".
4280 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4281 !!!next-input-character;
4282 redo A;
4283 } elsif ($self->{nc} == -1) {
4284 ## XML5: No parse error.
4285 !!!parse-error (type => 'unclosed md'); ## TODO: type
4286 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4287 !!!next-input-character;
4288 !!!emit ($self->{ct});
4289 redo A;
4290 } else {
4291 ## XML5: Not defined yet.
4292 $self->{ca}->{type} = chr $self->{nc};
4293 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4294 !!!next-input-character;
4295 redo A;
4296 }
4297 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4298 if ($is_space->{$self->{nc}}) {
4299 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4300 !!!next-input-character;
4301 redo A;
4302 } elsif ($self->{nc} == 0x0023) { # #
4303 ## XML5: Same as "anything else".
4304 !!!parse-error (type => 'no space before default value'); ## TODO: type
4305 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4306 !!!next-input-character;
4307 redo A;
4308 } elsif ($self->{nc} == 0x0022) { # "
4309 ## XML5: Same as "anything else".
4310 !!!parse-error (type => 'no space before default value'); ## TODO: type
4311 $self->{ca}->{value} = '';
4312 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4313 !!!next-input-character;
4314 redo A;
4315 } elsif ($self->{nc} == 0x0027) { # '
4316 ## XML5: Same as "anything else".
4317 !!!parse-error (type => 'no space before default value'); ## TODO: type
4318 $self->{ca}->{value} = '';
4319 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4320 !!!next-input-character;
4321 redo A;
4322 } elsif ($self->{nc} == 0x003E) { # >
4323 ## XML5: Same as "anything else".
4324 !!!parse-error (type => 'no attr default'); ## TODO: type
4325 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4326 !!!next-input-character;
4327 !!!emit ($self->{ct}); # ATTLIST
4328 redo A;
4329 } elsif ($self->{nc} == 0x0028) { # (
4330 ## XML5: Same as "anything else".
4331 !!!parse-error (type => 'no space before paren'); ## TODO: type
4332 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4333 !!!next-input-character;
4334 redo A;
4335 } elsif ($self->{nc} == -1) {
4336 ## XML5: No parse error.
4337 !!!parse-error (type => 'unclosed md'); ## TODO: type
4338 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4339 !!!next-input-character;
4340 !!!emit ($self->{ct});
4341 redo A;
4342 } else {
4343 ## XML5: Not defined yet.
4344 $self->{ca}->{type} .= chr $self->{nc};
4345 ## Stay in the state.
4346 !!!next-input-character;
4347 redo A;
4348 }
4349 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4350 if ($is_space->{$self->{nc}}) {
4351 ## Stay in the state.
4352 !!!next-input-character;
4353 redo A;
4354 } elsif ($self->{nc} == 0x0028) { # (
4355 ## XML5: Same as "anything else".
4356 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4357 !!!next-input-character;
4358 redo A;
4359 } elsif ($self->{nc} == 0x0023) { # #
4360 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4361 !!!next-input-character;
4362 redo A;
4363 } elsif ($self->{nc} == 0x0022) { # "
4364 ## XML5: Same as "anything else".
4365 $self->{ca}->{value} = '';
4366 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4367 !!!next-input-character;
4368 redo A;
4369 } elsif ($self->{nc} == 0x0027) { # '
4370 ## XML5: Same as "anything else".
4371 $self->{ca}->{value} = '';
4372 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4373 !!!next-input-character;
4374 redo A;
4375 } elsif ($self->{nc} == 0x003E) { # >
4376 ## XML5: Same as "anything else".
4377 !!!parse-error (type => 'no attr default'); ## TODO: type
4378 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4379 !!!next-input-character;
4380 !!!emit ($self->{ct}); # ATTLIST
4381 redo A;
4382 } elsif ($self->{nc} == -1) {
4383 ## XML5: No parse error.
4384 !!!parse-error (type => 'unclosed md'); ## TODO: type
4385 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4386 !!!next-input-character;
4387 !!!emit ($self->{ct});
4388 redo A;
4389 } else {
4390 ## XML5: Switch to the "DOCTYPE bogus comment state".
4391 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4392 $self->{ca}->{value} = '';
4393 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4394 ## Reconsume.
4395 redo A;
4396 }
4397 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4398 if ($is_space->{$self->{nc}}) {
4399 ## Stay in the state.
4400 !!!next-input-character;
4401 redo A;
4402 } elsif ($self->{nc} == 0x007C) { # |
4403 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4404 ## Stay in the state.
4405 !!!next-input-character;
4406 redo A;
4407 } elsif ($self->{nc} == 0x0029) { # )
4408 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4409 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4410 !!!next-input-character;
4411 redo A;
4412 } elsif ($self->{nc} == 0x003E) { # >
4413 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4414 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4415 !!!next-input-character;
4416 !!!emit ($self->{ct}); # ATTLIST
4417 redo A;
4418 } elsif ($self->{nc} == -1) {
4419 ## XML5: No parse error.
4420 !!!parse-error (type => 'unclosed md'); ## TODO: type
4421 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4422 !!!next-input-character;
4423 !!!emit ($self->{ct});
4424 redo A;
4425 } else {
4426 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4427 $self->{state} = ALLOWED_TOKEN_STATE;
4428 !!!next-input-character;
4429 redo A;
4430 }
4431 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4432 if ($is_space->{$self->{nc}}) {
4433 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4434 !!!next-input-character;
4435 redo A;
4436 } elsif ($self->{nc} == 0x007C) { # |
4437 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4438 !!!next-input-character;
4439 redo A;
4440 } elsif ($self->{nc} == 0x0029) { # )
4441 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4442 !!!next-input-character;
4443 redo A;
4444 } elsif ($self->{nc} == 0x003E) { # >
4445 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4446 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4447 !!!next-input-character;
4448 !!!emit ($self->{ct}); # ATTLIST
4449 redo A;
4450 } elsif ($self->{nc} == -1) {
4451 ## XML5: No parse error.
4452 !!!parse-error (type => 'unclosed md'); ## TODO: type
4453 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4454 !!!next-input-character;
4455 !!!emit ($self->{ct});
4456 redo A;
4457 } else {
4458 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4459 ## Stay in the state.
4460 !!!next-input-character;
4461 redo A;
4462 }
4463 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4464 if ($is_space->{$self->{nc}}) {
4465 ## Stay in the state.
4466 !!!next-input-character;
4467 redo A;
4468 } elsif ($self->{nc} == 0x007C) { # |
4469 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4470 !!!next-input-character;
4471 redo A;
4472 } elsif ($self->{nc} == 0x0029) { # )
4473 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4474 !!!next-input-character;
4475 redo A;
4476 } elsif ($self->{nc} == 0x003E) { # >
4477 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4478 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4479 !!!next-input-character;
4480 !!!emit ($self->{ct}); # ATTLIST
4481 redo A;
4482 } elsif ($self->{nc} == -1) {
4483 ## XML5: No parse error.
4484 !!!parse-error (type => 'unclosed md'); ## TODO: type
4485 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4486 !!!next-input-character;
4487 !!!emit ($self->{ct});
4488 redo A;
4489 } else {
4490 !!!parse-error (type => 'space in allowed token', ## TODO: type
4491 line => $self->{line_prev},
4492 column => $self->{column_prev});
4493 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4494 $self->{state} = ALLOWED_TOKEN_STATE;
4495 !!!next-input-character;
4496 redo A;
4497 }
4498 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4499 if ($is_space->{$self->{nc}}) {
4500 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4501 !!!next-input-character;
4502 redo A;
4503 } elsif ($self->{nc} == 0x0023) { # #
4504 !!!parse-error (type => 'no space before default value'); ## TODO: type
4505 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4506 !!!next-input-character;
4507 redo A;
4508 } elsif ($self->{nc} == 0x0022) { # "
4509 !!!parse-error (type => 'no space before default value'); ## TODO: type
4510 $self->{ca}->{value} = '';
4511 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4512 !!!next-input-character;
4513 redo A;
4514 } elsif ($self->{nc} == 0x0027) { # '
4515 !!!parse-error (type => 'no space before default value'); ## TODO: type
4516 $self->{ca}->{value} = '';
4517 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4518 !!!next-input-character;
4519 redo A;
4520 } elsif ($self->{nc} == 0x003E) { # >
4521 !!!parse-error (type => 'no attr default'); ## TODO: type
4522 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4523 !!!next-input-character;
4524 !!!emit ($self->{ct}); # ATTLIST
4525 redo A;
4526 } elsif ($self->{nc} == -1) {
4527 !!!parse-error (type => 'unclosed md'); ## TODO: type
4528 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4529 !!!next-input-character;
4530 !!!emit ($self->{ct});
4531 redo A;
4532 } else {
4533 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4534 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4535 ## Reconsume.
4536 redo A;
4537 }
4538 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4539 if ($is_space->{$self->{nc}}) {
4540 ## Stay in the state.
4541 !!!next-input-character;
4542 redo A;
4543 } elsif ($self->{nc} == 0x0023) { # #
4544 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4545 !!!next-input-character;
4546 redo A;
4547 } elsif ($self->{nc} == 0x0022) { # "
4548 $self->{ca}->{value} = '';
4549 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4550 !!!next-input-character;
4551 redo A;
4552 } elsif ($self->{nc} == 0x0027) { # '
4553 $self->{ca}->{value} = '';
4554 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4555 !!!next-input-character;
4556 redo A;
4557 } elsif ($self->{nc} == 0x003E) { # >
4558 !!!parse-error (type => 'no attr default'); ## TODO: type
4559 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4560 !!!next-input-character;
4561 !!!emit ($self->{ct}); # ATTLIST
4562 redo A;
4563 } elsif ($self->{nc} == -1) {
4564 !!!parse-error (type => 'unclosed md'); ## TODO: type
4565 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4566 !!!next-input-character;
4567 !!!emit ($self->{ct});
4568 redo A;
4569 } else {
4570 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4571 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4572 ## Reconsume.
4573 redo A;
4574 }
4575 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4576 if ($is_space->{$self->{nc}}) {
4577 ## XML5: No parse error.
4578 !!!parse-error (type => 'no default type'); ## TODO: type
4579 $self->{state} = BOGUS_MD_STATE;
4580 ## Reconsume.
4581 redo A;
4582 } elsif ($self->{nc} == 0x0022) { # "
4583 ## XML5: Same as "anything else".
4584 $self->{ca}->{value} = '';
4585 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4586 !!!next-input-character;
4587 redo A;
4588 } elsif ($self->{nc} == 0x0027) { # '
4589 ## XML5: Same as "anything else".
4590 $self->{ca}->{value} = '';
4591 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4592 !!!next-input-character;
4593 redo A;
4594 } elsif ($self->{nc} == 0x003E) { # >
4595 ## XML5: Same as "anything else".
4596 !!!parse-error (type => 'no attr default'); ## TODO: type
4597 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4598 !!!next-input-character;
4599 !!!emit ($self->{ct}); # ATTLIST
4600 redo A;
4601 } elsif ($self->{nc} == -1) {
4602 ## XML5: No parse error.
4603 !!!parse-error (type => 'unclosed md'); ## TODO: type
4604 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4605 !!!next-input-character;
4606 !!!emit ($self->{ct});
4607 redo A;
4608 } else {
4609 $self->{ca}->{default} = chr $self->{nc};
4610 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4611 !!!next-input-character;
4612 redo A;
4613 }
4614 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4615 if ($is_space->{$self->{nc}}) {
4616 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4617 !!!next-input-character;
4618 redo A;
4619 } elsif ($self->{nc} == 0x0022) { # "
4620 ## XML5: Same as "anything else".
4621 !!!parse-error (type => 'no space before default value'); ## TODO: type
4622 $self->{ca}->{value} = '';
4623 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4624 !!!next-input-character;
4625 redo A;
4626 } elsif ($self->{nc} == 0x0027) { # '
4627 ## XML5: Same as "anything else".
4628 !!!parse-error (type => 'no space before default value'); ## TODO: type
4629 $self->{ca}->{value} = '';
4630 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4631 !!!next-input-character;
4632 redo A;
4633 } elsif ($self->{nc} == 0x003E) { # >
4634 ## XML5: Same as "anything else".
4635 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4636 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4637 !!!next-input-character;
4638 !!!emit ($self->{ct}); # ATTLIST
4639 redo A;
4640 } elsif ($self->{nc} == -1) {
4641 ## XML5: No parse error.
4642 !!!parse-error (type => 'unclosed md'); ## TODO: type
4643 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4644 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4645 !!!next-input-character;
4646 !!!emit ($self->{ct});
4647 redo A;
4648 } else {
4649 $self->{ca}->{default} .= chr $self->{nc};
4650 ## Stay in the state.
4651 !!!next-input-character;
4652 redo A;
4653 }
4654 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4655 if ($is_space->{$self->{nc}}) {
4656 ## Stay in the state.
4657 !!!next-input-character;
4658 redo A;
4659 } elsif ($self->{nc} == 0x0022) { # "
4660 $self->{ca}->{value} = '';
4661 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4662 !!!next-input-character;
4663 redo A;
4664 } elsif ($self->{nc} == 0x0027) { # '
4665 $self->{ca}->{value} = '';
4666 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4667 !!!next-input-character;
4668 redo A;
4669 } elsif ($self->{nc} == 0x003E) { # >
4670 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4671 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4672 !!!next-input-character;
4673 !!!emit ($self->{ct}); # ATTLIST
4674 redo A;
4675 } elsif ($self->{nc} == -1) {
4676 ## XML5: No parse error.
4677 !!!parse-error (type => 'unclosed md'); ## TODO: type
4678 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4679 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4680 !!!next-input-character;
4681 !!!emit ($self->{ct});
4682 redo A;
4683 } else {
4684 ## XML5: Not defined yet.
4685 if ($self->{ca}->{default} eq 'FIXED') {
4686 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4687 } else {
4688 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4689 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4690 }
4691 ## Reconsume.
4692 redo A;
4693 }
4694 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4695 if ($is_space->{$self->{nc}} or
4696 $self->{nc} == -1 or
4697 $self->{nc} == 0x003E) { # >
4698 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4699 ## Reconsume.
4700 redo A;
4701 } else {
4702 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4703 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4704 ## Reconsume.
4705 redo A;
4706 }
4707 } elsif ($self->{state} == NDATA_STATE) {
4708 ## ASCII case-insensitive
4709 if ($self->{nc} == [
4710 undef,
4711 0x0044, # D
4712 0x0041, # A
4713 0x0054, # T
4714 ]->[length $self->{kwd}] or
4715 $self->{nc} == [
4716 undef,
4717 0x0064, # d
4718 0x0061, # a
4719 0x0074, # t
4720 ]->[length $self->{kwd}]) {
4721 !!!cp (172.2);
4722 ## Stay in the state.
4723 $self->{kwd} .= chr $self->{nc};
4724 !!!next-input-character;
4725 redo A;
4726 } elsif ((length $self->{kwd}) == 4 and
4727 ($self->{nc} == 0x0041 or # A
4728 $self->{nc} == 0x0061)) { # a
4729 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4730 !!!cp (172.3);
4731 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4732 text => 'NDATA',
4733 line => $self->{line_prev},
4734 column => $self->{column_prev} - 4);
4735 } else {
4736 !!!cp (172.4);
4737 }
4738 $self->{state} = AFTER_NDATA_STATE;
4739 !!!next-input-character;
4740 redo A;
4741 } else {
4742 !!!parse-error (type => 'string after literal', ## TODO: type
4743 line => $self->{line_prev},
4744 column => $self->{column_prev} + 1
4745 - length $self->{kwd});
4746 !!!cp (172.5);
4747 $self->{state} = BOGUS_MD_STATE;
4748 ## Reconsume.
4749 redo A;
4750 }
4751 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4752 if ($is_space->{$self->{nc}}) {
4753 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4754 !!!next-input-character;
4755 redo A;
4756 } elsif ($self->{nc} == 0x003E) { # >
4757 !!!parse-error (type => 'no notation name'); ## TODO: type
4758 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4759 !!!next-input-character;
4760 !!!emit ($self->{ct}); # ENTITY
4761 redo A;
4762 } elsif ($self->{nc} == -1) {
4763 !!!parse-error (type => 'unclosed md'); ## TODO: type
4764 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4765 !!!next-input-character;
4766 !!!emit ($self->{ct}); # ENTITY
4767 redo A;
4768 } else {
4769 !!!parse-error (type => 'string after literal', ## TODO: type
4770 line => $self->{line_prev},
4771 column => $self->{column_prev} + 1
4772 - length $self->{kwd});
4773 $self->{state} = BOGUS_MD_STATE;
4774 ## Reconsume.
4775 redo A;
4776 }
4777 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4778 if ($is_space->{$self->{nc}}) {
4779 ## Stay in the state.
4780 !!!next-input-character;
4781 redo A;
4782 } elsif ($self->{nc} == 0x003E) { # >
4783 !!!parse-error (type => 'no notation name'); ## TODO: type
4784 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4785 !!!next-input-character;
4786 !!!emit ($self->{ct}); # ENTITY
4787 redo A;
4788 } elsif ($self->{nc} == -1) {
4789 !!!parse-error (type => 'unclosed md'); ## TODO: type
4790 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4791 !!!next-input-character;
4792 !!!emit ($self->{ct}); # ENTITY
4793 redo A;
4794 } else {
4795 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4796 $self->{state} = NOTATION_NAME_STATE;
4797 !!!next-input-character;
4798 redo A;
4799 }
4800 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4801 if ($is_space->{$self->{nc}}) {
4802 $self->{state} = AFTER_MD_DEF_STATE;
4803 !!!next-input-character;
4804 redo A;
4805 } elsif ($self->{nc} == 0x003E) { # >
4806 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4807 !!!next-input-character;
4808 !!!emit ($self->{ct}); # ENTITY
4809 redo A;
4810 } elsif ($self->{nc} == -1) {
4811 !!!parse-error (type => 'unclosed md'); ## TODO: type
4812 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4813 !!!next-input-character;
4814 !!!emit ($self->{ct}); # ENTITY
4815 redo A;
4816 } else {
4817 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4818 ## Stay in the state.
4819 !!!next-input-character;
4820 redo A;
4821 }
4822 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4823 if ($self->{nc} == 0x0022) { # "
4824 $self->{state} = AFTER_MD_DEF_STATE;
4825 !!!next-input-character;
4826 redo A;
4827 } elsif ($self->{nc} == 0x0026) { # &
4828 $self->{prev_state} = $self->{state};
4829 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4830 $self->{entity_add} = 0x0022; # "
4831 !!!next-input-character;
4832 redo A;
4833 ## TODO: %
4834 } elsif ($self->{nc} == -1) {
4835 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4836 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4837 ## Reconsume.
4838 !!!emit ($self->{ct}); # ENTITY
4839 redo A;
4840 } else {
4841 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4842 !!!next-input-character;
4843 redo A;
4844 }
4845 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4846 if ($self->{nc} == 0x0027) { # '
4847 $self->{state} = AFTER_MD_DEF_STATE;
4848 !!!next-input-character;
4849 redo A;
4850 } elsif ($self->{nc} == 0x0026) { # &
4851 $self->{prev_state} = $self->{state};
4852 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4853 $self->{entity_add} = 0x0027; # '
4854 !!!next-input-character;
4855 redo A;
4856 ## TODO: %
4857 } elsif ($self->{nc} == -1) {
4858 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4859 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4860 ## Reconsume.
4861 !!!emit ($self->{ct}); # ENTITY
4862 redo A;
4863 } else {
4864 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4865 !!!next-input-character;
4866 redo A;
4867 }
4868 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4869 if ($is_space->{$self->{nc}} or
4870 {
4871 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4872 $self->{entity_add} => 1,
4873 }->{$self->{nc}}) {
4874 !!!parse-error (type => 'bare ero',
4875 line => $self->{line_prev},
4876 column => $self->{column_prev}
4877 + ($self->{nc} == -1 ? 1 : 0));
4878 ## Don't consume
4879 ## Return nothing.
4880 #
4881 } elsif ($self->{nc} == 0x0023) { # #
4882 $self->{ca} = $self->{ct};
4883 $self->{state} = ENTITY_HASH_STATE;
4884 $self->{kwd} = '#';
4885 !!!next-input-character;
4886 redo A;
4887 } else {
4888 #
4889 }
4890
4891 $self->{ct}->{value} .= '&';
4892 $self->{state} = $self->{prev_state};
4893 ## Reconsume.
4894 redo A;
4895 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4896 if ($is_space->{$self->{nc}}) {
4897 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4898 !!!next-input-character;
4899 redo A;
4900 } elsif ($self->{nc} == 0x0028) { # (
4901 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4902 $self->{ct}->{content} = ['('];
4903 $self->{group_depth} = 1;
4904 !!!next-input-character;
4905 redo A;
4906 } elsif ($self->{nc} == 0x003E) { # >
4907 !!!parse-error (type => 'no md def'); ## TODO: type
4908 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4909 !!!next-input-character;
4910 !!!emit ($self->{ct}); # ELEMENT
4911 redo A;
4912 } elsif ($self->{nc} == -1) {
4913 !!!parse-error (type => 'unclosed md'); ## TODO: type
4914 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4915 !!!next-input-character;
4916 !!!emit ($self->{ct}); # ELEMENT
4917 redo A;
4918 } else {
4919 $self->{ct}->{content} = [chr $self->{nc}];
4920 $self->{state} = CONTENT_KEYWORD_STATE;
4921 !!!next-input-character;
4922 redo A;
4923 }
4924 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4925 if ($is_space->{$self->{nc}}) {
4926 $self->{state} = AFTER_MD_DEF_STATE;
4927 !!!next-input-character;
4928 redo A;
4929 } elsif ($self->{nc} == 0x003E) { # >
4930 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4931 !!!next-input-character;
4932 !!!emit ($self->{ct}); # ELEMENT
4933 redo A;
4934 } elsif ($self->{nc} == -1) {
4935 !!!parse-error (type => 'unclosed md'); ## TODO: type
4936 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4937 !!!next-input-character;
4938 !!!emit ($self->{ct}); # ELEMENT
4939 redo A;
4940 } else {
4941 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4942 ## Stay in the state.
4943 !!!next-input-character;
4944 redo A;
4945 }
4946 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4947 if ($is_space->{$self->{nc}}) {
4948 ## Stay in the state.
4949 !!!next-input-character;
4950 redo A;
4951 } elsif ($self->{nc} == 0x0028) { # (
4952 $self->{group_depth}++;
4953 push @{$self->{ct}->{content}}, chr $self->{nc};
4954 ## Stay in the state.
4955 !!!next-input-character;
4956 redo A;
4957 } elsif ($self->{nc} == 0x007C or # |
4958 $self->{nc} == 0x002C) { # ,
4959 !!!parse-error (type => 'empty element name'); ## TODO: type
4960 ## Stay in the state.
4961 !!!next-input-character;
4962 redo A;
4963 } elsif ($self->{nc} == 0x0029) { # )
4964 !!!parse-error (type => 'empty element name'); ## TODO: type
4965 push @{$self->{ct}->{content}}, chr $self->{nc};
4966 $self->{group_depth}--;
4967 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4968 !!!next-input-character;
4969 redo A;
4970 } elsif ($self->{nc} == 0x003E) { # >
4971 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4972 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4973 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4974 !!!next-input-character;
4975 !!!emit ($self->{ct}); # ELEMENT
4976 redo A;
4977 } elsif ($self->{nc} == -1) {
4978 !!!parse-error (type => 'unclosed md'); ## TODO: type
4979 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4981 !!!next-input-character;
4982 !!!emit ($self->{ct}); # ELEMENT
4983 redo A;
4984 } else {
4985 push @{$self->{ct}->{content}}, chr $self->{nc};
4986 $self->{state} = CM_ELEMENT_NAME_STATE;
4987 !!!next-input-character;
4988 redo A;
4989 }
4990 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4991 if ($is_space->{$self->{nc}}) {
4992 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4993 !!!next-input-character;
4994 redo A;
4995 } elsif ($self->{nc} == 0x002A or # *
4996 $self->{nc} == 0x002B or # +
4997 $self->{nc} == 0x003F) { # ?
4998 push @{$self->{ct}->{content}}, chr $self->{nc};
4999 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5000 !!!next-input-character;
5001 redo A;
5002 } elsif ($self->{nc} == 0x007C or # |
5003 $self->{nc} == 0x002C) { # ,
5004 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5005 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5006 !!!next-input-character;
5007 redo A;
5008 } elsif ($self->{nc} == 0x0029) { # )
5009 $self->{group_depth}--;
5010 push @{$self->{ct}->{content}}, chr $self->{nc};
5011 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5012 !!!next-input-character;
5013 redo A;
5014 } elsif ($self->{nc} == 0x003E) { # >
5015 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5016 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5017 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5018 !!!next-input-character;
5019 !!!emit ($self->{ct}); # ELEMENT
5020 redo A;
5021 } elsif ($self->{nc} == -1) {
5022 !!!parse-error (type => 'unclosed md'); ## TODO: type
5023 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5024 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5025 !!!next-input-character;
5026 !!!emit ($self->{ct}); # ELEMENT
5027 redo A;
5028 } else {
5029 $self->{ct}->{content}->[-1] .= chr $self->{nc};
5030 ## Stay in the state.
5031 !!!next-input-character;
5032 redo A;
5033 }
5034 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5035 if ($is_space->{$self->{nc}}) {
5036 ## Stay in the state.
5037 !!!next-input-character;
5038 redo A;
5039 } elsif ($self->{nc} == 0x007C or # |
5040 $self->{nc} == 0x002C) { # ,
5041 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5042 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5043 !!!next-input-character;
5044 redo A;
5045 } elsif ($self->{nc} == 0x0029) { # )
5046 $self->{group_depth}--;
5047 push @{$self->{ct}->{content}}, chr $self->{nc};
5048 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5049 !!!next-input-character;
5050 redo A;
5051 } elsif ($self->{nc} == 0x003E) { # >
5052 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5053 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5054 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5055 !!!next-input-character;
5056 !!!emit ($self->{ct}); # ELEMENT
5057 redo A;
5058 } elsif ($self->{nc} == -1) {
5059 !!!parse-error (type => 'unclosed md'); ## TODO: type
5060 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5061 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5062 !!!next-input-character;
5063 !!!emit ($self->{ct}); # ELEMENT
5064 redo A;
5065 } else {
5066 !!!parse-error (type => 'after element name'); ## TODO: type
5067 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5068 $self->{state} = BOGUS_MD_STATE;
5069 !!!next-input-character;
5070 redo A;
5071 }
5072 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5073 if ($is_space->{$self->{nc}}) {
5074 if ($self->{group_depth}) {
5075 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5076 } else {
5077 $self->{state} = AFTER_MD_DEF_STATE;
5078 }
5079 !!!next-input-character;
5080 redo A;
5081 } elsif ($self->{nc} == 0x002A or # *
5082 $self->{nc} == 0x002B or # +
5083 $self->{nc} == 0x003F) { # ?
5084 push @{$self->{ct}->{content}}, chr $self->{nc};
5085 if ($self->{group_depth}) {
5086 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5087 } else {
5088 $self->{state} = AFTER_MD_DEF_STATE;
5089 }
5090 !!!next-input-character;
5091 redo A;
5092 } elsif ($self->{nc} == 0x0029) { # )
5093 if ($self->{group_depth}) {
5094 $self->{group_depth}--;
5095 push @{$self->{ct}->{content}}, chr $self->{nc};
5096 ## Stay in the state.
5097 !!!next-input-character;
5098 redo A;
5099 } else {
5100 !!!parse-error (type => 'string after md def'); ## TODO: type
5101 $self->{state} = BOGUS_MD_STATE;
5102 ## Reconsume.
5103 redo A;
5104 }
5105 } elsif ($self->{nc} == 0x003E) { # >
5106 if ($self->{group_depth}) {
5107 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5108 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5109 }
5110 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5111 !!!next-input-character;
5112 !!!emit ($self->{ct}); # ELEMENT
5113 redo A;
5114 } elsif ($self->{nc} == -1) {
5115 !!!parse-error (type => 'unclosed md'); ## TODO: type
5116 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5117 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5118 !!!next-input-character;
5119 !!!emit ($self->{ct}); # ELEMENT
5120 redo A;
5121 } else {
5122 if ($self->{group_depth}) {
5123 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5124 } else {
5125 !!!parse-error (type => 'string after md def'); ## TODO: type
5126 $self->{state} = BOGUS_MD_STATE;
5127 }
5128 ## Reconsume.
5129 redo A;
5130 }
5131 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5132 if ($is_space->{$self->{nc}}) {
5133 ## Stay in the state.
5134 !!!next-input-character;
5135 redo A;
5136 } elsif ($self->{nc} == 0x003E) { # >
5137 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5138 !!!next-input-character;
5139 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5140 redo A;
5141 } elsif ($self->{nc} == -1) {
5142 !!!parse-error (type => 'unclosed md'); ## TODO: type
5143 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5144 !!!next-input-character;
5145 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5146 redo A;
5147 } else {
5148 !!!parse-error (type => 'string after md def'); ## TODO: type
5149 $self->{state} = BOGUS_MD_STATE;
5150 ## Reconsume.
5151 redo A;
5152 }
5153 } elsif ($self->{state} == BOGUS_MD_STATE) {
5154 if ($self->{nc} == 0x003E) { # >
5155 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5156 !!!next-input-character;
5157 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5158 redo A;
5159 } elsif ($self->{nc} == -1) {
5160 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5161 ## Reconsume.
5162 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5163 redo A;
5164 } else {
5165 ## Stay in the state.
5166 !!!next-input-character;
5167 redo A;
5168 }
5169 } else {
5170 die "$0: $self->{state}: Unknown state";
5171 }
5172 } # A
5173
5174 die "$0: _get_next_token: unexpected case";
5175 } # _get_next_token
5176
5177 1;
5178 ## $Date: 2009/08/16 05:24:47 $
5179

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24