/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.32 - (show annotations) (download) (as text)
Sat Sep 5 09:57:55 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.31: +60 -5 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Sep 2009 09:57:06 -0000
	* tokenizer-test-1.test: Added test cases for "comment end space
	state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:57:45 -0000
	space state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.31 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_BANG_STATE () { 102 }
109 sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 sub COMMENT_END_DASH_STATE () { 18 }
111 sub BOGUS_COMMENT_STATE () { 19 }
112 sub DOCTYPE_STATE () { 20 }
113 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114 sub DOCTYPE_NAME_STATE () { 22 }
115 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124 sub BOGUS_DOCTYPE_STATE () { 32 }
125 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126 sub SELF_CLOSING_START_TAG_STATE () { 34 }
127 sub CDATA_SECTION_STATE () { 35 }
128 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136 ## NOTE: "Entity data state", "entity in attribute value state", and
137 ## "consume a character reference" algorithm are jointly implemented
138 ## using the following six states:
139 sub ENTITY_STATE () { 44 }
140 sub ENTITY_HASH_STATE () { 45 }
141 sub NCR_NUM_STATE () { 46 }
142 sub HEXREF_X_STATE () { 47 }
143 sub HEXREF_HEX_STATE () { 48 }
144 sub ENTITY_NAME_STATE () { 49 }
145 sub PCDATA_STATE () { 50 } # "data state" in the spec
146
147 ## XML-only states
148 sub PI_STATE () { 51 }
149 sub PI_TARGET_STATE () { 52 }
150 sub PI_TARGET_AFTER_STATE () { 53 }
151 sub PI_DATA_STATE () { 54 }
152 sub PI_AFTER_STATE () { 55 }
153 sub PI_DATA_AFTER_STATE () { 56 }
154 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157 sub DOCTYPE_TAG_STATE () { 60 }
158 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159 sub MD_ATTLIST_STATE () { 62 }
160 sub MD_E_STATE () { 63 }
161 sub MD_ELEMENT_STATE () { 64 }
162 sub MD_ENTITY_STATE () { 65 }
163 sub MD_NOTATION_STATE () { 66 }
164 sub DOCTYPE_MD_STATE () { 67 }
165 sub BEFORE_MD_NAME_STATE () { 68 }
166 sub MD_NAME_STATE () { 69 }
167 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174 sub ALLOWED_TOKEN_STATE () { 77 }
175 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 sub BEFORE_NDATA_STATE () { 85 }
183 sub NDATA_STATE () { 86 }
184 sub AFTER_NDATA_STATE () { 87 }
185 sub BEFORE_NOTATION_NAME_STATE () { 88 }
186 sub NOTATION_NAME_STATE () { 89 }
187 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190 sub AFTER_ELEMENT_NAME_STATE () { 93 }
191 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192 sub CONTENT_KEYWORD_STATE () { 95 }
193 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194 sub CM_ELEMENT_NAME_STATE () { 97 }
195 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197 sub AFTER_MD_DEF_STATE () { 100 }
198 sub BOGUS_MD_STATE () { 101 }
199
200 ## Tree constructor state constants (see Whatpm::HTML for the full
201 ## list and descriptions)
202
203 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204 sub FOREIGN_EL () { 0b1_00000000000 }
205
206 ## Character reference mappings
207
208 my $charref_map = {
209 0x0D => 0x000A,
210 0x80 => 0x20AC,
211 0x81 => 0xFFFD,
212 0x82 => 0x201A,
213 0x83 => 0x0192,
214 0x84 => 0x201E,
215 0x85 => 0x2026,
216 0x86 => 0x2020,
217 0x87 => 0x2021,
218 0x88 => 0x02C6,
219 0x89 => 0x2030,
220 0x8A => 0x0160,
221 0x8B => 0x2039,
222 0x8C => 0x0152,
223 0x8D => 0xFFFD,
224 0x8E => 0x017D,
225 0x8F => 0xFFFD,
226 0x90 => 0xFFFD,
227 0x91 => 0x2018,
228 0x92 => 0x2019,
229 0x93 => 0x201C,
230 0x94 => 0x201D,
231 0x95 => 0x2022,
232 0x96 => 0x2013,
233 0x97 => 0x2014,
234 0x98 => 0x02DC,
235 0x99 => 0x2122,
236 0x9A => 0x0161,
237 0x9B => 0x203A,
238 0x9C => 0x0153,
239 0x9D => 0xFFFD,
240 0x9E => 0x017E,
241 0x9F => 0x0178,
242 }; # $charref_map
243 $charref_map->{$_} = 0xFFFD
244 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
245 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
246 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
247 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
248 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
249 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
250 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
251
252 ## Implementations MUST act as if state machine in the spec
253
254 sub _initialize_tokenizer ($) {
255 my $self = shift;
256
257 ## NOTE: Fields set by |new| constructor:
258 #$self->{level}
259 #$self->{set_nc}
260 #$self->{parse_error}
261 #$self->{is_xml} (if XML)
262
263 $self->{state} = DATA_STATE; # MUST
264 $self->{s_kwd} = ''; # Data state keyword
265 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266 #$self->{entity__value}; # initialized when used
267 #$self->{entity__match}; # initialized when used
268 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
269 undef $self->{ct}; # current token
270 undef $self->{ca}; # current attribute
271 undef $self->{last_stag_name}; # last emitted start tag name
272 #$self->{prev_state}; # initialized when used
273 delete $self->{self_closing};
274 $self->{char_buffer} = '';
275 $self->{char_buffer_pos} = 0;
276 $self->{nc} = -1; # next input character
277 #$self->{next_nc}
278 !!!next-input-character;
279 $self->{token} = [];
280 # $self->{escape}
281 } # _initialize_tokenizer
282
283 ## A token has:
284 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
285 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
286 ## ->{name} (DOCTYPE_TOKEN)
287 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
288 ## ->{target} (PI_TOKEN)
289 ## ->{pubid} (DOCTYPE_TOKEN)
290 ## ->{sysid} (DOCTYPE_TOKEN)
291 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
292 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
293 ## ->{name}
294 ## ->{value}
295 ## ->{has_reference} == 1 or 0
296 ## ->{index}: Index of the attribute in a tag.
297 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
298 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
299 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
300 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
301
302 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
303 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
304 ## while the token is pushed back to the stack.
305
306 ## Emitted token MUST immediately be handled by the tree construction state.
307
308 ## Before each step, UA MAY check to see if either one of the scripts in
309 ## "list of scripts that will execute as soon as possible" or the first
310 ## script in the "list of scripts that will execute asynchronously",
311 ## has completed loading. If one has, then it MUST be executed
312 ## and removed from the list.
313
314 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
315 ## (This requirement was dropped from HTML5 spec, unfortunately.)
316
317 my $is_space = {
318 0x0009 => 1, # CHARACTER TABULATION (HT)
319 0x000A => 1, # LINE FEED (LF)
320 #0x000B => 0, # LINE TABULATION (VT)
321 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
322 #0x000D => 1, # CARRIAGE RETURN (CR)
323 0x0020 => 1, # SPACE (SP)
324 };
325
326 sub _get_next_token ($) {
327 my $self = shift;
328
329 if ($self->{self_closing}) {
330 !!!parse-error (type => 'nestc', token => $self->{ct});
331 ## NOTE: The |self_closing| flag is only set by start tag token.
332 ## In addition, when a start tag token is emitted, it is always set to
333 ## |ct|.
334 delete $self->{self_closing};
335 }
336
337 if (@{$self->{token}}) {
338 $self->{self_closing} = $self->{token}->[0]->{self_closing};
339 return shift @{$self->{token}};
340 }
341
342 A: {
343 if ($self->{state} == PCDATA_STATE) {
344 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
345
346 if ($self->{nc} == 0x0026) { # &
347 !!!cp (0.1);
348 ## NOTE: In the spec, the tokenizer is switched to the
349 ## "entity data state". In this implementation, the tokenizer
350 ## is switched to the |ENTITY_STATE|, which is an implementation
351 ## of the "consume a character reference" algorithm.
352 $self->{entity_add} = -1;
353 $self->{prev_state} = DATA_STATE;
354 $self->{state} = ENTITY_STATE;
355 !!!next-input-character;
356 redo A;
357 } elsif ($self->{nc} == 0x003C) { # <
358 !!!cp (0.2);
359 $self->{state} = TAG_OPEN_STATE;
360 !!!next-input-character;
361 redo A;
362 } elsif ($self->{nc} == -1) {
363 !!!cp (0.3);
364 !!!emit ({type => END_OF_FILE_TOKEN,
365 line => $self->{line}, column => $self->{column}});
366 last A; ## TODO: ok?
367 } else {
368 !!!cp (0.4);
369 #
370 }
371
372 # Anything else
373 my $token = {type => CHARACTER_TOKEN,
374 data => chr $self->{nc},
375 line => $self->{line}, column => $self->{column},
376 };
377 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
378
379 ## Stay in the state.
380 !!!next-input-character;
381 !!!emit ($token);
382 redo A;
383 } elsif ($self->{state} == DATA_STATE) {
384 $self->{s_kwd} = '' unless defined $self->{s_kwd};
385 if ($self->{nc} == 0x0026) { # &
386 $self->{s_kwd} = '';
387 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
388 not $self->{escape}) {
389 !!!cp (1);
390 ## NOTE: In the spec, the tokenizer is switched to the
391 ## "entity data state". In this implementation, the tokenizer
392 ## is switched to the |ENTITY_STATE|, which is an implementation
393 ## of the "consume a character reference" algorithm.
394 $self->{entity_add} = -1;
395 $self->{prev_state} = DATA_STATE;
396 $self->{state} = ENTITY_STATE;
397 !!!next-input-character;
398 redo A;
399 } else {
400 !!!cp (2);
401 #
402 }
403 } elsif ($self->{nc} == 0x002D) { # -
404 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
405 if ($self->{s_kwd} eq '<!-') {
406 !!!cp (3);
407 $self->{escape} = 1; # unless $self->{escape};
408 $self->{s_kwd} = '--';
409 #
410 } elsif ($self->{s_kwd} eq '-') {
411 !!!cp (4);
412 $self->{s_kwd} = '--';
413 #
414 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
415 !!!cp (4.1);
416 $self->{s_kwd} .= '-';
417 #
418 } else {
419 !!!cp (5);
420 $self->{s_kwd} = '-';
421 #
422 }
423 }
424
425 #
426 } elsif ($self->{nc} == 0x0021) { # !
427 if (length $self->{s_kwd}) {
428 !!!cp (5.1);
429 $self->{s_kwd} .= '!';
430 #
431 } else {
432 !!!cp (5.2);
433 #$self->{s_kwd} = '';
434 #
435 }
436 #
437 } elsif ($self->{nc} == 0x003C) { # <
438 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
439 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
440 not $self->{escape})) {
441 !!!cp (6);
442 $self->{state} = TAG_OPEN_STATE;
443 !!!next-input-character;
444 redo A;
445 } else {
446 !!!cp (7);
447 $self->{s_kwd} = '';
448 #
449 }
450 } elsif ($self->{nc} == 0x003E) { # >
451 if ($self->{escape} and
452 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
453 if ($self->{s_kwd} eq '--') {
454 !!!cp (8);
455 delete $self->{escape};
456 #
457 } else {
458 !!!cp (9);
459 #
460 }
461 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
462 !!!cp (9.1);
463 !!!parse-error (type => 'unmatched mse', ## TODO: type
464 line => $self->{line_prev},
465 column => $self->{column_prev} - 1);
466 #
467 } else {
468 !!!cp (10);
469 #
470 }
471
472 $self->{s_kwd} = '';
473 #
474 } elsif ($self->{nc} == 0x005D) { # ]
475 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
476 !!!cp (10.1);
477 $self->{s_kwd} .= ']';
478 } elsif ($self->{s_kwd} eq ']]') {
479 !!!cp (10.2);
480 #
481 } else {
482 !!!cp (10.3);
483 $self->{s_kwd} = '';
484 }
485 #
486 } elsif ($self->{nc} == -1) {
487 !!!cp (11);
488 $self->{s_kwd} = '';
489 !!!emit ({type => END_OF_FILE_TOKEN,
490 line => $self->{line}, column => $self->{column}});
491 last A; ## TODO: ok?
492 } else {
493 !!!cp (12);
494 $self->{s_kwd} = '';
495 #
496 }
497
498 # Anything else
499 my $token = {type => CHARACTER_TOKEN,
500 data => chr $self->{nc},
501 line => $self->{line}, column => $self->{column},
502 };
503 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
504 length $token->{data})) {
505 $self->{s_kwd} = '';
506 }
507
508 ## Stay in the data state.
509 if (not $self->{is_xml} and
510 $self->{content_model} == PCDATA_CONTENT_MODEL) {
511 !!!cp (13);
512 $self->{state} = PCDATA_STATE;
513 } else {
514 !!!cp (14);
515 ## Stay in the state.
516 }
517 !!!next-input-character;
518 !!!emit ($token);
519 redo A;
520 } elsif ($self->{state} == TAG_OPEN_STATE) {
521 ## XML5: "tag state".
522
523 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
524 if ($self->{nc} == 0x002F) { # /
525 !!!cp (15);
526 !!!next-input-character;
527 $self->{state} = CLOSE_TAG_OPEN_STATE;
528 redo A;
529 } elsif ($self->{nc} == 0x0021) { # !
530 !!!cp (15.1);
531 $self->{s_kwd} = $self->{escaped} ? '' : '<';
532 #
533 } else {
534 !!!cp (16);
535 $self->{s_kwd} = '';
536 #
537 }
538
539 ## reconsume
540 $self->{state} = DATA_STATE;
541 !!!emit ({type => CHARACTER_TOKEN, data => '<',
542 line => $self->{line_prev},
543 column => $self->{column_prev},
544 });
545 redo A;
546 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
547 if ($self->{nc} == 0x0021) { # !
548 !!!cp (17);
549 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
550 !!!next-input-character;
551 redo A;
552 } elsif ($self->{nc} == 0x002F) { # /
553 !!!cp (18);
554 $self->{state} = CLOSE_TAG_OPEN_STATE;
555 !!!next-input-character;
556 redo A;
557 } elsif (0x0041 <= $self->{nc} and
558 $self->{nc} <= 0x005A) { # A..Z
559 !!!cp (19);
560 $self->{ct}
561 = {type => START_TAG_TOKEN,
562 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
563 line => $self->{line_prev},
564 column => $self->{column_prev}};
565 $self->{state} = TAG_NAME_STATE;
566 !!!next-input-character;
567 redo A;
568 } elsif (0x0061 <= $self->{nc} and
569 $self->{nc} <= 0x007A) { # a..z
570 !!!cp (20);
571 $self->{ct} = {type => START_TAG_TOKEN,
572 tag_name => chr ($self->{nc}),
573 line => $self->{line_prev},
574 column => $self->{column_prev}};
575 $self->{state} = TAG_NAME_STATE;
576 !!!next-input-character;
577 redo A;
578 } elsif ($self->{nc} == 0x003E) { # >
579 !!!cp (21);
580 !!!parse-error (type => 'empty start tag',
581 line => $self->{line_prev},
582 column => $self->{column_prev});
583 $self->{state} = DATA_STATE;
584 $self->{s_kwd} = '';
585 !!!next-input-character;
586
587 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
588 line => $self->{line_prev},
589 column => $self->{column_prev},
590 });
591
592 redo A;
593 } elsif ($self->{nc} == 0x003F) { # ?
594 if ($self->{is_xml}) {
595 !!!cp (22.1);
596 $self->{state} = PI_STATE;
597 !!!next-input-character;
598 redo A;
599 } else {
600 !!!cp (22);
601 !!!parse-error (type => 'pio',
602 line => $self->{line_prev},
603 column => $self->{column_prev});
604 $self->{state} = BOGUS_COMMENT_STATE;
605 $self->{ct} = {type => COMMENT_TOKEN, data => '',
606 line => $self->{line_prev},
607 column => $self->{column_prev},
608 };
609 ## $self->{nc} is intentionally left as is
610 redo A;
611 }
612 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
613 !!!cp (23);
614 !!!parse-error (type => 'bare stago',
615 line => $self->{line_prev},
616 column => $self->{column_prev});
617 $self->{state} = DATA_STATE;
618 $self->{s_kwd} = '';
619 ## reconsume
620
621 !!!emit ({type => CHARACTER_TOKEN, data => '<',
622 line => $self->{line_prev},
623 column => $self->{column_prev},
624 });
625
626 redo A;
627 } else {
628 ## XML5: "<:" is a parse error.
629 !!!cp (23.1);
630 $self->{ct} = {type => START_TAG_TOKEN,
631 tag_name => chr ($self->{nc}),
632 line => $self->{line_prev},
633 column => $self->{column_prev}};
634 $self->{state} = TAG_NAME_STATE;
635 !!!next-input-character;
636 redo A;
637 }
638 } else {
639 die "$0: $self->{content_model} in tag open";
640 }
641 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
642 ## NOTE: The "close tag open state" in the spec is implemented as
643 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
644
645 ## XML5: "end tag state".
646
647 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
648 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
649 if (defined $self->{last_stag_name}) {
650 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
651 $self->{kwd} = '';
652 ## Reconsume.
653 redo A;
654 } else {
655 ## No start tag token has ever been emitted
656 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
657 !!!cp (28);
658 $self->{state} = DATA_STATE;
659 $self->{s_kwd} = '';
660 ## Reconsume.
661 !!!emit ({type => CHARACTER_TOKEN, data => '</',
662 line => $l, column => $c,
663 });
664 redo A;
665 }
666 }
667
668 if (0x0041 <= $self->{nc} and
669 $self->{nc} <= 0x005A) { # A..Z
670 !!!cp (29);
671 $self->{ct}
672 = {type => END_TAG_TOKEN,
673 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
674 line => $l, column => $c};
675 $self->{state} = TAG_NAME_STATE;
676 !!!next-input-character;
677 redo A;
678 } elsif (0x0061 <= $self->{nc} and
679 $self->{nc} <= 0x007A) { # a..z
680 !!!cp (30);
681 $self->{ct} = {type => END_TAG_TOKEN,
682 tag_name => chr ($self->{nc}),
683 line => $l, column => $c};
684 $self->{state} = TAG_NAME_STATE;
685 !!!next-input-character;
686 redo A;
687 } elsif ($self->{nc} == 0x003E) { # >
688 !!!parse-error (type => 'empty end tag',
689 line => $self->{line_prev}, ## "<" in "</>"
690 column => $self->{column_prev} - 1);
691 $self->{state} = DATA_STATE;
692 $self->{s_kwd} = '';
693 if ($self->{is_xml}) {
694 !!!cp (31);
695 ## XML5: No parse error.
696
697 ## NOTE: This parser raises a parse error, since it supports
698 ## XML1, not XML5.
699
700 ## NOTE: A short end tag token.
701 my $ct = {type => END_TAG_TOKEN,
702 tag_name => '',
703 line => $self->{line_prev},
704 column => $self->{column_prev} - 1,
705 };
706 !!!next-input-character;
707 !!!emit ($ct);
708 } else {
709 !!!cp (31.1);
710 !!!next-input-character;
711 }
712 redo A;
713 } elsif ($self->{nc} == -1) {
714 !!!cp (32);
715 !!!parse-error (type => 'bare etago');
716 $self->{s_kwd} = '';
717 $self->{state} = DATA_STATE;
718 # reconsume
719
720 !!!emit ({type => CHARACTER_TOKEN, data => '</',
721 line => $l, column => $c,
722 });
723
724 redo A;
725 } elsif (not $self->{is_xml} or
726 $is_space->{$self->{nc}}) {
727 !!!cp (33);
728 !!!parse-error (type => 'bogus end tag',
729 line => $self->{line_prev}, # "<" of "</"
730 column => $self->{column_prev} - 1);
731 $self->{state} = BOGUS_COMMENT_STATE;
732 $self->{ct} = {type => COMMENT_TOKEN, data => '',
733 line => $self->{line_prev}, # "<" of "</"
734 column => $self->{column_prev} - 1,
735 };
736 ## NOTE: $self->{nc} is intentionally left as is.
737 ## Although the "anything else" case of the spec not explicitly
738 ## states that the next input character is to be reconsumed,
739 ## it will be included to the |data| of the comment token
740 ## generated from the bogus end tag, as defined in the
741 ## "bogus comment state" entry.
742 redo A;
743 } else {
744 ## XML5: "</:" is a parse error.
745 !!!cp (30.1);
746 $self->{ct} = {type => END_TAG_TOKEN,
747 tag_name => chr ($self->{nc}),
748 line => $l, column => $c};
749 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
750 !!!next-input-character;
751 redo A;
752 }
753 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
754 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
755 if (length $ch) {
756 my $CH = $ch;
757 $ch =~ tr/a-z/A-Z/;
758 my $nch = chr $self->{nc};
759 if ($nch eq $ch or $nch eq $CH) {
760 !!!cp (24);
761 ## Stay in the state.
762 $self->{kwd} .= $nch;
763 !!!next-input-character;
764 redo A;
765 } else {
766 !!!cp (25);
767 $self->{state} = DATA_STATE;
768 $self->{s_kwd} = '';
769 ## Reconsume.
770 !!!emit ({type => CHARACTER_TOKEN,
771 data => '</' . $self->{kwd},
772 line => $self->{line_prev},
773 column => $self->{column_prev} - 1 - length $self->{kwd},
774 });
775 redo A;
776 }
777 } else { # after "<{tag-name}"
778 unless ($is_space->{$self->{nc}} or
779 {
780 0x003E => 1, # >
781 0x002F => 1, # /
782 -1 => 1, # EOF
783 }->{$self->{nc}}) {
784 !!!cp (26);
785 ## Reconsume.
786 $self->{state} = DATA_STATE;
787 $self->{s_kwd} = '';
788 !!!emit ({type => CHARACTER_TOKEN,
789 data => '</' . $self->{kwd},
790 line => $self->{line_prev},
791 column => $self->{column_prev} - 1 - length $self->{kwd},
792 });
793 redo A;
794 } else {
795 !!!cp (27);
796 $self->{ct}
797 = {type => END_TAG_TOKEN,
798 tag_name => $self->{last_stag_name},
799 line => $self->{line_prev},
800 column => $self->{column_prev} - 1 - length $self->{kwd}};
801 $self->{state} = TAG_NAME_STATE;
802 ## Reconsume.
803 redo A;
804 }
805 }
806 } elsif ($self->{state} == TAG_NAME_STATE) {
807 if ($is_space->{$self->{nc}}) {
808 !!!cp (34);
809 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
810 !!!next-input-character;
811 redo A;
812 } elsif ($self->{nc} == 0x003E) { # >
813 if ($self->{ct}->{type} == START_TAG_TOKEN) {
814 !!!cp (35);
815 $self->{last_stag_name} = $self->{ct}->{tag_name};
816 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
817 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
818 #if ($self->{ct}->{attributes}) {
819 # ## NOTE: This should never be reached.
820 # !!! cp (36);
821 # !!! parse-error (type => 'end tag attribute');
822 #} else {
823 !!!cp (37);
824 #}
825 } else {
826 die "$0: $self->{ct}->{type}: Unknown token type";
827 }
828 $self->{state} = DATA_STATE;
829 $self->{s_kwd} = '';
830 !!!next-input-character;
831
832 !!!emit ($self->{ct}); # start tag or end tag
833
834 redo A;
835 } elsif (0x0041 <= $self->{nc} and
836 $self->{nc} <= 0x005A) { # A..Z
837 !!!cp (38);
838 $self->{ct}->{tag_name}
839 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
840 # start tag or end tag
841 ## Stay in this state
842 !!!next-input-character;
843 redo A;
844 } elsif ($self->{nc} == -1) {
845 !!!parse-error (type => 'unclosed tag');
846 if ($self->{ct}->{type} == START_TAG_TOKEN) {
847 !!!cp (39);
848 $self->{last_stag_name} = $self->{ct}->{tag_name};
849 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
850 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
851 #if ($self->{ct}->{attributes}) {
852 # ## NOTE: This state should never be reached.
853 # !!! cp (40);
854 # !!! parse-error (type => 'end tag attribute');
855 #} else {
856 !!!cp (41);
857 #}
858 } else {
859 die "$0: $self->{ct}->{type}: Unknown token type";
860 }
861 $self->{state} = DATA_STATE;
862 $self->{s_kwd} = '';
863 # reconsume
864
865 !!!emit ($self->{ct}); # start tag or end tag
866
867 redo A;
868 } elsif ($self->{nc} == 0x002F) { # /
869 !!!cp (42);
870 $self->{state} = SELF_CLOSING_START_TAG_STATE;
871 !!!next-input-character;
872 redo A;
873 } else {
874 !!!cp (44);
875 $self->{ct}->{tag_name} .= chr $self->{nc};
876 # start tag or end tag
877 ## Stay in the state
878 !!!next-input-character;
879 redo A;
880 }
881 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
882 ## XML5: "Tag attribute name before state".
883
884 if ($is_space->{$self->{nc}}) {
885 !!!cp (45);
886 ## Stay in the state
887 !!!next-input-character;
888 redo A;
889 } elsif ($self->{nc} == 0x003E) { # >
890 if ($self->{ct}->{type} == START_TAG_TOKEN) {
891 !!!cp (46);
892 $self->{last_stag_name} = $self->{ct}->{tag_name};
893 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
894 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
895 if ($self->{ct}->{attributes}) {
896 !!!cp (47);
897 !!!parse-error (type => 'end tag attribute');
898 } else {
899 !!!cp (48);
900 }
901 } else {
902 die "$0: $self->{ct}->{type}: Unknown token type";
903 }
904 $self->{state} = DATA_STATE;
905 $self->{s_kwd} = '';
906 !!!next-input-character;
907
908 !!!emit ($self->{ct}); # start tag or end tag
909
910 redo A;
911 } elsif (0x0041 <= $self->{nc} and
912 $self->{nc} <= 0x005A) { # A..Z
913 !!!cp (49);
914 $self->{ca}
915 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
916 value => '',
917 line => $self->{line}, column => $self->{column}};
918 $self->{state} = ATTRIBUTE_NAME_STATE;
919 !!!next-input-character;
920 redo A;
921 } elsif ($self->{nc} == 0x002F) { # /
922 !!!cp (50);
923 $self->{state} = SELF_CLOSING_START_TAG_STATE;
924 !!!next-input-character;
925 redo A;
926 } elsif ($self->{nc} == -1) {
927 !!!parse-error (type => 'unclosed tag');
928 if ($self->{ct}->{type} == START_TAG_TOKEN) {
929 !!!cp (52);
930 $self->{last_stag_name} = $self->{ct}->{tag_name};
931 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
932 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
933 if ($self->{ct}->{attributes}) {
934 !!!cp (53);
935 !!!parse-error (type => 'end tag attribute');
936 } else {
937 !!!cp (54);
938 }
939 } else {
940 die "$0: $self->{ct}->{type}: Unknown token type";
941 }
942 $self->{state} = DATA_STATE;
943 $self->{s_kwd} = '';
944 # reconsume
945
946 !!!emit ($self->{ct}); # start tag or end tag
947
948 redo A;
949 } else {
950 if ({
951 0x0022 => 1, # "
952 0x0027 => 1, # '
953 0x003C => 1, # <
954 0x003D => 1, # =
955 }->{$self->{nc}}) {
956 !!!cp (55);
957 ## XML5: Not a parse error.
958 !!!parse-error (type => 'bad attribute name');
959 } else {
960 !!!cp (56);
961 ## XML5: ":" raises a parse error and is ignored.
962 }
963 $self->{ca}
964 = {name => chr ($self->{nc}),
965 value => '',
966 line => $self->{line}, column => $self->{column}};
967 $self->{state} = ATTRIBUTE_NAME_STATE;
968 !!!next-input-character;
969 redo A;
970 }
971 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
972 ## XML5: "Tag attribute name state".
973
974 my $before_leave = sub {
975 if (exists $self->{ct}->{attributes} # start tag or end tag
976 ->{$self->{ca}->{name}}) { # MUST
977 !!!cp (57);
978 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
979 ## Discard $self->{ca} # MUST
980 } else {
981 !!!cp (58);
982 $self->{ct}->{attributes}->{$self->{ca}->{name}}
983 = $self->{ca};
984 $self->{ca}->{index} = ++$self->{ct}->{last_index};
985 }
986 }; # $before_leave
987
988 if ($is_space->{$self->{nc}}) {
989 !!!cp (59);
990 $before_leave->();
991 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
992 !!!next-input-character;
993 redo A;
994 } elsif ($self->{nc} == 0x003D) { # =
995 !!!cp (60);
996 $before_leave->();
997 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
998 !!!next-input-character;
999 redo A;
1000 } elsif ($self->{nc} == 0x003E) { # >
1001 if ($self->{is_xml}) {
1002 !!!cp (60.1);
1003 ## XML5: Not a parse error.
1004 !!!parse-error (type => 'no attr value'); ## TODO: type
1005 } else {
1006 !!!cp (60.2);
1007 }
1008
1009 $before_leave->();
1010 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1011 !!!cp (61);
1012 $self->{last_stag_name} = $self->{ct}->{tag_name};
1013 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1014 !!!cp (62);
1015 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1016 if ($self->{ct}->{attributes}) {
1017 !!!parse-error (type => 'end tag attribute');
1018 }
1019 } else {
1020 die "$0: $self->{ct}->{type}: Unknown token type";
1021 }
1022 $self->{state} = DATA_STATE;
1023 $self->{s_kwd} = '';
1024 !!!next-input-character;
1025
1026 !!!emit ($self->{ct}); # start tag or end tag
1027
1028 redo A;
1029 } elsif (0x0041 <= $self->{nc} and
1030 $self->{nc} <= 0x005A) { # A..Z
1031 !!!cp (63);
1032 $self->{ca}->{name}
1033 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1034 ## Stay in the state
1035 !!!next-input-character;
1036 redo A;
1037 } elsif ($self->{nc} == 0x002F) { # /
1038 if ($self->{is_xml}) {
1039 !!!cp (64);
1040 ## XML5: Not a parse error.
1041 !!!parse-error (type => 'no attr value'); ## TODO: type
1042 } else {
1043 !!!cp (64.1);
1044 }
1045
1046 $before_leave->();
1047 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1048 !!!next-input-character;
1049 redo A;
1050 } elsif ($self->{nc} == -1) {
1051 !!!parse-error (type => 'unclosed tag');
1052 $before_leave->();
1053 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1054 !!!cp (66);
1055 $self->{last_stag_name} = $self->{ct}->{tag_name};
1056 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1057 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1058 if ($self->{ct}->{attributes}) {
1059 !!!cp (67);
1060 !!!parse-error (type => 'end tag attribute');
1061 } else {
1062 ## NOTE: This state should never be reached.
1063 !!!cp (68);
1064 }
1065 } else {
1066 die "$0: $self->{ct}->{type}: Unknown token type";
1067 }
1068 $self->{state} = DATA_STATE;
1069 $self->{s_kwd} = '';
1070 # reconsume
1071
1072 !!!emit ($self->{ct}); # start tag or end tag
1073
1074 redo A;
1075 } else {
1076 if ({
1077 0x0022 => 1, # "
1078 0x0027 => 1, # '
1079 0x003C => 1, # <
1080 }->{$self->{nc}}) {
1081 !!!cp (69);
1082 ## XML5: Not a parse error.
1083 !!!parse-error (type => 'bad attribute name');
1084 } else {
1085 !!!cp (70);
1086 }
1087 $self->{ca}->{name} .= chr ($self->{nc});
1088 ## Stay in the state
1089 !!!next-input-character;
1090 redo A;
1091 }
1092 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1093 ## XML5: "Tag attribute name after state".
1094
1095 if ($is_space->{$self->{nc}}) {
1096 !!!cp (71);
1097 ## Stay in the state
1098 !!!next-input-character;
1099 redo A;
1100 } elsif ($self->{nc} == 0x003D) { # =
1101 !!!cp (72);
1102 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1103 !!!next-input-character;
1104 redo A;
1105 } elsif ($self->{nc} == 0x003E) { # >
1106 if ($self->{is_xml}) {
1107 !!!cp (72.1);
1108 ## XML5: Not a parse error.
1109 !!!parse-error (type => 'no attr value'); ## TODO: type
1110 } else {
1111 !!!cp (72.2);
1112 }
1113
1114 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1115 !!!cp (73);
1116 $self->{last_stag_name} = $self->{ct}->{tag_name};
1117 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1118 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1119 if ($self->{ct}->{attributes}) {
1120 !!!cp (74);
1121 !!!parse-error (type => 'end tag attribute');
1122 } else {
1123 ## NOTE: This state should never be reached.
1124 !!!cp (75);
1125 }
1126 } else {
1127 die "$0: $self->{ct}->{type}: Unknown token type";
1128 }
1129 $self->{state} = DATA_STATE;
1130 $self->{s_kwd} = '';
1131 !!!next-input-character;
1132
1133 !!!emit ($self->{ct}); # start tag or end tag
1134
1135 redo A;
1136 } elsif (0x0041 <= $self->{nc} and
1137 $self->{nc} <= 0x005A) { # A..Z
1138 !!!cp (76);
1139 $self->{ca}
1140 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1141 value => '',
1142 line => $self->{line}, column => $self->{column}};
1143 $self->{state} = ATTRIBUTE_NAME_STATE;
1144 !!!next-input-character;
1145 redo A;
1146 } elsif ($self->{nc} == 0x002F) { # /
1147 if ($self->{is_xml}) {
1148 !!!cp (77);
1149 ## XML5: Not a parse error.
1150 !!!parse-error (type => 'no attr value'); ## TODO: type
1151 } else {
1152 !!!cp (77.1);
1153 }
1154
1155 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1156 !!!next-input-character;
1157 redo A;
1158 } elsif ($self->{nc} == -1) {
1159 !!!parse-error (type => 'unclosed tag');
1160 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1161 !!!cp (79);
1162 $self->{last_stag_name} = $self->{ct}->{tag_name};
1163 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1164 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1165 if ($self->{ct}->{attributes}) {
1166 !!!cp (80);
1167 !!!parse-error (type => 'end tag attribute');
1168 } else {
1169 ## NOTE: This state should never be reached.
1170 !!!cp (81);
1171 }
1172 } else {
1173 die "$0: $self->{ct}->{type}: Unknown token type";
1174 }
1175 $self->{s_kwd} = '';
1176 $self->{state} = DATA_STATE;
1177 # reconsume
1178
1179 !!!emit ($self->{ct}); # start tag or end tag
1180
1181 redo A;
1182 } else {
1183 if ($self->{is_xml}) {
1184 !!!cp (78.1);
1185 ## XML5: Not a parse error.
1186 !!!parse-error (type => 'no attr value'); ## TODO: type
1187 } else {
1188 !!!cp (78.2);
1189 }
1190
1191 if ({
1192 0x0022 => 1, # "
1193 0x0027 => 1, # '
1194 0x003C => 1, # <
1195 }->{$self->{nc}}) {
1196 !!!cp (78);
1197 ## XML5: Not a parse error.
1198 !!!parse-error (type => 'bad attribute name');
1199 } else {
1200 !!!cp (82);
1201 }
1202 $self->{ca}
1203 = {name => chr ($self->{nc}),
1204 value => '',
1205 line => $self->{line}, column => $self->{column}};
1206 $self->{state} = ATTRIBUTE_NAME_STATE;
1207 !!!next-input-character;
1208 redo A;
1209 }
1210 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1211 ## XML5: "Tag attribute value before state".
1212
1213 if ($is_space->{$self->{nc}}) {
1214 !!!cp (83);
1215 ## Stay in the state
1216 !!!next-input-character;
1217 redo A;
1218 } elsif ($self->{nc} == 0x0022) { # "
1219 !!!cp (84);
1220 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1221 !!!next-input-character;
1222 redo A;
1223 } elsif ($self->{nc} == 0x0026) { # &
1224 !!!cp (85);
1225 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1226 ## reconsume
1227 redo A;
1228 } elsif ($self->{nc} == 0x0027) { # '
1229 !!!cp (86);
1230 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1231 !!!next-input-character;
1232 redo A;
1233 } elsif ($self->{nc} == 0x003E) { # >
1234 !!!parse-error (type => 'empty unquoted attribute value');
1235 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1236 !!!cp (87);
1237 $self->{last_stag_name} = $self->{ct}->{tag_name};
1238 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1239 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1240 if ($self->{ct}->{attributes}) {
1241 !!!cp (88);
1242 !!!parse-error (type => 'end tag attribute');
1243 } else {
1244 ## NOTE: This state should never be reached.
1245 !!!cp (89);
1246 }
1247 } else {
1248 die "$0: $self->{ct}->{type}: Unknown token type";
1249 }
1250 $self->{state} = DATA_STATE;
1251 $self->{s_kwd} = '';
1252 !!!next-input-character;
1253
1254 !!!emit ($self->{ct}); # start tag or end tag
1255
1256 redo A;
1257 } elsif ($self->{nc} == -1) {
1258 !!!parse-error (type => 'unclosed tag');
1259 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1260 !!!cp (90);
1261 $self->{last_stag_name} = $self->{ct}->{tag_name};
1262 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1263 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1264 if ($self->{ct}->{attributes}) {
1265 !!!cp (91);
1266 !!!parse-error (type => 'end tag attribute');
1267 } else {
1268 ## NOTE: This state should never be reached.
1269 !!!cp (92);
1270 }
1271 } else {
1272 die "$0: $self->{ct}->{type}: Unknown token type";
1273 }
1274 $self->{state} = DATA_STATE;
1275 $self->{s_kwd} = '';
1276 ## reconsume
1277
1278 !!!emit ($self->{ct}); # start tag or end tag
1279
1280 redo A;
1281 } else {
1282 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1283 !!!cp (93);
1284 ## XML5: Not a parse error.
1285 !!!parse-error (type => 'bad attribute value');
1286 } elsif ($self->{is_xml}) {
1287 !!!cp (93.1);
1288 ## XML5: No parse error.
1289 !!!parse-error (type => 'unquoted attr value'); ## TODO
1290 } else {
1291 !!!cp (94);
1292 }
1293 $self->{ca}->{value} .= chr ($self->{nc});
1294 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1295 !!!next-input-character;
1296 redo A;
1297 }
1298 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1299 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1300 ## ATTLIST attribute value double quoted state".
1301
1302 if ($self->{nc} == 0x0022) { # "
1303 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1304 !!!cp (95.1);
1305 ## XML5: "DOCTYPE ATTLIST name after state".
1306 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1307 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1308 } else {
1309 !!!cp (95);
1310 ## XML5: "Tag attribute name before state".
1311 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1312 }
1313 !!!next-input-character;
1314 redo A;
1315 } elsif ($self->{nc} == 0x0026) { # &
1316 !!!cp (96);
1317 ## XML5: Not defined yet.
1318
1319 ## NOTE: In the spec, the tokenizer is switched to the
1320 ## "entity in attribute value state". In this implementation, the
1321 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1322 ## implementation of the "consume a character reference" algorithm.
1323 $self->{prev_state} = $self->{state};
1324 $self->{entity_add} = 0x0022; # "
1325 $self->{state} = ENTITY_STATE;
1326 !!!next-input-character;
1327 redo A;
1328 } elsif ($self->{is_xml} and
1329 $is_space->{$self->{nc}}) {
1330 !!!cp (97.1);
1331 $self->{ca}->{value} .= ' ';
1332 ## Stay in the state.
1333 !!!next-input-character;
1334 redo A;
1335 } elsif ($self->{nc} == -1) {
1336 !!!parse-error (type => 'unclosed attribute value');
1337 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338 !!!cp (97);
1339 $self->{last_stag_name} = $self->{ct}->{tag_name};
1340
1341 $self->{state} = DATA_STATE;
1342 $self->{s_kwd} = '';
1343 ## reconsume
1344 !!!emit ($self->{ct}); # start tag
1345 redo A;
1346 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1347 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1348 if ($self->{ct}->{attributes}) {
1349 !!!cp (98);
1350 !!!parse-error (type => 'end tag attribute');
1351 } else {
1352 ## NOTE: This state should never be reached.
1353 !!!cp (99);
1354 }
1355
1356 $self->{state} = DATA_STATE;
1357 $self->{s_kwd} = '';
1358 ## reconsume
1359 !!!emit ($self->{ct}); # end tag
1360 redo A;
1361 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1362 ## XML5: No parse error above; not defined yet.
1363 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1364 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1365 ## Reconsume.
1366 !!!emit ($self->{ct}); # ATTLIST
1367 redo A;
1368 } else {
1369 die "$0: $self->{ct}->{type}: Unknown token type";
1370 }
1371 } else {
1372 ## XML5 [ATTLIST]: Not defined yet.
1373 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1374 !!!cp (100);
1375 ## XML5: Not a parse error.
1376 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1377 } else {
1378 !!!cp (100.1);
1379 }
1380 $self->{ca}->{value} .= chr ($self->{nc});
1381 $self->{read_until}->($self->{ca}->{value},
1382 qq["&<\x09\x0C\x20],
1383 length $self->{ca}->{value});
1384
1385 ## Stay in the state
1386 !!!next-input-character;
1387 redo A;
1388 }
1389 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1390 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1391 ## ATTLIST attribute value single quoted state".
1392
1393 if ($self->{nc} == 0x0027) { # '
1394 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1395 !!!cp (101.1);
1396 ## XML5: "DOCTYPE ATTLIST name after state".
1397 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1398 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1399 } else {
1400 !!!cp (101);
1401 ## XML5: "Before attribute name state" (sic).
1402 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1403 }
1404 !!!next-input-character;
1405 redo A;
1406 } elsif ($self->{nc} == 0x0026) { # &
1407 !!!cp (102);
1408 ## XML5: Not defined yet.
1409
1410 ## NOTE: In the spec, the tokenizer is switched to the
1411 ## "entity in attribute value state". In this implementation, the
1412 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1413 ## implementation of the "consume a character reference" algorithm.
1414 $self->{entity_add} = 0x0027; # '
1415 $self->{prev_state} = $self->{state};
1416 $self->{state} = ENTITY_STATE;
1417 !!!next-input-character;
1418 redo A;
1419 } elsif ($self->{is_xml} and
1420 $is_space->{$self->{nc}}) {
1421 !!!cp (103.1);
1422 $self->{ca}->{value} .= ' ';
1423 ## Stay in the state.
1424 !!!next-input-character;
1425 redo A;
1426 } elsif ($self->{nc} == -1) {
1427 !!!parse-error (type => 'unclosed attribute value');
1428 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1429 !!!cp (103);
1430 $self->{last_stag_name} = $self->{ct}->{tag_name};
1431
1432 $self->{state} = DATA_STATE;
1433 $self->{s_kwd} = '';
1434 ## reconsume
1435 !!!emit ($self->{ct}); # start tag
1436 redo A;
1437 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1438 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1439 if ($self->{ct}->{attributes}) {
1440 !!!cp (104);
1441 !!!parse-error (type => 'end tag attribute');
1442 } else {
1443 ## NOTE: This state should never be reached.
1444 !!!cp (105);
1445 }
1446
1447 $self->{state} = DATA_STATE;
1448 $self->{s_kwd} = '';
1449 ## reconsume
1450 !!!emit ($self->{ct}); # end tag
1451 redo A;
1452 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1453 ## XML5: No parse error above; not defined yet.
1454 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1455 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1456 ## Reconsume.
1457 !!!emit ($self->{ct}); # ATTLIST
1458 redo A;
1459 } else {
1460 die "$0: $self->{ct}->{type}: Unknown token type";
1461 }
1462 } else {
1463 ## XML5 [ATTLIST]: Not defined yet.
1464 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1465 !!!cp (106);
1466 ## XML5: Not a parse error.
1467 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1468 } else {
1469 !!!cp (106.1);
1470 }
1471 $self->{ca}->{value} .= chr ($self->{nc});
1472 $self->{read_until}->($self->{ca}->{value},
1473 qq['&<\x09\x0C\x20],
1474 length $self->{ca}->{value});
1475
1476 ## Stay in the state
1477 !!!next-input-character;
1478 redo A;
1479 }
1480 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1481 ## XML5: "Tag attribute value unquoted state".
1482
1483 if ($is_space->{$self->{nc}}) {
1484 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1485 !!!cp (107.1);
1486 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1487 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1488 } else {
1489 !!!cp (107);
1490 ## XML5: "Tag attribute name before state".
1491 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1492 }
1493 !!!next-input-character;
1494 redo A;
1495 } elsif ($self->{nc} == 0x0026) { # &
1496 !!!cp (108);
1497
1498 ## XML5: Not defined yet.
1499
1500 ## NOTE: In the spec, the tokenizer is switched to the
1501 ## "entity in attribute value state". In this implementation, the
1502 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1503 ## implementation of the "consume a character reference" algorithm.
1504 $self->{entity_add} = -1;
1505 $self->{prev_state} = $self->{state};
1506 $self->{state} = ENTITY_STATE;
1507 !!!next-input-character;
1508 redo A;
1509 } elsif ($self->{nc} == 0x003E) { # >
1510 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1511 !!!cp (109);
1512 $self->{last_stag_name} = $self->{ct}->{tag_name};
1513
1514 $self->{state} = DATA_STATE;
1515 $self->{s_kwd} = '';
1516 !!!next-input-character;
1517 !!!emit ($self->{ct}); # start tag
1518 redo A;
1519 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1520 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1521 if ($self->{ct}->{attributes}) {
1522 !!!cp (110);
1523 !!!parse-error (type => 'end tag attribute');
1524 } else {
1525 ## NOTE: This state should never be reached.
1526 !!!cp (111);
1527 }
1528
1529 $self->{state} = DATA_STATE;
1530 $self->{s_kwd} = '';
1531 !!!next-input-character;
1532 !!!emit ($self->{ct}); # end tag
1533 redo A;
1534 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1535 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1536 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1537 !!!next-input-character;
1538 !!!emit ($self->{ct}); # ATTLIST
1539 redo A;
1540 } else {
1541 die "$0: $self->{ct}->{type}: Unknown token type";
1542 }
1543 } elsif ($self->{nc} == -1) {
1544 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1545 !!!cp (112);
1546 !!!parse-error (type => 'unclosed tag');
1547 $self->{last_stag_name} = $self->{ct}->{tag_name};
1548
1549 $self->{state} = DATA_STATE;
1550 $self->{s_kwd} = '';
1551 ## reconsume
1552 !!!emit ($self->{ct}); # start tag
1553 redo A;
1554 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1555 !!!parse-error (type => 'unclosed tag');
1556 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1557 if ($self->{ct}->{attributes}) {
1558 !!!cp (113);
1559 !!!parse-error (type => 'end tag attribute');
1560 } else {
1561 ## NOTE: This state should never be reached.
1562 !!!cp (114);
1563 }
1564
1565 $self->{state} = DATA_STATE;
1566 $self->{s_kwd} = '';
1567 ## reconsume
1568 !!!emit ($self->{ct}); # end tag
1569 redo A;
1570 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1571 !!!parse-error (type => 'unclosed md'); ## TODO: type
1572 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1573 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1574 ## Reconsume.
1575 !!!emit ($self->{ct}); # ATTLIST
1576 redo A;
1577 } else {
1578 die "$0: $self->{ct}->{type}: Unknown token type";
1579 }
1580 } else {
1581 if ({
1582 0x0022 => 1, # "
1583 0x0027 => 1, # '
1584 0x003D => 1, # =
1585 0x003C => 1, # <
1586 }->{$self->{nc}}) {
1587 !!!cp (115);
1588 ## XML5: Not a parse error.
1589 !!!parse-error (type => 'bad attribute value');
1590 } else {
1591 !!!cp (116);
1592 }
1593 $self->{ca}->{value} .= chr ($self->{nc});
1594 $self->{read_until}->($self->{ca}->{value},
1595 qq["'=& \x09\x0C>],
1596 length $self->{ca}->{value});
1597
1598 ## Stay in the state
1599 !!!next-input-character;
1600 redo A;
1601 }
1602 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1603 if ($is_space->{$self->{nc}}) {
1604 !!!cp (118);
1605 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1606 !!!next-input-character;
1607 redo A;
1608 } elsif ($self->{nc} == 0x003E) { # >
1609 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1610 !!!cp (119);
1611 $self->{last_stag_name} = $self->{ct}->{tag_name};
1612 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1613 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1614 if ($self->{ct}->{attributes}) {
1615 !!!cp (120);
1616 !!!parse-error (type => 'end tag attribute');
1617 } else {
1618 ## NOTE: This state should never be reached.
1619 !!!cp (121);
1620 }
1621 } else {
1622 die "$0: $self->{ct}->{type}: Unknown token type";
1623 }
1624 $self->{state} = DATA_STATE;
1625 $self->{s_kwd} = '';
1626 !!!next-input-character;
1627
1628 !!!emit ($self->{ct}); # start tag or end tag
1629
1630 redo A;
1631 } elsif ($self->{nc} == 0x002F) { # /
1632 !!!cp (122);
1633 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1634 !!!next-input-character;
1635 redo A;
1636 } elsif ($self->{nc} == -1) {
1637 !!!parse-error (type => 'unclosed tag');
1638 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1639 !!!cp (122.3);
1640 $self->{last_stag_name} = $self->{ct}->{tag_name};
1641 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1642 if ($self->{ct}->{attributes}) {
1643 !!!cp (122.1);
1644 !!!parse-error (type => 'end tag attribute');
1645 } else {
1646 ## NOTE: This state should never be reached.
1647 !!!cp (122.2);
1648 }
1649 } else {
1650 die "$0: $self->{ct}->{type}: Unknown token type";
1651 }
1652 $self->{state} = DATA_STATE;
1653 $self->{s_kwd} = '';
1654 ## Reconsume.
1655 !!!emit ($self->{ct}); # start tag or end tag
1656 redo A;
1657 } else {
1658 !!!cp ('124.1');
1659 !!!parse-error (type => 'no space between attributes');
1660 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1661 ## reconsume
1662 redo A;
1663 }
1664 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1665 ## XML5: "Empty tag state".
1666
1667 if ($self->{nc} == 0x003E) { # >
1668 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1669 !!!cp ('124.2');
1670 !!!parse-error (type => 'nestc', token => $self->{ct});
1671 ## TODO: Different type than slash in start tag
1672 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1673 if ($self->{ct}->{attributes}) {
1674 !!!cp ('124.4');
1675 !!!parse-error (type => 'end tag attribute');
1676 } else {
1677 !!!cp ('124.5');
1678 }
1679 ## TODO: Test |<title></title/>|
1680 } else {
1681 !!!cp ('124.3');
1682 $self->{self_closing} = 1;
1683 }
1684
1685 $self->{state} = DATA_STATE;
1686 $self->{s_kwd} = '';
1687 !!!next-input-character;
1688
1689 !!!emit ($self->{ct}); # start tag or end tag
1690
1691 redo A;
1692 } elsif ($self->{nc} == -1) {
1693 !!!parse-error (type => 'unclosed tag');
1694 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1695 !!!cp (124.7);
1696 $self->{last_stag_name} = $self->{ct}->{tag_name};
1697 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1698 if ($self->{ct}->{attributes}) {
1699 !!!cp (124.5);
1700 !!!parse-error (type => 'end tag attribute');
1701 } else {
1702 ## NOTE: This state should never be reached.
1703 !!!cp (124.6);
1704 }
1705 } else {
1706 die "$0: $self->{ct}->{type}: Unknown token type";
1707 }
1708 ## XML5: "Tag attribute name before state".
1709 $self->{state} = DATA_STATE;
1710 $self->{s_kwd} = '';
1711 ## Reconsume.
1712 !!!emit ($self->{ct}); # start tag or end tag
1713 redo A;
1714 } else {
1715 !!!cp ('124.4');
1716 !!!parse-error (type => 'nestc');
1717 ## TODO: This error type is wrong.
1718 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1719 ## Reconsume.
1720 redo A;
1721 }
1722 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1723 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1724
1725 ## NOTE: Unlike spec's "bogus comment state", this implementation
1726 ## consumes characters one-by-one basis.
1727
1728 if ($self->{nc} == 0x003E) { # >
1729 if ($self->{in_subset}) {
1730 !!!cp (123);
1731 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1732 } else {
1733 !!!cp (124);
1734 $self->{state} = DATA_STATE;
1735 $self->{s_kwd} = '';
1736 }
1737 !!!next-input-character;
1738
1739 !!!emit ($self->{ct}); # comment
1740 redo A;
1741 } elsif ($self->{nc} == -1) {
1742 if ($self->{in_subset}) {
1743 !!!cp (125.1);
1744 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1745 } else {
1746 !!!cp (125);
1747 $self->{state} = DATA_STATE;
1748 $self->{s_kwd} = '';
1749 }
1750 ## reconsume
1751
1752 !!!emit ($self->{ct}); # comment
1753 redo A;
1754 } else {
1755 !!!cp (126);
1756 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1757 $self->{read_until}->($self->{ct}->{data},
1758 q[>],
1759 length $self->{ct}->{data});
1760
1761 ## Stay in the state.
1762 !!!next-input-character;
1763 redo A;
1764 }
1765 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1766 ## XML5: "Markup declaration state".
1767
1768 if ($self->{nc} == 0x002D) { # -
1769 !!!cp (133);
1770 $self->{state} = MD_HYPHEN_STATE;
1771 !!!next-input-character;
1772 redo A;
1773 } elsif ($self->{nc} == 0x0044 or # D
1774 $self->{nc} == 0x0064) { # d
1775 ## ASCII case-insensitive.
1776 !!!cp (130);
1777 $self->{state} = MD_DOCTYPE_STATE;
1778 $self->{kwd} = chr $self->{nc};
1779 !!!next-input-character;
1780 redo A;
1781 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1782 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1783 $self->{is_xml}) and
1784 $self->{nc} == 0x005B) { # [
1785 !!!cp (135.4);
1786 $self->{state} = MD_CDATA_STATE;
1787 $self->{kwd} = '[';
1788 !!!next-input-character;
1789 redo A;
1790 } else {
1791 !!!cp (136);
1792 }
1793
1794 !!!parse-error (type => 'bogus comment',
1795 line => $self->{line_prev},
1796 column => $self->{column_prev} - 1);
1797 ## Reconsume.
1798 $self->{state} = BOGUS_COMMENT_STATE;
1799 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1800 line => $self->{line_prev},
1801 column => $self->{column_prev} - 1,
1802 };
1803 redo A;
1804 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1805 if ($self->{nc} == 0x002D) { # -
1806 !!!cp (127);
1807 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1808 line => $self->{line_prev},
1809 column => $self->{column_prev} - 2,
1810 };
1811 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1812 !!!next-input-character;
1813 redo A;
1814 } else {
1815 !!!cp (128);
1816 !!!parse-error (type => 'bogus comment',
1817 line => $self->{line_prev},
1818 column => $self->{column_prev} - 2);
1819 $self->{state} = BOGUS_COMMENT_STATE;
1820 ## Reconsume.
1821 $self->{ct} = {type => COMMENT_TOKEN,
1822 data => '-',
1823 line => $self->{line_prev},
1824 column => $self->{column_prev} - 2,
1825 };
1826 redo A;
1827 }
1828 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1829 ## ASCII case-insensitive.
1830 if ($self->{nc} == [
1831 undef,
1832 0x004F, # O
1833 0x0043, # C
1834 0x0054, # T
1835 0x0059, # Y
1836 0x0050, # P
1837 ]->[length $self->{kwd}] or
1838 $self->{nc} == [
1839 undef,
1840 0x006F, # o
1841 0x0063, # c
1842 0x0074, # t
1843 0x0079, # y
1844 0x0070, # p
1845 ]->[length $self->{kwd}]) {
1846 !!!cp (131);
1847 ## Stay in the state.
1848 $self->{kwd} .= chr $self->{nc};
1849 !!!next-input-character;
1850 redo A;
1851 } elsif ((length $self->{kwd}) == 6 and
1852 ($self->{nc} == 0x0045 or # E
1853 $self->{nc} == 0x0065)) { # e
1854 if ($self->{is_xml} and
1855 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1856 !!!cp (129);
1857 ## XML5: case-sensitive.
1858 !!!parse-error (type => 'lowercase keyword', ## TODO
1859 text => 'DOCTYPE',
1860 line => $self->{line_prev},
1861 column => $self->{column_prev} - 5);
1862 } else {
1863 !!!cp (129.1);
1864 }
1865 $self->{state} = DOCTYPE_STATE;
1866 $self->{ct} = {type => DOCTYPE_TOKEN,
1867 quirks => 1,
1868 line => $self->{line_prev},
1869 column => $self->{column_prev} - 7,
1870 };
1871 !!!next-input-character;
1872 redo A;
1873 } else {
1874 !!!cp (132);
1875 !!!parse-error (type => 'bogus comment',
1876 line => $self->{line_prev},
1877 column => $self->{column_prev} - 1 - length $self->{kwd});
1878 $self->{state} = BOGUS_COMMENT_STATE;
1879 ## Reconsume.
1880 $self->{ct} = {type => COMMENT_TOKEN,
1881 data => $self->{kwd},
1882 line => $self->{line_prev},
1883 column => $self->{column_prev} - 1 - length $self->{kwd},
1884 };
1885 redo A;
1886 }
1887 } elsif ($self->{state} == MD_CDATA_STATE) {
1888 if ($self->{nc} == {
1889 '[' => 0x0043, # C
1890 '[C' => 0x0044, # D
1891 '[CD' => 0x0041, # A
1892 '[CDA' => 0x0054, # T
1893 '[CDAT' => 0x0041, # A
1894 }->{$self->{kwd}}) {
1895 !!!cp (135.1);
1896 ## Stay in the state.
1897 $self->{kwd} .= chr $self->{nc};
1898 !!!next-input-character;
1899 redo A;
1900 } elsif ($self->{kwd} eq '[CDATA' and
1901 $self->{nc} == 0x005B) { # [
1902 if ($self->{is_xml} and
1903 not $self->{tainted} and
1904 @{$self->{open_elements} or []} == 0) {
1905 !!!cp (135.2);
1906 !!!parse-error (type => 'cdata outside of root element',
1907 line => $self->{line_prev},
1908 column => $self->{column_prev} - 7);
1909 $self->{tainted} = 1;
1910 } else {
1911 !!!cp (135.21);
1912 }
1913
1914 $self->{ct} = {type => CHARACTER_TOKEN,
1915 data => '',
1916 line => $self->{line_prev},
1917 column => $self->{column_prev} - 7};
1918 $self->{state} = CDATA_SECTION_STATE;
1919 !!!next-input-character;
1920 redo A;
1921 } else {
1922 !!!cp (135.3);
1923 !!!parse-error (type => 'bogus comment',
1924 line => $self->{line_prev},
1925 column => $self->{column_prev} - 1 - length $self->{kwd});
1926 $self->{state} = BOGUS_COMMENT_STATE;
1927 ## Reconsume.
1928 $self->{ct} = {type => COMMENT_TOKEN,
1929 data => $self->{kwd},
1930 line => $self->{line_prev},
1931 column => $self->{column_prev} - 1 - length $self->{kwd},
1932 };
1933 redo A;
1934 }
1935 } elsif ($self->{state} == COMMENT_START_STATE) {
1936 if ($self->{nc} == 0x002D) { # -
1937 !!!cp (137);
1938 $self->{state} = COMMENT_START_DASH_STATE;
1939 !!!next-input-character;
1940 redo A;
1941 } elsif ($self->{nc} == 0x003E) { # >
1942 !!!parse-error (type => 'bogus comment');
1943 if ($self->{in_subset}) {
1944 !!!cp (138.1);
1945 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1946 } else {
1947 !!!cp (138);
1948 $self->{state} = DATA_STATE;
1949 $self->{s_kwd} = '';
1950 }
1951 !!!next-input-character;
1952
1953 !!!emit ($self->{ct}); # comment
1954
1955 redo A;
1956 } elsif ($self->{nc} == -1) {
1957 !!!parse-error (type => 'unclosed comment');
1958 if ($self->{in_subset}) {
1959 !!!cp (139.1);
1960 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1961 } else {
1962 !!!cp (139);
1963 $self->{state} = DATA_STATE;
1964 $self->{s_kwd} = '';
1965 }
1966 ## reconsume
1967
1968 !!!emit ($self->{ct}); # comment
1969
1970 redo A;
1971 } else {
1972 !!!cp (140);
1973 $self->{ct}->{data} # comment
1974 .= chr ($self->{nc});
1975 $self->{state} = COMMENT_STATE;
1976 !!!next-input-character;
1977 redo A;
1978 }
1979 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1980 if ($self->{nc} == 0x002D) { # -
1981 !!!cp (141);
1982 $self->{state} = COMMENT_END_STATE;
1983 !!!next-input-character;
1984 redo A;
1985 } elsif ($self->{nc} == 0x003E) { # >
1986 !!!parse-error (type => 'bogus comment');
1987 if ($self->{in_subset}) {
1988 !!!cp (142.1);
1989 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1990 } else {
1991 !!!cp (142);
1992 $self->{state} = DATA_STATE;
1993 $self->{s_kwd} = '';
1994 }
1995 !!!next-input-character;
1996
1997 !!!emit ($self->{ct}); # comment
1998
1999 redo A;
2000 } elsif ($self->{nc} == -1) {
2001 !!!parse-error (type => 'unclosed comment');
2002 if ($self->{in_subset}) {
2003 !!!cp (143.1);
2004 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005 } else {
2006 !!!cp (143);
2007 $self->{state} = DATA_STATE;
2008 $self->{s_kwd} = '';
2009 }
2010 ## reconsume
2011
2012 !!!emit ($self->{ct}); # comment
2013
2014 redo A;
2015 } else {
2016 !!!cp (144);
2017 $self->{ct}->{data} # comment
2018 .= '-' . chr ($self->{nc});
2019 $self->{state} = COMMENT_STATE;
2020 !!!next-input-character;
2021 redo A;
2022 }
2023 } elsif ($self->{state} == COMMENT_STATE) {
2024 ## XML5: "Comment state" and "DOCTYPE comment state".
2025
2026 if ($self->{nc} == 0x002D) { # -
2027 !!!cp (145);
2028 $self->{state} = COMMENT_END_DASH_STATE;
2029 !!!next-input-character;
2030 redo A;
2031 } elsif ($self->{nc} == -1) {
2032 !!!parse-error (type => 'unclosed comment');
2033 if ($self->{in_subset}) {
2034 !!!cp (146.1);
2035 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2036 } else {
2037 !!!cp (146);
2038 $self->{state} = DATA_STATE;
2039 $self->{s_kwd} = '';
2040 }
2041 ## reconsume
2042
2043 !!!emit ($self->{ct}); # comment
2044
2045 redo A;
2046 } else {
2047 !!!cp (147);
2048 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2049 $self->{read_until}->($self->{ct}->{data},
2050 q[-],
2051 length $self->{ct}->{data});
2052
2053 ## Stay in the state
2054 !!!next-input-character;
2055 redo A;
2056 }
2057 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2058 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2059
2060 if ($self->{nc} == 0x002D) { # -
2061 !!!cp (148);
2062 $self->{state} = COMMENT_END_STATE;
2063 !!!next-input-character;
2064 redo A;
2065 } elsif ($self->{nc} == -1) {
2066 !!!parse-error (type => 'unclosed comment');
2067 if ($self->{in_subset}) {
2068 !!!cp (149.1);
2069 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2070 } else {
2071 !!!cp (149);
2072 $self->{state} = DATA_STATE;
2073 $self->{s_kwd} = '';
2074 }
2075 ## reconsume
2076
2077 !!!emit ($self->{ct}); # comment
2078
2079 redo A;
2080 } else {
2081 !!!cp (150);
2082 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2083 $self->{state} = COMMENT_STATE;
2084 !!!next-input-character;
2085 redo A;
2086 }
2087 } elsif ($self->{state} == COMMENT_END_STATE or
2088 $self->{state} == COMMENT_END_BANG_STATE) {
2089 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2090 ## (No comment end bang state.)
2091
2092 if ($self->{nc} == 0x003E) { # >
2093 if ($self->{in_subset}) {
2094 !!!cp (151.1);
2095 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2096 } else {
2097 !!!cp (151);
2098 $self->{state} = DATA_STATE;
2099 $self->{s_kwd} = '';
2100 }
2101 !!!next-input-character;
2102
2103 !!!emit ($self->{ct}); # comment
2104
2105 redo A;
2106 } elsif ($self->{nc} == 0x002D) { # -
2107 if ($self->{state} == COMMENT_END_BANG_STATE) {
2108 !!!cp (154.3);
2109 $self->{ct}->{data} .= '--!'; # comment
2110 $self->{state} = COMMENT_END_DASH_STATE;
2111 } else {
2112 !!!cp (152);
2113 ## XML5: Not a parse error.
2114 !!!parse-error (type => 'dash in comment',
2115 line => $self->{line_prev},
2116 column => $self->{column_prev});
2117 $self->{ct}->{data} .= '-'; # comment
2118 ## Stay in the state
2119 }
2120 !!!next-input-character;
2121 redo A;
2122 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2123 $is_space->{$self->{nc}}) {
2124 !!!cp (152.1);
2125 !!!parse-error (type => 'comment end space'); # XXX error type
2126 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2127 $self->{state} = COMMENT_END_SPACE_STATE;
2128 !!!next-input-character;
2129 redo A;
2130 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2131 $self->{nc} == 0x0021) { # !
2132 !!!cp (152.2);
2133 !!!parse-error (type => 'comment end bang'); # XXX error type
2134 $self->{state} = COMMENT_END_BANG_STATE;
2135 !!!next-input-character;
2136 redo A;
2137 } elsif ($self->{nc} == -1) {
2138 !!!parse-error (type => 'unclosed comment');
2139 if ($self->{in_subset}) {
2140 !!!cp (153.1);
2141 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2142 } else {
2143 !!!cp (153);
2144 $self->{state} = DATA_STATE;
2145 $self->{s_kwd} = '';
2146 }
2147 ## Reconsume.
2148
2149 !!!emit ($self->{ct}); # comment
2150
2151 redo A;
2152 } else {
2153 !!!cp (154);
2154 if ($self->{state} == COMMENT_END_BANG_STATE) {
2155 $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2156 } else {
2157 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2158 }
2159 $self->{state} = COMMENT_STATE;
2160 !!!next-input-character;
2161 redo A;
2162 }
2163 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
2164 ## XML5: Not exist.
2165
2166 if ($self->{nc} == 0x003E) { # >
2167 if ($self->{in_subset}) {
2168 !!!cp (154.4);
2169 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2170 } else {
2171 !!!cp (154.5);
2172 $self->{state} = DATA_STATE;
2173 $self->{s_kwd} = '';
2174 }
2175 !!!next-input-character;
2176
2177 !!!emit ($self->{ct}); # comment
2178
2179 redo A;
2180 } elsif ($is_space->{$self->{nc}}) {
2181 !!!cp (154.6);
2182 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2183 ## Stay in the state.
2184 !!!next-input-character;
2185 redo A;
2186 } elsif ($self->{nc} == -1) {
2187 !!!parse-error (type => 'unclosed comment');
2188 if ($self->{in_subset}) {
2189 !!!cp (154.7);
2190 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2191 } else {
2192 !!!cp (154.8);
2193 $self->{state} = DATA_STATE;
2194 $self->{s_kwd} = '';
2195 }
2196 ## Reconsume.
2197
2198 !!!emit ($self->{ct}); # comment
2199
2200 redo A;
2201 } else {
2202 !!!cp (154.9);
2203 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2204 $self->{state} = COMMENT_STATE;
2205 !!!next-input-character;
2206 redo A;
2207 }
2208 } elsif ($self->{state} == DOCTYPE_STATE) {
2209 if ($is_space->{$self->{nc}}) {
2210 !!!cp (155);
2211 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2212 !!!next-input-character;
2213 redo A;
2214 } elsif ($self->{nc} == -1) {
2215 !!!cp (155.1);
2216 !!!parse-error (type => 'unclosed DOCTYPE');
2217 $self->{ct}->{quirks} = 1;
2218
2219 $self->{state} = DATA_STATE;
2220 ## Reconsume.
2221 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2222
2223 redo A;
2224 } else {
2225 !!!cp (156);
2226 ## XML5: Swith to the bogus comment state.
2227 !!!parse-error (type => 'no space before DOCTYPE name');
2228 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2229 ## reconsume
2230 redo A;
2231 }
2232 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2233 ## XML5: "DOCTYPE root name before state".
2234
2235 if ($is_space->{$self->{nc}}) {
2236 !!!cp (157);
2237 ## Stay in the state
2238 !!!next-input-character;
2239 redo A;
2240 } elsif ($self->{nc} == 0x003E) { # >
2241 !!!cp (158);
2242 ## XML5: No parse error.
2243 !!!parse-error (type => 'no DOCTYPE name');
2244 $self->{state} = DATA_STATE;
2245 $self->{s_kwd} = '';
2246 !!!next-input-character;
2247
2248 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2249
2250 redo A;
2251 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2252 !!!cp (158.1);
2253 $self->{ct}->{name} # DOCTYPE
2254 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2255 delete $self->{ct}->{quirks};
2256 $self->{state} = DOCTYPE_NAME_STATE;
2257 !!!next-input-character;
2258 redo A;
2259 } elsif ($self->{nc} == -1) {
2260 !!!cp (159);
2261 !!!parse-error (type => 'no DOCTYPE name');
2262 $self->{state} = DATA_STATE;
2263 $self->{s_kwd} = '';
2264 ## reconsume
2265
2266 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2267
2268 redo A;
2269 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2270 !!!cp (159.1);
2271 !!!parse-error (type => 'no DOCTYPE name');
2272 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2273 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2274 $self->{in_subset} = 1;
2275 !!!next-input-character;
2276 !!!emit ($self->{ct}); # DOCTYPE
2277 redo A;
2278 } else {
2279 !!!cp (160);
2280 $self->{ct}->{name} = chr $self->{nc};
2281 delete $self->{ct}->{quirks};
2282 $self->{state} = DOCTYPE_NAME_STATE;
2283 !!!next-input-character;
2284 redo A;
2285 }
2286 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2287 ## XML5: "DOCTYPE root name state".
2288
2289 ## ISSUE: Redundant "First," in the spec.
2290
2291 if ($is_space->{$self->{nc}}) {
2292 !!!cp (161);
2293 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2294 !!!next-input-character;
2295 redo A;
2296 } elsif ($self->{nc} == 0x003E) { # >
2297 !!!cp (162);
2298 $self->{state} = DATA_STATE;
2299 $self->{s_kwd} = '';
2300 !!!next-input-character;
2301
2302 !!!emit ($self->{ct}); # DOCTYPE
2303
2304 redo A;
2305 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2306 !!!cp (162.1);
2307 $self->{ct}->{name} # DOCTYPE
2308 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2309 delete $self->{ct}->{quirks};
2310 ## Stay in the state.
2311 !!!next-input-character;
2312 redo A;
2313 } elsif ($self->{nc} == -1) {
2314 !!!cp (163);
2315 !!!parse-error (type => 'unclosed DOCTYPE');
2316 $self->{state} = DATA_STATE;
2317 $self->{s_kwd} = '';
2318 ## reconsume
2319
2320 $self->{ct}->{quirks} = 1;
2321 !!!emit ($self->{ct}); # DOCTYPE
2322
2323 redo A;
2324 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2325 !!!cp (163.1);
2326 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2327 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2328 $self->{in_subset} = 1;
2329 !!!next-input-character;
2330 !!!emit ($self->{ct}); # DOCTYPE
2331 redo A;
2332 } else {
2333 !!!cp (164);
2334 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2335 ## Stay in the state.
2336 !!!next-input-character;
2337 redo A;
2338 }
2339 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2340 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2341 ## state", but implemented differently.
2342
2343 if ($is_space->{$self->{nc}}) {
2344 !!!cp (165);
2345 ## Stay in the state
2346 !!!next-input-character;
2347 redo A;
2348 } elsif ($self->{nc} == 0x003E) { # >
2349 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2350 !!!cp (166);
2351 $self->{state} = DATA_STATE;
2352 $self->{s_kwd} = '';
2353 } else {
2354 !!!cp (166.1);
2355 !!!parse-error (type => 'no md def'); ## TODO: type
2356 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2357 }
2358
2359 !!!next-input-character;
2360 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2361 redo A;
2362 } elsif ($self->{nc} == -1) {
2363 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2364 !!!cp (167);
2365 !!!parse-error (type => 'unclosed DOCTYPE');
2366 $self->{state} = DATA_STATE;
2367 $self->{s_kwd} = '';
2368 $self->{ct}->{quirks} = 1;
2369 } else {
2370 !!!cp (167.12);
2371 !!!parse-error (type => 'unclosed md'); ## TODO: type
2372 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2373 }
2374
2375 ## Reconsume.
2376 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2377 redo A;
2378 } elsif ($self->{nc} == 0x0050 or # P
2379 $self->{nc} == 0x0070) { # p
2380 !!!cp (167.1);
2381 $self->{state} = PUBLIC_STATE;
2382 $self->{kwd} = chr $self->{nc};
2383 !!!next-input-character;
2384 redo A;
2385 } elsif ($self->{nc} == 0x0053 or # S
2386 $self->{nc} == 0x0073) { # s
2387 !!!cp (167.2);
2388 $self->{state} = SYSTEM_STATE;
2389 $self->{kwd} = chr $self->{nc};
2390 !!!next-input-character;
2391 redo A;
2392 } elsif ($self->{nc} == 0x0022 and # "
2393 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2394 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2395 !!!cp (167.21);
2396 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2397 $self->{ct}->{value} = ''; # ENTITY
2398 !!!next-input-character;
2399 redo A;
2400 } elsif ($self->{nc} == 0x0027 and # '
2401 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2402 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2403 !!!cp (167.22);
2404 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2405 $self->{ct}->{value} = ''; # ENTITY
2406 !!!next-input-character;
2407 redo A;
2408 } elsif ($self->{is_xml} and
2409 $self->{ct}->{type} == DOCTYPE_TOKEN and
2410 $self->{nc} == 0x005B) { # [
2411 !!!cp (167.3);
2412 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2413 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2414 $self->{in_subset} = 1;
2415 !!!next-input-character;
2416 !!!emit ($self->{ct}); # DOCTYPE
2417 redo A;
2418 } else {
2419 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2420
2421 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2422 !!!cp (180);
2423 $self->{ct}->{quirks} = 1;
2424 $self->{state} = BOGUS_DOCTYPE_STATE;
2425 } else {
2426 !!!cp (180.1);
2427 $self->{state} = BOGUS_MD_STATE;
2428 }
2429
2430 !!!next-input-character;
2431 redo A;
2432 }
2433 } elsif ($self->{state} == PUBLIC_STATE) {
2434 ## ASCII case-insensitive
2435 if ($self->{nc} == [
2436 undef,
2437 0x0055, # U
2438 0x0042, # B
2439 0x004C, # L
2440 0x0049, # I
2441 ]->[length $self->{kwd}] or
2442 $self->{nc} == [
2443 undef,
2444 0x0075, # u
2445 0x0062, # b
2446 0x006C, # l
2447 0x0069, # i
2448 ]->[length $self->{kwd}]) {
2449 !!!cp (175);
2450 ## Stay in the state.
2451 $self->{kwd} .= chr $self->{nc};
2452 !!!next-input-character;
2453 redo A;
2454 } elsif ((length $self->{kwd}) == 5 and
2455 ($self->{nc} == 0x0043 or # C
2456 $self->{nc} == 0x0063)) { # c
2457 if ($self->{is_xml} and
2458 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2459 !!!cp (168.1);
2460 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2461 text => 'PUBLIC',
2462 line => $self->{line_prev},
2463 column => $self->{column_prev} - 4);
2464 } else {
2465 !!!cp (168);
2466 }
2467 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2468 !!!next-input-character;
2469 redo A;
2470 } else {
2471 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2472 line => $self->{line_prev},
2473 column => $self->{column_prev} + 1 - length $self->{kwd});
2474 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2475 !!!cp (169);
2476 $self->{ct}->{quirks} = 1;
2477 $self->{state} = BOGUS_DOCTYPE_STATE;
2478 } else {
2479 !!!cp (169.1);
2480 $self->{state} = BOGUS_MD_STATE;
2481 }
2482 ## Reconsume.
2483 redo A;
2484 }
2485 } elsif ($self->{state} == SYSTEM_STATE) {
2486 ## ASCII case-insensitive
2487 if ($self->{nc} == [
2488 undef,
2489 0x0059, # Y
2490 0x0053, # S
2491 0x0054, # T
2492 0x0045, # E
2493 ]->[length $self->{kwd}] or
2494 $self->{nc} == [
2495 undef,
2496 0x0079, # y
2497 0x0073, # s
2498 0x0074, # t
2499 0x0065, # e
2500 ]->[length $self->{kwd}]) {
2501 !!!cp (170);
2502 ## Stay in the state.
2503 $self->{kwd} .= chr $self->{nc};
2504 !!!next-input-character;
2505 redo A;
2506 } elsif ((length $self->{kwd}) == 5 and
2507 ($self->{nc} == 0x004D or # M
2508 $self->{nc} == 0x006D)) { # m
2509 if ($self->{is_xml} and
2510 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2511 !!!cp (171.1);
2512 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2513 text => 'SYSTEM',
2514 line => $self->{line_prev},
2515 column => $self->{column_prev} - 4);
2516 } else {
2517 !!!cp (171);
2518 }
2519 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2520 !!!next-input-character;
2521 redo A;
2522 } else {
2523 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2524 line => $self->{line_prev},
2525 column => $self->{column_prev} + 1 - length $self->{kwd});
2526 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2527 !!!cp (172);
2528 $self->{ct}->{quirks} = 1;
2529 $self->{state} = BOGUS_DOCTYPE_STATE;
2530 } else {
2531 !!!cp (172.1);
2532 $self->{state} = BOGUS_MD_STATE;
2533 }
2534 ## Reconsume.
2535 redo A;
2536 }
2537 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2538 if ($is_space->{$self->{nc}}) {
2539 !!!cp (181);
2540 ## Stay in the state
2541 !!!next-input-character;
2542 redo A;
2543 } elsif ($self->{nc} eq 0x0022) { # "
2544 !!!cp (182);
2545 $self->{ct}->{pubid} = ''; # DOCTYPE
2546 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2547 !!!next-input-character;
2548 redo A;
2549 } elsif ($self->{nc} eq 0x0027) { # '
2550 !!!cp (183);
2551 $self->{ct}->{pubid} = ''; # DOCTYPE
2552 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2553 !!!next-input-character;
2554 redo A;
2555 } elsif ($self->{nc} eq 0x003E) { # >
2556 !!!parse-error (type => 'no PUBLIC literal');
2557
2558 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2559 !!!cp (184);
2560 $self->{state} = DATA_STATE;
2561 $self->{s_kwd} = '';
2562 $self->{ct}->{quirks} = 1;
2563 } else {
2564 !!!cp (184.1);
2565 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2566 }
2567
2568 !!!next-input-character;
2569 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2570 redo A;
2571 } elsif ($self->{nc} == -1) {
2572 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2573 !!!cp (185);
2574 !!!parse-error (type => 'unclosed DOCTYPE');
2575 $self->{state} = DATA_STATE;
2576 $self->{s_kwd} = '';
2577 $self->{ct}->{quirks} = 1;
2578 } else {
2579 !!!cp (185.1);
2580 !!!parse-error (type => 'unclosed md'); ## TODO: type
2581 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2582 }
2583
2584 ## reconsume
2585 !!!emit ($self->{ct}); # DOCTYPE
2586 redo A;
2587 } elsif ($self->{is_xml} and
2588 $self->{ct}->{type} == DOCTYPE_TOKEN and
2589 $self->{nc} == 0x005B) { # [
2590 !!!cp (186.1);
2591 !!!parse-error (type => 'no PUBLIC literal');
2592 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2593 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2594 $self->{in_subset} = 1;
2595 !!!next-input-character;
2596 !!!emit ($self->{ct}); # DOCTYPE
2597 redo A;
2598 } else {
2599 !!!parse-error (type => 'string after PUBLIC');
2600
2601 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2602 !!!cp (186);
2603 $self->{ct}->{quirks} = 1;
2604 $self->{state} = BOGUS_DOCTYPE_STATE;
2605 } else {
2606 !!!cp (186.2);
2607 $self->{state} = BOGUS_MD_STATE;
2608 }
2609
2610 !!!next-input-character;
2611 redo A;
2612 }
2613 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2614 if ($self->{nc} == 0x0022) { # "
2615 !!!cp (187);
2616 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2617 !!!next-input-character;
2618 redo A;
2619 } elsif ($self->{nc} == 0x003E) { # >
2620 !!!parse-error (type => 'unclosed PUBLIC literal');
2621
2622 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2623 !!!cp (188);
2624 $self->{state} = DATA_STATE;
2625 $self->{s_kwd} = '';
2626 $self->{ct}->{quirks} = 1;
2627 } else {
2628 !!!cp (188.1);
2629 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2630 }
2631
2632 !!!next-input-character;
2633 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2634 redo A;
2635 } elsif ($self->{nc} == -1) {
2636 !!!parse-error (type => 'unclosed PUBLIC literal');
2637
2638 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2639 !!!cp (189);
2640 $self->{state} = DATA_STATE;
2641 $self->{s_kwd} = '';
2642 $self->{ct}->{quirks} = 1;
2643 } else {
2644 !!!cp (189.1);
2645 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2646 }
2647
2648 ## Reconsume.
2649 !!!emit ($self->{ct}); # DOCTYPE
2650 redo A;
2651 } else {
2652 !!!cp (190);
2653 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2654 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2655 length $self->{ct}->{pubid});
2656
2657 ## Stay in the state
2658 !!!next-input-character;
2659 redo A;
2660 }
2661 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2662 if ($self->{nc} == 0x0027) { # '
2663 !!!cp (191);
2664 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2665 !!!next-input-character;
2666 redo A;
2667 } elsif ($self->{nc} == 0x003E) { # >
2668 !!!parse-error (type => 'unclosed PUBLIC literal');
2669
2670 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2671 !!!cp (192);
2672 $self->{state} = DATA_STATE;
2673 $self->{s_kwd} = '';
2674 $self->{ct}->{quirks} = 1;
2675 } else {
2676 !!!cp (192.1);
2677 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2678 }
2679
2680 !!!next-input-character;
2681 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2682 redo A;
2683 } elsif ($self->{nc} == -1) {
2684 !!!parse-error (type => 'unclosed PUBLIC literal');
2685
2686 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2687 !!!cp (193);
2688 $self->{state} = DATA_STATE;
2689 $self->{s_kwd} = '';
2690 $self->{ct}->{quirks} = 1;
2691 } else {
2692 !!!cp (193.1);
2693 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2694 }
2695
2696 ## reconsume
2697 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2698 redo A;
2699 } else {
2700 !!!cp (194);
2701 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2702 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2703 length $self->{ct}->{pubid});
2704
2705 ## Stay in the state
2706 !!!next-input-character;
2707 redo A;
2708 }
2709 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2710 if ($is_space->{$self->{nc}}) {
2711 !!!cp (195);
2712 ## Stay in the state
2713 !!!next-input-character;
2714 redo A;
2715 } elsif ($self->{nc} == 0x0022) { # "
2716 !!!cp (196);
2717 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2718 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2719 !!!next-input-character;
2720 redo A;
2721 } elsif ($self->{nc} == 0x0027) { # '
2722 !!!cp (197);
2723 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2724 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2725 !!!next-input-character;
2726 redo A;
2727 } elsif ($self->{nc} == 0x003E) { # >
2728 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2729 if ($self->{is_xml}) {
2730 !!!cp (198.1);
2731 !!!parse-error (type => 'no SYSTEM literal');
2732 } else {
2733 !!!cp (198);
2734 }
2735 $self->{state} = DATA_STATE;
2736 $self->{s_kwd} = '';
2737 } else {
2738 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2739 !!!cp (198.2);
2740 } else {
2741 !!!cp (198.3);
2742 !!!parse-error (type => 'no SYSTEM literal');
2743 }
2744 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2745 }
2746
2747 !!!next-input-character;
2748 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2749 redo A;
2750 } elsif ($self->{nc} == -1) {
2751 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2752 !!!cp (199);
2753 !!!parse-error (type => 'unclosed DOCTYPE');
2754
2755 $self->{state} = DATA_STATE;
2756 $self->{s_kwd} = '';
2757 $self->{ct}->{quirks} = 1;
2758 } else {
2759 !!!parse-error (type => 'unclosed md'); ## TODO: type
2760 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2761 }
2762
2763 ## reconsume
2764 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2765 redo A;
2766 } elsif ($self->{is_xml} and
2767 $self->{ct}->{type} == DOCTYPE_TOKEN and
2768 $self->{nc} == 0x005B) { # [
2769 !!!cp (200.1);
2770 !!!parse-error (type => 'no SYSTEM literal');
2771 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2772 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2773 $self->{in_subset} = 1;
2774 !!!next-input-character;
2775 !!!emit ($self->{ct}); # DOCTYPE
2776 redo A;
2777 } else {
2778 !!!parse-error (type => 'string after PUBLIC literal');
2779
2780 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2781 !!!cp (200);
2782 $self->{ct}->{quirks} = 1;
2783 $self->{state} = BOGUS_DOCTYPE_STATE;
2784 } else {
2785 !!!cp (200.2);
2786 $self->{state} = BOGUS_MD_STATE;
2787 }
2788
2789 !!!next-input-character;
2790 redo A;
2791 }
2792 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2793 if ($is_space->{$self->{nc}}) {
2794 !!!cp (201);
2795 ## Stay in the state
2796 !!!next-input-character;
2797 redo A;
2798 } elsif ($self->{nc} == 0x0022) { # "
2799 !!!cp (202);
2800 $self->{ct}->{sysid} = ''; # DOCTYPE
2801 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2802 !!!next-input-character;
2803 redo A;
2804 } elsif ($self->{nc} == 0x0027) { # '
2805 !!!cp (203);
2806 $self->{ct}->{sysid} = ''; # DOCTYPE
2807 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2808 !!!next-input-character;
2809 redo A;
2810 } elsif ($self->{nc} == 0x003E) { # >
2811 !!!parse-error (type => 'no SYSTEM literal');
2812 !!!next-input-character;
2813
2814 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2815 !!!cp (204);
2816 $self->{state} = DATA_STATE;
2817 $self->{s_kwd} = '';
2818 $self->{ct}->{quirks} = 1;
2819 } else {
2820 !!!cp (204.1);
2821 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822 }
2823
2824 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2825 redo A;
2826 } elsif ($self->{nc} == -1) {
2827 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2828 !!!cp (205);
2829 !!!parse-error (type => 'unclosed DOCTYPE');
2830 $self->{state} = DATA_STATE;
2831 $self->{s_kwd} = '';
2832 $self->{ct}->{quirks} = 1;
2833 } else {
2834 !!!cp (205.1);
2835 !!!parse-error (type => 'unclosed md'); ## TODO: type
2836 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2837 }
2838
2839 ## reconsume
2840 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2841 redo A;
2842 } elsif ($self->{is_xml} and
2843 $self->{ct}->{type} == DOCTYPE_TOKEN and
2844 $self->{nc} == 0x005B) { # [
2845 !!!cp (206.1);
2846 !!!parse-error (type => 'no SYSTEM literal');
2847
2848 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2849 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2850 $self->{in_subset} = 1;
2851 !!!next-input-character;
2852 !!!emit ($self->{ct}); # DOCTYPE
2853 redo A;
2854 } else {
2855 !!!parse-error (type => 'string after SYSTEM');
2856
2857 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2858 !!!cp (206);
2859 $self->{ct}->{quirks} = 1;
2860 $self->{state} = BOGUS_DOCTYPE_STATE;
2861 } else {
2862 !!!cp (206.2);
2863 $self->{state} = BOGUS_MD_STATE;
2864 }
2865
2866 !!!next-input-character;
2867 redo A;
2868 }
2869 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2870 if ($self->{nc} == 0x0022) { # "
2871 !!!cp (207);
2872 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2873 !!!next-input-character;
2874 redo A;
2875 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2876 !!!parse-error (type => 'unclosed SYSTEM literal');
2877
2878 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2879 !!!cp (208);
2880 $self->{state} = DATA_STATE;
2881 $self->{s_kwd} = '';
2882 $self->{ct}->{quirks} = 1;
2883 } else {
2884 !!!cp (208.1);
2885 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2886 }
2887
2888 !!!next-input-character;
2889 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2890 redo A;
2891 } elsif ($self->{nc} == -1) {
2892 !!!parse-error (type => 'unclosed SYSTEM literal');
2893
2894 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2895 !!!cp (209);
2896 $self->{state} = DATA_STATE;
2897 $self->{s_kwd} = '';
2898 $self->{ct}->{quirks} = 1;
2899 } else {
2900 !!!cp (209.1);
2901 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902 }
2903
2904 ## reconsume
2905 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2906 redo A;
2907 } else {
2908 !!!cp (210);
2909 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2910 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2911 length $self->{ct}->{sysid});
2912
2913 ## Stay in the state
2914 !!!next-input-character;
2915 redo A;
2916 }
2917 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2918 if ($self->{nc} == 0x0027) { # '
2919 !!!cp (211);
2920 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2921 !!!next-input-character;
2922 redo A;
2923 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2924 !!!cp (212);
2925 !!!parse-error (type => 'unclosed SYSTEM literal');
2926
2927 $self->{state} = DATA_STATE;
2928 $self->{s_kwd} = '';
2929 !!!next-input-character;
2930
2931 $self->{ct}->{quirks} = 1;
2932 !!!emit ($self->{ct}); # DOCTYPE
2933
2934 redo A;
2935 } elsif ($self->{nc} == -1) {
2936 !!!parse-error (type => 'unclosed SYSTEM literal');
2937
2938 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2939 !!!cp (213);
2940 $self->{state} = DATA_STATE;
2941 $self->{s_kwd} = '';
2942 $self->{ct}->{quirks} = 1;
2943 } else {
2944 !!!cp (213.1);
2945 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2946 }
2947
2948 ## reconsume
2949 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2950 redo A;
2951 } else {
2952 !!!cp (214);
2953 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2954 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2955 length $self->{ct}->{sysid});
2956
2957 ## Stay in the state
2958 !!!next-input-character;
2959 redo A;
2960 }
2961 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2962 if ($is_space->{$self->{nc}}) {
2963 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2964 !!!cp (215.1);
2965 $self->{state} = BEFORE_NDATA_STATE;
2966 } else {
2967 !!!cp (215);
2968 ## Stay in the state
2969 }
2970 !!!next-input-character;
2971 redo A;
2972 } elsif ($self->{nc} == 0x003E) { # >
2973 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2974 !!!cp (216);
2975 $self->{state} = DATA_STATE;
2976 $self->{s_kwd} = '';
2977 } else {
2978 !!!cp (216.1);
2979 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2980 }
2981
2982 !!!next-input-character;
2983 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2984 redo A;
2985 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2986 ($self->{nc} == 0x004E or # N
2987 $self->{nc} == 0x006E)) { # n
2988 !!!cp (216.2);
2989 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2990 $self->{state} = NDATA_STATE;
2991 $self->{kwd} = chr $self->{nc};
2992 !!!next-input-character;
2993 redo A;
2994 } elsif ($self->{nc} == -1) {
2995 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2996 !!!cp (217);
2997 !!!parse-error (type => 'unclosed DOCTYPE');
2998 $self->{state} = DATA_STATE;
2999 $self->{s_kwd} = '';
3000 $self->{ct}->{quirks} = 1;
3001 } else {
3002 !!!cp (217.1);
3003 !!!parse-error (type => 'unclosed md'); ## TODO: type
3004 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3005 }
3006
3007 ## reconsume
3008 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3009 redo A;
3010 } elsif ($self->{is_xml} and
3011 $self->{ct}->{type} == DOCTYPE_TOKEN and
3012 $self->{nc} == 0x005B) { # [
3013 !!!cp (218.1);
3014 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3015 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3016 $self->{in_subset} = 1;
3017 !!!next-input-character;
3018 !!!emit ($self->{ct}); # DOCTYPE
3019 redo A;
3020 } else {
3021 !!!parse-error (type => 'string after SYSTEM literal');
3022
3023 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3024 !!!cp (218);
3025 #$self->{ct}->{quirks} = 1;
3026 $self->{state} = BOGUS_DOCTYPE_STATE;
3027 } else {
3028 !!!cp (218.2);
3029 $self->{state} = BOGUS_MD_STATE;
3030 }
3031
3032 !!!next-input-character;
3033 redo A;
3034 }
3035 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
3036 if ($is_space->{$self->{nc}}) {
3037 !!!cp (218.3);
3038 ## Stay in the state.
3039 !!!next-input-character;
3040 redo A;
3041 } elsif ($self->{nc} == 0x003E) { # >
3042 !!!cp (218.4);
3043 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3044 !!!next-input-character;
3045 !!!emit ($self->{ct}); # ENTITY
3046 redo A;
3047 } elsif ($self->{nc} == 0x004E or # N
3048 $self->{nc} == 0x006E) { # n
3049 !!!cp (218.5);
3050 $self->{state} = NDATA_STATE;
3051 $self->{kwd} = chr $self->{nc};
3052 !!!next-input-character;
3053 redo A;
3054 } elsif ($self->{nc} == -1) {
3055 !!!cp (218.6);
3056 !!!parse-error (type => 'unclosed md'); ## TODO: type
3057 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3058 ## reconsume
3059 !!!emit ($self->{ct}); # ENTITY
3060 redo A;
3061 } else {
3062 !!!cp (218.7);
3063 !!!parse-error (type => 'string after SYSTEM literal');
3064 $self->{state} = BOGUS_MD_STATE;
3065 !!!next-input-character;
3066 redo A;
3067 }
3068 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3069 if ($self->{nc} == 0x003E) { # >
3070 !!!cp (219);
3071 $self->{state} = DATA_STATE;
3072 $self->{s_kwd} = '';
3073 !!!next-input-character;
3074
3075 !!!emit ($self->{ct}); # DOCTYPE
3076
3077 redo A;
3078 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3079 !!!cp (220.1);
3080 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3081 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3082 $self->{in_subset} = 1;
3083 !!!next-input-character;
3084 !!!emit ($self->{ct}); # DOCTYPE
3085 redo A;
3086 } elsif ($self->{nc} == -1) {
3087 !!!cp (220);
3088 $self->{state} = DATA_STATE;
3089 $self->{s_kwd} = '';
3090 ## reconsume
3091
3092 !!!emit ($self->{ct}); # DOCTYPE
3093
3094 redo A;
3095 } else {
3096 !!!cp (221);
3097 my $s = '';
3098 $self->{read_until}->($s, q{>[}, 0);
3099
3100 ## Stay in the state
3101 !!!next-input-character;
3102 redo A;
3103 }
3104 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3105 ## NOTE: "CDATA section state" in the state is jointly implemented
3106 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3107 ## and |CDATA_SECTION_MSE2_STATE|.
3108
3109 ## XML5: "CDATA state".
3110
3111 if ($self->{nc} == 0x005D) { # ]
3112 !!!cp (221.1);
3113 $self->{state} = CDATA_SECTION_MSE1_STATE;
3114 !!!next-input-character;
3115 redo A;
3116 } elsif ($self->{nc} == -1) {
3117 if ($self->{is_xml}) {
3118 !!!cp (221.11);
3119 !!!parse-error (type => 'no mse'); ## TODO: type
3120 } else {
3121 !!!cp (221.12);
3122 }
3123
3124 $self->{state} = DATA_STATE;
3125 $self->{s_kwd} = '';
3126 ## Reconsume.
3127 if (length $self->{ct}->{data}) { # character
3128 !!!cp (221.2);
3129 !!!emit ($self->{ct}); # character
3130 } else {
3131 !!!cp (221.3);
3132 ## No token to emit. $self->{ct} is discarded.
3133 }
3134 redo A;
3135 } else {
3136 !!!cp (221.4);
3137 $self->{ct}->{data} .= chr $self->{nc};
3138 $self->{read_until}->($self->{ct}->{data},
3139 q<]>,
3140 length $self->{ct}->{data});
3141
3142 ## Stay in the state.
3143 !!!next-input-character;
3144 redo A;
3145 }
3146
3147 ## ISSUE: "text tokens" in spec.
3148 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3149 ## XML5: "CDATA bracket state".
3150
3151 if ($self->{nc} == 0x005D) { # ]
3152 !!!cp (221.5);
3153 $self->{state} = CDATA_SECTION_MSE2_STATE;
3154 !!!next-input-character;
3155 redo A;
3156 } else {
3157 !!!cp (221.6);
3158 ## XML5: If EOF, "]" is not appended and changed to the data state.
3159 $self->{ct}->{data} .= ']';
3160 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3161 ## Reconsume.
3162 redo A;
3163 }
3164 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3165 ## XML5: "CDATA end state".
3166
3167 if ($self->{nc} == 0x003E) { # >
3168 $self->{state} = DATA_STATE;
3169 $self->{s_kwd} = '';
3170 !!!next-input-character;
3171 if (length $self->{ct}->{data}) { # character
3172 !!!cp (221.7);
3173 !!!emit ($self->{ct}); # character
3174 } else {
3175 !!!cp (221.8);
3176 ## No token to emit. $self->{ct} is discarded.
3177 }
3178 redo A;
3179 } elsif ($self->{nc} == 0x005D) { # ]
3180 !!!cp (221.9); # character
3181 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3182 ## Stay in the state.
3183 !!!next-input-character;
3184 redo A;
3185 } else {
3186 !!!cp (221.11);
3187 $self->{ct}->{data} .= ']]'; # character
3188 $self->{state} = CDATA_SECTION_STATE;
3189 ## Reconsume. ## XML5: Emit.
3190 redo A;
3191 }
3192 } elsif ($self->{state} == ENTITY_STATE) {
3193 if ($is_space->{$self->{nc}} or
3194 {
3195 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3196 $self->{entity_add} => 1,
3197 }->{$self->{nc}}) {
3198 if ($self->{is_xml}) {
3199 !!!cp (1001.1);
3200 !!!parse-error (type => 'bare ero',
3201 line => $self->{line_prev},
3202 column => $self->{column_prev}
3203 + ($self->{nc} == -1 ? 1 : 0));
3204 } else {
3205 !!!cp (1001);
3206 ## No error
3207 }
3208 ## Don't consume
3209 ## Return nothing.
3210 #
3211 } elsif ($self->{nc} == 0x0023) { # #
3212 !!!cp (999);
3213 $self->{state} = ENTITY_HASH_STATE;
3214 $self->{kwd} = '#';
3215 !!!next-input-character;
3216 redo A;
3217 } elsif ($self->{is_xml} or
3218 (0x0041 <= $self->{nc} and
3219 $self->{nc} <= 0x005A) or # A..Z
3220 (0x0061 <= $self->{nc} and
3221 $self->{nc} <= 0x007A)) { # a..z
3222 !!!cp (998);
3223 require Whatpm::_NamedEntityList;
3224 $self->{state} = ENTITY_NAME_STATE;
3225 $self->{kwd} = chr $self->{nc};
3226 $self->{entity__value} = $self->{kwd};
3227 $self->{entity__match} = 0;
3228 !!!next-input-character;
3229 redo A;
3230 } else {
3231 !!!cp (1027);
3232 !!!parse-error (type => 'bare ero');
3233 ## Return nothing.
3234 #
3235 }
3236
3237 ## NOTE: No character is consumed by the "consume a character
3238 ## reference" algorithm. In other word, there is an "&" character
3239 ## that does not introduce a character reference, which would be
3240 ## appended to the parent element or the attribute value in later
3241 ## process of the tokenizer.
3242
3243 if ($self->{prev_state} == DATA_STATE) {
3244 !!!cp (997);
3245 $self->{state} = $self->{prev_state};
3246 $self->{s_kwd} = '';
3247 ## Reconsume.
3248 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3249 line => $self->{line_prev},
3250 column => $self->{column_prev},
3251 });
3252 redo A;
3253 } else {
3254 !!!cp (996);
3255 $self->{ca}->{value} .= '&';
3256 $self->{state} = $self->{prev_state};
3257 $self->{s_kwd} = '';
3258 ## Reconsume.
3259 redo A;
3260 }
3261 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3262 if ($self->{nc} == 0x0078) { # x
3263 !!!cp (995);
3264 $self->{state} = HEXREF_X_STATE;
3265 $self->{kwd} .= chr $self->{nc};
3266 !!!next-input-character;
3267 redo A;
3268 } elsif ($self->{nc} == 0x0058) { # X
3269 !!!cp (995.1);
3270 if ($self->{is_xml}) {
3271 !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3272 }
3273 $self->{state} = HEXREF_X_STATE;
3274 $self->{kwd} .= chr $self->{nc};
3275 !!!next-input-character;
3276 redo A;
3277 } elsif (0x0030 <= $self->{nc} and
3278 $self->{nc} <= 0x0039) { # 0..9
3279 !!!cp (994);
3280 $self->{state} = NCR_NUM_STATE;
3281 $self->{kwd} = $self->{nc} - 0x0030;
3282 !!!next-input-character;
3283 redo A;
3284 } else {
3285 !!!parse-error (type => 'bare nero',
3286 line => $self->{line_prev},
3287 column => $self->{column_prev} - 1);
3288
3289 ## NOTE: According to the spec algorithm, nothing is returned,
3290 ## and then "&#" is appended to the parent element or the attribute
3291 ## value in the later processing.
3292
3293 if ($self->{prev_state} == DATA_STATE) {
3294 !!!cp (1019);
3295 $self->{state} = $self->{prev_state};
3296 $self->{s_kwd} = '';
3297 ## Reconsume.
3298 !!!emit ({type => CHARACTER_TOKEN,
3299 data => '&#',
3300 line => $self->{line_prev},
3301 column => $self->{column_prev} - 1,
3302 });
3303 redo A;
3304 } else {
3305 !!!cp (993);
3306 $self->{ca}->{value} .= '&#';
3307 $self->{state} = $self->{prev_state};
3308 $self->{s_kwd} = '';
3309 ## Reconsume.
3310 redo A;
3311 }
3312 }
3313 } elsif ($self->{state} == NCR_NUM_STATE) {
3314 if (0x0030 <= $self->{nc} and
3315 $self->{nc} <= 0x0039) { # 0..9
3316 !!!cp (1012);
3317 $self->{kwd} *= 10;
3318 $self->{kwd} += $self->{nc} - 0x0030;
3319
3320 ## Stay in the state.
3321 !!!next-input-character;
3322 redo A;
3323 } elsif ($self->{nc} == 0x003B) { # ;
3324 !!!cp (1013);
3325 !!!next-input-character;
3326 #
3327 } else {
3328 !!!cp (1014);
3329 !!!parse-error (type => 'no refc');
3330 ## Reconsume.
3331 #
3332 }
3333
3334 my $code = $self->{kwd};
3335 my $l = $self->{line_prev};
3336 my $c = $self->{column_prev};
3337 if ((not $self->{is_xml} and $charref_map->{$code}) or
3338 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3339 ($self->{is_xml} and $code == 0x0000)) {
3340 !!!cp (1015);
3341 !!!parse-error (type => 'invalid character reference',
3342 text => (sprintf 'U+%04X', $code),
3343 line => $l, column => $c);
3344 $code = $charref_map->{$code};
3345 } elsif ($code > 0x10FFFF) {
3346 !!!cp (1016);
3347 !!!parse-error (type => 'invalid character reference',
3348 text => (sprintf 'U-%08X', $code),
3349 line => $l, column => $c);
3350 $code = 0xFFFD;
3351 }
3352
3353 if ($self->{prev_state} == DATA_STATE) {
3354 !!!cp (992);
3355 $self->{state} = $self->{prev_state};
3356 $self->{s_kwd} = '';
3357 ## Reconsume.
3358 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3359 has_reference => 1,
3360 line => $l, column => $c,
3361 });
3362 redo A;
3363 } else {
3364 !!!cp (991);
3365 $self->{ca}->{value} .= chr $code;
3366 $self->{ca}->{has_reference} = 1;
3367 $self->{state} = $self->{prev_state};
3368 $self->{s_kwd} = '';
3369 ## Reconsume.
3370 redo A;
3371 }
3372 } elsif ($self->{state} == HEXREF_X_STATE) {
3373 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3374 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3375 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3376 # 0..9, A..F, a..f
3377 !!!cp (990);
3378 $self->{state} = HEXREF_HEX_STATE;
3379 $self->{kwd} = 0;
3380 ## Reconsume.
3381 redo A;
3382 } else {
3383 !!!parse-error (type => 'bare hcro',
3384 line => $self->{line_prev},
3385 column => $self->{column_prev} - 2);
3386
3387 ## NOTE: According to the spec algorithm, nothing is returned,
3388 ## and then "&#" followed by "X" or "x" is appended to the parent
3389 ## element or the attribute value in the later processing.
3390
3391 if ($self->{prev_state} == DATA_STATE) {
3392 !!!cp (1005);
3393 $self->{state} = $self->{prev_state};
3394 $self->{s_kwd} = '';
3395 ## Reconsume.
3396 !!!emit ({type => CHARACTER_TOKEN,
3397 data => '&' . $self->{kwd},
3398 line => $self->{line_prev},
3399 column => $self->{column_prev} - length $self->{kwd},
3400 });
3401 redo A;
3402 } else {
3403 !!!cp (989);
3404 $self->{ca}->{value} .= '&' . $self->{kwd};
3405 $self->{state} = $self->{prev_state};
3406 $self->{s_kwd} = '';
3407 ## Reconsume.
3408 redo A;
3409 }
3410 }
3411 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3412 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3413 # 0..9
3414 !!!cp (1002);
3415 $self->{kwd} *= 0x10;
3416 $self->{kwd} += $self->{nc} - 0x0030;
3417 ## Stay in the state.
3418 !!!next-input-character;
3419 redo A;
3420 } elsif (0x0061 <= $self->{nc} and
3421 $self->{nc} <= 0x0066) { # a..f
3422 !!!cp (1003);
3423 $self->{kwd} *= 0x10;
3424 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3425 ## Stay in the state.
3426 !!!next-input-character;
3427 redo A;
3428 } elsif (0x0041 <= $self->{nc} and
3429 $self->{nc} <= 0x0046) { # A..F
3430 !!!cp (1004);
3431 $self->{kwd} *= 0x10;
3432 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3433 ## Stay in the state.
3434 !!!next-input-character;
3435 redo A;
3436 } elsif ($self->{nc} == 0x003B) { # ;
3437 !!!cp (1006);
3438 !!!next-input-character;
3439 #
3440 } else {
3441 !!!cp (1007);
3442 !!!parse-error (type => 'no refc',
3443 line => $self->{line},
3444 column => $self->{column});
3445 ## Reconsume.
3446 #
3447 }
3448
3449 my $code = $self->{kwd};
3450 my $l = $self->{line_prev};
3451 my $c = $self->{column_prev};
3452 if ((not $self->{is_xml} and $charref_map->{$code}) or
3453 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3454 ($self->{is_xml} and $code == 0x0000)) {
3455 !!!cp (1008);
3456 !!!parse-error (type => 'invalid character reference',
3457 text => (sprintf 'U+%04X', $code),
3458 line => $l, column => $c);
3459 $code = $charref_map->{$code};
3460 } elsif ($code > 0x10FFFF) {
3461 !!!cp (1009);
3462 !!!parse-error (type => 'invalid character reference',
3463 text => (sprintf 'U-%08X', $code),
3464 line => $l, column => $c);
3465 $code = 0xFFFD;
3466 }
3467
3468 if ($self->{prev_state} == DATA_STATE) {
3469 !!!cp (988);
3470 $self->{state} = $self->{prev_state};
3471 $self->{s_kwd} = '';
3472 ## Reconsume.
3473 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3474 has_reference => 1,
3475 line => $l, column => $c,
3476 });
3477 redo A;
3478 } else {
3479 !!!cp (987);
3480 $self->{ca}->{value} .= chr $code;
3481 $self->{ca}->{has_reference} = 1;
3482 $self->{state} = $self->{prev_state};
3483 $self->{s_kwd} = '';
3484 ## Reconsume.
3485 redo A;
3486 }
3487 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3488 if ((0x0041 <= $self->{nc} and # a
3489 $self->{nc} <= 0x005A) or # x
3490 (0x0061 <= $self->{nc} and # a
3491 $self->{nc} <= 0x007A) or # z
3492 (0x0030 <= $self->{nc} and # 0
3493 $self->{nc} <= 0x0039) or # 9
3494 $self->{nc} == 0x003B or # ;
3495 ($self->{is_xml} and
3496 not ($is_space->{$self->{nc}} or
3497 {
3498 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3499 $self->{entity_add} => 1,
3500 }->{$self->{nc}}))) {
3501 our $EntityChar;
3502 $self->{kwd} .= chr $self->{nc};
3503 if (defined $EntityChar->{$self->{kwd}} or
3504 $self->{ge}->{$self->{kwd}}) {
3505 if ($self->{nc} == 0x003B) { # ;
3506 if (defined $self->{ge}->{$self->{kwd}}) {
3507 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3508 !!!cp (1020.1);
3509 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3510 } else {
3511 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3512 !!!cp (1020.2);
3513 !!!parse-error (type => 'unparsed entity', ## TODO: type
3514 value => $self->{kwd});
3515 } else {
3516 !!!cp (1020.3);
3517 }
3518 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3519 }
3520 } else {
3521 if ($self->{is_xml}) {
3522 !!!cp (1020.4);
3523 !!!parse-error (type => 'entity not declared', ## TODO: type
3524 value => $self->{kwd},
3525 level => {
3526 'amp;' => $self->{level}->{warn},
3527 'quot;' => $self->{level}->{warn},
3528 'lt;' => $self->{level}->{warn},
3529 'gt;' => $self->{level}->{warn},
3530 'apos;' => $self->{level}->{warn},
3531 }->{$self->{kwd}} ||
3532 $self->{level}->{must});
3533 } else {
3534 !!!cp (1020);
3535 }
3536 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3537 }
3538 $self->{entity__match} = 1;
3539 !!!next-input-character;
3540 #
3541 } else {
3542 !!!cp (1021);
3543 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3544 $self->{entity__match} = -1;
3545 ## Stay in the state.
3546 !!!next-input-character;
3547 redo A;
3548 }
3549 } else {
3550 !!!cp (1022);
3551 $self->{entity__value} .= chr $self->{nc};
3552 $self->{entity__match} *= 2;
3553 ## Stay in the state.
3554 !!!next-input-character;
3555 redo A;
3556 }
3557 }
3558
3559 my $data;
3560 my $has_ref;
3561 if ($self->{entity__match} > 0) {
3562 !!!cp (1023);
3563 $data = $self->{entity__value};
3564 $has_ref = 1;
3565 #
3566 } elsif ($self->{entity__match} < 0) {
3567 !!!parse-error (type => 'no refc');
3568 if ($self->{prev_state} != DATA_STATE and # in attribute
3569 $self->{entity__match} < -1) {
3570 !!!cp (1024);
3571 $data = '&' . $self->{kwd};
3572 #
3573 } else {
3574 !!!cp (1025);
3575 $data = $self->{entity__value};
3576 $has_ref = 1;
3577 #
3578 }
3579 } else {
3580 !!!cp (1026);
3581 !!!parse-error (type => 'bare ero',
3582 line => $self->{line_prev},
3583 column => $self->{column_prev} - length $self->{kwd});
3584 $data = '&' . $self->{kwd};
3585 #
3586 }
3587
3588 ## NOTE: In these cases, when a character reference is found,
3589 ## it is consumed and a character token is returned, or, otherwise,
3590 ## nothing is consumed and returned, according to the spec algorithm.
3591 ## In this implementation, anything that has been examined by the
3592 ## tokenizer is appended to the parent element or the attribute value
3593 ## as string, either literal string when no character reference or
3594 ## entity-replaced string otherwise, in this stage, since any characters
3595 ## that would not be consumed are appended in the data state or in an
3596 ## appropriate attribute value state anyway.
3597
3598 if ($self->{prev_state} == DATA_STATE) {
3599 !!!cp (986);
3600 $self->{state} = $self->{prev_state};
3601 $self->{s_kwd} = '';
3602 ## Reconsume.
3603 !!!emit ({type => CHARACTER_TOKEN,
3604 data => $data,
3605 has_reference => $has_ref,
3606 line => $self->{line_prev},
3607 column => $self->{column_prev} + 1 - length $self->{kwd},
3608 });
3609 redo A;
3610 } else {
3611 !!!cp (985);
3612 $self->{ca}->{value} .= $data;
3613 $self->{ca}->{has_reference} = 1 if $has_ref;
3614 $self->{state} = $self->{prev_state};
3615 $self->{s_kwd} = '';
3616 ## Reconsume.
3617 redo A;
3618 }
3619
3620 ## XML-only states
3621
3622 } elsif ($self->{state} == PI_STATE) {
3623 ## XML5: "Pi state" and "DOCTYPE pi state".
3624
3625 if ($is_space->{$self->{nc}} or
3626 $self->{nc} == 0x003F or # ?
3627 $self->{nc} == -1) {
3628 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3629 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3630 ## "DOCTYPE pi state": Parse error, switch to the "data
3631 ## state".
3632 !!!parse-error (type => 'bare pio', ## TODO: type
3633 line => $self->{line_prev},
3634 column => $self->{column_prev}
3635 - 1 * ($self->{nc} != -1));
3636 $self->{state} = BOGUS_COMMENT_STATE;
3637 ## Reconsume.
3638 $self->{ct} = {type => COMMENT_TOKEN,
3639 data => '?',
3640 line => $self->{line_prev},
3641 column => $self->{column_prev}
3642 - 1 * ($self->{nc} != -1),
3643 };
3644 redo A;
3645 } else {
3646 ## XML5: "DOCTYPE pi state": Stay in the state.
3647 $self->{ct} = {type => PI_TOKEN,
3648 target => chr $self->{nc},
3649 data => '',
3650 line => $self->{line_prev},
3651 column => $self->{column_prev} - 1,
3652 };
3653 $self->{state} = PI_TARGET_STATE;
3654 !!!next-input-character;
3655 redo A;
3656 }
3657 } elsif ($self->{state} == PI_TARGET_STATE) {
3658 if ($is_space->{$self->{nc}}) {
3659 $self->{state} = PI_TARGET_AFTER_STATE;
3660 !!!next-input-character;
3661 redo A;
3662 } elsif ($self->{nc} == -1) {
3663 !!!parse-error (type => 'no pic'); ## TODO: type
3664 if ($self->{in_subset}) {
3665 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3666 } else {
3667 $self->{state} = DATA_STATE;
3668 $self->{s_kwd} = '';
3669 }
3670 ## Reconsume.
3671 !!!emit ($self->{ct}); # pi
3672 redo A;
3673 } elsif ($self->{nc} == 0x003F) { # ?
3674 $self->{state} = PI_AFTER_STATE;
3675 !!!next-input-character;
3676 redo A;
3677 } else {
3678 ## XML5: typo ("tag name" -> "target")
3679 $self->{ct}->{target} .= chr $self->{nc}; # pi
3680 !!!next-input-character;
3681 redo A;
3682 }
3683 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3684 if ($is_space->{$self->{nc}}) {
3685 ## Stay in the state.
3686 !!!next-input-character;
3687 redo A;
3688 } else {
3689 $self->{state} = PI_DATA_STATE;
3690 ## Reprocess.
3691 redo A;
3692 }
3693 } elsif ($self->{state} == PI_DATA_STATE) {
3694 if ($self->{nc} == 0x003F) { # ?
3695 $self->{state} = PI_DATA_AFTER_STATE;
3696 !!!next-input-character;
3697 redo A;
3698 } elsif ($self->{nc} == -1) {
3699 !!!parse-error (type => 'no pic'); ## TODO: type
3700 if ($self->{in_subset}) {
3701 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3702 } else {
3703 $self->{state} = DATA_STATE;
3704 $self->{s_kwd} = '';
3705 }
3706 ## Reprocess.
3707 !!!emit ($self->{ct}); # pi
3708 redo A;
3709 } else {
3710 $self->{ct}->{data} .= chr $self->{nc}; # pi
3711 $self->{read_until}->($self->{ct}->{data}, q[?],
3712 length $self->{ct}->{data});
3713 ## Stay in the state.
3714 !!!next-input-character;
3715 ## Reprocess.
3716 redo A;
3717 }
3718 } elsif ($self->{state} == PI_AFTER_STATE) {
3719 ## XML5: Part of "Pi after state".
3720
3721 if ($self->{nc} == 0x003E) { # >
3722 if ($self->{in_subset}) {
3723 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3724 } else {
3725 $self->{state} = DATA_STATE;
3726 $self->{s_kwd} = '';
3727 }
3728 !!!next-input-character;
3729 !!!emit ($self->{ct}); # pi
3730 redo A;
3731 } elsif ($self->{nc} == 0x003F) { # ?
3732 !!!parse-error (type => 'no s after target', ## TODO: type
3733 line => $self->{line_prev},
3734 column => $self->{column_prev}); ## XML5: no error
3735 $self->{ct}->{data} .= '?';
3736 $self->{state} = PI_DATA_AFTER_STATE;
3737 !!!next-input-character;
3738 redo A;
3739 } else {
3740 !!!parse-error (type => 'no s after target', ## TODO: type
3741 line => $self->{line_prev},
3742 column => $self->{column_prev}
3743 + 1 * ($self->{nc} == -1)); ## XML5: no error
3744 $self->{ct}->{data} .= '?'; ## XML5: not appended
3745 $self->{state} = PI_DATA_STATE;
3746 ## Reprocess.
3747 redo A;
3748 }
3749 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3750 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3751
3752 if ($self->{nc} == 0x003E) { # >
3753 if ($self->{in_subset}) {
3754 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3755 } else {
3756 $self->{state} = DATA_STATE;
3757 $self->{s_kwd} = '';
3758 }
3759 !!!next-input-character;
3760 !!!emit ($self->{ct}); # pi
3761 redo A;
3762 } elsif ($self->{nc} == 0x003F) { # ?
3763 $self->{ct}->{data} .= '?';
3764 ## Stay in the state.
3765 !!!next-input-character;
3766 redo A;
3767 } else {
3768 $self->{ct}->{data} .= '?'; ## XML5: not appended
3769 $self->{state} = PI_DATA_STATE;
3770 ## Reprocess.
3771 redo A;
3772 }
3773
3774 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3775 if ($self->{nc} == 0x003C) { # <
3776 $self->{state} = DOCTYPE_TAG_STATE;
3777 !!!next-input-character;
3778 redo A;
3779 } elsif ($self->{nc} == 0x0025) { # %
3780 ## XML5: Not defined yet.
3781
3782 ## TODO:
3783
3784 if (not $self->{stop_processing} and
3785 not $self->{document}->xml_standalone) {
3786 !!!parse-error (type => 'stop processing', ## TODO: type
3787 level => $self->{level}->{info});
3788 $self->{stop_processing} = 1;
3789 }
3790
3791 !!!next-input-character;
3792 redo A;
3793 } elsif ($self->{nc} == 0x005D) { # ]
3794 delete $self->{in_subset};
3795 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3796 !!!next-input-character;
3797 redo A;
3798 } elsif ($is_space->{$self->{nc}}) {
3799 ## Stay in the state.
3800 !!!next-input-character;
3801 redo A;
3802 } elsif ($self->{nc} == -1) {
3803 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3804 delete $self->{in_subset};
3805 $self->{state} = DATA_STATE;
3806 $self->{s_kwd} = '';
3807 ## Reconsume.
3808 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3809 redo A;
3810 } else {
3811 unless ($self->{internal_subset_tainted}) {
3812 ## XML5: No parse error.
3813 !!!parse-error (type => 'string in internal subset');
3814 $self->{internal_subset_tainted} = 1;
3815 }
3816 ## Stay in the state.
3817 !!!next-input-character;
3818 redo A;
3819 }
3820 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3821 if ($self->{nc} == 0x003E) { # >
3822 $self->{state} = DATA_STATE;
3823 $self->{s_kwd} = '';
3824 !!!next-input-character;
3825 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3826 redo A;
3827 } elsif ($self->{nc} == -1) {
3828 !!!parse-error (type => 'unclosed DOCTYPE');
3829 $self->{state} = DATA_STATE;
3830 $self->{s_kwd} = '';
3831 ## Reconsume.
3832 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3833 redo A;
3834 } else {
3835 ## XML5: No parse error and stay in the state.
3836 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3837
3838 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3839 !!!next-input-character;
3840 redo A;
3841 }
3842 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3843 if ($self->{nc} == 0x003E) { # >
3844 $self->{state} = DATA_STATE;
3845 $self->{s_kwd} = '';
3846 !!!next-input-character;
3847 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3848 redo A;
3849 } elsif ($self->{nc} == -1) {
3850 $self->{state} = DATA_STATE;
3851 $self->{s_kwd} = '';
3852 ## Reconsume.
3853 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3854 redo A;
3855 } else {
3856 ## Stay in the state.
3857 !!!next-input-character;
3858 redo A;
3859 }
3860 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3861 if ($self->{nc} == 0x0021) { # !
3862 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3863 !!!next-input-character;
3864 redo A;
3865 } elsif ($self->{nc} == 0x003F) { # ?
3866 $self->{state} = PI_STATE;
3867 !!!next-input-character;
3868 redo A;
3869 } elsif ($self->{nc} == -1) {
3870 !!!parse-error (type => 'bare stago');
3871 $self->{state} = DATA_STATE;
3872 $self->{s_kwd} = '';
3873 ## Reconsume.
3874 redo A;
3875 } else {
3876 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3877 line => $self->{line_prev},
3878 column => $self->{column_prev});
3879 $self->{state} = BOGUS_COMMENT_STATE;
3880 $self->{ct} = {type => COMMENT_TOKEN,
3881 data => '',
3882 }; ## NOTE: Will be discarded.
3883 !!!next-input-character;
3884 redo A;
3885 }
3886 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3887 ## XML5: "DOCTYPE markup declaration state".
3888
3889 if ($self->{nc} == 0x002D) { # -
3890 $self->{state} = MD_HYPHEN_STATE;
3891 !!!next-input-character;
3892 redo A;
3893 } elsif ($self->{nc} == 0x0045 or # E
3894 $self->{nc} == 0x0065) { # e
3895 $self->{state} = MD_E_STATE;
3896 $self->{kwd} = chr $self->{nc};
3897 !!!next-input-character;
3898 redo A;
3899 } elsif ($self->{nc} == 0x0041 or # A
3900 $self->{nc} == 0x0061) { # a
3901 $self->{state} = MD_ATTLIST_STATE;
3902 $self->{kwd} = chr $self->{nc};
3903 !!!next-input-character;
3904 redo A;
3905 } elsif ($self->{nc} == 0x004E or # N
3906 $self->{nc} == 0x006E) { # n
3907 $self->{state} = MD_NOTATION_STATE;
3908 $self->{kwd} = chr $self->{nc};
3909 !!!next-input-character;
3910 redo A;
3911 } else {
3912 #
3913 }
3914
3915 ## XML5: No parse error.
3916 !!!parse-error (type => 'bogus comment',
3917 line => $self->{line_prev},
3918 column => $self->{column_prev} - 1);
3919 ## Reconsume.
3920 $self->{state} = BOGUS_COMMENT_STATE;
3921 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3922 redo A;
3923 } elsif ($self->{state} == MD_E_STATE) {
3924 if ($self->{nc} == 0x004E or # N
3925 $self->{nc} == 0x006E) { # n
3926 $self->{state} = MD_ENTITY_STATE;
3927 $self->{kwd} .= chr $self->{nc};
3928 !!!next-input-character;
3929 redo A;
3930 } elsif ($self->{nc} == 0x004C or # L
3931 $self->{nc} == 0x006C) { # l
3932 ## XML5: <!ELEMENT> not supported.
3933 $self->{state} = MD_ELEMENT_STATE;
3934 $self->{kwd} .= chr $self->{nc};
3935 !!!next-input-character;
3936 redo A;
3937 } else {
3938 ## XML5: No parse error.
3939 !!!parse-error (type => 'bogus comment',
3940 line => $self->{line_prev},
3941 column => $self->{column_prev} - 2
3942 + 1 * ($self->{nc} == -1));
3943 ## Reconsume.
3944 $self->{state} = BOGUS_COMMENT_STATE;
3945 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3946 redo A;
3947 }
3948 } elsif ($self->{state} == MD_ENTITY_STATE) {
3949 if ($self->{nc} == [
3950 undef,
3951 undef,
3952 0x0054, # T
3953 0x0049, # I
3954 0x0054, # T
3955 ]->[length $self->{kwd}] or
3956 $self->{nc} == [
3957 undef,
3958 undef,
3959 0x0074, # t
3960 0x0069, # i
3961 0x0074, # t
3962 ]->[length $self->{kwd}]) {
3963 ## Stay in the state.
3964 $self->{kwd} .= chr $self->{nc};
3965 !!!next-input-character;
3966 redo A;
3967 } elsif ((length $self->{kwd}) == 5 and
3968 ($self->{nc} == 0x0059 or # Y
3969 $self->{nc} == 0x0079)) { # y
3970 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3971 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3972 text => 'ENTITY',
3973 line => $self->{line_prev},
3974 column => $self->{column_prev} - 4);
3975 }
3976 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3977 line => $self->{line_prev},
3978 column => $self->{column_prev} - 6};
3979 $self->{state} = DOCTYPE_MD_STATE;
3980 !!!next-input-character;
3981 redo A;
3982 } else {
3983 !!!parse-error (type => 'bogus comment',
3984 line => $self->{line_prev},
3985 column => $self->{column_prev} - 1
3986 - (length $self->{kwd})
3987 + 1 * ($self->{nc} == -1));
3988 $self->{state} = BOGUS_COMMENT_STATE;
3989 ## Reconsume.
3990 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3991 redo A;
3992 }
3993 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3994 if ($self->{nc} == [
3995 undef,
3996 undef,
3997 0x0045, # E
3998 0x004D, # M
3999 0x0045, # E
4000 0x004E, # N
4001 ]->[length $self->{kwd}] or
4002 $self->{nc} == [
4003 undef,
4004 undef,
4005 0x0065, # e
4006 0x006D, # m
4007 0x0065, # e
4008 0x006E, # n
4009 ]->[length $self->{kwd}]) {
4010 ## Stay in the state.
4011 $self->{kwd} .= chr $self->{nc};
4012 !!!next-input-character;
4013 redo A;
4014 } elsif ((length $self->{kwd}) == 6 and
4015 ($self->{nc} == 0x0054 or # T
4016 $self->{nc} == 0x0074)) { # t
4017 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
4018 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4019 text => 'ELEMENT',
4020 line => $self->{line_prev},
4021 column => $self->{column_prev} - 5);
4022 }
4023 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
4024 line => $self->{line_prev},
4025 column => $self->{column_prev} - 7};
4026 $self->{state} = DOCTYPE_MD_STATE;
4027 !!!next-input-character;
4028 redo A;
4029 } else {
4030 !!!parse-error (type => 'bogus comment',
4031 line => $self->{line_prev},
4032 column => $self->{column_prev} - 1
4033 - (length $self->{kwd})
4034 + 1 * ($self->{nc} == -1));
4035 $self->{state} = BOGUS_COMMENT_STATE;
4036 ## Reconsume.
4037 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4038 redo A;
4039 }
4040 } elsif ($self->{state} == MD_ATTLIST_STATE) {
4041 if ($self->{nc} == [
4042 undef,
4043 0x0054, # T
4044 0x0054, # T
4045 0x004C, # L
4046 0x0049, # I
4047 0x0053, # S
4048 ]->[length $self->{kwd}] or
4049 $self->{nc} == [
4050 undef,
4051 0x0074, # t
4052 0x0074, # t
4053 0x006C, # l
4054 0x0069, # i
4055 0x0073, # s
4056 ]->[length $self->{kwd}]) {
4057 ## Stay in the state.
4058 $self->{kwd} .= chr $self->{nc};
4059 !!!next-input-character;
4060 redo A;
4061 } elsif ((length $self->{kwd}) == 6 and
4062 ($self->{nc} == 0x0054 or # T
4063 $self->{nc} == 0x0074)) { # t
4064 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
4065 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4066 text => 'ATTLIST',
4067 line => $self->{line_prev},
4068 column => $self->{column_prev} - 5);
4069 }
4070 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4071 attrdefs => [],
4072 line => $self->{line_prev},
4073 column => $self->{column_prev} - 7};
4074 $self->{state} = DOCTYPE_MD_STATE;
4075 !!!next-input-character;
4076 redo A;
4077 } else {
4078 !!!parse-error (type => 'bogus comment',
4079 line => $self->{line_prev},
4080 column => $self->{column_prev} - 1
4081 - (length $self->{kwd})
4082 + 1 * ($self->{nc} == -1));
4083 $self->{state} = BOGUS_COMMENT_STATE;
4084 ## Reconsume.
4085 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4086 redo A;
4087 }
4088 } elsif ($self->{state} == MD_NOTATION_STATE) {
4089 if ($self->{nc} == [
4090 undef,
4091 0x004F, # O
4092 0x0054, # T
4093 0x0041, # A
4094 0x0054, # T
4095 0x0049, # I
4096 0x004F, # O
4097 ]->[length $self->{kwd}] or
4098 $self->{nc} == [
4099 undef,
4100 0x006F, # o
4101 0x0074, # t
4102 0x0061, # a
4103 0x0074, # t
4104 0x0069, # i
4105 0x006F, # o
4106 ]->[length $self->{kwd}]) {
4107 ## Stay in the state.
4108 $self->{kwd} .= chr $self->{nc};
4109 !!!next-input-character;
4110 redo A;
4111 } elsif ((length $self->{kwd}) == 7 and
4112 ($self->{nc} == 0x004E or # N
4113 $self->{nc} == 0x006E)) { # n
4114 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4115 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4116 text => 'NOTATION',
4117 line => $self->{line_prev},
4118 column => $self->{column_prev} - 6);
4119 }
4120 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4121 line => $self->{line_prev},
4122 column => $self->{column_prev} - 8};
4123 $self->{state} = DOCTYPE_MD_STATE;
4124 !!!next-input-character;
4125 redo A;
4126 } else {
4127 !!!parse-error (type => 'bogus comment',
4128 line => $self->{line_prev},
4129 column => $self->{column_prev} - 1
4130 - (length $self->{kwd})
4131 + 1 * ($self->{nc} == -1));
4132 $self->{state} = BOGUS_COMMENT_STATE;
4133 ## Reconsume.
4134 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4135 redo A;
4136 }
4137 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4138 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4139 ## "DOCTYPE NOTATION state".
4140
4141 if ($is_space->{$self->{nc}}) {
4142 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4143 $self->{state} = BEFORE_MD_NAME_STATE;
4144 !!!next-input-character;
4145 redo A;
4146 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4147 $self->{nc} == 0x0025) { # %
4148 ## XML5: Switch to the "DOCTYPE bogus comment state".
4149 !!!parse-error (type => 'no space before md name'); ## TODO: type
4150 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4151 !!!next-input-character;
4152 redo A;
4153 } elsif ($self->{nc} == -1) {
4154 !!!parse-error (type => 'unclosed md'); ## TODO: type
4155 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4156 ## Reconsume.
4157 redo A;
4158 } elsif ($self->{nc} == 0x003E) { # >
4159 ## XML5: Switch to the "DOCTYPE bogus comment state".
4160 !!!parse-error (type => 'no md name'); ## TODO: type
4161 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4162 !!!next-input-character;
4163 redo A;
4164 } else {
4165 ## XML5: Switch to the "DOCTYPE bogus comment state".
4166 !!!parse-error (type => 'no space before md name'); ## TODO: type
4167 $self->{state} = BEFORE_MD_NAME_STATE;
4168 redo A;
4169 }
4170 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4171 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4172 ## before state", "DOCTYPE ATTLIST name before state".
4173
4174 if ($is_space->{$self->{nc}}) {
4175 ## Stay in the state.
4176 !!!next-input-character;
4177 redo A;
4178 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4179 $self->{nc} == 0x0025) { # %
4180 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4181 !!!next-input-character;
4182 redo A;
4183 } elsif ($self->{nc} == 0x003E) { # >
4184 ## XML5: Same as "Anything else".
4185 !!!parse-error (type => 'no md name'); ## TODO: type
4186 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4187 !!!next-input-character;
4188 redo A;
4189 } elsif ($self->{nc} == -1) {
4190 !!!parse-error (type => 'unclosed md'); ## TODO: type
4191 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4192 ## Reconsume.
4193 redo A;
4194 } else {
4195 ## XML5: [ATTLIST] Not defined yet.
4196 $self->{ct}->{name} .= chr $self->{nc};
4197 $self->{state} = MD_NAME_STATE;
4198 !!!next-input-character;
4199 redo A;
4200 }
4201 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4202 if ($is_space->{$self->{nc}}) {
4203 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4204 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4205 $self->{state} = BEFORE_MD_NAME_STATE;
4206 !!!next-input-character;
4207 redo A;
4208 } elsif ($self->{nc} == 0x003E) { # >
4209 ## XML5: Same as "Anything else".
4210 !!!parse-error (type => 'no md name'); ## TODO: type
4211 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4212 !!!next-input-character;
4213 redo A;
4214 } elsif ($self->{nc} == -1) {
4215 !!!parse-error (type => 'unclosed md');
4216 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4217 ## Reconsume.
4218 redo A;
4219 } else {
4220 ## XML5: No parse error.
4221 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4222 $self->{state} = BOGUS_COMMENT_STATE;
4223 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4224 ## Reconsume.
4225 redo A;
4226 }
4227 } elsif ($self->{state} == MD_NAME_STATE) {
4228 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4229
4230 if ($is_space->{$self->{nc}}) {
4231 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4232 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4233 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4234 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4235 } else { # ENTITY/NOTATION
4236 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4237 }
4238 !!!next-input-character;
4239 redo A;
4240 } elsif ($self->{nc} == 0x003E) { # >
4241 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4242 #
4243 } else {
4244 !!!parse-error (type => 'no md def'); ## TODO: type
4245 }
4246 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247 !!!next-input-character;
4248 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4249 redo A;
4250 } elsif ($self->{nc} == -1) {
4251 ## XML5: [ATTLIST] No parse error.
4252 !!!parse-error (type => 'unclosed md');
4253 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4254 ## Reconsume.
4255 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4256 redo A;
4257 } else {
4258 ## XML5: [ATTLIST] Not defined yet.
4259 $self->{ct}->{name} .= chr $self->{nc};
4260 ## Stay in the state.
4261 !!!next-input-character;
4262 redo A;
4263 }
4264 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4265 if ($is_space->{$self->{nc}}) {
4266 ## Stay in the state.
4267 !!!next-input-character;
4268 redo A;
4269 } elsif ($self->{nc} == 0x003E) { # >
4270 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4271 !!!next-input-character;
4272 !!!emit ($self->{ct}); # ATTLIST
4273 redo A;
4274 } elsif ($self->{nc} == -1) {
4275 ## XML5: No parse error.
4276 !!!parse-error (type => 'unclosed md'); ## TODO: type
4277 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4278 !!!emit ($self->{ct});
4279 redo A;
4280 } else {
4281 ## XML5: Not defined yet.
4282 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4283 tokens => [],
4284 line => $self->{line}, column => $self->{column}};
4285 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4286 !!!next-input-character;
4287 redo A;
4288 }
4289 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4290 if ($is_space->{$self->{nc}}) {
4291 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4292 !!!next-input-character;
4293 redo A;
4294 } elsif ($self->{nc} == 0x003E) { # >
4295 ## XML5: Same as "anything else".
4296 !!!parse-error (type => 'no attr type'); ## TODO: type
4297 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298 !!!next-input-character;
4299 !!!emit ($self->{ct}); # ATTLIST
4300 redo A;
4301 } elsif ($self->{nc} == 0x0028) { # (
4302 ## XML5: Same as "anything else".
4303 !!!parse-error (type => 'no space before paren'); ## TODO: type
4304 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4305 !!!next-input-character;
4306 redo A;
4307 } elsif ($self->{nc} == -1) {
4308 ## XML5: No parse error.
4309 !!!parse-error (type => 'unclosed md'); ## TODO: type
4310 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4311 !!!next-input-character;
4312 !!!emit ($self->{ct}); # ATTLIST
4313 redo A;
4314 } else {
4315 ## XML5: Not defined yet.
4316 $self->{ca}->{name} .= chr $self->{nc};
4317 ## Stay in the state.
4318 !!!next-input-character;
4319 redo A;
4320 }
4321 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4322 if ($is_space->{$self->{nc}}) {
4323 ## Stay in the state.
4324 !!!next-input-character;
4325 redo A;
4326 } elsif ($self->{nc} == 0x003E) { # >
4327 ## XML5: Same as "anything else".
4328 !!!parse-error (type => 'no attr type'); ## TODO: type
4329 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4330 !!!next-input-character;
4331 !!!emit ($self->{ct}); # ATTLIST
4332 redo A;
4333 } elsif ($self->{nc} == 0x0028) { # (
4334 ## XML5: Same as "anything else".
4335 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4336 !!!next-input-character;
4337 redo A;
4338 } elsif ($self->{nc} == -1) {
4339 ## XML5: No parse error.
4340 !!!parse-error (type => 'unclosed md'); ## TODO: type
4341 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4342 !!!next-input-character;
4343 !!!emit ($self->{ct});
4344 redo A;
4345 } else {
4346 ## XML5: Not defined yet.
4347 $self->{ca}->{type} = chr $self->{nc};
4348 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4349 !!!next-input-character;
4350 redo A;
4351 }
4352 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4353 if ($is_space->{$self->{nc}}) {
4354 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4355 !!!next-input-character;
4356 redo A;
4357 } elsif ($self->{nc} == 0x0023) { # #
4358 ## XML5: Same as "anything else".
4359 !!!parse-error (type => 'no space before default value'); ## TODO: type
4360 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4361 !!!next-input-character;
4362 redo A;
4363 } elsif ($self->{nc} == 0x0022) { # "
4364 ## XML5: Same as "anything else".
4365 !!!parse-error (type => 'no space before default value'); ## TODO: type
4366 $self->{ca}->{value} = '';
4367 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4368 !!!next-input-character;
4369 redo A;
4370 } elsif ($self->{nc} == 0x0027) { # '
4371 ## XML5: Same as "anything else".
4372 !!!parse-error (type => 'no space before default value'); ## TODO: type
4373 $self->{ca}->{value} = '';
4374 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4375 !!!next-input-character;
4376 redo A;
4377 } elsif ($self->{nc} == 0x003E) { # >
4378 ## XML5: Same as "anything else".
4379 !!!parse-error (type => 'no attr default'); ## TODO: type
4380 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4381 !!!next-input-character;
4382 !!!emit ($self->{ct}); # ATTLIST
4383 redo A;
4384 } elsif ($self->{nc} == 0x0028) { # (
4385 ## XML5: Same as "anything else".
4386 !!!parse-error (type => 'no space before paren'); ## TODO: type
4387 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4388 !!!next-input-character;
4389 redo A;
4390 } elsif ($self->{nc} == -1) {
4391 ## XML5: No parse error.
4392 !!!parse-error (type => 'unclosed md'); ## TODO: type
4393 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4394 !!!next-input-character;
4395 !!!emit ($self->{ct});
4396 redo A;
4397 } else {
4398 ## XML5: Not defined yet.
4399 $self->{ca}->{type} .= chr $self->{nc};
4400 ## Stay in the state.
4401 !!!next-input-character;
4402 redo A;
4403 }
4404 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4405 if ($is_space->{$self->{nc}}) {
4406 ## Stay in the state.
4407 !!!next-input-character;
4408 redo A;
4409 } elsif ($self->{nc} == 0x0028) { # (
4410 ## XML5: Same as "anything else".
4411 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4412 !!!next-input-character;
4413 redo A;
4414 } elsif ($self->{nc} == 0x0023) { # #
4415 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4416 !!!next-input-character;
4417 redo A;
4418 } elsif ($self->{nc} == 0x0022) { # "
4419 ## XML5: Same as "anything else".
4420 $self->{ca}->{value} = '';
4421 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4422 !!!next-input-character;
4423 redo A;
4424 } elsif ($self->{nc} == 0x0027) { # '
4425 ## XML5: Same as "anything else".
4426 $self->{ca}->{value} = '';
4427 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4428 !!!next-input-character;
4429 redo A;
4430 } elsif ($self->{nc} == 0x003E) { # >
4431 ## XML5: Same as "anything else".
4432 !!!parse-error (type => 'no attr default'); ## TODO: type
4433 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434 !!!next-input-character;
4435 !!!emit ($self->{ct}); # ATTLIST
4436 redo A;
4437 } elsif ($self->{nc} == -1) {
4438 ## XML5: No parse error.
4439 !!!parse-error (type => 'unclosed md'); ## TODO: type
4440 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4441 !!!next-input-character;
4442 !!!emit ($self->{ct});
4443 redo A;
4444 } else {
4445 ## XML5: Switch to the "DOCTYPE bogus comment state".
4446 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4447 $self->{ca}->{value} = '';
4448 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4449 ## Reconsume.
4450 redo A;
4451 }
4452 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4453 if ($is_space->{$self->{nc}}) {
4454 ## Stay in the state.
4455 !!!next-input-character;
4456 redo A;
4457 } elsif ($self->{nc} == 0x007C) { # |
4458 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4459 ## Stay in the state.
4460 !!!next-input-character;
4461 redo A;
4462 } elsif ($self->{nc} == 0x0029) { # )
4463 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4464 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4465 !!!next-input-character;
4466 redo A;
4467 } elsif ($self->{nc} == 0x003E) { # >
4468 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4469 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4470 !!!next-input-character;
4471 !!!emit ($self->{ct}); # ATTLIST
4472 redo A;
4473 } elsif ($self->{nc} == -1) {
4474 ## XML5: No parse error.
4475 !!!parse-error (type => 'unclosed md'); ## TODO: type
4476 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4477 !!!next-input-character;
4478 !!!emit ($self->{ct});
4479 redo A;
4480 } else {
4481 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4482 $self->{state} = ALLOWED_TOKEN_STATE;
4483 !!!next-input-character;
4484 redo A;
4485 }
4486 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4487 if ($is_space->{$self->{nc}}) {
4488 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4489 !!!next-input-character;
4490 redo A;
4491 } elsif ($self->{nc} == 0x007C) { # |
4492 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4493 !!!next-input-character;
4494 redo A;
4495 } elsif ($self->{nc} == 0x0029) { # )
4496 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4497 !!!next-input-character;
4498 redo A;
4499 } elsif ($self->{nc} == 0x003E) { # >
4500 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4501 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4502 !!!next-input-character;
4503 !!!emit ($self->{ct}); # ATTLIST
4504 redo A;
4505 } elsif ($self->{nc} == -1) {
4506 ## XML5: No parse error.
4507 !!!parse-error (type => 'unclosed md'); ## TODO: type
4508 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4509 !!!next-input-character;
4510 !!!emit ($self->{ct});
4511 redo A;
4512 } else {
4513 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4514 ## Stay in the state.
4515 !!!next-input-character;
4516 redo A;
4517 }
4518 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4519 if ($is_space->{$self->{nc}}) {
4520 ## Stay in the state.
4521 !!!next-input-character;
4522 redo A;
4523 } elsif ($self->{nc} == 0x007C) { # |
4524 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4525 !!!next-input-character;
4526 redo A;
4527 } elsif ($self->{nc} == 0x0029) { # )
4528 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4529 !!!next-input-character;
4530 redo A;
4531 } elsif ($self->{nc} == 0x003E) { # >
4532 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4533 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4534 !!!next-input-character;
4535 !!!emit ($self->{ct}); # ATTLIST
4536 redo A;
4537 } elsif ($self->{nc} == -1) {
4538 ## XML5: No parse error.
4539 !!!parse-error (type => 'unclosed md'); ## TODO: type
4540 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4541 !!!next-input-character;
4542 !!!emit ($self->{ct});
4543 redo A;
4544 } else {
4545 !!!parse-error (type => 'space in allowed token', ## TODO: type
4546 line => $self->{line_prev},
4547 column => $self->{column_prev});
4548 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4549 $self->{state} = ALLOWED_TOKEN_STATE;
4550 !!!next-input-character;
4551 redo A;
4552 }
4553 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4554 if ($is_space->{$self->{nc}}) {
4555 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4556 !!!next-input-character;
4557 redo A;
4558 } elsif ($self->{nc} == 0x0023) { # #
4559 !!!parse-error (type => 'no space before default value'); ## TODO: type
4560 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4561 !!!next-input-character;
4562 redo A;
4563 } elsif ($self->{nc} == 0x0022) { # "
4564 !!!parse-error (type => 'no space before default value'); ## TODO: type
4565 $self->{ca}->{value} = '';
4566 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4567 !!!next-input-character;
4568 redo A;
4569 } elsif ($self->{nc} == 0x0027) { # '
4570 !!!parse-error (type => 'no space before default value'); ## TODO: type
4571 $self->{ca}->{value} = '';
4572 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4573 !!!next-input-character;
4574 redo A;
4575 } elsif ($self->{nc} == 0x003E) { # >
4576 !!!parse-error (type => 'no attr default'); ## TODO: type
4577 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4578 !!!next-input-character;
4579 !!!emit ($self->{ct}); # ATTLIST
4580 redo A;
4581 } elsif ($self->{nc} == -1) {
4582 !!!parse-error (type => 'unclosed md'); ## TODO: type
4583 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4584 !!!next-input-character;
4585 !!!emit ($self->{ct});
4586 redo A;
4587 } else {
4588 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4589 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4590 ## Reconsume.
4591 redo A;
4592 }
4593 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4594 if ($is_space->{$self->{nc}}) {
4595 ## Stay in the state.
4596 !!!next-input-character;
4597 redo A;
4598 } elsif ($self->{nc} == 0x0023) { # #
4599 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4600 !!!next-input-character;
4601 redo A;
4602 } elsif ($self->{nc} == 0x0022) { # "
4603 $self->{ca}->{value} = '';
4604 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4605 !!!next-input-character;
4606 redo A;
4607 } elsif ($self->{nc} == 0x0027) { # '
4608 $self->{ca}->{value} = '';
4609 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4610 !!!next-input-character;
4611 redo A;
4612 } elsif ($self->{nc} == 0x003E) { # >
4613 !!!parse-error (type => 'no attr default'); ## TODO: type
4614 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4615 !!!next-input-character;
4616 !!!emit ($self->{ct}); # ATTLIST
4617 redo A;
4618 } elsif ($self->{nc} == -1) {
4619 !!!parse-error (type => 'unclosed md'); ## TODO: type
4620 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4621 !!!next-input-character;
4622 !!!emit ($self->{ct});
4623 redo A;
4624 } else {
4625 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4626 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4627 ## Reconsume.
4628 redo A;
4629 }
4630 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4631 if ($is_space->{$self->{nc}}) {
4632 ## XML5: No parse error.
4633 !!!parse-error (type => 'no default type'); ## TODO: type
4634 $self->{state} = BOGUS_MD_STATE;
4635 ## Reconsume.
4636 redo A;
4637 } elsif ($self->{nc} == 0x0022) { # "
4638 ## XML5: Same as "anything else".
4639 $self->{ca}->{value} = '';
4640 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4641 !!!next-input-character;
4642 redo A;
4643 } elsif ($self->{nc} == 0x0027) { # '
4644 ## XML5: Same as "anything else".
4645 $self->{ca}->{value} = '';
4646 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4647 !!!next-input-character;
4648 redo A;
4649 } elsif ($self->{nc} == 0x003E) { # >
4650 ## XML5: Same as "anything else".
4651 !!!parse-error (type => 'no attr default'); ## TODO: type
4652 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653 !!!next-input-character;
4654 !!!emit ($self->{ct}); # ATTLIST
4655 redo A;
4656 } elsif ($self->{nc} == -1) {
4657 ## XML5: No parse error.
4658 !!!parse-error (type => 'unclosed md'); ## TODO: type
4659 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4660 !!!next-input-character;
4661 !!!emit ($self->{ct});
4662 redo A;
4663 } else {
4664 $self->{ca}->{default} = chr $self->{nc};
4665 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4666 !!!next-input-character;
4667 redo A;
4668 }
4669 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4670 if ($is_space->{$self->{nc}}) {
4671 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4672 !!!next-input-character;
4673 redo A;
4674 } elsif ($self->{nc} == 0x0022) { # "
4675 ## XML5: Same as "anything else".
4676 !!!parse-error (type => 'no space before default value'); ## TODO: type
4677 $self->{ca}->{value} = '';
4678 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4679 !!!next-input-character;
4680 redo A;
4681 } elsif ($self->{nc} == 0x0027) { # '
4682 ## XML5: Same as "anything else".
4683 !!!parse-error (type => 'no space before default value'); ## TODO: type
4684 $self->{ca}->{value} = '';
4685 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4686 !!!next-input-character;
4687 redo A;
4688 } elsif ($self->{nc} == 0x003E) { # >
4689 ## XML5: Same as "anything else".
4690 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4691 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4692 !!!next-input-character;
4693 !!!emit ($self->{ct}); # ATTLIST
4694 redo A;
4695 } elsif ($self->{nc} == -1) {
4696 ## XML5: No parse error.
4697 !!!parse-error (type => 'unclosed md'); ## TODO: type
4698 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4699 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4700 !!!next-input-character;
4701 !!!emit ($self->{ct});
4702 redo A;
4703 } else {
4704 $self->{ca}->{default} .= chr $self->{nc};
4705 ## Stay in the state.
4706 !!!next-input-character;
4707 redo A;
4708 }
4709 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4710 if ($is_space->{$self->{nc}}) {
4711 ## Stay in the state.
4712 !!!next-input-character;
4713 redo A;
4714 } elsif ($self->{nc} == 0x0022) { # "
4715 $self->{ca}->{value} = '';
4716 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4717 !!!next-input-character;
4718 redo A;
4719 } elsif ($self->{nc} == 0x0027) { # '
4720 $self->{ca}->{value} = '';
4721 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4722 !!!next-input-character;
4723 redo A;
4724 } elsif ($self->{nc} == 0x003E) { # >
4725 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4726 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4727 !!!next-input-character;
4728 !!!emit ($self->{ct}); # ATTLIST
4729 redo A;
4730 } elsif ($self->{nc} == -1) {
4731 ## XML5: No parse error.
4732 !!!parse-error (type => 'unclosed md'); ## TODO: type
4733 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4734 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4735 !!!next-input-character;
4736 !!!emit ($self->{ct});
4737 redo A;
4738 } else {
4739 ## XML5: Not defined yet.
4740 if ($self->{ca}->{default} eq 'FIXED') {
4741 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4742 } else {
4743 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4744 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4745 }
4746 ## Reconsume.
4747 redo A;
4748 }
4749 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4750 if ($is_space->{$self->{nc}} or
4751 $self->{nc} == -1 or
4752 $self->{nc} == 0x003E) { # >
4753 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4754 ## Reconsume.
4755 redo A;
4756 } else {
4757 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4758 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4759 ## Reconsume.
4760 redo A;
4761 }
4762 } elsif ($self->{state} == NDATA_STATE) {
4763 ## ASCII case-insensitive
4764 if ($self->{nc} == [
4765 undef,
4766 0x0044, # D
4767 0x0041, # A
4768 0x0054, # T
4769 ]->[length $self->{kwd}] or
4770 $self->{nc} == [
4771 undef,
4772 0x0064, # d
4773 0x0061, # a
4774 0x0074, # t
4775 ]->[length $self->{kwd}]) {
4776 !!!cp (172.2);
4777 ## Stay in the state.
4778 $self->{kwd} .= chr $self->{nc};
4779 !!!next-input-character;
4780 redo A;
4781 } elsif ((length $self->{kwd}) == 4 and
4782 ($self->{nc} == 0x0041 or # A
4783 $self->{nc} == 0x0061)) { # a
4784 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4785 !!!cp (172.3);
4786 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4787 text => 'NDATA',
4788 line => $self->{line_prev},
4789 column => $self->{column_prev} - 4);
4790 } else {
4791 !!!cp (172.4);
4792 }
4793 $self->{state} = AFTER_NDATA_STATE;
4794 !!!next-input-character;
4795 redo A;
4796 } else {
4797 !!!parse-error (type => 'string after literal', ## TODO: type
4798 line => $self->{line_prev},
4799 column => $self->{column_prev} + 1
4800 - length $self->{kwd});
4801 !!!cp (172.5);
4802 $self->{state} = BOGUS_MD_STATE;
4803 ## Reconsume.
4804 redo A;
4805 }
4806 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4807 if ($is_space->{$self->{nc}}) {
4808 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4809 !!!next-input-character;
4810 redo A;
4811 } elsif ($self->{nc} == 0x003E) { # >
4812 !!!parse-error (type => 'no notation name'); ## TODO: type
4813 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4814 !!!next-input-character;
4815 !!!emit ($self->{ct}); # ENTITY
4816 redo A;
4817 } elsif ($self->{nc} == -1) {
4818 !!!parse-error (type => 'unclosed md'); ## TODO: type
4819 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4820 !!!next-input-character;
4821 !!!emit ($self->{ct}); # ENTITY
4822 redo A;
4823 } else {
4824 !!!parse-error (type => 'string after literal', ## TODO: type
4825 line => $self->{line_prev},
4826 column => $self->{column_prev} + 1
4827 - length $self->{kwd});
4828 $self->{state} = BOGUS_MD_STATE;
4829 ## Reconsume.
4830 redo A;
4831 }
4832 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4833 if ($is_space->{$self->{nc}}) {
4834 ## Stay in the state.
4835 !!!next-input-character;
4836 redo A;
4837 } elsif ($self->{nc} == 0x003E) { # >
4838 !!!parse-error (type => 'no notation name'); ## TODO: type
4839 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4840 !!!next-input-character;
4841 !!!emit ($self->{ct}); # ENTITY
4842 redo A;
4843 } elsif ($self->{nc} == -1) {
4844 !!!parse-error (type => 'unclosed md'); ## TODO: type
4845 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4846 !!!next-input-character;
4847 !!!emit ($self->{ct}); # ENTITY
4848 redo A;
4849 } else {
4850 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4851 $self->{state} = NOTATION_NAME_STATE;
4852 !!!next-input-character;
4853 redo A;
4854 }
4855 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4856 if ($is_space->{$self->{nc}}) {
4857 $self->{state} = AFTER_MD_DEF_STATE;
4858 !!!next-input-character;
4859 redo A;
4860 } elsif ($self->{nc} == 0x003E) { # >
4861 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4862 !!!next-input-character;
4863 !!!emit ($self->{ct}); # ENTITY
4864 redo A;
4865 } elsif ($self->{nc} == -1) {
4866 !!!parse-error (type => 'unclosed md'); ## TODO: type
4867 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4868 !!!next-input-character;
4869 !!!emit ($self->{ct}); # ENTITY
4870 redo A;
4871 } else {
4872 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4873 ## Stay in the state.
4874 !!!next-input-character;
4875 redo A;
4876 }
4877 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4878 if ($self->{nc} == 0x0022) { # "
4879 $self->{state} = AFTER_MD_DEF_STATE;
4880 !!!next-input-character;
4881 redo A;
4882 } elsif ($self->{nc} == 0x0026) { # &
4883 $self->{prev_state} = $self->{state};
4884 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4885 $self->{entity_add} = 0x0022; # "
4886 !!!next-input-character;
4887 redo A;
4888 ## TODO: %
4889 } elsif ($self->{nc} == -1) {
4890 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4891 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4892 ## Reconsume.
4893 !!!emit ($self->{ct}); # ENTITY
4894 redo A;
4895 } else {
4896 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4897 !!!next-input-character;
4898 redo A;
4899 }
4900 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4901 if ($self->{nc} == 0x0027) { # '
4902 $self->{state} = AFTER_MD_DEF_STATE;
4903 !!!next-input-character;
4904 redo A;
4905 } elsif ($self->{nc} == 0x0026) { # &
4906 $self->{prev_state} = $self->{state};
4907 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4908 $self->{entity_add} = 0x0027; # '
4909 !!!next-input-character;
4910 redo A;
4911 ## TODO: %
4912 } elsif ($self->{nc} == -1) {
4913 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4914 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4915 ## Reconsume.
4916 !!!emit ($self->{ct}); # ENTITY
4917 redo A;
4918 } else {
4919 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4920 !!!next-input-character;
4921 redo A;
4922 }
4923 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4924 if ($is_space->{$self->{nc}} or
4925 {
4926 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4927 $self->{entity_add} => 1,
4928 }->{$self->{nc}}) {
4929 !!!parse-error (type => 'bare ero',
4930 line => $self->{line_prev},
4931 column => $self->{column_prev}
4932 + ($self->{nc} == -1 ? 1 : 0));
4933 ## Don't consume
4934 ## Return nothing.
4935 #
4936 } elsif ($self->{nc} == 0x0023) { # #
4937 $self->{ca} = $self->{ct};
4938 $self->{state} = ENTITY_HASH_STATE;
4939 $self->{kwd} = '#';
4940 !!!next-input-character;
4941 redo A;
4942 } else {
4943 #
4944 }
4945
4946 $self->{ct}->{value} .= '&';
4947 $self->{state} = $self->{prev_state};
4948 ## Reconsume.
4949 redo A;
4950 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4951 if ($is_space->{$self->{nc}}) {
4952 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4953 !!!next-input-character;
4954 redo A;
4955 } elsif ($self->{nc} == 0x0028) { # (
4956 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4957 $self->{ct}->{content} = ['('];
4958 $self->{group_depth} = 1;
4959 !!!next-input-character;
4960 redo A;
4961 } elsif ($self->{nc} == 0x003E) { # >
4962 !!!parse-error (type => 'no md def'); ## TODO: type
4963 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4964 !!!next-input-character;
4965 !!!emit ($self->{ct}); # ELEMENT
4966 redo A;
4967 } elsif ($self->{nc} == -1) {
4968 !!!parse-error (type => 'unclosed md'); ## TODO: type
4969 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4970 !!!next-input-character;
4971 !!!emit ($self->{ct}); # ELEMENT
4972 redo A;
4973 } else {
4974 $self->{ct}->{content} = [chr $self->{nc}];
4975 $self->{state} = CONTENT_KEYWORD_STATE;
4976 !!!next-input-character;
4977 redo A;
4978 }
4979 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4980 if ($is_space->{$self->{nc}}) {
4981 $self->{state} = AFTER_MD_DEF_STATE;
4982 !!!next-input-character;
4983 redo A;
4984 } elsif ($self->{nc} == 0x003E) { # >
4985 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4986 !!!next-input-character;
4987 !!!emit ($self->{ct}); # ELEMENT
4988 redo A;
4989 } elsif ($self->{nc} == -1) {
4990 !!!parse-error (type => 'unclosed md'); ## TODO: type
4991 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4992 !!!next-input-character;
4993 !!!emit ($self->{ct}); # ELEMENT
4994 redo A;
4995 } else {
4996 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4997 ## Stay in the state.
4998 !!!next-input-character;
4999 redo A;
5000 }
5001 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
5002 if ($is_space->{$self->{nc}}) {
5003 ## Stay in the state.
5004 !!!next-input-character;
5005 redo A;
5006 } elsif ($self->{nc} == 0x0028) { # (
5007 $self->{group_depth}++;
5008 push @{$self->{ct}->{content}}, chr $self->{nc};
5009 ## Stay in the state.
5010 !!!next-input-character;
5011 redo A;
5012 } elsif ($self->{nc} == 0x007C or # |
5013 $self->{nc} == 0x002C) { # ,
5014 !!!parse-error (type => 'empty element name'); ## TODO: type
5015 ## Stay in the state.
5016 !!!next-input-character;
5017 redo A;
5018 } elsif ($self->{nc} == 0x0029) { # )
5019 !!!parse-error (type => 'empty element name'); ## TODO: type
5020 push @{$self->{ct}->{content}}, chr $self->{nc};
5021 $self->{group_depth}--;
5022 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5023 !!!next-input-character;
5024 redo A;
5025 } elsif ($self->{nc} == 0x003E) { # >
5026 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5027 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5028 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5029 !!!next-input-character;
5030 !!!emit ($self->{ct}); # ELEMENT
5031 redo A;
5032 } elsif ($self->{nc} == -1) {
5033 !!!parse-error (type => 'unclosed md'); ## TODO: type
5034 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5035 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5036 !!!next-input-character;
5037 !!!emit ($self->{ct}); # ELEMENT
5038 redo A;
5039 } else {
5040 push @{$self->{ct}->{content}}, chr $self->{nc};
5041 $self->{state} = CM_ELEMENT_NAME_STATE;
5042 !!!next-input-character;
5043 redo A;
5044 }
5045 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
5046 if ($is_space->{$self->{nc}}) {
5047 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5048 !!!next-input-character;
5049 redo A;
5050 } elsif ($self->{nc} == 0x002A or # *
5051 $self->{nc} == 0x002B or # +
5052 $self->{nc} == 0x003F) { # ?
5053 push @{$self->{ct}->{content}}, chr $self->{nc};
5054 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5055 !!!next-input-character;
5056 redo A;
5057 } elsif ($self->{nc} == 0x007C or # |
5058 $self->{nc} == 0x002C) { # ,
5059 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5060 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5061 !!!next-input-character;
5062 redo A;
5063 } elsif ($self->{nc} == 0x0029) { # )
5064 $self->{group_depth}--;
5065 push @{$self->{ct}->{content}}, chr $self->{nc};
5066 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5067 !!!next-input-character;
5068 redo A;
5069 } elsif ($self->{nc} == 0x003E) { # >
5070 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5071 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5072 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5073 !!!next-input-character;
5074 !!!emit ($self->{ct}); # ELEMENT
5075 redo A;
5076 } elsif ($self->{nc} == -1) {
5077 !!!parse-error (type => 'unclosed md'); ## TODO: type
5078 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5079 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5080 !!!next-input-character;
5081 !!!emit ($self->{ct}); # ELEMENT
5082 redo A;
5083 } else {
5084 $self->{ct}->{content}->[-1] .= chr $self->{nc};
5085 ## Stay in the state.
5086 !!!next-input-character;
5087 redo A;
5088 }
5089 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5090 if ($is_space->{$self->{nc}}) {
5091 ## Stay in the state.
5092 !!!next-input-character;
5093 redo A;
5094 } elsif ($self->{nc} == 0x007C or # |
5095 $self->{nc} == 0x002C) { # ,
5096 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5097 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5098 !!!next-input-character;
5099 redo A;
5100 } elsif ($self->{nc} == 0x0029) { # )
5101 $self->{group_depth}--;
5102 push @{$self->{ct}->{content}}, chr $self->{nc};
5103 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5104 !!!next-input-character;
5105 redo A;
5106 } elsif ($self->{nc} == 0x003E) { # >
5107 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5108 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5109 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5110 !!!next-input-character;
5111 !!!emit ($self->{ct}); # ELEMENT
5112 redo A;
5113 } elsif ($self->{nc} == -1) {
5114 !!!parse-error (type => 'unclosed md'); ## TODO: type
5115 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5116 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5117 !!!next-input-character;
5118 !!!emit ($self->{ct}); # ELEMENT
5119 redo A;
5120 } else {
5121 !!!parse-error (type => 'after element name'); ## TODO: type
5122 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5123 $self->{state} = BOGUS_MD_STATE;
5124 !!!next-input-character;
5125 redo A;
5126 }
5127 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5128 if ($is_space->{$self->{nc}}) {
5129 if ($self->{group_depth}) {
5130 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5131 } else {
5132 $self->{state} = AFTER_MD_DEF_STATE;
5133 }
5134 !!!next-input-character;
5135 redo A;
5136 } elsif ($self->{nc} == 0x002A or # *
5137 $self->{nc} == 0x002B or # +
5138 $self->{nc} == 0x003F) { # ?
5139 push @{$self->{ct}->{content}}, chr $self->{nc};
5140 if ($self->{group_depth}) {
5141 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5142 } else {
5143 $self->{state} = AFTER_MD_DEF_STATE;
5144 }
5145 !!!next-input-character;
5146 redo A;
5147 } elsif ($self->{nc} == 0x0029) { # )
5148 if ($self->{group_depth}) {
5149 $self->{group_depth}--;
5150 push @{$self->{ct}->{content}}, chr $self->{nc};
5151 ## Stay in the state.
5152 !!!next-input-character;
5153 redo A;
5154 } else {
5155 !!!parse-error (type => 'string after md def'); ## TODO: type
5156 $self->{state} = BOGUS_MD_STATE;
5157 ## Reconsume.
5158 redo A;
5159 }
5160 } elsif ($self->{nc} == 0x003E) { # >
5161 if ($self->{group_depth}) {
5162 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5163 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5164 }
5165 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5166 !!!next-input-character;
5167 !!!emit ($self->{ct}); # ELEMENT
5168 redo A;
5169 } elsif ($self->{nc} == -1) {
5170 !!!parse-error (type => 'unclosed md'); ## TODO: type
5171 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5172 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5173 !!!next-input-character;
5174 !!!emit ($self->{ct}); # ELEMENT
5175 redo A;
5176 } else {
5177 if ($self->{group_depth}) {
5178 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5179 } else {
5180 !!!parse-error (type => 'string after md def'); ## TODO: type
5181 $self->{state} = BOGUS_MD_STATE;
5182 }
5183 ## Reconsume.
5184 redo A;
5185 }
5186 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5187 if ($is_space->{$self->{nc}}) {
5188 ## Stay in the state.
5189 !!!next-input-character;
5190 redo A;
5191 } elsif ($self->{nc} == 0x003E) { # >
5192 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5193 !!!next-input-character;
5194 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5195 redo A;
5196 } elsif ($self->{nc} == -1) {
5197 !!!parse-error (type => 'unclosed md'); ## TODO: type
5198 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5199 !!!next-input-character;
5200 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5201 redo A;
5202 } else {
5203 !!!parse-error (type => 'string after md def'); ## TODO: type
5204 $self->{state} = BOGUS_MD_STATE;
5205 ## Reconsume.
5206 redo A;
5207 }
5208 } elsif ($self->{state} == BOGUS_MD_STATE) {
5209 if ($self->{nc} == 0x003E) { # >
5210 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5211 !!!next-input-character;
5212 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5213 redo A;
5214 } elsif ($self->{nc} == -1) {
5215 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5216 ## Reconsume.
5217 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5218 redo A;
5219 } else {
5220 ## Stay in the state.
5221 !!!next-input-character;
5222 redo A;
5223 }
5224 } else {
5225 die "$0: $self->{state}: Unknown state";
5226 }
5227 } # A
5228
5229 die "$0: _get_next_token: unexpected case";
5230 } # _get_next_token
5231
5232 1;
5233 ## $Date: 2009/09/05 09:26:55 $
5234

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24