/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (show annotations) (download) (as text)
Sun Oct 19 06:14:57 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.17: +196 -6 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	19 Oct 2008 06:14:42 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/entities-1.dat" and "xml/entities-2.dat"
	added.  Support for the "#entities" directive.

++ whatpm/t/xml/ChangeLog	19 Oct 2008 06:11:59 -0000
	* entities-1.dat, entities-2.dat: New test data files.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	19 Oct 2008 06:12:27 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (notation_name): New attribute.

	* NanoDOM.pm (public_id, system_id): New attributes.a
++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 06:13:03 -0000
	* Dumper.pm: Dump text content of Entity nodes.

	* Tokenizer.pm.src: Support for <!ENTITY ... NDATA>.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 06:14:05 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): General and parameter entities
	implemented.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub AFTER_NOTATION_NAME_STATE () { 90 }
186 sub BOGUS_MD_STATE () { 91 }
187
188 ## Tree constructor state constants (see Whatpm::HTML for the full
189 ## list and descriptions)
190
191 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
192 sub FOREIGN_EL () { 0b1_00000000000 }
193
194 ## Character reference mappings
195
196 my $charref_map = {
197 0x0D => 0x000A,
198 0x80 => 0x20AC,
199 0x81 => 0xFFFD,
200 0x82 => 0x201A,
201 0x83 => 0x0192,
202 0x84 => 0x201E,
203 0x85 => 0x2026,
204 0x86 => 0x2020,
205 0x87 => 0x2021,
206 0x88 => 0x02C6,
207 0x89 => 0x2030,
208 0x8A => 0x0160,
209 0x8B => 0x2039,
210 0x8C => 0x0152,
211 0x8D => 0xFFFD,
212 0x8E => 0x017D,
213 0x8F => 0xFFFD,
214 0x90 => 0xFFFD,
215 0x91 => 0x2018,
216 0x92 => 0x2019,
217 0x93 => 0x201C,
218 0x94 => 0x201D,
219 0x95 => 0x2022,
220 0x96 => 0x2013,
221 0x97 => 0x2014,
222 0x98 => 0x02DC,
223 0x99 => 0x2122,
224 0x9A => 0x0161,
225 0x9B => 0x203A,
226 0x9C => 0x0153,
227 0x9D => 0xFFFD,
228 0x9E => 0x017E,
229 0x9F => 0x0178,
230 }; # $charref_map
231 $charref_map->{$_} = 0xFFFD
232 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
233 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
234 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
235 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
236 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
237 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
238 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
239
240 ## Implementations MUST act as if state machine in the spec
241
242 sub _initialize_tokenizer ($) {
243 my $self = shift;
244
245 ## NOTE: Fields set by |new| constructor:
246 #$self->{level}
247 #$self->{set_nc}
248 #$self->{parse_error}
249 #$self->{is_xml} (if XML)
250
251 $self->{state} = DATA_STATE; # MUST
252 $self->{s_kwd} = ''; # Data state keyword
253 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
254 #$self->{entity__value}; # initialized when used
255 #$self->{entity__match}; # initialized when used
256 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
257 undef $self->{ct}; # current token
258 undef $self->{ca}; # current attribute
259 undef $self->{last_stag_name}; # last emitted start tag name
260 #$self->{prev_state}; # initialized when used
261 delete $self->{self_closing};
262 $self->{char_buffer} = '';
263 $self->{char_buffer_pos} = 0;
264 $self->{nc} = -1; # next input character
265 #$self->{next_nc}
266 !!!next-input-character;
267 $self->{token} = [];
268 # $self->{escape}
269 } # _initialize_tokenizer
270
271 ## A token has:
272 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
273 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
274 ## ->{name} (DOCTYPE_TOKEN)
275 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
276 ## ->{target} (PI_TOKEN)
277 ## ->{pubid} (DOCTYPE_TOKEN)
278 ## ->{sysid} (DOCTYPE_TOKEN)
279 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
280 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
281 ## ->{name}
282 ## ->{value}
283 ## ->{has_reference} == 1 or 0
284 ## ->{index}: Index of the attribute in a tag.
285 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
286 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
287 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
288 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
289
290 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
291 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
292 ## while the token is pushed back to the stack.
293
294 ## Emitted token MUST immediately be handled by the tree construction state.
295
296 ## Before each step, UA MAY check to see if either one of the scripts in
297 ## "list of scripts that will execute as soon as possible" or the first
298 ## script in the "list of scripts that will execute asynchronously",
299 ## has completed loading. If one has, then it MUST be executed
300 ## and removed from the list.
301
302 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
303 ## (This requirement was dropped from HTML5 spec, unfortunately.)
304
305 my $is_space = {
306 0x0009 => 1, # CHARACTER TABULATION (HT)
307 0x000A => 1, # LINE FEED (LF)
308 #0x000B => 0, # LINE TABULATION (VT)
309 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
310 #0x000D => 1, # CARRIAGE RETURN (CR)
311 0x0020 => 1, # SPACE (SP)
312 };
313
314 sub _get_next_token ($) {
315 my $self = shift;
316
317 if ($self->{self_closing}) {
318 !!!parse-error (type => 'nestc', token => $self->{ct});
319 ## NOTE: The |self_closing| flag is only set by start tag token.
320 ## In addition, when a start tag token is emitted, it is always set to
321 ## |ct|.
322 delete $self->{self_closing};
323 }
324
325 if (@{$self->{token}}) {
326 $self->{self_closing} = $self->{token}->[0]->{self_closing};
327 return shift @{$self->{token}};
328 }
329
330 A: {
331 if ($self->{state} == PCDATA_STATE) {
332 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
333
334 if ($self->{nc} == 0x0026) { # &
335 !!!cp (0.1);
336 ## NOTE: In the spec, the tokenizer is switched to the
337 ## "entity data state". In this implementation, the tokenizer
338 ## is switched to the |ENTITY_STATE|, which is an implementation
339 ## of the "consume a character reference" algorithm.
340 $self->{entity_add} = -1;
341 $self->{prev_state} = DATA_STATE;
342 $self->{state} = ENTITY_STATE;
343 !!!next-input-character;
344 redo A;
345 } elsif ($self->{nc} == 0x003C) { # <
346 !!!cp (0.2);
347 $self->{state} = TAG_OPEN_STATE;
348 !!!next-input-character;
349 redo A;
350 } elsif ($self->{nc} == -1) {
351 !!!cp (0.3);
352 !!!emit ({type => END_OF_FILE_TOKEN,
353 line => $self->{line}, column => $self->{column}});
354 last A; ## TODO: ok?
355 } else {
356 !!!cp (0.4);
357 #
358 }
359
360 # Anything else
361 my $token = {type => CHARACTER_TOKEN,
362 data => chr $self->{nc},
363 line => $self->{line}, column => $self->{column},
364 };
365 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
366
367 ## Stay in the state.
368 !!!next-input-character;
369 !!!emit ($token);
370 redo A;
371 } elsif ($self->{state} == DATA_STATE) {
372 $self->{s_kwd} = '' unless defined $self->{s_kwd};
373 if ($self->{nc} == 0x0026) { # &
374 $self->{s_kwd} = '';
375 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
376 not $self->{escape}) {
377 !!!cp (1);
378 ## NOTE: In the spec, the tokenizer is switched to the
379 ## "entity data state". In this implementation, the tokenizer
380 ## is switched to the |ENTITY_STATE|, which is an implementation
381 ## of the "consume a character reference" algorithm.
382 $self->{entity_add} = -1;
383 $self->{prev_state} = DATA_STATE;
384 $self->{state} = ENTITY_STATE;
385 !!!next-input-character;
386 redo A;
387 } else {
388 !!!cp (2);
389 #
390 }
391 } elsif ($self->{nc} == 0x002D) { # -
392 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
393 if ($self->{s_kwd} eq '<!-') {
394 !!!cp (3);
395 $self->{escape} = 1; # unless $self->{escape};
396 $self->{s_kwd} = '--';
397 #
398 } elsif ($self->{s_kwd} eq '-') {
399 !!!cp (4);
400 $self->{s_kwd} = '--';
401 #
402 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
403 !!!cp (4.1);
404 $self->{s_kwd} .= '-';
405 #
406 } else {
407 !!!cp (5);
408 $self->{s_kwd} = '-';
409 #
410 }
411 }
412
413 #
414 } elsif ($self->{nc} == 0x0021) { # !
415 if (length $self->{s_kwd}) {
416 !!!cp (5.1);
417 $self->{s_kwd} .= '!';
418 #
419 } else {
420 !!!cp (5.2);
421 #$self->{s_kwd} = '';
422 #
423 }
424 #
425 } elsif ($self->{nc} == 0x003C) { # <
426 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
427 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
428 not $self->{escape})) {
429 !!!cp (6);
430 $self->{state} = TAG_OPEN_STATE;
431 !!!next-input-character;
432 redo A;
433 } else {
434 !!!cp (7);
435 $self->{s_kwd} = '';
436 #
437 }
438 } elsif ($self->{nc} == 0x003E) { # >
439 if ($self->{escape} and
440 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
441 if ($self->{s_kwd} eq '--') {
442 !!!cp (8);
443 delete $self->{escape};
444 #
445 } else {
446 !!!cp (9);
447 #
448 }
449 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
450 !!!cp (9.1);
451 !!!parse-error (type => 'unmatched mse', ## TODO: type
452 line => $self->{line_prev},
453 column => $self->{column_prev} - 1);
454 #
455 } else {
456 !!!cp (10);
457 #
458 }
459
460 $self->{s_kwd} = '';
461 #
462 } elsif ($self->{nc} == 0x005D) { # ]
463 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
464 !!!cp (10.1);
465 $self->{s_kwd} .= ']';
466 } elsif ($self->{s_kwd} eq ']]') {
467 !!!cp (10.2);
468 #
469 } else {
470 !!!cp (10.3);
471 $self->{s_kwd} = '';
472 }
473 #
474 } elsif ($self->{nc} == -1) {
475 !!!cp (11);
476 $self->{s_kwd} = '';
477 !!!emit ({type => END_OF_FILE_TOKEN,
478 line => $self->{line}, column => $self->{column}});
479 last A; ## TODO: ok?
480 } else {
481 !!!cp (12);
482 $self->{s_kwd} = '';
483 #
484 }
485
486 # Anything else
487 my $token = {type => CHARACTER_TOKEN,
488 data => chr $self->{nc},
489 line => $self->{line}, column => $self->{column},
490 };
491 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
492 length $token->{data})) {
493 $self->{s_kwd} = '';
494 }
495
496 ## Stay in the data state.
497 if (not $self->{is_xml} and
498 $self->{content_model} == PCDATA_CONTENT_MODEL) {
499 !!!cp (13);
500 $self->{state} = PCDATA_STATE;
501 } else {
502 !!!cp (14);
503 ## Stay in the state.
504 }
505 !!!next-input-character;
506 !!!emit ($token);
507 redo A;
508 } elsif ($self->{state} == TAG_OPEN_STATE) {
509 ## XML5: "tag state".
510
511 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
512 if ($self->{nc} == 0x002F) { # /
513 !!!cp (15);
514 !!!next-input-character;
515 $self->{state} = CLOSE_TAG_OPEN_STATE;
516 redo A;
517 } elsif ($self->{nc} == 0x0021) { # !
518 !!!cp (15.1);
519 $self->{s_kwd} = $self->{escaped} ? '' : '<';
520 #
521 } else {
522 !!!cp (16);
523 $self->{s_kwd} = '';
524 #
525 }
526
527 ## reconsume
528 $self->{state} = DATA_STATE;
529 !!!emit ({type => CHARACTER_TOKEN, data => '<',
530 line => $self->{line_prev},
531 column => $self->{column_prev},
532 });
533 redo A;
534 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
535 if ($self->{nc} == 0x0021) { # !
536 !!!cp (17);
537 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
538 !!!next-input-character;
539 redo A;
540 } elsif ($self->{nc} == 0x002F) { # /
541 !!!cp (18);
542 $self->{state} = CLOSE_TAG_OPEN_STATE;
543 !!!next-input-character;
544 redo A;
545 } elsif (0x0041 <= $self->{nc} and
546 $self->{nc} <= 0x005A) { # A..Z
547 !!!cp (19);
548 $self->{ct}
549 = {type => START_TAG_TOKEN,
550 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
551 line => $self->{line_prev},
552 column => $self->{column_prev}};
553 $self->{state} = TAG_NAME_STATE;
554 !!!next-input-character;
555 redo A;
556 } elsif (0x0061 <= $self->{nc} and
557 $self->{nc} <= 0x007A) { # a..z
558 !!!cp (20);
559 $self->{ct} = {type => START_TAG_TOKEN,
560 tag_name => chr ($self->{nc}),
561 line => $self->{line_prev},
562 column => $self->{column_prev}};
563 $self->{state} = TAG_NAME_STATE;
564 !!!next-input-character;
565 redo A;
566 } elsif ($self->{nc} == 0x003E) { # >
567 !!!cp (21);
568 !!!parse-error (type => 'empty start tag',
569 line => $self->{line_prev},
570 column => $self->{column_prev});
571 $self->{state} = DATA_STATE;
572 $self->{s_kwd} = '';
573 !!!next-input-character;
574
575 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
576 line => $self->{line_prev},
577 column => $self->{column_prev},
578 });
579
580 redo A;
581 } elsif ($self->{nc} == 0x003F) { # ?
582 if ($self->{is_xml}) {
583 !!!cp (22.1);
584 $self->{state} = PI_STATE;
585 !!!next-input-character;
586 redo A;
587 } else {
588 !!!cp (22);
589 !!!parse-error (type => 'pio',
590 line => $self->{line_prev},
591 column => $self->{column_prev});
592 $self->{state} = BOGUS_COMMENT_STATE;
593 $self->{ct} = {type => COMMENT_TOKEN, data => '',
594 line => $self->{line_prev},
595 column => $self->{column_prev},
596 };
597 ## $self->{nc} is intentionally left as is
598 redo A;
599 }
600 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
601 !!!cp (23);
602 !!!parse-error (type => 'bare stago',
603 line => $self->{line_prev},
604 column => $self->{column_prev});
605 $self->{state} = DATA_STATE;
606 $self->{s_kwd} = '';
607 ## reconsume
608
609 !!!emit ({type => CHARACTER_TOKEN, data => '<',
610 line => $self->{line_prev},
611 column => $self->{column_prev},
612 });
613
614 redo A;
615 } else {
616 ## XML5: "<:" is a parse error.
617 !!!cp (23.1);
618 $self->{ct} = {type => START_TAG_TOKEN,
619 tag_name => chr ($self->{nc}),
620 line => $self->{line_prev},
621 column => $self->{column_prev}};
622 $self->{state} = TAG_NAME_STATE;
623 !!!next-input-character;
624 redo A;
625 }
626 } else {
627 die "$0: $self->{content_model} in tag open";
628 }
629 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
630 ## NOTE: The "close tag open state" in the spec is implemented as
631 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
632
633 ## XML5: "end tag state".
634
635 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
636 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
637 if (defined $self->{last_stag_name}) {
638 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
639 $self->{kwd} = '';
640 ## Reconsume.
641 redo A;
642 } else {
643 ## No start tag token has ever been emitted
644 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
645 !!!cp (28);
646 $self->{state} = DATA_STATE;
647 $self->{s_kwd} = '';
648 ## Reconsume.
649 !!!emit ({type => CHARACTER_TOKEN, data => '</',
650 line => $l, column => $c,
651 });
652 redo A;
653 }
654 }
655
656 if (0x0041 <= $self->{nc} and
657 $self->{nc} <= 0x005A) { # A..Z
658 !!!cp (29);
659 $self->{ct}
660 = {type => END_TAG_TOKEN,
661 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
662 line => $l, column => $c};
663 $self->{state} = TAG_NAME_STATE;
664 !!!next-input-character;
665 redo A;
666 } elsif (0x0061 <= $self->{nc} and
667 $self->{nc} <= 0x007A) { # a..z
668 !!!cp (30);
669 $self->{ct} = {type => END_TAG_TOKEN,
670 tag_name => chr ($self->{nc}),
671 line => $l, column => $c};
672 $self->{state} = TAG_NAME_STATE;
673 !!!next-input-character;
674 redo A;
675 } elsif ($self->{nc} == 0x003E) { # >
676 !!!parse-error (type => 'empty end tag',
677 line => $self->{line_prev}, ## "<" in "</>"
678 column => $self->{column_prev} - 1);
679 $self->{state} = DATA_STATE;
680 $self->{s_kwd} = '';
681 if ($self->{is_xml}) {
682 !!!cp (31);
683 ## XML5: No parse error.
684
685 ## NOTE: This parser raises a parse error, since it supports
686 ## XML1, not XML5.
687
688 ## NOTE: A short end tag token.
689 my $ct = {type => END_TAG_TOKEN,
690 tag_name => '',
691 line => $self->{line_prev},
692 column => $self->{column_prev} - 1,
693 };
694 !!!next-input-character;
695 !!!emit ($ct);
696 } else {
697 !!!cp (31.1);
698 !!!next-input-character;
699 }
700 redo A;
701 } elsif ($self->{nc} == -1) {
702 !!!cp (32);
703 !!!parse-error (type => 'bare etago');
704 $self->{s_kwd} = '';
705 $self->{state} = DATA_STATE;
706 # reconsume
707
708 !!!emit ({type => CHARACTER_TOKEN, data => '</',
709 line => $l, column => $c,
710 });
711
712 redo A;
713 } elsif (not $self->{is_xml} or
714 $is_space->{$self->{nc}}) {
715 !!!cp (33);
716 !!!parse-error (type => 'bogus end tag',
717 line => $self->{line_prev}, # "<" of "</"
718 column => $self->{column_prev} - 1);
719 $self->{state} = BOGUS_COMMENT_STATE;
720 $self->{ct} = {type => COMMENT_TOKEN, data => '',
721 line => $self->{line_prev}, # "<" of "</"
722 column => $self->{column_prev} - 1,
723 };
724 ## NOTE: $self->{nc} is intentionally left as is.
725 ## Although the "anything else" case of the spec not explicitly
726 ## states that the next input character is to be reconsumed,
727 ## it will be included to the |data| of the comment token
728 ## generated from the bogus end tag, as defined in the
729 ## "bogus comment state" entry.
730 redo A;
731 } else {
732 ## XML5: "</:" is a parse error.
733 !!!cp (30.1);
734 $self->{ct} = {type => END_TAG_TOKEN,
735 tag_name => chr ($self->{nc}),
736 line => $l, column => $c};
737 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
738 !!!next-input-character;
739 redo A;
740 }
741 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
742 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
743 if (length $ch) {
744 my $CH = $ch;
745 $ch =~ tr/a-z/A-Z/;
746 my $nch = chr $self->{nc};
747 if ($nch eq $ch or $nch eq $CH) {
748 !!!cp (24);
749 ## Stay in the state.
750 $self->{kwd} .= $nch;
751 !!!next-input-character;
752 redo A;
753 } else {
754 !!!cp (25);
755 $self->{state} = DATA_STATE;
756 $self->{s_kwd} = '';
757 ## Reconsume.
758 !!!emit ({type => CHARACTER_TOKEN,
759 data => '</' . $self->{kwd},
760 line => $self->{line_prev},
761 column => $self->{column_prev} - 1 - length $self->{kwd},
762 });
763 redo A;
764 }
765 } else { # after "<{tag-name}"
766 unless ($is_space->{$self->{nc}} or
767 {
768 0x003E => 1, # >
769 0x002F => 1, # /
770 -1 => 1, # EOF
771 }->{$self->{nc}}) {
772 !!!cp (26);
773 ## Reconsume.
774 $self->{state} = DATA_STATE;
775 $self->{s_kwd} = '';
776 !!!emit ({type => CHARACTER_TOKEN,
777 data => '</' . $self->{kwd},
778 line => $self->{line_prev},
779 column => $self->{column_prev} - 1 - length $self->{kwd},
780 });
781 redo A;
782 } else {
783 !!!cp (27);
784 $self->{ct}
785 = {type => END_TAG_TOKEN,
786 tag_name => $self->{last_stag_name},
787 line => $self->{line_prev},
788 column => $self->{column_prev} - 1 - length $self->{kwd}};
789 $self->{state} = TAG_NAME_STATE;
790 ## Reconsume.
791 redo A;
792 }
793 }
794 } elsif ($self->{state} == TAG_NAME_STATE) {
795 if ($is_space->{$self->{nc}}) {
796 !!!cp (34);
797 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
798 !!!next-input-character;
799 redo A;
800 } elsif ($self->{nc} == 0x003E) { # >
801 if ($self->{ct}->{type} == START_TAG_TOKEN) {
802 !!!cp (35);
803 $self->{last_stag_name} = $self->{ct}->{tag_name};
804 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
805 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
806 #if ($self->{ct}->{attributes}) {
807 # ## NOTE: This should never be reached.
808 # !!! cp (36);
809 # !!! parse-error (type => 'end tag attribute');
810 #} else {
811 !!!cp (37);
812 #}
813 } else {
814 die "$0: $self->{ct}->{type}: Unknown token type";
815 }
816 $self->{state} = DATA_STATE;
817 $self->{s_kwd} = '';
818 !!!next-input-character;
819
820 !!!emit ($self->{ct}); # start tag or end tag
821
822 redo A;
823 } elsif (0x0041 <= $self->{nc} and
824 $self->{nc} <= 0x005A) { # A..Z
825 !!!cp (38);
826 $self->{ct}->{tag_name}
827 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
828 # start tag or end tag
829 ## Stay in this state
830 !!!next-input-character;
831 redo A;
832 } elsif ($self->{nc} == -1) {
833 !!!parse-error (type => 'unclosed tag');
834 if ($self->{ct}->{type} == START_TAG_TOKEN) {
835 !!!cp (39);
836 $self->{last_stag_name} = $self->{ct}->{tag_name};
837 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
838 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
839 #if ($self->{ct}->{attributes}) {
840 # ## NOTE: This state should never be reached.
841 # !!! cp (40);
842 # !!! parse-error (type => 'end tag attribute');
843 #} else {
844 !!!cp (41);
845 #}
846 } else {
847 die "$0: $self->{ct}->{type}: Unknown token type";
848 }
849 $self->{state} = DATA_STATE;
850 $self->{s_kwd} = '';
851 # reconsume
852
853 !!!emit ($self->{ct}); # start tag or end tag
854
855 redo A;
856 } elsif ($self->{nc} == 0x002F) { # /
857 !!!cp (42);
858 $self->{state} = SELF_CLOSING_START_TAG_STATE;
859 !!!next-input-character;
860 redo A;
861 } else {
862 !!!cp (44);
863 $self->{ct}->{tag_name} .= chr $self->{nc};
864 # start tag or end tag
865 ## Stay in the state
866 !!!next-input-character;
867 redo A;
868 }
869 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
870 ## XML5: "Tag attribute name before state".
871
872 if ($is_space->{$self->{nc}}) {
873 !!!cp (45);
874 ## Stay in the state
875 !!!next-input-character;
876 redo A;
877 } elsif ($self->{nc} == 0x003E) { # >
878 if ($self->{ct}->{type} == START_TAG_TOKEN) {
879 !!!cp (46);
880 $self->{last_stag_name} = $self->{ct}->{tag_name};
881 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
882 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
883 if ($self->{ct}->{attributes}) {
884 !!!cp (47);
885 !!!parse-error (type => 'end tag attribute');
886 } else {
887 !!!cp (48);
888 }
889 } else {
890 die "$0: $self->{ct}->{type}: Unknown token type";
891 }
892 $self->{state} = DATA_STATE;
893 $self->{s_kwd} = '';
894 !!!next-input-character;
895
896 !!!emit ($self->{ct}); # start tag or end tag
897
898 redo A;
899 } elsif (0x0041 <= $self->{nc} and
900 $self->{nc} <= 0x005A) { # A..Z
901 !!!cp (49);
902 $self->{ca}
903 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
904 value => '',
905 line => $self->{line}, column => $self->{column}};
906 $self->{state} = ATTRIBUTE_NAME_STATE;
907 !!!next-input-character;
908 redo A;
909 } elsif ($self->{nc} == 0x002F) { # /
910 !!!cp (50);
911 $self->{state} = SELF_CLOSING_START_TAG_STATE;
912 !!!next-input-character;
913 redo A;
914 } elsif ($self->{nc} == -1) {
915 !!!parse-error (type => 'unclosed tag');
916 if ($self->{ct}->{type} == START_TAG_TOKEN) {
917 !!!cp (52);
918 $self->{last_stag_name} = $self->{ct}->{tag_name};
919 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
920 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
921 if ($self->{ct}->{attributes}) {
922 !!!cp (53);
923 !!!parse-error (type => 'end tag attribute');
924 } else {
925 !!!cp (54);
926 }
927 } else {
928 die "$0: $self->{ct}->{type}: Unknown token type";
929 }
930 $self->{state} = DATA_STATE;
931 $self->{s_kwd} = '';
932 # reconsume
933
934 !!!emit ($self->{ct}); # start tag or end tag
935
936 redo A;
937 } else {
938 if ({
939 0x0022 => 1, # "
940 0x0027 => 1, # '
941 0x003D => 1, # =
942 }->{$self->{nc}}) {
943 !!!cp (55);
944 ## XML5: Not a parse error.
945 !!!parse-error (type => 'bad attribute name');
946 } else {
947 !!!cp (56);
948 ## XML5: ":" raises a parse error and is ignored.
949 }
950 $self->{ca}
951 = {name => chr ($self->{nc}),
952 value => '',
953 line => $self->{line}, column => $self->{column}};
954 $self->{state} = ATTRIBUTE_NAME_STATE;
955 !!!next-input-character;
956 redo A;
957 }
958 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
959 ## XML5: "Tag attribute name state".
960
961 my $before_leave = sub {
962 if (exists $self->{ct}->{attributes} # start tag or end tag
963 ->{$self->{ca}->{name}}) { # MUST
964 !!!cp (57);
965 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
966 ## Discard $self->{ca} # MUST
967 } else {
968 !!!cp (58);
969 $self->{ct}->{attributes}->{$self->{ca}->{name}}
970 = $self->{ca};
971 $self->{ca}->{index} = ++$self->{ct}->{last_index};
972 }
973 }; # $before_leave
974
975 if ($is_space->{$self->{nc}}) {
976 !!!cp (59);
977 $before_leave->();
978 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
979 !!!next-input-character;
980 redo A;
981 } elsif ($self->{nc} == 0x003D) { # =
982 !!!cp (60);
983 $before_leave->();
984 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
985 !!!next-input-character;
986 redo A;
987 } elsif ($self->{nc} == 0x003E) { # >
988 if ($self->{is_xml}) {
989 !!!cp (60.1);
990 ## XML5: Not a parse error.
991 !!!parse-error (type => 'no attr value'); ## TODO: type
992 } else {
993 !!!cp (60.2);
994 }
995
996 $before_leave->();
997 if ($self->{ct}->{type} == START_TAG_TOKEN) {
998 !!!cp (61);
999 $self->{last_stag_name} = $self->{ct}->{tag_name};
1000 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1001 !!!cp (62);
1002 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1003 if ($self->{ct}->{attributes}) {
1004 !!!parse-error (type => 'end tag attribute');
1005 }
1006 } else {
1007 die "$0: $self->{ct}->{type}: Unknown token type";
1008 }
1009 $self->{state} = DATA_STATE;
1010 $self->{s_kwd} = '';
1011 !!!next-input-character;
1012
1013 !!!emit ($self->{ct}); # start tag or end tag
1014
1015 redo A;
1016 } elsif (0x0041 <= $self->{nc} and
1017 $self->{nc} <= 0x005A) { # A..Z
1018 !!!cp (63);
1019 $self->{ca}->{name}
1020 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1021 ## Stay in the state
1022 !!!next-input-character;
1023 redo A;
1024 } elsif ($self->{nc} == 0x002F) { # /
1025 if ($self->{is_xml}) {
1026 !!!cp (64);
1027 ## XML5: Not a parse error.
1028 !!!parse-error (type => 'no attr value'); ## TODO: type
1029 } else {
1030 !!!cp (64.1);
1031 }
1032
1033 $before_leave->();
1034 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1035 !!!next-input-character;
1036 redo A;
1037 } elsif ($self->{nc} == -1) {
1038 !!!parse-error (type => 'unclosed tag');
1039 $before_leave->();
1040 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1041 !!!cp (66);
1042 $self->{last_stag_name} = $self->{ct}->{tag_name};
1043 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1044 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1045 if ($self->{ct}->{attributes}) {
1046 !!!cp (67);
1047 !!!parse-error (type => 'end tag attribute');
1048 } else {
1049 ## NOTE: This state should never be reached.
1050 !!!cp (68);
1051 }
1052 } else {
1053 die "$0: $self->{ct}->{type}: Unknown token type";
1054 }
1055 $self->{state} = DATA_STATE;
1056 $self->{s_kwd} = '';
1057 # reconsume
1058
1059 !!!emit ($self->{ct}); # start tag or end tag
1060
1061 redo A;
1062 } else {
1063 if ($self->{nc} == 0x0022 or # "
1064 $self->{nc} == 0x0027) { # '
1065 !!!cp (69);
1066 ## XML5: Not a parse error.
1067 !!!parse-error (type => 'bad attribute name');
1068 } else {
1069 !!!cp (70);
1070 }
1071 $self->{ca}->{name} .= chr ($self->{nc});
1072 ## Stay in the state
1073 !!!next-input-character;
1074 redo A;
1075 }
1076 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1077 ## XML5: "Tag attribute name after state".
1078
1079 if ($is_space->{$self->{nc}}) {
1080 !!!cp (71);
1081 ## Stay in the state
1082 !!!next-input-character;
1083 redo A;
1084 } elsif ($self->{nc} == 0x003D) { # =
1085 !!!cp (72);
1086 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1087 !!!next-input-character;
1088 redo A;
1089 } elsif ($self->{nc} == 0x003E) { # >
1090 if ($self->{is_xml}) {
1091 !!!cp (72.1);
1092 ## XML5: Not a parse error.
1093 !!!parse-error (type => 'no attr value'); ## TODO: type
1094 } else {
1095 !!!cp (72.2);
1096 }
1097
1098 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1099 !!!cp (73);
1100 $self->{last_stag_name} = $self->{ct}->{tag_name};
1101 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1102 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1103 if ($self->{ct}->{attributes}) {
1104 !!!cp (74);
1105 !!!parse-error (type => 'end tag attribute');
1106 } else {
1107 ## NOTE: This state should never be reached.
1108 !!!cp (75);
1109 }
1110 } else {
1111 die "$0: $self->{ct}->{type}: Unknown token type";
1112 }
1113 $self->{state} = DATA_STATE;
1114 $self->{s_kwd} = '';
1115 !!!next-input-character;
1116
1117 !!!emit ($self->{ct}); # start tag or end tag
1118
1119 redo A;
1120 } elsif (0x0041 <= $self->{nc} and
1121 $self->{nc} <= 0x005A) { # A..Z
1122 !!!cp (76);
1123 $self->{ca}
1124 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1125 value => '',
1126 line => $self->{line}, column => $self->{column}};
1127 $self->{state} = ATTRIBUTE_NAME_STATE;
1128 !!!next-input-character;
1129 redo A;
1130 } elsif ($self->{nc} == 0x002F) { # /
1131 if ($self->{is_xml}) {
1132 !!!cp (77);
1133 ## XML5: Not a parse error.
1134 !!!parse-error (type => 'no attr value'); ## TODO: type
1135 } else {
1136 !!!cp (77.1);
1137 }
1138
1139 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1140 !!!next-input-character;
1141 redo A;
1142 } elsif ($self->{nc} == -1) {
1143 !!!parse-error (type => 'unclosed tag');
1144 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1145 !!!cp (79);
1146 $self->{last_stag_name} = $self->{ct}->{tag_name};
1147 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1148 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1149 if ($self->{ct}->{attributes}) {
1150 !!!cp (80);
1151 !!!parse-error (type => 'end tag attribute');
1152 } else {
1153 ## NOTE: This state should never be reached.
1154 !!!cp (81);
1155 }
1156 } else {
1157 die "$0: $self->{ct}->{type}: Unknown token type";
1158 }
1159 $self->{s_kwd} = '';
1160 $self->{state} = DATA_STATE;
1161 # reconsume
1162
1163 !!!emit ($self->{ct}); # start tag or end tag
1164
1165 redo A;
1166 } else {
1167 if ($self->{is_xml}) {
1168 !!!cp (78.1);
1169 ## XML5: Not a parse error.
1170 !!!parse-error (type => 'no attr value'); ## TODO: type
1171 } else {
1172 !!!cp (78.2);
1173 }
1174
1175 if ($self->{nc} == 0x0022 or # "
1176 $self->{nc} == 0x0027) { # '
1177 !!!cp (78);
1178 ## XML5: Not a parse error.
1179 !!!parse-error (type => 'bad attribute name');
1180 } else {
1181 !!!cp (82);
1182 }
1183 $self->{ca}
1184 = {name => chr ($self->{nc}),
1185 value => '',
1186 line => $self->{line}, column => $self->{column}};
1187 $self->{state} = ATTRIBUTE_NAME_STATE;
1188 !!!next-input-character;
1189 redo A;
1190 }
1191 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1192 ## XML5: "Tag attribute value before state".
1193
1194 if ($is_space->{$self->{nc}}) {
1195 !!!cp (83);
1196 ## Stay in the state
1197 !!!next-input-character;
1198 redo A;
1199 } elsif ($self->{nc} == 0x0022) { # "
1200 !!!cp (84);
1201 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1202 !!!next-input-character;
1203 redo A;
1204 } elsif ($self->{nc} == 0x0026) { # &
1205 !!!cp (85);
1206 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1207 ## reconsume
1208 redo A;
1209 } elsif ($self->{nc} == 0x0027) { # '
1210 !!!cp (86);
1211 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 } elsif ($self->{nc} == 0x003E) { # >
1215 !!!parse-error (type => 'empty unquoted attribute value');
1216 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1217 !!!cp (87);
1218 $self->{last_stag_name} = $self->{ct}->{tag_name};
1219 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1220 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1221 if ($self->{ct}->{attributes}) {
1222 !!!cp (88);
1223 !!!parse-error (type => 'end tag attribute');
1224 } else {
1225 ## NOTE: This state should never be reached.
1226 !!!cp (89);
1227 }
1228 } else {
1229 die "$0: $self->{ct}->{type}: Unknown token type";
1230 }
1231 $self->{state} = DATA_STATE;
1232 $self->{s_kwd} = '';
1233 !!!next-input-character;
1234
1235 !!!emit ($self->{ct}); # start tag or end tag
1236
1237 redo A;
1238 } elsif ($self->{nc} == -1) {
1239 !!!parse-error (type => 'unclosed tag');
1240 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1241 !!!cp (90);
1242 $self->{last_stag_name} = $self->{ct}->{tag_name};
1243 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1244 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1245 if ($self->{ct}->{attributes}) {
1246 !!!cp (91);
1247 !!!parse-error (type => 'end tag attribute');
1248 } else {
1249 ## NOTE: This state should never be reached.
1250 !!!cp (92);
1251 }
1252 } else {
1253 die "$0: $self->{ct}->{type}: Unknown token type";
1254 }
1255 $self->{state} = DATA_STATE;
1256 $self->{s_kwd} = '';
1257 ## reconsume
1258
1259 !!!emit ($self->{ct}); # start tag or end tag
1260
1261 redo A;
1262 } else {
1263 if ($self->{nc} == 0x003D) { # =
1264 !!!cp (93);
1265 ## XML5: Not a parse error.
1266 !!!parse-error (type => 'bad attribute value');
1267 } elsif ($self->{is_xml}) {
1268 !!!cp (93.1);
1269 ## XML5: No parse error.
1270 !!!parse-error (type => 'unquoted attr value'); ## TODO
1271 } else {
1272 !!!cp (94);
1273 }
1274 $self->{ca}->{value} .= chr ($self->{nc});
1275 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1276 !!!next-input-character;
1277 redo A;
1278 }
1279 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1280 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1281 ## ATTLIST attribute value double quoted state".
1282
1283 if ($self->{nc} == 0x0022) { # "
1284 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1285 !!!cp (95.1);
1286 ## XML5: "DOCTYPE ATTLIST name after state".
1287 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1288 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1289 } else {
1290 !!!cp (95);
1291 ## XML5: "Tag attribute name before state".
1292 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1293 }
1294 !!!next-input-character;
1295 redo A;
1296 } elsif ($self->{nc} == 0x0026) { # &
1297 !!!cp (96);
1298 ## XML5: Not defined yet.
1299
1300 ## NOTE: In the spec, the tokenizer is switched to the
1301 ## "entity in attribute value state". In this implementation, the
1302 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1303 ## implementation of the "consume a character reference" algorithm.
1304 $self->{prev_state} = $self->{state};
1305 $self->{entity_add} = 0x0022; # "
1306 $self->{state} = ENTITY_STATE;
1307 !!!next-input-character;
1308 redo A;
1309 } elsif ($self->{nc} == -1) {
1310 !!!parse-error (type => 'unclosed attribute value');
1311 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1312 !!!cp (97);
1313 $self->{last_stag_name} = $self->{ct}->{tag_name};
1314
1315 $self->{state} = DATA_STATE;
1316 $self->{s_kwd} = '';
1317 ## reconsume
1318 !!!emit ($self->{ct}); # start tag
1319 redo A;
1320 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1321 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1322 if ($self->{ct}->{attributes}) {
1323 !!!cp (98);
1324 !!!parse-error (type => 'end tag attribute');
1325 } else {
1326 ## NOTE: This state should never be reached.
1327 !!!cp (99);
1328 }
1329
1330 $self->{state} = DATA_STATE;
1331 $self->{s_kwd} = '';
1332 ## reconsume
1333 !!!emit ($self->{ct}); # end tag
1334 redo A;
1335 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1336 ## XML5: No parse error above; not defined yet.
1337 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1338 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1339 ## Reconsume.
1340 !!!emit ($self->{ct}); # ATTLIST
1341 redo A;
1342 } else {
1343 die "$0: $self->{ct}->{type}: Unknown token type";
1344 }
1345 } else {
1346 ## XML5 [ATTLIST]: Not defined yet.
1347 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1348 !!!cp (100);
1349 ## XML5: Not a parse error.
1350 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1351 } else {
1352 !!!cp (100.1);
1353 }
1354 $self->{ca}->{value} .= chr ($self->{nc});
1355 $self->{read_until}->($self->{ca}->{value},
1356 q["&<],
1357 length $self->{ca}->{value});
1358
1359 ## Stay in the state
1360 !!!next-input-character;
1361 redo A;
1362 }
1363 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1364 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1365 ## ATTLIST attribute value single quoted state".
1366
1367 if ($self->{nc} == 0x0027) { # '
1368 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1369 !!!cp (101.1);
1370 ## XML5: "DOCTYPE ATTLIST name after state".
1371 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1372 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1373 } else {
1374 !!!cp (101);
1375 ## XML5: "Before attribute name state" (sic).
1376 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1377 }
1378 !!!next-input-character;
1379 redo A;
1380 } elsif ($self->{nc} == 0x0026) { # &
1381 !!!cp (102);
1382 ## XML5: Not defined yet.
1383
1384 ## NOTE: In the spec, the tokenizer is switched to the
1385 ## "entity in attribute value state". In this implementation, the
1386 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1387 ## implementation of the "consume a character reference" algorithm.
1388 $self->{entity_add} = 0x0027; # '
1389 $self->{prev_state} = $self->{state};
1390 $self->{state} = ENTITY_STATE;
1391 !!!next-input-character;
1392 redo A;
1393 } elsif ($self->{nc} == -1) {
1394 !!!parse-error (type => 'unclosed attribute value');
1395 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1396 !!!cp (103);
1397 $self->{last_stag_name} = $self->{ct}->{tag_name};
1398
1399 $self->{state} = DATA_STATE;
1400 $self->{s_kwd} = '';
1401 ## reconsume
1402 !!!emit ($self->{ct}); # start tag
1403 redo A;
1404 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1405 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1406 if ($self->{ct}->{attributes}) {
1407 !!!cp (104);
1408 !!!parse-error (type => 'end tag attribute');
1409 } else {
1410 ## NOTE: This state should never be reached.
1411 !!!cp (105);
1412 }
1413
1414 $self->{state} = DATA_STATE;
1415 $self->{s_kwd} = '';
1416 ## reconsume
1417 !!!emit ($self->{ct}); # end tag
1418 redo A;
1419 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1420 ## XML5: No parse error above; not defined yet.
1421 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1422 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1423 ## Reconsume.
1424 !!!emit ($self->{ct}); # ATTLIST
1425 redo A;
1426 } else {
1427 die "$0: $self->{ct}->{type}: Unknown token type";
1428 }
1429 } else {
1430 ## XML5 [ATTLIST]: Not defined yet.
1431 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1432 !!!cp (106);
1433 ## XML5: Not a parse error.
1434 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1435 } else {
1436 !!!cp (106.1);
1437 }
1438 $self->{ca}->{value} .= chr ($self->{nc});
1439 $self->{read_until}->($self->{ca}->{value},
1440 q['&<],
1441 length $self->{ca}->{value});
1442
1443 ## Stay in the state
1444 !!!next-input-character;
1445 redo A;
1446 }
1447 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1448 ## XML5: "Tag attribute value unquoted state".
1449
1450 if ($is_space->{$self->{nc}}) {
1451 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1452 !!!cp (107.1);
1453 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1454 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1455 } else {
1456 !!!cp (107);
1457 ## XML5: "Tag attribute name before state".
1458 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1459 }
1460 !!!next-input-character;
1461 redo A;
1462 } elsif ($self->{nc} == 0x0026) { # &
1463 !!!cp (108);
1464
1465 ## XML5: Not defined yet.
1466
1467 ## NOTE: In the spec, the tokenizer is switched to the
1468 ## "entity in attribute value state". In this implementation, the
1469 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1470 ## implementation of the "consume a character reference" algorithm.
1471 $self->{entity_add} = -1;
1472 $self->{prev_state} = $self->{state};
1473 $self->{state} = ENTITY_STATE;
1474 !!!next-input-character;
1475 redo A;
1476 } elsif ($self->{nc} == 0x003E) { # >
1477 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1478 !!!cp (109);
1479 $self->{last_stag_name} = $self->{ct}->{tag_name};
1480
1481 $self->{state} = DATA_STATE;
1482 $self->{s_kwd} = '';
1483 !!!next-input-character;
1484 !!!emit ($self->{ct}); # start tag
1485 redo A;
1486 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1487 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488 if ($self->{ct}->{attributes}) {
1489 !!!cp (110);
1490 !!!parse-error (type => 'end tag attribute');
1491 } else {
1492 ## NOTE: This state should never be reached.
1493 !!!cp (111);
1494 }
1495
1496 $self->{state} = DATA_STATE;
1497 $self->{s_kwd} = '';
1498 !!!next-input-character;
1499 !!!emit ($self->{ct}); # end tag
1500 redo A;
1501 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1502 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1503 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1504 !!!next-input-character;
1505 !!!emit ($self->{ct}); # ATTLIST
1506 redo A;
1507 } else {
1508 die "$0: $self->{ct}->{type}: Unknown token type";
1509 }
1510 } elsif ($self->{nc} == -1) {
1511 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1512 !!!cp (112);
1513 !!!parse-error (type => 'unclosed tag');
1514 $self->{last_stag_name} = $self->{ct}->{tag_name};
1515
1516 $self->{state} = DATA_STATE;
1517 $self->{s_kwd} = '';
1518 ## reconsume
1519 !!!emit ($self->{ct}); # start tag
1520 redo A;
1521 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1522 !!!parse-error (type => 'unclosed tag');
1523 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1524 if ($self->{ct}->{attributes}) {
1525 !!!cp (113);
1526 !!!parse-error (type => 'end tag attribute');
1527 } else {
1528 ## NOTE: This state should never be reached.
1529 !!!cp (114);
1530 }
1531
1532 $self->{state} = DATA_STATE;
1533 $self->{s_kwd} = '';
1534 ## reconsume
1535 !!!emit ($self->{ct}); # end tag
1536 redo A;
1537 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1538 !!!parse-error (type => 'unclosed md'); ## TODO: type
1539 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1540 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1541 ## Reconsume.
1542 !!!emit ($self->{ct}); # ATTLIST
1543 redo A;
1544 } else {
1545 die "$0: $self->{ct}->{type}: Unknown token type";
1546 }
1547 } else {
1548 if ({
1549 0x0022 => 1, # "
1550 0x0027 => 1, # '
1551 0x003D => 1, # =
1552 }->{$self->{nc}}) {
1553 !!!cp (115);
1554 ## XML5: Not a parse error.
1555 !!!parse-error (type => 'bad attribute value');
1556 } else {
1557 !!!cp (116);
1558 }
1559 $self->{ca}->{value} .= chr ($self->{nc});
1560 $self->{read_until}->($self->{ca}->{value},
1561 q["'=& >],
1562 length $self->{ca}->{value});
1563
1564 ## Stay in the state
1565 !!!next-input-character;
1566 redo A;
1567 }
1568 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1569 if ($is_space->{$self->{nc}}) {
1570 !!!cp (118);
1571 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1572 !!!next-input-character;
1573 redo A;
1574 } elsif ($self->{nc} == 0x003E) { # >
1575 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1576 !!!cp (119);
1577 $self->{last_stag_name} = $self->{ct}->{tag_name};
1578 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1579 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1580 if ($self->{ct}->{attributes}) {
1581 !!!cp (120);
1582 !!!parse-error (type => 'end tag attribute');
1583 } else {
1584 ## NOTE: This state should never be reached.
1585 !!!cp (121);
1586 }
1587 } else {
1588 die "$0: $self->{ct}->{type}: Unknown token type";
1589 }
1590 $self->{state} = DATA_STATE;
1591 $self->{s_kwd} = '';
1592 !!!next-input-character;
1593
1594 !!!emit ($self->{ct}); # start tag or end tag
1595
1596 redo A;
1597 } elsif ($self->{nc} == 0x002F) { # /
1598 !!!cp (122);
1599 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1600 !!!next-input-character;
1601 redo A;
1602 } elsif ($self->{nc} == -1) {
1603 !!!parse-error (type => 'unclosed tag');
1604 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1605 !!!cp (122.3);
1606 $self->{last_stag_name} = $self->{ct}->{tag_name};
1607 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1608 if ($self->{ct}->{attributes}) {
1609 !!!cp (122.1);
1610 !!!parse-error (type => 'end tag attribute');
1611 } else {
1612 ## NOTE: This state should never be reached.
1613 !!!cp (122.2);
1614 }
1615 } else {
1616 die "$0: $self->{ct}->{type}: Unknown token type";
1617 }
1618 $self->{state} = DATA_STATE;
1619 $self->{s_kwd} = '';
1620 ## Reconsume.
1621 !!!emit ($self->{ct}); # start tag or end tag
1622 redo A;
1623 } else {
1624 !!!cp ('124.1');
1625 !!!parse-error (type => 'no space between attributes');
1626 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1627 ## reconsume
1628 redo A;
1629 }
1630 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1631 ## XML5: "Empty tag state".
1632
1633 if ($self->{nc} == 0x003E) { # >
1634 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1635 !!!cp ('124.2');
1636 !!!parse-error (type => 'nestc', token => $self->{ct});
1637 ## TODO: Different type than slash in start tag
1638 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1639 if ($self->{ct}->{attributes}) {
1640 !!!cp ('124.4');
1641 !!!parse-error (type => 'end tag attribute');
1642 } else {
1643 !!!cp ('124.5');
1644 }
1645 ## TODO: Test |<title></title/>|
1646 } else {
1647 !!!cp ('124.3');
1648 $self->{self_closing} = 1;
1649 }
1650
1651 $self->{state} = DATA_STATE;
1652 $self->{s_kwd} = '';
1653 !!!next-input-character;
1654
1655 !!!emit ($self->{ct}); # start tag or end tag
1656
1657 redo A;
1658 } elsif ($self->{nc} == -1) {
1659 !!!parse-error (type => 'unclosed tag');
1660 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1661 !!!cp (124.7);
1662 $self->{last_stag_name} = $self->{ct}->{tag_name};
1663 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1664 if ($self->{ct}->{attributes}) {
1665 !!!cp (124.5);
1666 !!!parse-error (type => 'end tag attribute');
1667 } else {
1668 ## NOTE: This state should never be reached.
1669 !!!cp (124.6);
1670 }
1671 } else {
1672 die "$0: $self->{ct}->{type}: Unknown token type";
1673 }
1674 ## XML5: "Tag attribute name before state".
1675 $self->{state} = DATA_STATE;
1676 $self->{s_kwd} = '';
1677 ## Reconsume.
1678 !!!emit ($self->{ct}); # start tag or end tag
1679 redo A;
1680 } else {
1681 !!!cp ('124.4');
1682 !!!parse-error (type => 'nestc');
1683 ## TODO: This error type is wrong.
1684 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1685 ## Reconsume.
1686 redo A;
1687 }
1688 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1689 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1690
1691 ## NOTE: Unlike spec's "bogus comment state", this implementation
1692 ## consumes characters one-by-one basis.
1693
1694 if ($self->{nc} == 0x003E) { # >
1695 if ($self->{in_subset}) {
1696 !!!cp (123);
1697 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1698 } else {
1699 !!!cp (124);
1700 $self->{state} = DATA_STATE;
1701 $self->{s_kwd} = '';
1702 }
1703 !!!next-input-character;
1704
1705 !!!emit ($self->{ct}); # comment
1706 redo A;
1707 } elsif ($self->{nc} == -1) {
1708 if ($self->{in_subset}) {
1709 !!!cp (125.1);
1710 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1711 } else {
1712 !!!cp (125);
1713 $self->{state} = DATA_STATE;
1714 $self->{s_kwd} = '';
1715 }
1716 ## reconsume
1717
1718 !!!emit ($self->{ct}); # comment
1719 redo A;
1720 } else {
1721 !!!cp (126);
1722 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1723 $self->{read_until}->($self->{ct}->{data},
1724 q[>],
1725 length $self->{ct}->{data});
1726
1727 ## Stay in the state.
1728 !!!next-input-character;
1729 redo A;
1730 }
1731 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1732 ## XML5: "Markup declaration state".
1733
1734 if ($self->{nc} == 0x002D) { # -
1735 !!!cp (133);
1736 $self->{state} = MD_HYPHEN_STATE;
1737 !!!next-input-character;
1738 redo A;
1739 } elsif ($self->{nc} == 0x0044 or # D
1740 $self->{nc} == 0x0064) { # d
1741 ## ASCII case-insensitive.
1742 !!!cp (130);
1743 $self->{state} = MD_DOCTYPE_STATE;
1744 $self->{kwd} = chr $self->{nc};
1745 !!!next-input-character;
1746 redo A;
1747 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1748 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1749 $self->{is_xml}) and
1750 $self->{nc} == 0x005B) { # [
1751 !!!cp (135.4);
1752 $self->{state} = MD_CDATA_STATE;
1753 $self->{kwd} = '[';
1754 !!!next-input-character;
1755 redo A;
1756 } else {
1757 !!!cp (136);
1758 }
1759
1760 !!!parse-error (type => 'bogus comment',
1761 line => $self->{line_prev},
1762 column => $self->{column_prev} - 1);
1763 ## Reconsume.
1764 $self->{state} = BOGUS_COMMENT_STATE;
1765 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1766 line => $self->{line_prev},
1767 column => $self->{column_prev} - 1,
1768 };
1769 redo A;
1770 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1771 if ($self->{nc} == 0x002D) { # -
1772 !!!cp (127);
1773 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1774 line => $self->{line_prev},
1775 column => $self->{column_prev} - 2,
1776 };
1777 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1778 !!!next-input-character;
1779 redo A;
1780 } else {
1781 !!!cp (128);
1782 !!!parse-error (type => 'bogus comment',
1783 line => $self->{line_prev},
1784 column => $self->{column_prev} - 2);
1785 $self->{state} = BOGUS_COMMENT_STATE;
1786 ## Reconsume.
1787 $self->{ct} = {type => COMMENT_TOKEN,
1788 data => '-',
1789 line => $self->{line_prev},
1790 column => $self->{column_prev} - 2,
1791 };
1792 redo A;
1793 }
1794 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1795 ## ASCII case-insensitive.
1796 if ($self->{nc} == [
1797 undef,
1798 0x004F, # O
1799 0x0043, # C
1800 0x0054, # T
1801 0x0059, # Y
1802 0x0050, # P
1803 ]->[length $self->{kwd}] or
1804 $self->{nc} == [
1805 undef,
1806 0x006F, # o
1807 0x0063, # c
1808 0x0074, # t
1809 0x0079, # y
1810 0x0070, # p
1811 ]->[length $self->{kwd}]) {
1812 !!!cp (131);
1813 ## Stay in the state.
1814 $self->{kwd} .= chr $self->{nc};
1815 !!!next-input-character;
1816 redo A;
1817 } elsif ((length $self->{kwd}) == 6 and
1818 ($self->{nc} == 0x0045 or # E
1819 $self->{nc} == 0x0065)) { # e
1820 if ($self->{is_xml} and
1821 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1822 !!!cp (129);
1823 ## XML5: case-sensitive.
1824 !!!parse-error (type => 'lowercase keyword', ## TODO
1825 text => 'DOCTYPE',
1826 line => $self->{line_prev},
1827 column => $self->{column_prev} - 5);
1828 } else {
1829 !!!cp (129.1);
1830 }
1831 $self->{state} = DOCTYPE_STATE;
1832 $self->{ct} = {type => DOCTYPE_TOKEN,
1833 quirks => 1,
1834 line => $self->{line_prev},
1835 column => $self->{column_prev} - 7,
1836 };
1837 !!!next-input-character;
1838 redo A;
1839 } else {
1840 !!!cp (132);
1841 !!!parse-error (type => 'bogus comment',
1842 line => $self->{line_prev},
1843 column => $self->{column_prev} - 1 - length $self->{kwd});
1844 $self->{state} = BOGUS_COMMENT_STATE;
1845 ## Reconsume.
1846 $self->{ct} = {type => COMMENT_TOKEN,
1847 data => $self->{kwd},
1848 line => $self->{line_prev},
1849 column => $self->{column_prev} - 1 - length $self->{kwd},
1850 };
1851 redo A;
1852 }
1853 } elsif ($self->{state} == MD_CDATA_STATE) {
1854 if ($self->{nc} == {
1855 '[' => 0x0043, # C
1856 '[C' => 0x0044, # D
1857 '[CD' => 0x0041, # A
1858 '[CDA' => 0x0054, # T
1859 '[CDAT' => 0x0041, # A
1860 }->{$self->{kwd}}) {
1861 !!!cp (135.1);
1862 ## Stay in the state.
1863 $self->{kwd} .= chr $self->{nc};
1864 !!!next-input-character;
1865 redo A;
1866 } elsif ($self->{kwd} eq '[CDATA' and
1867 $self->{nc} == 0x005B) { # [
1868 if ($self->{is_xml} and
1869 not $self->{tainted} and
1870 @{$self->{open_elements} or []} == 0) {
1871 !!!cp (135.2);
1872 !!!parse-error (type => 'cdata outside of root element',
1873 line => $self->{line_prev},
1874 column => $self->{column_prev} - 7);
1875 $self->{tainted} = 1;
1876 } else {
1877 !!!cp (135.21);
1878 }
1879
1880 $self->{ct} = {type => CHARACTER_TOKEN,
1881 data => '',
1882 line => $self->{line_prev},
1883 column => $self->{column_prev} - 7};
1884 $self->{state} = CDATA_SECTION_STATE;
1885 !!!next-input-character;
1886 redo A;
1887 } else {
1888 !!!cp (135.3);
1889 !!!parse-error (type => 'bogus comment',
1890 line => $self->{line_prev},
1891 column => $self->{column_prev} - 1 - length $self->{kwd});
1892 $self->{state} = BOGUS_COMMENT_STATE;
1893 ## Reconsume.
1894 $self->{ct} = {type => COMMENT_TOKEN,
1895 data => $self->{kwd},
1896 line => $self->{line_prev},
1897 column => $self->{column_prev} - 1 - length $self->{kwd},
1898 };
1899 redo A;
1900 }
1901 } elsif ($self->{state} == COMMENT_START_STATE) {
1902 if ($self->{nc} == 0x002D) { # -
1903 !!!cp (137);
1904 $self->{state} = COMMENT_START_DASH_STATE;
1905 !!!next-input-character;
1906 redo A;
1907 } elsif ($self->{nc} == 0x003E) { # >
1908 !!!parse-error (type => 'bogus comment');
1909 if ($self->{in_subset}) {
1910 !!!cp (138.1);
1911 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1912 } else {
1913 !!!cp (138);
1914 $self->{state} = DATA_STATE;
1915 $self->{s_kwd} = '';
1916 }
1917 !!!next-input-character;
1918
1919 !!!emit ($self->{ct}); # comment
1920
1921 redo A;
1922 } elsif ($self->{nc} == -1) {
1923 !!!parse-error (type => 'unclosed comment');
1924 if ($self->{in_subset}) {
1925 !!!cp (139.1);
1926 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1927 } else {
1928 !!!cp (139);
1929 $self->{state} = DATA_STATE;
1930 $self->{s_kwd} = '';
1931 }
1932 ## reconsume
1933
1934 !!!emit ($self->{ct}); # comment
1935
1936 redo A;
1937 } else {
1938 !!!cp (140);
1939 $self->{ct}->{data} # comment
1940 .= chr ($self->{nc});
1941 $self->{state} = COMMENT_STATE;
1942 !!!next-input-character;
1943 redo A;
1944 }
1945 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1946 if ($self->{nc} == 0x002D) { # -
1947 !!!cp (141);
1948 $self->{state} = COMMENT_END_STATE;
1949 !!!next-input-character;
1950 redo A;
1951 } elsif ($self->{nc} == 0x003E) { # >
1952 !!!parse-error (type => 'bogus comment');
1953 if ($self->{in_subset}) {
1954 !!!cp (142.1);
1955 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1956 } else {
1957 !!!cp (142);
1958 $self->{state} = DATA_STATE;
1959 $self->{s_kwd} = '';
1960 }
1961 !!!next-input-character;
1962
1963 !!!emit ($self->{ct}); # comment
1964
1965 redo A;
1966 } elsif ($self->{nc} == -1) {
1967 !!!parse-error (type => 'unclosed comment');
1968 if ($self->{in_subset}) {
1969 !!!cp (143.1);
1970 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1971 } else {
1972 !!!cp (143);
1973 $self->{state} = DATA_STATE;
1974 $self->{s_kwd} = '';
1975 }
1976 ## reconsume
1977
1978 !!!emit ($self->{ct}); # comment
1979
1980 redo A;
1981 } else {
1982 !!!cp (144);
1983 $self->{ct}->{data} # comment
1984 .= '-' . chr ($self->{nc});
1985 $self->{state} = COMMENT_STATE;
1986 !!!next-input-character;
1987 redo A;
1988 }
1989 } elsif ($self->{state} == COMMENT_STATE) {
1990 ## XML5: "Comment state" and "DOCTYPE comment state".
1991
1992 if ($self->{nc} == 0x002D) { # -
1993 !!!cp (145);
1994 $self->{state} = COMMENT_END_DASH_STATE;
1995 !!!next-input-character;
1996 redo A;
1997 } elsif ($self->{nc} == -1) {
1998 !!!parse-error (type => 'unclosed comment');
1999 if ($self->{in_subset}) {
2000 !!!cp (146.1);
2001 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2002 } else {
2003 !!!cp (146);
2004 $self->{state} = DATA_STATE;
2005 $self->{s_kwd} = '';
2006 }
2007 ## reconsume
2008
2009 !!!emit ($self->{ct}); # comment
2010
2011 redo A;
2012 } else {
2013 !!!cp (147);
2014 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2015 $self->{read_until}->($self->{ct}->{data},
2016 q[-],
2017 length $self->{ct}->{data});
2018
2019 ## Stay in the state
2020 !!!next-input-character;
2021 redo A;
2022 }
2023 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2024 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2025
2026 if ($self->{nc} == 0x002D) { # -
2027 !!!cp (148);
2028 $self->{state} = COMMENT_END_STATE;
2029 !!!next-input-character;
2030 redo A;
2031 } elsif ($self->{nc} == -1) {
2032 !!!parse-error (type => 'unclosed comment');
2033 if ($self->{in_subset}) {
2034 !!!cp (149.1);
2035 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2036 } else {
2037 !!!cp (149);
2038 $self->{state} = DATA_STATE;
2039 $self->{s_kwd} = '';
2040 }
2041 ## reconsume
2042
2043 !!!emit ($self->{ct}); # comment
2044
2045 redo A;
2046 } else {
2047 !!!cp (150);
2048 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2049 $self->{state} = COMMENT_STATE;
2050 !!!next-input-character;
2051 redo A;
2052 }
2053 } elsif ($self->{state} == COMMENT_END_STATE) {
2054 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2055
2056 if ($self->{nc} == 0x003E) { # >
2057 if ($self->{in_subset}) {
2058 !!!cp (151.1);
2059 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2060 } else {
2061 !!!cp (151);
2062 $self->{state} = DATA_STATE;
2063 $self->{s_kwd} = '';
2064 }
2065 !!!next-input-character;
2066
2067 !!!emit ($self->{ct}); # comment
2068
2069 redo A;
2070 } elsif ($self->{nc} == 0x002D) { # -
2071 !!!cp (152);
2072 ## XML5: Not a parse error.
2073 !!!parse-error (type => 'dash in comment',
2074 line => $self->{line_prev},
2075 column => $self->{column_prev});
2076 $self->{ct}->{data} .= '-'; # comment
2077 ## Stay in the state
2078 !!!next-input-character;
2079 redo A;
2080 } elsif ($self->{nc} == -1) {
2081 !!!parse-error (type => 'unclosed comment');
2082 if ($self->{in_subset}) {
2083 !!!cp (153.1);
2084 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085 } else {
2086 !!!cp (153);
2087 $self->{state} = DATA_STATE;
2088 $self->{s_kwd} = '';
2089 }
2090 ## reconsume
2091
2092 !!!emit ($self->{ct}); # comment
2093
2094 redo A;
2095 } else {
2096 !!!cp (154);
2097 ## XML5: Not a parse error.
2098 !!!parse-error (type => 'dash in comment',
2099 line => $self->{line_prev},
2100 column => $self->{column_prev});
2101 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2102 $self->{state} = COMMENT_STATE;
2103 !!!next-input-character;
2104 redo A;
2105 }
2106 } elsif ($self->{state} == DOCTYPE_STATE) {
2107 if ($is_space->{$self->{nc}}) {
2108 !!!cp (155);
2109 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2110 !!!next-input-character;
2111 redo A;
2112 } else {
2113 !!!cp (156);
2114 ## XML5: Unless EOF, swith to the bogus comment state.
2115 !!!parse-error (type => 'no space before DOCTYPE name');
2116 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2117 ## reconsume
2118 redo A;
2119 }
2120 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2121 ## XML5: "DOCTYPE root name before state".
2122
2123 if ($is_space->{$self->{nc}}) {
2124 !!!cp (157);
2125 ## Stay in the state
2126 !!!next-input-character;
2127 redo A;
2128 } elsif ($self->{nc} == 0x003E) { # >
2129 !!!cp (158);
2130 ## XML5: No parse error.
2131 !!!parse-error (type => 'no DOCTYPE name');
2132 $self->{state} = DATA_STATE;
2133 $self->{s_kwd} = '';
2134 !!!next-input-character;
2135
2136 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2137
2138 redo A;
2139 } elsif ($self->{nc} == -1) {
2140 !!!cp (159);
2141 !!!parse-error (type => 'no DOCTYPE name');
2142 $self->{state} = DATA_STATE;
2143 $self->{s_kwd} = '';
2144 ## reconsume
2145
2146 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2147
2148 redo A;
2149 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2150 !!!cp (159.1);
2151 !!!parse-error (type => 'no DOCTYPE name');
2152 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2153 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2154 $self->{in_subset} = 1;
2155 !!!next-input-character;
2156 !!!emit ($self->{ct}); # DOCTYPE
2157 redo A;
2158 } else {
2159 !!!cp (160);
2160 $self->{ct}->{name} = chr $self->{nc};
2161 delete $self->{ct}->{quirks};
2162 $self->{state} = DOCTYPE_NAME_STATE;
2163 !!!next-input-character;
2164 redo A;
2165 }
2166 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2167 ## XML5: "DOCTYPE root name state".
2168
2169 ## ISSUE: Redundant "First," in the spec.
2170
2171 if ($is_space->{$self->{nc}}) {
2172 !!!cp (161);
2173 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2174 !!!next-input-character;
2175 redo A;
2176 } elsif ($self->{nc} == 0x003E) { # >
2177 !!!cp (162);
2178 $self->{state} = DATA_STATE;
2179 $self->{s_kwd} = '';
2180 !!!next-input-character;
2181
2182 !!!emit ($self->{ct}); # DOCTYPE
2183
2184 redo A;
2185 } elsif ($self->{nc} == -1) {
2186 !!!cp (163);
2187 !!!parse-error (type => 'unclosed DOCTYPE');
2188 $self->{state} = DATA_STATE;
2189 $self->{s_kwd} = '';
2190 ## reconsume
2191
2192 $self->{ct}->{quirks} = 1;
2193 !!!emit ($self->{ct}); # DOCTYPE
2194
2195 redo A;
2196 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2197 !!!cp (163.1);
2198 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2199 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2200 $self->{in_subset} = 1;
2201 !!!next-input-character;
2202 !!!emit ($self->{ct}); # DOCTYPE
2203 redo A;
2204 } else {
2205 !!!cp (164);
2206 $self->{ct}->{name}
2207 .= chr ($self->{nc}); # DOCTYPE
2208 ## Stay in the state
2209 !!!next-input-character;
2210 redo A;
2211 }
2212 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2213 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2214 ## state", but implemented differently.
2215
2216 if ($is_space->{$self->{nc}}) {
2217 !!!cp (165);
2218 ## Stay in the state
2219 !!!next-input-character;
2220 redo A;
2221 } elsif ($self->{nc} == 0x003E) { # >
2222 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2223 !!!cp (166);
2224 $self->{state} = DATA_STATE;
2225 $self->{s_kwd} = '';
2226 } else {
2227 !!!cp (166.1);
2228 !!!parse-error (type => 'no md def'); ## TODO: type
2229 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2230 }
2231
2232 !!!next-input-character;
2233 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2234 redo A;
2235 } elsif ($self->{nc} == -1) {
2236 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2237 !!!cp (167);
2238 !!!parse-error (type => 'unclosed DOCTYPE');
2239 $self->{state} = DATA_STATE;
2240 $self->{s_kwd} = '';
2241 $self->{ct}->{quirks} = 1;
2242 } else {
2243 !!!cp (167.12);
2244 !!!parse-error (type => 'unclosed md'); ## TODO: type
2245 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2246 }
2247
2248 ## Reconsume.
2249 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2250 redo A;
2251 } elsif ($self->{nc} == 0x0050 or # P
2252 $self->{nc} == 0x0070) { # p
2253 !!!cp (167.1);
2254 $self->{state} = PUBLIC_STATE;
2255 $self->{kwd} = chr $self->{nc};
2256 !!!next-input-character;
2257 redo A;
2258 } elsif ($self->{nc} == 0x0053 or # S
2259 $self->{nc} == 0x0073) { # s
2260 !!!cp (167.2);
2261 $self->{state} = SYSTEM_STATE;
2262 $self->{kwd} = chr $self->{nc};
2263 !!!next-input-character;
2264 redo A;
2265 ## TODO: " and ' for ENTITY
2266 } elsif ($self->{is_xml} and
2267 $self->{ct}->{type} == DOCTYPE_TOKEN and
2268 $self->{nc} == 0x005B) { # [
2269 !!!cp (167.3);
2270 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2271 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2272 $self->{in_subset} = 1;
2273 !!!next-input-character;
2274 !!!emit ($self->{ct}); # DOCTYPE
2275 redo A;
2276 } else {
2277 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2278
2279 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2280 !!!cp (180);
2281 $self->{ct}->{quirks} = 1;
2282 $self->{state} = BOGUS_DOCTYPE_STATE;
2283 } else {
2284 !!!cp (180.1);
2285 $self->{state} = BOGUS_MD_STATE;
2286 }
2287
2288 !!!next-input-character;
2289 redo A;
2290 }
2291 } elsif ($self->{state} == PUBLIC_STATE) {
2292 ## ASCII case-insensitive
2293 if ($self->{nc} == [
2294 undef,
2295 0x0055, # U
2296 0x0042, # B
2297 0x004C, # L
2298 0x0049, # I
2299 ]->[length $self->{kwd}] or
2300 $self->{nc} == [
2301 undef,
2302 0x0075, # u
2303 0x0062, # b
2304 0x006C, # l
2305 0x0069, # i
2306 ]->[length $self->{kwd}]) {
2307 !!!cp (175);
2308 ## Stay in the state.
2309 $self->{kwd} .= chr $self->{nc};
2310 !!!next-input-character;
2311 redo A;
2312 } elsif ((length $self->{kwd}) == 5 and
2313 ($self->{nc} == 0x0043 or # C
2314 $self->{nc} == 0x0063)) { # c
2315 if ($self->{is_xml} and
2316 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2317 !!!cp (168.1);
2318 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2319 text => 'PUBLIC',
2320 line => $self->{line_prev},
2321 column => $self->{column_prev} - 4);
2322 } else {
2323 !!!cp (168);
2324 }
2325 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2326 !!!next-input-character;
2327 redo A;
2328 } else {
2329 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2330 line => $self->{line_prev},
2331 column => $self->{column_prev} + 1 - length $self->{kwd});
2332 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2333 !!!cp (169);
2334 $self->{ct}->{quirks} = 1;
2335 $self->{state} = BOGUS_DOCTYPE_STATE;
2336 } else {
2337 !!!cp (169.1);
2338 $self->{state} = BOGUS_MD_STATE;
2339 }
2340 ## Reconsume.
2341 redo A;
2342 }
2343 } elsif ($self->{state} == SYSTEM_STATE) {
2344 ## ASCII case-insensitive
2345 if ($self->{nc} == [
2346 undef,
2347 0x0059, # Y
2348 0x0053, # S
2349 0x0054, # T
2350 0x0045, # E
2351 ]->[length $self->{kwd}] or
2352 $self->{nc} == [
2353 undef,
2354 0x0079, # y
2355 0x0073, # s
2356 0x0074, # t
2357 0x0065, # e
2358 ]->[length $self->{kwd}]) {
2359 !!!cp (170);
2360 ## Stay in the state.
2361 $self->{kwd} .= chr $self->{nc};
2362 !!!next-input-character;
2363 redo A;
2364 } elsif ((length $self->{kwd}) == 5 and
2365 ($self->{nc} == 0x004D or # M
2366 $self->{nc} == 0x006D)) { # m
2367 if ($self->{is_xml} and
2368 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2369 !!!cp (171.1);
2370 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2371 text => 'SYSTEM',
2372 line => $self->{line_prev},
2373 column => $self->{column_prev} - 4);
2374 } else {
2375 !!!cp (171);
2376 }
2377 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2378 !!!next-input-character;
2379 redo A;
2380 } else {
2381 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2382 line => $self->{line_prev},
2383 column => $self->{column_prev} + 1 - length $self->{kwd});
2384 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2385 !!!cp (172);
2386 $self->{ct}->{quirks} = 1;
2387 $self->{state} = BOGUS_DOCTYPE_STATE;
2388 } else {
2389 !!!cp (172.1);
2390 $self->{state} = BOGUS_MD_STATE;
2391 }
2392 ## Reconsume.
2393 redo A;
2394 }
2395 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2396 if ($is_space->{$self->{nc}}) {
2397 !!!cp (181);
2398 ## Stay in the state
2399 !!!next-input-character;
2400 redo A;
2401 } elsif ($self->{nc} eq 0x0022) { # "
2402 !!!cp (182);
2403 $self->{ct}->{pubid} = ''; # DOCTYPE
2404 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2405 !!!next-input-character;
2406 redo A;
2407 } elsif ($self->{nc} eq 0x0027) { # '
2408 !!!cp (183);
2409 $self->{ct}->{pubid} = ''; # DOCTYPE
2410 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2411 !!!next-input-character;
2412 redo A;
2413 } elsif ($self->{nc} eq 0x003E) { # >
2414 !!!parse-error (type => 'no PUBLIC literal');
2415
2416 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2417 !!!cp (184);
2418 $self->{state} = DATA_STATE;
2419 $self->{s_kwd} = '';
2420 $self->{ct}->{quirks} = 1;
2421 } else {
2422 !!!cp (184.1);
2423 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2424 }
2425
2426 !!!next-input-character;
2427 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2428 redo A;
2429 } elsif ($self->{nc} == -1) {
2430 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2431 !!!cp (185);
2432 !!!parse-error (type => 'unclosed DOCTYPE');
2433 $self->{state} = DATA_STATE;
2434 $self->{s_kwd} = '';
2435 $self->{ct}->{quirks} = 1;
2436 } else {
2437 !!!cp (185.1);
2438 !!!parse-error (type => 'unclosed md'); ## TODO: type
2439 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2440 }
2441
2442 ## reconsume
2443 !!!emit ($self->{ct}); # DOCTYPE
2444 redo A;
2445 } elsif ($self->{is_xml} and
2446 $self->{ct}->{type} == DOCTYPE_TOKEN and
2447 $self->{nc} == 0x005B) { # [
2448 !!!cp (186.1);
2449 !!!parse-error (type => 'no PUBLIC literal');
2450 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2451 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2452 $self->{in_subset} = 1;
2453 !!!next-input-character;
2454 !!!emit ($self->{ct}); # DOCTYPE
2455 redo A;
2456 } else {
2457 !!!parse-error (type => 'string after PUBLIC');
2458
2459 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2460 !!!cp (186);
2461 $self->{ct}->{quirks} = 1;
2462 $self->{state} = BOGUS_DOCTYPE_STATE;
2463 } else {
2464 !!!cp (186.2);
2465 $self->{state} = BOGUS_MD_STATE;
2466 }
2467
2468 !!!next-input-character;
2469 redo A;
2470 }
2471 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2472 if ($self->{nc} == 0x0022) { # "
2473 !!!cp (187);
2474 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2475 !!!next-input-character;
2476 redo A;
2477 } elsif ($self->{nc} == 0x003E) { # >
2478 !!!parse-error (type => 'unclosed PUBLIC literal');
2479
2480 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2481 !!!cp (188);
2482 $self->{state} = DATA_STATE;
2483 $self->{s_kwd} = '';
2484 $self->{ct}->{quirks} = 1;
2485 } else {
2486 !!!cp (188.1);
2487 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2488 }
2489
2490 !!!next-input-character;
2491 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2492 redo A;
2493 } elsif ($self->{nc} == -1) {
2494 !!!parse-error (type => 'unclosed PUBLIC literal');
2495
2496 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2497 !!!cp (189);
2498 $self->{state} = DATA_STATE;
2499 $self->{s_kwd} = '';
2500 $self->{ct}->{quirks} = 1;
2501 } else {
2502 !!!cp (189.1);
2503 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2504 }
2505
2506 ## Reconsume.
2507 !!!emit ($self->{ct}); # DOCTYPE
2508 redo A;
2509 } else {
2510 !!!cp (190);
2511 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2512 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2513 length $self->{ct}->{pubid});
2514
2515 ## Stay in the state
2516 !!!next-input-character;
2517 redo A;
2518 }
2519 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2520 if ($self->{nc} == 0x0027) { # '
2521 !!!cp (191);
2522 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2523 !!!next-input-character;
2524 redo A;
2525 } elsif ($self->{nc} == 0x003E) { # >
2526 !!!parse-error (type => 'unclosed PUBLIC literal');
2527
2528 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2529 !!!cp (192);
2530 $self->{state} = DATA_STATE;
2531 $self->{s_kwd} = '';
2532 $self->{ct}->{quirks} = 1;
2533 } else {
2534 !!!cp (192.1);
2535 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2536 }
2537
2538 !!!next-input-character;
2539 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2540 redo A;
2541 } elsif ($self->{nc} == -1) {
2542 !!!parse-error (type => 'unclosed PUBLIC literal');
2543
2544 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2545 !!!cp (193);
2546 $self->{state} = DATA_STATE;
2547 $self->{s_kwd} = '';
2548 $self->{ct}->{quirks} = 1;
2549 } else {
2550 !!!cp (193.1);
2551 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2552 }
2553
2554 ## reconsume
2555 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2556 redo A;
2557 } else {
2558 !!!cp (194);
2559 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2560 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2561 length $self->{ct}->{pubid});
2562
2563 ## Stay in the state
2564 !!!next-input-character;
2565 redo A;
2566 }
2567 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2568 if ($is_space->{$self->{nc}}) {
2569 !!!cp (195);
2570 ## Stay in the state
2571 !!!next-input-character;
2572 redo A;
2573 } elsif ($self->{nc} == 0x0022) { # "
2574 !!!cp (196);
2575 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2576 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2577 !!!next-input-character;
2578 redo A;
2579 } elsif ($self->{nc} == 0x0027) { # '
2580 !!!cp (197);
2581 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2582 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2583 !!!next-input-character;
2584 redo A;
2585 } elsif ($self->{nc} == 0x003E) { # >
2586 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2587 if ($self->{is_xml}) {
2588 !!!cp (198.1);
2589 !!!parse-error (type => 'no SYSTEM literal');
2590 } else {
2591 !!!cp (198);
2592 }
2593 $self->{state} = DATA_STATE;
2594 $self->{s_kwd} = '';
2595 } else {
2596 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2597 !!!cp (198.2);
2598 } else {
2599 !!!cp (198.3);
2600 !!!parse-error (type => 'no SYSTEM literal');
2601 }
2602 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2603 }
2604
2605 !!!next-input-character;
2606 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2607 redo A;
2608 } elsif ($self->{nc} == -1) {
2609 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2610 !!!cp (199);
2611 !!!parse-error (type => 'unclosed DOCTYPE');
2612
2613 $self->{state} = DATA_STATE;
2614 $self->{s_kwd} = '';
2615 $self->{ct}->{quirks} = 1;
2616 } else {
2617 !!!parse-error (type => 'unclosed md'); ## TODO: type
2618 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2619 }
2620
2621 ## reconsume
2622 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2623 redo A;
2624 } elsif ($self->{is_xml} and
2625 $self->{ct}->{type} == DOCTYPE_TOKEN and
2626 $self->{nc} == 0x005B) { # [
2627 !!!cp (200.1);
2628 !!!parse-error (type => 'no SYSTEM literal');
2629 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2630 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2631 $self->{in_subset} = 1;
2632 !!!next-input-character;
2633 !!!emit ($self->{ct}); # DOCTYPE
2634 redo A;
2635 } else {
2636 !!!parse-error (type => 'string after PUBLIC literal');
2637
2638 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2639 !!!cp (200);
2640 $self->{ct}->{quirks} = 1;
2641 $self->{state} = BOGUS_DOCTYPE_STATE;
2642 } else {
2643 !!!cp (200.2);
2644 $self->{state} = BOGUS_MD_STATE;
2645 }
2646
2647 !!!next-input-character;
2648 redo A;
2649 }
2650 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2651 if ($is_space->{$self->{nc}}) {
2652 !!!cp (201);
2653 ## Stay in the state
2654 !!!next-input-character;
2655 redo A;
2656 } elsif ($self->{nc} == 0x0022) { # "
2657 !!!cp (202);
2658 $self->{ct}->{sysid} = ''; # DOCTYPE
2659 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2660 !!!next-input-character;
2661 redo A;
2662 } elsif ($self->{nc} == 0x0027) { # '
2663 !!!cp (203);
2664 $self->{ct}->{sysid} = ''; # DOCTYPE
2665 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2666 !!!next-input-character;
2667 redo A;
2668 } elsif ($self->{nc} == 0x003E) { # >
2669 !!!parse-error (type => 'no SYSTEM literal');
2670 !!!next-input-character;
2671
2672 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2673 !!!cp (204);
2674 $self->{state} = DATA_STATE;
2675 $self->{s_kwd} = '';
2676 $self->{ct}->{quirks} = 1;
2677 } else {
2678 !!!cp (204.1);
2679 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2680 }
2681
2682 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2683 redo A;
2684 } elsif ($self->{nc} == -1) {
2685 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2686 !!!cp (205);
2687 !!!parse-error (type => 'unclosed DOCTYPE');
2688 $self->{state} = DATA_STATE;
2689 $self->{s_kwd} = '';
2690 $self->{ct}->{quirks} = 1;
2691 } else {
2692 !!!cp (205.1);
2693 !!!parse-error (type => 'unclosed md'); ## TODO: type
2694 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2695 }
2696
2697 ## reconsume
2698 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2699 redo A;
2700 } elsif ($self->{is_xml} and
2701 $self->{ct}->{type} == DOCTYPE_TOKEN and
2702 $self->{nc} == 0x005B) { # [
2703 !!!cp (206.1);
2704 !!!parse-error (type => 'no SYSTEM literal');
2705
2706 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2708 $self->{in_subset} = 1;
2709 !!!next-input-character;
2710 !!!emit ($self->{ct}); # DOCTYPE
2711 redo A;
2712 } else {
2713 !!!parse-error (type => 'string after SYSTEM');
2714
2715 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2716 !!!cp (206);
2717 $self->{ct}->{quirks} = 1;
2718 $self->{state} = BOGUS_DOCTYPE_STATE;
2719 } else {
2720 !!!cp (206.2);
2721 $self->{state} = BOGUS_MD_STATE;
2722 }
2723
2724 !!!next-input-character;
2725 redo A;
2726 }
2727 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2728 if ($self->{nc} == 0x0022) { # "
2729 !!!cp (207);
2730 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2731 !!!next-input-character;
2732 redo A;
2733 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2734 !!!parse-error (type => 'unclosed SYSTEM literal');
2735
2736 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2737 !!!cp (208);
2738 $self->{state} = DATA_STATE;
2739 $self->{s_kwd} = '';
2740 $self->{ct}->{quirks} = 1;
2741 } else {
2742 !!!cp (208.1);
2743 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2744 }
2745
2746 !!!next-input-character;
2747 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2748 redo A;
2749 } elsif ($self->{nc} == -1) {
2750 !!!parse-error (type => 'unclosed SYSTEM literal');
2751
2752 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2753 !!!cp (209);
2754 $self->{state} = DATA_STATE;
2755 $self->{s_kwd} = '';
2756 $self->{ct}->{quirks} = 1;
2757 } else {
2758 !!!cp (209.1);
2759 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2760 }
2761
2762 ## reconsume
2763 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2764 redo A;
2765 } else {
2766 !!!cp (210);
2767 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2768 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2769 length $self->{ct}->{sysid});
2770
2771 ## Stay in the state
2772 !!!next-input-character;
2773 redo A;
2774 }
2775 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2776 if ($self->{nc} == 0x0027) { # '
2777 !!!cp (211);
2778 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2779 !!!next-input-character;
2780 redo A;
2781 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2782 !!!cp (212);
2783 !!!parse-error (type => 'unclosed SYSTEM literal');
2784
2785 $self->{state} = DATA_STATE;
2786 $self->{s_kwd} = '';
2787 !!!next-input-character;
2788
2789 $self->{ct}->{quirks} = 1;
2790 !!!emit ($self->{ct}); # DOCTYPE
2791
2792 redo A;
2793 } elsif ($self->{nc} == -1) {
2794 !!!parse-error (type => 'unclosed SYSTEM literal');
2795
2796 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2797 !!!cp (213);
2798 $self->{state} = DATA_STATE;
2799 $self->{s_kwd} = '';
2800 $self->{ct}->{quirks} = 1;
2801 } else {
2802 !!!cp (213.1);
2803 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2804 }
2805
2806 ## reconsume
2807 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2808 redo A;
2809 } else {
2810 !!!cp (214);
2811 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2812 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2813 length $self->{ct}->{sysid});
2814
2815 ## Stay in the state
2816 !!!next-input-character;
2817 redo A;
2818 }
2819 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2820 if ($is_space->{$self->{nc}}) {
2821 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2822 !!!cp (215.1);
2823 $self->{state} = BEFORE_NDATA_STATE;
2824 } else {
2825 !!!cp (215);
2826 ## Stay in the state
2827 }
2828 !!!next-input-character;
2829 redo A;
2830 } elsif ($self->{nc} == 0x003E) { # >
2831 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2832 !!!cp (216);
2833 $self->{state} = DATA_STATE;
2834 $self->{s_kwd} = '';
2835 } else {
2836 !!!cp (216.1);
2837 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2838 }
2839
2840 !!!next-input-character;
2841 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2842 redo A;
2843 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2844 ($self->{nc} == 0x004E or # N
2845 $self->{nc} == 0x006E)) { # n
2846 !!!cp (216.2);
2847 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2848 $self->{state} = NDATA_STATE;
2849 $self->{kwd} = chr $self->{nc};
2850 !!!next-input-character;
2851 redo A;
2852 } elsif ($self->{nc} == -1) {
2853 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2854 !!!cp (217);
2855 !!!parse-error (type => 'unclosed DOCTYPE');
2856 $self->{state} = DATA_STATE;
2857 $self->{s_kwd} = '';
2858 $self->{ct}->{quirks} = 1;
2859 } else {
2860 !!!cp (217.1);
2861 !!!parse-error (type => 'unclosed md'); ## TODO: type
2862 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2863 }
2864
2865 ## reconsume
2866 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2867 redo A;
2868 } elsif ($self->{is_xml} and
2869 $self->{ct}->{type} == DOCTYPE_TOKEN and
2870 $self->{nc} == 0x005B) { # [
2871 !!!cp (218.1);
2872 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2873 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2874 $self->{in_subset} = 1;
2875 !!!next-input-character;
2876 !!!emit ($self->{ct}); # DOCTYPE
2877 redo A;
2878 } else {
2879 !!!parse-error (type => 'string after SYSTEM literal');
2880
2881 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2882 !!!cp (218);
2883 #$self->{ct}->{quirks} = 1;
2884 $self->{state} = BOGUS_DOCTYPE_STATE;
2885 } else {
2886 !!!cp (218.2);
2887 $self->{state} = BOGUS_MD_STATE;
2888 }
2889
2890 !!!next-input-character;
2891 redo A;
2892 }
2893 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2894 if ($is_space->{$self->{nc}}) {
2895 !!!cp (218.3);
2896 ## Stay in the state.
2897 !!!next-input-character;
2898 redo A;
2899 } elsif ($self->{nc} == 0x003E) { # >
2900 !!!cp (218.4);
2901 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902 !!!next-input-character;
2903 !!!emit ($self->{ct}); # ENTITY
2904 redo A;
2905 } elsif ($self->{nc} == 0x004E or # N
2906 $self->{nc} == 0x006E) { # n
2907 !!!cp (218.5);
2908 $self->{state} = NDATA_STATE;
2909 $self->{kwd} = chr $self->{nc};
2910 !!!next-input-character;
2911 redo A;
2912 } elsif ($self->{nc} == -1) {
2913 !!!cp (218.6);
2914 !!!parse-error (type => 'unclosed md'); ## TODO: type
2915 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2916 ## reconsume
2917 !!!emit ($self->{ct}); # ENTITY
2918 redo A;
2919 } else {
2920 !!!cp (218.7);
2921 !!!parse-error (type => 'string after SYSTEM literal');
2922 $self->{state} = BOGUS_MD_STATE;
2923 !!!next-input-character;
2924 redo A;
2925 }
2926 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2927 if ($self->{nc} == 0x003E) { # >
2928 !!!cp (219);
2929 $self->{state} = DATA_STATE;
2930 $self->{s_kwd} = '';
2931 !!!next-input-character;
2932
2933 !!!emit ($self->{ct}); # DOCTYPE
2934
2935 redo A;
2936 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2937 !!!cp (220.1);
2938 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2940 $self->{in_subset} = 1;
2941 !!!next-input-character;
2942 !!!emit ($self->{ct}); # DOCTYPE
2943 redo A;
2944 } elsif ($self->{nc} == -1) {
2945 !!!cp (220);
2946 $self->{state} = DATA_STATE;
2947 $self->{s_kwd} = '';
2948 ## reconsume
2949
2950 !!!emit ($self->{ct}); # DOCTYPE
2951
2952 redo A;
2953 } else {
2954 !!!cp (221);
2955 my $s = '';
2956 $self->{read_until}->($s, q{>[}, 0);
2957
2958 ## Stay in the state
2959 !!!next-input-character;
2960 redo A;
2961 }
2962 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2963 ## NOTE: "CDATA section state" in the state is jointly implemented
2964 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2965 ## and |CDATA_SECTION_MSE2_STATE|.
2966
2967 ## XML5: "CDATA state".
2968
2969 if ($self->{nc} == 0x005D) { # ]
2970 !!!cp (221.1);
2971 $self->{state} = CDATA_SECTION_MSE1_STATE;
2972 !!!next-input-character;
2973 redo A;
2974 } elsif ($self->{nc} == -1) {
2975 if ($self->{is_xml}) {
2976 !!!cp (221.11);
2977 !!!parse-error (type => 'no mse'); ## TODO: type
2978 } else {
2979 !!!cp (221.12);
2980 }
2981
2982 $self->{state} = DATA_STATE;
2983 $self->{s_kwd} = '';
2984 ## Reconsume.
2985 if (length $self->{ct}->{data}) { # character
2986 !!!cp (221.2);
2987 !!!emit ($self->{ct}); # character
2988 } else {
2989 !!!cp (221.3);
2990 ## No token to emit. $self->{ct} is discarded.
2991 }
2992 redo A;
2993 } else {
2994 !!!cp (221.4);
2995 $self->{ct}->{data} .= chr $self->{nc};
2996 $self->{read_until}->($self->{ct}->{data},
2997 q<]>,
2998 length $self->{ct}->{data});
2999
3000 ## Stay in the state.
3001 !!!next-input-character;
3002 redo A;
3003 }
3004
3005 ## ISSUE: "text tokens" in spec.
3006 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3007 ## XML5: "CDATA bracket state".
3008
3009 if ($self->{nc} == 0x005D) { # ]
3010 !!!cp (221.5);
3011 $self->{state} = CDATA_SECTION_MSE2_STATE;
3012 !!!next-input-character;
3013 redo A;
3014 } else {
3015 !!!cp (221.6);
3016 ## XML5: If EOF, "]" is not appended and changed to the data state.
3017 $self->{ct}->{data} .= ']';
3018 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3019 ## Reconsume.
3020 redo A;
3021 }
3022 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3023 ## XML5: "CDATA end state".
3024
3025 if ($self->{nc} == 0x003E) { # >
3026 $self->{state} = DATA_STATE;
3027 $self->{s_kwd} = '';
3028 !!!next-input-character;
3029 if (length $self->{ct}->{data}) { # character
3030 !!!cp (221.7);
3031 !!!emit ($self->{ct}); # character
3032 } else {
3033 !!!cp (221.8);
3034 ## No token to emit. $self->{ct} is discarded.
3035 }
3036 redo A;
3037 } elsif ($self->{nc} == 0x005D) { # ]
3038 !!!cp (221.9); # character
3039 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3040 ## Stay in the state.
3041 !!!next-input-character;
3042 redo A;
3043 } else {
3044 !!!cp (221.11);
3045 $self->{ct}->{data} .= ']]'; # character
3046 $self->{state} = CDATA_SECTION_STATE;
3047 ## Reconsume. ## XML5: Emit.
3048 redo A;
3049 }
3050 } elsif ($self->{state} == ENTITY_STATE) {
3051 if ($is_space->{$self->{nc}} or
3052 {
3053 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3054 $self->{entity_add} => 1,
3055 }->{$self->{nc}}) {
3056 !!!cp (1001);
3057 ## Don't consume
3058 ## No error
3059 ## Return nothing.
3060 #
3061 } elsif ($self->{nc} == 0x0023) { # #
3062 !!!cp (999);
3063 $self->{state} = ENTITY_HASH_STATE;
3064 $self->{kwd} = '#';
3065 !!!next-input-character;
3066 redo A;
3067 } elsif ((0x0041 <= $self->{nc} and
3068 $self->{nc} <= 0x005A) or # A..Z
3069 (0x0061 <= $self->{nc} and
3070 $self->{nc} <= 0x007A)) { # a..z
3071 !!!cp (998);
3072 require Whatpm::_NamedEntityList;
3073 $self->{state} = ENTITY_NAME_STATE;
3074 $self->{kwd} = chr $self->{nc};
3075 $self->{entity__value} = $self->{kwd};
3076 $self->{entity__match} = 0;
3077 !!!next-input-character;
3078 redo A;
3079 } else {
3080 !!!cp (1027);
3081 !!!parse-error (type => 'bare ero');
3082 ## Return nothing.
3083 #
3084 }
3085
3086 ## NOTE: No character is consumed by the "consume a character
3087 ## reference" algorithm. In other word, there is an "&" character
3088 ## that does not introduce a character reference, which would be
3089 ## appended to the parent element or the attribute value in later
3090 ## process of the tokenizer.
3091
3092 if ($self->{prev_state} == DATA_STATE) {
3093 !!!cp (997);
3094 $self->{state} = $self->{prev_state};
3095 $self->{s_kwd} = '';
3096 ## Reconsume.
3097 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3098 line => $self->{line_prev},
3099 column => $self->{column_prev},
3100 });
3101 redo A;
3102 } else {
3103 !!!cp (996);
3104 $self->{ca}->{value} .= '&';
3105 $self->{state} = $self->{prev_state};
3106 $self->{s_kwd} = '';
3107 ## Reconsume.
3108 redo A;
3109 }
3110 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3111 if ($self->{nc} == 0x0078 or # x
3112 $self->{nc} == 0x0058) { # X
3113 !!!cp (995);
3114 $self->{state} = HEXREF_X_STATE;
3115 $self->{kwd} .= chr $self->{nc};
3116 !!!next-input-character;
3117 redo A;
3118 } elsif (0x0030 <= $self->{nc} and
3119 $self->{nc} <= 0x0039) { # 0..9
3120 !!!cp (994);
3121 $self->{state} = NCR_NUM_STATE;
3122 $self->{kwd} = $self->{nc} - 0x0030;
3123 !!!next-input-character;
3124 redo A;
3125 } else {
3126 !!!parse-error (type => 'bare nero',
3127 line => $self->{line_prev},
3128 column => $self->{column_prev} - 1);
3129
3130 ## NOTE: According to the spec algorithm, nothing is returned,
3131 ## and then "&#" is appended to the parent element or the attribute
3132 ## value in the later processing.
3133
3134 if ($self->{prev_state} == DATA_STATE) {
3135 !!!cp (1019);
3136 $self->{state} = $self->{prev_state};
3137 $self->{s_kwd} = '';
3138 ## Reconsume.
3139 !!!emit ({type => CHARACTER_TOKEN,
3140 data => '&#',
3141 line => $self->{line_prev},
3142 column => $self->{column_prev} - 1,
3143 });
3144 redo A;
3145 } else {
3146 !!!cp (993);
3147 $self->{ca}->{value} .= '&#';
3148 $self->{state} = $self->{prev_state};
3149 $self->{s_kwd} = '';
3150 ## Reconsume.
3151 redo A;
3152 }
3153 }
3154 } elsif ($self->{state} == NCR_NUM_STATE) {
3155 if (0x0030 <= $self->{nc} and
3156 $self->{nc} <= 0x0039) { # 0..9
3157 !!!cp (1012);
3158 $self->{kwd} *= 10;
3159 $self->{kwd} += $self->{nc} - 0x0030;
3160
3161 ## Stay in the state.
3162 !!!next-input-character;
3163 redo A;
3164 } elsif ($self->{nc} == 0x003B) { # ;
3165 !!!cp (1013);
3166 !!!next-input-character;
3167 #
3168 } else {
3169 !!!cp (1014);
3170 !!!parse-error (type => 'no refc');
3171 ## Reconsume.
3172 #
3173 }
3174
3175 my $code = $self->{kwd};
3176 my $l = $self->{line_prev};
3177 my $c = $self->{column_prev};
3178 if ($charref_map->{$code}) {
3179 !!!cp (1015);
3180 !!!parse-error (type => 'invalid character reference',
3181 text => (sprintf 'U+%04X', $code),
3182 line => $l, column => $c);
3183 $code = $charref_map->{$code};
3184 } elsif ($code > 0x10FFFF) {
3185 !!!cp (1016);
3186 !!!parse-error (type => 'invalid character reference',
3187 text => (sprintf 'U-%08X', $code),
3188 line => $l, column => $c);
3189 $code = 0xFFFD;
3190 }
3191
3192 if ($self->{prev_state} == DATA_STATE) {
3193 !!!cp (992);
3194 $self->{state} = $self->{prev_state};
3195 $self->{s_kwd} = '';
3196 ## Reconsume.
3197 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3198 has_reference => 1,
3199 line => $l, column => $c,
3200 });
3201 redo A;
3202 } else {
3203 !!!cp (991);
3204 $self->{ca}->{value} .= chr $code;
3205 $self->{ca}->{has_reference} = 1;
3206 $self->{state} = $self->{prev_state};
3207 $self->{s_kwd} = '';
3208 ## Reconsume.
3209 redo A;
3210 }
3211 } elsif ($self->{state} == HEXREF_X_STATE) {
3212 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3213 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3214 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3215 # 0..9, A..F, a..f
3216 !!!cp (990);
3217 $self->{state} = HEXREF_HEX_STATE;
3218 $self->{kwd} = 0;
3219 ## Reconsume.
3220 redo A;
3221 } else {
3222 !!!parse-error (type => 'bare hcro',
3223 line => $self->{line_prev},
3224 column => $self->{column_prev} - 2);
3225
3226 ## NOTE: According to the spec algorithm, nothing is returned,
3227 ## and then "&#" followed by "X" or "x" is appended to the parent
3228 ## element or the attribute value in the later processing.
3229
3230 if ($self->{prev_state} == DATA_STATE) {
3231 !!!cp (1005);
3232 $self->{state} = $self->{prev_state};
3233 $self->{s_kwd} = '';
3234 ## Reconsume.
3235 !!!emit ({type => CHARACTER_TOKEN,
3236 data => '&' . $self->{kwd},
3237 line => $self->{line_prev},
3238 column => $self->{column_prev} - length $self->{kwd},
3239 });
3240 redo A;
3241 } else {
3242 !!!cp (989);
3243 $self->{ca}->{value} .= '&' . $self->{kwd};
3244 $self->{state} = $self->{prev_state};
3245 $self->{s_kwd} = '';
3246 ## Reconsume.
3247 redo A;
3248 }
3249 }
3250 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3251 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3252 # 0..9
3253 !!!cp (1002);
3254 $self->{kwd} *= 0x10;
3255 $self->{kwd} += $self->{nc} - 0x0030;
3256 ## Stay in the state.
3257 !!!next-input-character;
3258 redo A;
3259 } elsif (0x0061 <= $self->{nc} and
3260 $self->{nc} <= 0x0066) { # a..f
3261 !!!cp (1003);
3262 $self->{kwd} *= 0x10;
3263 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3264 ## Stay in the state.
3265 !!!next-input-character;
3266 redo A;
3267 } elsif (0x0041 <= $self->{nc} and
3268 $self->{nc} <= 0x0046) { # A..F
3269 !!!cp (1004);
3270 $self->{kwd} *= 0x10;
3271 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3272 ## Stay in the state.
3273 !!!next-input-character;
3274 redo A;
3275 } elsif ($self->{nc} == 0x003B) { # ;
3276 !!!cp (1006);
3277 !!!next-input-character;
3278 #
3279 } else {
3280 !!!cp (1007);
3281 !!!parse-error (type => 'no refc',
3282 line => $self->{line},
3283 column => $self->{column});
3284 ## Reconsume.
3285 #
3286 }
3287
3288 my $code = $self->{kwd};
3289 my $l = $self->{line_prev};
3290 my $c = $self->{column_prev};
3291 if ($charref_map->{$code}) {
3292 !!!cp (1008);
3293 !!!parse-error (type => 'invalid character reference',
3294 text => (sprintf 'U+%04X', $code),
3295 line => $l, column => $c);
3296 $code = $charref_map->{$code};
3297 } elsif ($code > 0x10FFFF) {
3298 !!!cp (1009);
3299 !!!parse-error (type => 'invalid character reference',
3300 text => (sprintf 'U-%08X', $code),
3301 line => $l, column => $c);
3302 $code = 0xFFFD;
3303 }
3304
3305 if ($self->{prev_state} == DATA_STATE) {
3306 !!!cp (988);
3307 $self->{state} = $self->{prev_state};
3308 $self->{s_kwd} = '';
3309 ## Reconsume.
3310 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3311 has_reference => 1,
3312 line => $l, column => $c,
3313 });
3314 redo A;
3315 } else {
3316 !!!cp (987);
3317 $self->{ca}->{value} .= chr $code;
3318 $self->{ca}->{has_reference} = 1;
3319 $self->{state} = $self->{prev_state};
3320 $self->{s_kwd} = '';
3321 ## Reconsume.
3322 redo A;
3323 }
3324 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3325 if (length $self->{kwd} < 30 and
3326 ## NOTE: Some number greater than the maximum length of entity name
3327 ((0x0041 <= $self->{nc} and # a
3328 $self->{nc} <= 0x005A) or # x
3329 (0x0061 <= $self->{nc} and # a
3330 $self->{nc} <= 0x007A) or # z
3331 (0x0030 <= $self->{nc} and # 0
3332 $self->{nc} <= 0x0039) or # 9
3333 $self->{nc} == 0x003B)) { # ;
3334 our $EntityChar;
3335 $self->{kwd} .= chr $self->{nc};
3336 if (defined $EntityChar->{$self->{kwd}}) {
3337 if ($self->{nc} == 0x003B) { # ;
3338 !!!cp (1020);
3339 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3340 $self->{entity__match} = 1;
3341 !!!next-input-character;
3342 #
3343 } else {
3344 !!!cp (1021);
3345 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3346 $self->{entity__match} = -1;
3347 ## Stay in the state.
3348 !!!next-input-character;
3349 redo A;
3350 }
3351 } else {
3352 !!!cp (1022);
3353 $self->{entity__value} .= chr $self->{nc};
3354 $self->{entity__match} *= 2;
3355 ## Stay in the state.
3356 !!!next-input-character;
3357 redo A;
3358 }
3359 }
3360
3361 my $data;
3362 my $has_ref;
3363 if ($self->{entity__match} > 0) {
3364 !!!cp (1023);
3365 $data = $self->{entity__value};
3366 $has_ref = 1;
3367 #
3368 } elsif ($self->{entity__match} < 0) {
3369 !!!parse-error (type => 'no refc');
3370 if ($self->{prev_state} != DATA_STATE and # in attribute
3371 $self->{entity__match} < -1) {
3372 !!!cp (1024);
3373 $data = '&' . $self->{kwd};
3374 #
3375 } else {
3376 !!!cp (1025);
3377 $data = $self->{entity__value};
3378 $has_ref = 1;
3379 #
3380 }
3381 } else {
3382 !!!cp (1026);
3383 !!!parse-error (type => 'bare ero',
3384 line => $self->{line_prev},
3385 column => $self->{column_prev} - length $self->{kwd});
3386 $data = '&' . $self->{kwd};
3387 #
3388 }
3389
3390 ## NOTE: In these cases, when a character reference is found,
3391 ## it is consumed and a character token is returned, or, otherwise,
3392 ## nothing is consumed and returned, according to the spec algorithm.
3393 ## In this implementation, anything that has been examined by the
3394 ## tokenizer is appended to the parent element or the attribute value
3395 ## as string, either literal string when no character reference or
3396 ## entity-replaced string otherwise, in this stage, since any characters
3397 ## that would not be consumed are appended in the data state or in an
3398 ## appropriate attribute value state anyway.
3399
3400 if ($self->{prev_state} == DATA_STATE) {
3401 !!!cp (986);
3402 $self->{state} = $self->{prev_state};
3403 $self->{s_kwd} = '';
3404 ## Reconsume.
3405 !!!emit ({type => CHARACTER_TOKEN,
3406 data => $data,
3407 has_reference => $has_ref,
3408 line => $self->{line_prev},
3409 column => $self->{column_prev} + 1 - length $self->{kwd},
3410 });
3411 redo A;
3412 } else {
3413 !!!cp (985);
3414 $self->{ca}->{value} .= $data;
3415 $self->{ca}->{has_reference} = 1 if $has_ref;
3416 $self->{state} = $self->{prev_state};
3417 $self->{s_kwd} = '';
3418 ## Reconsume.
3419 redo A;
3420 }
3421
3422 ## XML-only states
3423
3424 } elsif ($self->{state} == PI_STATE) {
3425 ## XML5: "Pi state" and "DOCTYPE pi state".
3426
3427 if ($is_space->{$self->{nc}} or
3428 $self->{nc} == 0x003F or # ?
3429 $self->{nc} == -1) {
3430 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3431 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3432 ## "DOCTYPE pi state": Parse error, switch to the "data
3433 ## state".
3434 !!!parse-error (type => 'bare pio', ## TODO: type
3435 line => $self->{line_prev},
3436 column => $self->{column_prev}
3437 - 1 * ($self->{nc} != -1));
3438 $self->{state} = BOGUS_COMMENT_STATE;
3439 ## Reconsume.
3440 $self->{ct} = {type => COMMENT_TOKEN,
3441 data => '?',
3442 line => $self->{line_prev},
3443 column => $self->{column_prev}
3444 - 1 * ($self->{nc} != -1),
3445 };
3446 redo A;
3447 } else {
3448 ## XML5: "DOCTYPE pi state": Stay in the state.
3449 $self->{ct} = {type => PI_TOKEN,
3450 target => chr $self->{nc},
3451 data => '',
3452 line => $self->{line_prev},
3453 column => $self->{column_prev} - 1,
3454 };
3455 $self->{state} = PI_TARGET_STATE;
3456 !!!next-input-character;
3457 redo A;
3458 }
3459 } elsif ($self->{state} == PI_TARGET_STATE) {
3460 if ($is_space->{$self->{nc}}) {
3461 $self->{state} = PI_TARGET_AFTER_STATE;
3462 !!!next-input-character;
3463 redo A;
3464 } elsif ($self->{nc} == -1) {
3465 !!!parse-error (type => 'no pic'); ## TODO: type
3466 if ($self->{in_subset}) {
3467 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3468 } else {
3469 $self->{state} = DATA_STATE;
3470 $self->{s_kwd} = '';
3471 }
3472 ## Reconsume.
3473 !!!emit ($self->{ct}); # pi
3474 redo A;
3475 } elsif ($self->{nc} == 0x003F) { # ?
3476 $self->{state} = PI_AFTER_STATE;
3477 !!!next-input-character;
3478 redo A;
3479 } else {
3480 ## XML5: typo ("tag name" -> "target")
3481 $self->{ct}->{target} .= chr $self->{nc}; # pi
3482 !!!next-input-character;
3483 redo A;
3484 }
3485 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3486 if ($is_space->{$self->{nc}}) {
3487 ## Stay in the state.
3488 !!!next-input-character;
3489 redo A;
3490 } else {
3491 $self->{state} = PI_DATA_STATE;
3492 ## Reprocess.
3493 redo A;
3494 }
3495 } elsif ($self->{state} == PI_DATA_STATE) {
3496 if ($self->{nc} == 0x003F) { # ?
3497 $self->{state} = PI_DATA_AFTER_STATE;
3498 !!!next-input-character;
3499 redo A;
3500 } elsif ($self->{nc} == -1) {
3501 !!!parse-error (type => 'no pic'); ## TODO: type
3502 if ($self->{in_subset}) {
3503 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3504 } else {
3505 $self->{state} = DATA_STATE;
3506 $self->{s_kwd} = '';
3507 }
3508 ## Reprocess.
3509 !!!emit ($self->{ct}); # pi
3510 redo A;
3511 } else {
3512 $self->{ct}->{data} .= chr $self->{nc}; # pi
3513 $self->{read_until}->($self->{ct}->{data}, q[?],
3514 length $self->{ct}->{data});
3515 ## Stay in the state.
3516 !!!next-input-character;
3517 ## Reprocess.
3518 redo A;
3519 }
3520 } elsif ($self->{state} == PI_AFTER_STATE) {
3521 ## XML5: Part of "Pi after state".
3522
3523 if ($self->{nc} == 0x003E) { # >
3524 if ($self->{in_subset}) {
3525 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3526 } else {
3527 $self->{state} = DATA_STATE;
3528 $self->{s_kwd} = '';
3529 }
3530 !!!next-input-character;
3531 !!!emit ($self->{ct}); # pi
3532 redo A;
3533 } elsif ($self->{nc} == 0x003F) { # ?
3534 !!!parse-error (type => 'no s after target', ## TODO: type
3535 line => $self->{line_prev},
3536 column => $self->{column_prev}); ## XML5: no error
3537 $self->{ct}->{data} .= '?';
3538 $self->{state} = PI_DATA_AFTER_STATE;
3539 !!!next-input-character;
3540 redo A;
3541 } else {
3542 !!!parse-error (type => 'no s after target', ## TODO: type
3543 line => $self->{line_prev},
3544 column => $self->{column_prev}
3545 + 1 * ($self->{nc} == -1)); ## XML5: no error
3546 $self->{ct}->{data} .= '?'; ## XML5: not appended
3547 $self->{state} = PI_DATA_STATE;
3548 ## Reprocess.
3549 redo A;
3550 }
3551 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3552 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3553
3554 if ($self->{nc} == 0x003E) { # >
3555 if ($self->{in_subset}) {
3556 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3557 } else {
3558 $self->{state} = DATA_STATE;
3559 $self->{s_kwd} = '';
3560 }
3561 !!!next-input-character;
3562 !!!emit ($self->{ct}); # pi
3563 redo A;
3564 } elsif ($self->{nc} == 0x003F) { # ?
3565 $self->{ct}->{data} .= '?';
3566 ## Stay in the state.
3567 !!!next-input-character;
3568 redo A;
3569 } else {
3570 $self->{ct}->{data} .= '?'; ## XML5: not appended
3571 $self->{state} = PI_DATA_STATE;
3572 ## Reprocess.
3573 redo A;
3574 }
3575
3576 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3577 if ($self->{nc} == 0x003C) { # <
3578 $self->{state} = DOCTYPE_TAG_STATE;
3579 !!!next-input-character;
3580 redo A;
3581 } elsif ($self->{nc} == 0x0025) { # %
3582 ## XML5: Not defined yet.
3583
3584 ## TODO:
3585 !!!next-input-character;
3586 redo A;
3587 } elsif ($self->{nc} == 0x005D) { # ]
3588 delete $self->{in_subset};
3589 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3590 !!!next-input-character;
3591 redo A;
3592 } elsif ($is_space->{$self->{nc}}) {
3593 ## Stay in the state.
3594 !!!next-input-character;
3595 redo A;
3596 } elsif ($self->{nc} == -1) {
3597 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3598 delete $self->{in_subset};
3599 $self->{state} = DATA_STATE;
3600 $self->{s_kwd} = '';
3601 ## Reconsume.
3602 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3603 redo A;
3604 } else {
3605 unless ($self->{internal_subset_tainted}) {
3606 ## XML5: No parse error.
3607 !!!parse-error (type => 'string in internal subset');
3608 $self->{internal_subset_tainted} = 1;
3609 }
3610 ## Stay in the state.
3611 !!!next-input-character;
3612 redo A;
3613 }
3614 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3615 if ($self->{nc} == 0x003E) { # >
3616 $self->{state} = DATA_STATE;
3617 $self->{s_kwd} = '';
3618 !!!next-input-character;
3619 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3620 redo A;
3621 } elsif ($self->{nc} == -1) {
3622 !!!parse-error (type => 'unclosed DOCTYPE');
3623 $self->{state} = DATA_STATE;
3624 $self->{s_kwd} = '';
3625 ## Reconsume.
3626 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3627 redo A;
3628 } else {
3629 ## XML5: No parse error and stay in the state.
3630 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3631
3632 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3633 !!!next-input-character;
3634 redo A;
3635 }
3636 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3637 if ($self->{nc} == 0x003E) { # >
3638 $self->{state} = DATA_STATE;
3639 $self->{s_kwd} = '';
3640 !!!next-input-character;
3641 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3642 redo A;
3643 } elsif ($self->{nc} == -1) {
3644 $self->{state} = DATA_STATE;
3645 $self->{s_kwd} = '';
3646 ## Reconsume.
3647 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3648 redo A;
3649 } else {
3650 ## Stay in the state.
3651 !!!next-input-character;
3652 redo A;
3653 }
3654 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3655 if ($self->{nc} == 0x0021) { # !
3656 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3657 !!!next-input-character;
3658 redo A;
3659 } elsif ($self->{nc} == 0x003F) { # ?
3660 $self->{state} = PI_STATE;
3661 !!!next-input-character;
3662 redo A;
3663 } elsif ($self->{nc} == -1) {
3664 !!!parse-error (type => 'bare stago');
3665 $self->{state} = DATA_STATE;
3666 $self->{s_kwd} = '';
3667 ## Reconsume.
3668 redo A;
3669 } else {
3670 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3671 line => $self->{line_prev},
3672 column => $self->{column_prev});
3673 $self->{state} = BOGUS_COMMENT_STATE;
3674 $self->{ct} = {type => COMMENT_TOKEN,
3675 data => '',
3676 }; ## NOTE: Will be discarded.
3677 !!!next-input-character;
3678 redo A;
3679 }
3680 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3681 ## XML5: "DOCTYPE markup declaration state".
3682
3683 if ($self->{nc} == 0x002D) { # -
3684 $self->{state} = MD_HYPHEN_STATE;
3685 !!!next-input-character;
3686 redo A;
3687 } elsif ($self->{nc} == 0x0045 or # E
3688 $self->{nc} == 0x0065) { # e
3689 $self->{state} = MD_E_STATE;
3690 $self->{kwd} = chr $self->{nc};
3691 !!!next-input-character;
3692 redo A;
3693 } elsif ($self->{nc} == 0x0041 or # A
3694 $self->{nc} == 0x0061) { # a
3695 $self->{state} = MD_ATTLIST_STATE;
3696 $self->{kwd} = chr $self->{nc};
3697 !!!next-input-character;
3698 redo A;
3699 } elsif ($self->{nc} == 0x004E or # N
3700 $self->{nc} == 0x006E) { # n
3701 $self->{state} = MD_NOTATION_STATE;
3702 $self->{kwd} = chr $self->{nc};
3703 !!!next-input-character;
3704 redo A;
3705 } else {
3706 #
3707 }
3708
3709 ## XML5: No parse error.
3710 !!!parse-error (type => 'bogus comment',
3711 line => $self->{line_prev},
3712 column => $self->{column_prev} - 1);
3713 ## Reconsume.
3714 $self->{state} = BOGUS_COMMENT_STATE;
3715 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3716 redo A;
3717 } elsif ($self->{state} == MD_E_STATE) {
3718 if ($self->{nc} == 0x004E or # N
3719 $self->{nc} == 0x006E) { # n
3720 $self->{state} = MD_ENTITY_STATE;
3721 $self->{kwd} .= chr $self->{nc};
3722 !!!next-input-character;
3723 redo A;
3724 } elsif ($self->{nc} == 0x004C or # L
3725 $self->{nc} == 0x006C) { # l
3726 ## XML5: <!ELEMENT> not supported.
3727 $self->{state} = MD_ELEMENT_STATE;
3728 $self->{kwd} .= chr $self->{nc};
3729 !!!next-input-character;
3730 redo A;
3731 } else {
3732 ## XML5: No parse error.
3733 !!!parse-error (type => 'bogus comment',
3734 line => $self->{line_prev},
3735 column => $self->{column_prev} - 2
3736 + 1 * ($self->{nc} == -1));
3737 ## Reconsume.
3738 $self->{state} = BOGUS_COMMENT_STATE;
3739 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3740 redo A;
3741 }
3742 } elsif ($self->{state} == MD_ENTITY_STATE) {
3743 if ($self->{nc} == [
3744 undef,
3745 undef,
3746 0x0054, # T
3747 0x0049, # I
3748 0x0054, # T
3749 ]->[length $self->{kwd}] or
3750 $self->{nc} == [
3751 undef,
3752 undef,
3753 0x0074, # t
3754 0x0069, # i
3755 0x0074, # t
3756 ]->[length $self->{kwd}]) {
3757 ## Stay in the state.
3758 $self->{kwd} .= chr $self->{nc};
3759 !!!next-input-character;
3760 redo A;
3761 } elsif ((length $self->{kwd}) == 5 and
3762 ($self->{nc} == 0x0059 or # Y
3763 $self->{nc} == 0x0079)) { # y
3764 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3765 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3766 text => 'ENTITY',
3767 line => $self->{line_prev},
3768 column => $self->{column_prev} - 4);
3769 }
3770 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3771 line => $self->{line_prev},
3772 column => $self->{column_prev} - 6};
3773 $self->{state} = DOCTYPE_MD_STATE;
3774 !!!next-input-character;
3775 redo A;
3776 } else {
3777 !!!parse-error (type => 'bogus comment',
3778 line => $self->{line_prev},
3779 column => $self->{column_prev} - 1
3780 - (length $self->{kwd})
3781 + 1 * ($self->{nc} == -1));
3782 $self->{state} = BOGUS_COMMENT_STATE;
3783 ## Reconsume.
3784 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3785 redo A;
3786 }
3787 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3788 if ($self->{nc} == [
3789 undef,
3790 undef,
3791 0x0045, # E
3792 0x004D, # M
3793 0x0045, # E
3794 0x004E, # N
3795 ]->[length $self->{kwd}] or
3796 $self->{nc} == [
3797 undef,
3798 undef,
3799 0x0065, # e
3800 0x006D, # m
3801 0x0065, # e
3802 0x006E, # n
3803 ]->[length $self->{kwd}]) {
3804 ## Stay in the state.
3805 $self->{kwd} .= chr $self->{nc};
3806 !!!next-input-character;
3807 redo A;
3808 } elsif ((length $self->{kwd}) == 6 and
3809 ($self->{nc} == 0x0054 or # T
3810 $self->{nc} == 0x0074)) { # t
3811 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3812 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3813 text => 'ELEMENT',
3814 line => $self->{line_prev},
3815 column => $self->{column_prev} - 5);
3816 }
3817 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3818 line => $self->{line_prev},
3819 column => $self->{column_prev} - 6};
3820 $self->{state} = DOCTYPE_MD_STATE;
3821 !!!next-input-character;
3822 redo A;
3823 } else {
3824 !!!parse-error (type => 'bogus comment',
3825 line => $self->{line_prev},
3826 column => $self->{column_prev} - 1
3827 - (length $self->{kwd})
3828 + 1 * ($self->{nc} == -1));
3829 $self->{state} = BOGUS_COMMENT_STATE;
3830 ## Reconsume.
3831 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3832 redo A;
3833 }
3834 } elsif ($self->{state} == MD_ATTLIST_STATE) {
3835 if ($self->{nc} == [
3836 undef,
3837 0x0054, # T
3838 0x0054, # T
3839 0x004C, # L
3840 0x0049, # I
3841 0x0053, # S
3842 ]->[length $self->{kwd}] or
3843 $self->{nc} == [
3844 undef,
3845 0x0074, # t
3846 0x0074, # t
3847 0x006C, # l
3848 0x0069, # i
3849 0x0073, # s
3850 ]->[length $self->{kwd}]) {
3851 ## Stay in the state.
3852 $self->{kwd} .= chr $self->{nc};
3853 !!!next-input-character;
3854 redo A;
3855 } elsif ((length $self->{kwd}) == 6 and
3856 ($self->{nc} == 0x0054 or # T
3857 $self->{nc} == 0x0074)) { # t
3858 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3859 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3860 text => 'ATTLIST',
3861 line => $self->{line_prev},
3862 column => $self->{column_prev} - 5);
3863 }
3864 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3865 attrdefs => [],
3866 line => $self->{line_prev},
3867 column => $self->{column_prev} - 6};
3868 $self->{state} = DOCTYPE_MD_STATE;
3869 !!!next-input-character;
3870 redo A;
3871 } else {
3872 !!!parse-error (type => 'bogus comment',
3873 line => $self->{line_prev},
3874 column => $self->{column_prev} - 1
3875 - (length $self->{kwd})
3876 + 1 * ($self->{nc} == -1));
3877 $self->{state} = BOGUS_COMMENT_STATE;
3878 ## Reconsume.
3879 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3880 redo A;
3881 }
3882 } elsif ($self->{state} == MD_NOTATION_STATE) {
3883 if ($self->{nc} == [
3884 undef,
3885 0x004F, # O
3886 0x0054, # T
3887 0x0041, # A
3888 0x0054, # T
3889 0x0049, # I
3890 0x004F, # O
3891 ]->[length $self->{kwd}] or
3892 $self->{nc} == [
3893 undef,
3894 0x006F, # o
3895 0x0074, # t
3896 0x0061, # a
3897 0x0074, # t
3898 0x0069, # i
3899 0x006F, # o
3900 ]->[length $self->{kwd}]) {
3901 ## Stay in the state.
3902 $self->{kwd} .= chr $self->{nc};
3903 !!!next-input-character;
3904 redo A;
3905 } elsif ((length $self->{kwd}) == 7 and
3906 ($self->{nc} == 0x004E or # N
3907 $self->{nc} == 0x006E)) { # n
3908 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3909 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3910 text => 'NOTATION',
3911 line => $self->{line_prev},
3912 column => $self->{column_prev} - 6);
3913 }
3914 $self->{ct} = {type => NOTATION_TOKEN, name => '',
3915 line => $self->{line_prev},
3916 column => $self->{column_prev} - 6};
3917 $self->{state} = DOCTYPE_MD_STATE;
3918 !!!next-input-character;
3919 redo A;
3920 } else {
3921 !!!parse-error (type => 'bogus comment',
3922 line => $self->{line_prev},
3923 column => $self->{column_prev} - 1
3924 - (length $self->{kwd})
3925 + 1 * ($self->{nc} == -1));
3926 $self->{state} = BOGUS_COMMENT_STATE;
3927 ## Reconsume.
3928 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3929 redo A;
3930 }
3931 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3932 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3933 ## "DOCTYPE NOTATION state".
3934
3935 if ($is_space->{$self->{nc}}) {
3936 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3937 $self->{state} = BEFORE_MD_NAME_STATE;
3938 !!!next-input-character;
3939 redo A;
3940 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3941 $self->{nc} == 0x0025) { # %
3942 ## XML5: Switch to the "DOCTYPE bogus comment state".
3943 !!!parse-error (type => 'no space before md name'); ## TODO: type
3944 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3945 !!!next-input-character;
3946 redo A;
3947 } elsif ($self->{nc} == -1) {
3948 !!!parse-error (type => 'unclosed md'); ## TODO: type
3949 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3950 ## Reconsume.
3951 redo A;
3952 } elsif ($self->{nc} == 0x003E) { # >
3953 ## XML5: Switch to the "DOCTYPE bogus comment state".
3954 !!!parse-error (type => 'no md name'); ## TODO: type
3955 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3956 !!!next-input-character;
3957 redo A;
3958 } else {
3959 ## XML5: Switch to the "DOCTYPE bogus comment state".
3960 !!!parse-error (type => 'no space before md name'); ## TODO: type
3961 $self->{state} = BEFORE_MD_NAME_STATE;
3962 redo A;
3963 }
3964 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3965 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3966 ## before state", "DOCTYPE ATTLIST name before state".
3967
3968 if ($is_space->{$self->{nc}}) {
3969 ## Stay in the state.
3970 !!!next-input-character;
3971 redo A;
3972 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3973 $self->{nc} == 0x0025) { # %
3974 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3975 !!!next-input-character;
3976 redo A;
3977 } elsif ($self->{nc} == 0x003E) { # >
3978 ## XML5: Same as "Anything else".
3979 !!!parse-error (type => 'no md name'); ## TODO: type
3980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3981 !!!next-input-character;
3982 redo A;
3983 } elsif ($self->{nc} == -1) {
3984 !!!parse-error (type => 'unclosed md'); ## TODO: type
3985 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3986 ## Reconsume.
3987 redo A;
3988 } else {
3989 ## XML5: [ATTLIST] Not defined yet.
3990 $self->{ct}->{name} .= chr $self->{nc};
3991 $self->{state} = MD_NAME_STATE;
3992 !!!next-input-character;
3993 redo A;
3994 }
3995 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3996 if ($is_space->{$self->{nc}}) {
3997 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3998 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3999 $self->{state} = BEFORE_MD_NAME_STATE;
4000 !!!next-input-character;
4001 redo A;
4002 } elsif ($self->{nc} == 0x003E) { # >
4003 ## XML5: Same as "Anything else".
4004 !!!parse-error (type => 'no md name'); ## TODO: type
4005 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4006 !!!next-input-character;
4007 redo A;
4008 } elsif ($self->{nc} == -1) {
4009 !!!parse-error (type => 'unclosed md');
4010 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4011 ## Reconsume.
4012 redo A;
4013 } else {
4014 ## XML5: No parse error.
4015 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4016 $self->{state} = BOGUS_COMMENT_STATE;
4017 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4018 ## Reconsume.
4019 redo A;
4020 }
4021 } elsif ($self->{state} == MD_NAME_STATE) {
4022 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4023
4024 if ($is_space->{$self->{nc}}) {
4025 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4026 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4027 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4028 ## TODO: ...
4029 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4030 } else { # ENTITY/NOTATION
4031 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4032 }
4033 !!!next-input-character;
4034 redo A;
4035 } elsif ($self->{nc} == 0x003E) { # >
4036 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4037 #
4038 } else {
4039 !!!parse-error (type => 'no md def'); ## TODO: type
4040 }
4041 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4042 !!!next-input-character;
4043 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4044 redo A;
4045 } elsif ($self->{nc} == -1) {
4046 ## XML5: [ATTLIST] No parse error.
4047 !!!parse-error (type => 'unclosed md');
4048 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4049 ## Reconsume.
4050 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4051 redo A;
4052 } else {
4053 ## XML5: [ATTLIST] Not defined yet.
4054 $self->{ct}->{name} .= chr $self->{nc};
4055 ## Stay in the state.
4056 !!!next-input-character;
4057 redo A;
4058 }
4059 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4060 if ($is_space->{$self->{nc}}) {
4061 ## Stay in the state.
4062 !!!next-input-character;
4063 redo A;
4064 } elsif ($self->{nc} == 0x003E) { # >
4065 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066 !!!next-input-character;
4067 !!!emit ($self->{ct}); # ATTLIST
4068 redo A;
4069 } elsif ($self->{nc} == -1) {
4070 ## XML5: No parse error.
4071 !!!parse-error (type => 'unclosed md'); ## TODO: type
4072 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4073 !!!emit ($self->{ct});
4074 redo A;
4075 } else {
4076 ## XML5: Not defined yet.
4077 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4078 tokens => [],
4079 line => $self->{line}, column => $self->{column}};
4080 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4081 !!!next-input-character;
4082 redo A;
4083 }
4084 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4085 if ($is_space->{$self->{nc}}) {
4086 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4087 !!!next-input-character;
4088 redo A;
4089 } elsif ($self->{nc} == 0x003E) { # >
4090 ## XML5: Same as "anything else".
4091 !!!parse-error (type => 'no attr type'); ## TODO: type
4092 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4093 !!!next-input-character;
4094 !!!emit ($self->{ct}); # ATTLIST
4095 redo A;
4096 } elsif ($self->{nc} == 0x0028) { # (
4097 ## XML5: Same as "anything else".
4098 !!!parse-error (type => 'no space before paren'); ## TODO: type
4099 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4100 !!!next-input-character;
4101 redo A;
4102 } elsif ($self->{nc} == -1) {
4103 ## XML5: No parse error.
4104 !!!parse-error (type => 'unclosed md'); ## TODO: type
4105 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4106 !!!next-input-character;
4107 !!!emit ($self->{ct}); # ATTLIST
4108 redo A;
4109 } else {
4110 ## XML5: Not defined yet.
4111 $self->{ca}->{name} .= chr $self->{nc};
4112 ## Stay in the state.
4113 !!!next-input-character;
4114 redo A;
4115 }
4116 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4117 if ($is_space->{$self->{nc}}) {
4118 ## Stay in the state.
4119 !!!next-input-character;
4120 redo A;
4121 } elsif ($self->{nc} == 0x003E) { # >
4122 ## XML5: Same as "anything else".
4123 !!!parse-error (type => 'no attr type'); ## TODO: type
4124 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125 !!!next-input-character;
4126 !!!emit ($self->{ct}); # ATTLIST
4127 redo A;
4128 } elsif ($self->{nc} == 0x0028) { # (
4129 ## XML5: Same as "anything else".
4130 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4131 !!!next-input-character;
4132 redo A;
4133 } elsif ($self->{nc} == -1) {
4134 ## XML5: No parse error.
4135 !!!parse-error (type => 'unclosed md'); ## TODO: type
4136 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4137 !!!next-input-character;
4138 !!!emit ($self->{ct});
4139 redo A;
4140 } else {
4141 ## XML5: Not defined yet.
4142 $self->{ca}->{type} = chr $self->{nc};
4143 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4144 !!!next-input-character;
4145 redo A;
4146 }
4147 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4148 if ($is_space->{$self->{nc}}) {
4149 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4150 !!!next-input-character;
4151 redo A;
4152 } elsif ($self->{nc} == 0x0023) { # #
4153 ## XML5: Same as "anything else".
4154 !!!parse-error (type => 'no space before default value'); ## TODO: type
4155 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4156 !!!next-input-character;
4157 redo A;
4158 } elsif ($self->{nc} == 0x0022) { # "
4159 ## XML5: Same as "anything else".
4160 !!!parse-error (type => 'no space before default value'); ## TODO: type
4161 $self->{ca}->{value} = '';
4162 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4163 !!!next-input-character;
4164 redo A;
4165 } elsif ($self->{nc} == 0x0027) { # '
4166 ## XML5: Same as "anything else".
4167 !!!parse-error (type => 'no space before default value'); ## TODO: type
4168 $self->{ca}->{value} = '';
4169 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4170 !!!next-input-character;
4171 redo A;
4172 } elsif ($self->{nc} == 0x003E) { # >
4173 ## XML5: Same as "anything else".
4174 !!!parse-error (type => 'no attr default'); ## TODO: type
4175 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4176 !!!next-input-character;
4177 !!!emit ($self->{ct}); # ATTLIST
4178 redo A;
4179 } elsif ($self->{nc} == 0x0028) { # (
4180 ## XML5: Same as "anything else".
4181 !!!parse-error (type => 'no space before paren'); ## TODO: type
4182 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4183 !!!next-input-character;
4184 redo A;
4185 } elsif ($self->{nc} == -1) {
4186 ## XML5: No parse error.
4187 !!!parse-error (type => 'unclosed md'); ## TODO: type
4188 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4189 !!!next-input-character;
4190 !!!emit ($self->{ct});
4191 redo A;
4192 } else {
4193 ## XML5: Not defined yet.
4194 $self->{ca}->{type} .= chr $self->{nc};
4195 ## Stay in the state.
4196 !!!next-input-character;
4197 redo A;
4198 }
4199 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4200 if ($is_space->{$self->{nc}}) {
4201 ## Stay in the state.
4202 !!!next-input-character;
4203 redo A;
4204 } elsif ($self->{nc} == 0x0028) { # (
4205 ## XML5: Same as "anything else".
4206 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4207 !!!next-input-character;
4208 redo A;
4209 } elsif ($self->{nc} == 0x0023) { # #
4210 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4211 !!!next-input-character;
4212 redo A;
4213 } elsif ($self->{nc} == 0x0022) { # "
4214 ## XML5: Same as "anything else".
4215 $self->{ca}->{value} = '';
4216 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4217 !!!next-input-character;
4218 redo A;
4219 } elsif ($self->{nc} == 0x0027) { # '
4220 ## XML5: Same as "anything else".
4221 $self->{ca}->{value} = '';
4222 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4223 !!!next-input-character;
4224 redo A;
4225 } elsif ($self->{nc} == 0x003E) { # >
4226 ## XML5: Same as "anything else".
4227 !!!parse-error (type => 'no attr default'); ## TODO: type
4228 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4229 !!!next-input-character;
4230 !!!emit ($self->{ct}); # ATTLIST
4231 redo A;
4232 } elsif ($self->{nc} == -1) {
4233 ## XML5: No parse error.
4234 !!!parse-error (type => 'unclosed md'); ## TODO: type
4235 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4236 !!!next-input-character;
4237 !!!emit ($self->{ct});
4238 redo A;
4239 } else {
4240 ## XML5: Switch to the "DOCTYPE bogus comment state".
4241 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4242 $self->{ca}->{value} = '';
4243 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4244 ## Reconsume.
4245 redo A;
4246 }
4247 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4248 if ($is_space->{$self->{nc}}) {
4249 ## Stay in the state.
4250 !!!next-input-character;
4251 redo A;
4252 } elsif ($self->{nc} == 0x007C) { # |
4253 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4254 ## Stay in the state.
4255 !!!next-input-character;
4256 redo A;
4257 } elsif ($self->{nc} == 0x0029) { # )
4258 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4259 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4260 !!!next-input-character;
4261 redo A;
4262 } elsif ($self->{nc} == 0x003E) { # >
4263 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4264 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4265 !!!next-input-character;
4266 !!!emit ($self->{ct}); # ATTLIST
4267 redo A;
4268 } elsif ($self->{nc} == -1) {
4269 ## XML5: No parse error.
4270 !!!parse-error (type => 'unclosed md'); ## TODO: type
4271 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4272 !!!next-input-character;
4273 !!!emit ($self->{ct});
4274 redo A;
4275 } else {
4276 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4277 $self->{state} = ALLOWED_TOKEN_STATE;
4278 !!!next-input-character;
4279 redo A;
4280 }
4281 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4282 if ($is_space->{$self->{nc}}) {
4283 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4284 !!!next-input-character;
4285 redo A;
4286 } elsif ($self->{nc} == 0x007C) { # |
4287 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4288 !!!next-input-character;
4289 redo A;
4290 } elsif ($self->{nc} == 0x0029) { # )
4291 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4292 !!!next-input-character;
4293 redo A;
4294 } elsif ($self->{nc} == 0x003E) { # >
4295 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4296 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4297 !!!next-input-character;
4298 !!!emit ($self->{ct}); # ATTLIST
4299 redo A;
4300 } elsif ($self->{nc} == -1) {
4301 ## XML5: No parse error.
4302 !!!parse-error (type => 'unclosed md'); ## TODO: type
4303 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4304 !!!next-input-character;
4305 !!!emit ($self->{ct});
4306 redo A;
4307 } else {
4308 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4309 ## Stay in the state.
4310 !!!next-input-character;
4311 redo A;
4312 }
4313 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4314 if ($is_space->{$self->{nc}}) {
4315 ## Stay in the state.
4316 !!!next-input-character;
4317 redo A;
4318 } elsif ($self->{nc} == 0x007C) { # |
4319 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4320 !!!next-input-character;
4321 redo A;
4322 } elsif ($self->{nc} == 0x0029) { # )
4323 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4324 !!!next-input-character;
4325 redo A;
4326 } elsif ($self->{nc} == 0x003E) { # >
4327 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4328 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4329 !!!next-input-character;
4330 !!!emit ($self->{ct}); # ATTLIST
4331 redo A;
4332 } elsif ($self->{nc} == -1) {
4333 ## XML5: No parse error.
4334 !!!parse-error (type => 'unclosed md'); ## TODO: type
4335 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4336 !!!next-input-character;
4337 !!!emit ($self->{ct});
4338 redo A;
4339 } else {
4340 !!!parse-error (type => 'space in allowed token', ## TODO: type
4341 line => $self->{line_prev},
4342 column => $self->{column_prev});
4343 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4344 $self->{state} = ALLOWED_TOKEN_STATE;
4345 !!!next-input-character;
4346 redo A;
4347 }
4348 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4349 if ($is_space->{$self->{nc}}) {
4350 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4351 !!!next-input-character;
4352 redo A;
4353 } elsif ($self->{nc} == 0x0023) { # #
4354 !!!parse-error (type => 'no space before default value'); ## TODO: type
4355 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4356 !!!next-input-character;
4357 redo A;
4358 } elsif ($self->{nc} == 0x0022) { # "
4359 !!!parse-error (type => 'no space before default value'); ## TODO: type
4360 $self->{ca}->{value} = '';
4361 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4362 !!!next-input-character;
4363 redo A;
4364 } elsif ($self->{nc} == 0x0027) { # '
4365 !!!parse-error (type => 'no space before default value'); ## TODO: type
4366 $self->{ca}->{value} = '';
4367 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4368 !!!next-input-character;
4369 redo A;
4370 } elsif ($self->{nc} == 0x003E) { # >
4371 !!!parse-error (type => 'no attr default'); ## TODO: type
4372 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4373 !!!next-input-character;
4374 !!!emit ($self->{ct}); # ATTLIST
4375 redo A;
4376 } elsif ($self->{nc} == -1) {
4377 !!!parse-error (type => 'unclosed md'); ## TODO: type
4378 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4379 !!!next-input-character;
4380 !!!emit ($self->{ct});
4381 redo A;
4382 } else {
4383 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4384 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4385 ## Reconsume.
4386 redo A;
4387 }
4388 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4389 if ($is_space->{$self->{nc}}) {
4390 ## Stay in the state.
4391 !!!next-input-character;
4392 redo A;
4393 } elsif ($self->{nc} == 0x0023) { # #
4394 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4395 !!!next-input-character;
4396 redo A;
4397 } elsif ($self->{nc} == 0x0022) { # "
4398 $self->{ca}->{value} = '';
4399 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4400 !!!next-input-character;
4401 redo A;
4402 } elsif ($self->{nc} == 0x0027) { # '
4403 $self->{ca}->{value} = '';
4404 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4405 !!!next-input-character;
4406 redo A;
4407 } elsif ($self->{nc} == 0x003E) { # >
4408 !!!parse-error (type => 'no attr default'); ## TODO: type
4409 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4410 !!!next-input-character;
4411 !!!emit ($self->{ct}); # ATTLIST
4412 redo A;
4413 } elsif ($self->{nc} == -1) {
4414 !!!parse-error (type => 'unclosed md'); ## TODO: type
4415 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4416 !!!next-input-character;
4417 !!!emit ($self->{ct});
4418 redo A;
4419 } else {
4420 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4421 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4422 ## Reconsume.
4423 redo A;
4424 }
4425 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4426 if ($is_space->{$self->{nc}}) {
4427 ## XML5: No parse error.
4428 !!!parse-error (type => 'no default type'); ## TODO: type
4429 $self->{state} = BOGUS_MD_STATE;
4430 ## Reconsume.
4431 redo A;
4432 } elsif ($self->{nc} == 0x0022) { # "
4433 ## XML5: Same as "anything else".
4434 $self->{ca}->{value} = '';
4435 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4436 !!!next-input-character;
4437 redo A;
4438 } elsif ($self->{nc} == 0x0027) { # '
4439 ## XML5: Same as "anything else".
4440 $self->{ca}->{value} = '';
4441 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4442 !!!next-input-character;
4443 redo A;
4444 } elsif ($self->{nc} == 0x003E) { # >
4445 ## XML5: Same as "anything else".
4446 !!!parse-error (type => 'no attr default'); ## TODO: type
4447 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4448 !!!next-input-character;
4449 !!!emit ($self->{ct}); # ATTLIST
4450 redo A;
4451 } elsif ($self->{nc} == -1) {
4452 ## XML5: No parse error.
4453 !!!parse-error (type => 'unclosed md'); ## TODO: type
4454 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4455 !!!next-input-character;
4456 !!!emit ($self->{ct});
4457 redo A;
4458 } else {
4459 $self->{ca}->{default} = chr $self->{nc};
4460 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4461 !!!next-input-character;
4462 redo A;
4463 }
4464 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4465 if ($is_space->{$self->{nc}}) {
4466 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4467 !!!next-input-character;
4468 redo A;
4469 } elsif ($self->{nc} == 0x0022) { # "
4470 ## XML5: Same as "anything else".
4471 !!!parse-error (type => 'no space before default value'); ## TODO: type
4472 $self->{ca}->{value} = '';
4473 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4474 !!!next-input-character;
4475 redo A;
4476 } elsif ($self->{nc} == 0x0027) { # '
4477 ## XML5: Same as "anything else".
4478 !!!parse-error (type => 'no space before default value'); ## TODO: type
4479 $self->{ca}->{value} = '';
4480 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4481 !!!next-input-character;
4482 redo A;
4483 } elsif ($self->{nc} == 0x003E) { # >
4484 ## XML5: Same as "anything else".
4485 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4486 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4487 !!!next-input-character;
4488 !!!emit ($self->{ct}); # ATTLIST
4489 redo A;
4490 } elsif ($self->{nc} == -1) {
4491 ## XML5: No parse error.
4492 !!!parse-error (type => 'unclosed md'); ## TODO: type
4493 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4494 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4495 !!!next-input-character;
4496 !!!emit ($self->{ct});
4497 redo A;
4498 } else {
4499 $self->{ca}->{default} .= chr $self->{nc};
4500 ## Stay in the state.
4501 !!!next-input-character;
4502 redo A;
4503 }
4504 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4505 if ($is_space->{$self->{nc}}) {
4506 ## Stay in the state.
4507 !!!next-input-character;
4508 redo A;
4509 } elsif ($self->{nc} == 0x0022) { # "
4510 $self->{ca}->{value} = '';
4511 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4512 !!!next-input-character;
4513 redo A;
4514 } elsif ($self->{nc} == 0x0027) { # '
4515 $self->{ca}->{value} = '';
4516 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4517 !!!next-input-character;
4518 redo A;
4519 } elsif ($self->{nc} == 0x003E) { # >
4520 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4521 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4522 !!!next-input-character;
4523 !!!emit ($self->{ct}); # ATTLIST
4524 redo A;
4525 } elsif ($self->{nc} == -1) {
4526 ## XML5: No parse error.
4527 !!!parse-error (type => 'unclosed md'); ## TODO: type
4528 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4529 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4530 !!!next-input-character;
4531 !!!emit ($self->{ct});
4532 redo A;
4533 } else {
4534 ## XML5: Not defined yet.
4535 if ($self->{ca}->{default} eq 'FIXED') {
4536 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4537 } else {
4538 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4540 }
4541 ## Reconsume.
4542 redo A;
4543 }
4544 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4545 if ($is_space->{$self->{nc}} or
4546 $self->{nc} == -1 or
4547 $self->{nc} == 0x003E) { # >
4548 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4549 ## Reconsume.
4550 redo A;
4551 } else {
4552 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4553 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4554 ## Reconsume.
4555 redo A;
4556 }
4557 } elsif ($self->{state} == NDATA_STATE) {
4558 ## ASCII case-insensitive
4559 if ($self->{nc} == [
4560 undef,
4561 0x0044, # D
4562 0x0041, # A
4563 0x0054, # T
4564 ]->[length $self->{kwd}] or
4565 $self->{nc} == [
4566 undef,
4567 0x0064, # d
4568 0x0061, # a
4569 0x0074, # t
4570 ]->[length $self->{kwd}]) {
4571 !!!cp (172.2);
4572 ## Stay in the state.
4573 $self->{kwd} .= chr $self->{nc};
4574 !!!next-input-character;
4575 redo A;
4576 } elsif ((length $self->{kwd}) == 4 and
4577 ($self->{nc} == 0x0041 or # A
4578 $self->{nc} == 0x0061)) { # a
4579 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4580 !!!cp (172.3);
4581 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4582 text => 'NDATA',
4583 line => $self->{line_prev},
4584 column => $self->{column_prev} - 4);
4585 } else {
4586 !!!cp (172.4);
4587 }
4588 $self->{state} = AFTER_NDATA_STATE;
4589 !!!next-input-character;
4590 redo A;
4591 } else {
4592 !!!parse-error (type => 'string after literal', ## TODO: type
4593 line => $self->{line_prev},
4594 column => $self->{column_prev} + 1
4595 - length $self->{kwd});
4596 !!!cp (172.5);
4597 $self->{state} = BOGUS_MD_STATE;
4598 ## Reconsume.
4599 redo A;
4600 }
4601 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4602 if ($is_space->{$self->{nc}}) {
4603 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4604 !!!next-input-character;
4605 redo A;
4606 } elsif ($self->{nc} == 0x003E) { # >
4607 !!!parse-error (type => 'no notation name'); ## TODO: type
4608 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4609 !!!next-input-character;
4610 !!!emit ($self->{ct}); # ENTITY
4611 redo A;
4612 } elsif ($self->{nc} == -1) {
4613 !!!parse-error (type => 'unclosed md'); ## TODO: type
4614 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4615 !!!next-input-character;
4616 !!!emit ($self->{ct}); # ENTITY
4617 redo A;
4618 } else {
4619 !!!parse-error (type => 'string after literal', ## TODO: type
4620 line => $self->{line_prev},
4621 column => $self->{column_prev} + 1
4622 - length $self->{kwd});
4623 $self->{state} = BOGUS_MD_STATE;
4624 ## Reconsume.
4625 redo A;
4626 }
4627 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4628 if ($is_space->{$self->{nc}}) {
4629 ## Stay in the state.
4630 !!!next-input-character;
4631 redo A;
4632 } elsif ($self->{nc} == 0x003E) { # >
4633 !!!parse-error (type => 'no notation name'); ## TODO: type
4634 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4635 !!!next-input-character;
4636 !!!emit ($self->{ct}); # ENTITY
4637 redo A;
4638 } elsif ($self->{nc} == -1) {
4639 !!!parse-error (type => 'unclosed md'); ## TODO: type
4640 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4641 !!!next-input-character;
4642 !!!emit ($self->{ct}); # ENTITY
4643 redo A;
4644 } else {
4645 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4646 $self->{state} = NOTATION_NAME_STATE;
4647 !!!next-input-character;
4648 redo A;
4649 }
4650 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4651 if ($is_space->{$self->{nc}}) {
4652 $self->{state} = AFTER_NOTATION_NAME_STATE;
4653 !!!next-input-character;
4654 redo A;
4655 } elsif ($self->{nc} == 0x003E) { # >
4656 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4657 !!!next-input-character;
4658 !!!emit ($self->{ct}); # ENTITY
4659 redo A;
4660 } elsif ($self->{nc} == -1) {
4661 !!!parse-error (type => 'unclosed md'); ## TODO: type
4662 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4663 !!!next-input-character;
4664 !!!emit ($self->{ct}); # ENTITY
4665 redo A;
4666 } else {
4667 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4668 ## Stay in the state.
4669 !!!next-input-character;
4670 redo A;
4671 }
4672 } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4673 if ($is_space->{$self->{nc}}) {
4674 ## Stay in the state.
4675 !!!next-input-character;
4676 redo A;
4677 } elsif ($self->{nc} == 0x003E) { # >
4678 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4679 !!!next-input-character;
4680 !!!emit ($self->{ct}); # ENTITY
4681 redo A;
4682 } elsif ($self->{nc} == -1) {
4683 !!!parse-error (type => 'unclosed md'); ## TODO: type
4684 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4685 !!!next-input-character;
4686 !!!emit ($self->{ct}); # ENTITY
4687 redo A;
4688 } else {
4689 !!!parse-error (type => 'string after notation name'); ## TODO: type
4690 $self->{state} = BOGUS_MD_STATE;
4691 ## Reconsume.
4692 redo A;
4693 }
4694
4695
4696 } elsif ($self->{state} == BOGUS_MD_STATE) {
4697 if ($self->{nc} == 0x003E) { # >
4698 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4699 !!!next-input-character;
4700 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4701 redo A;
4702 } elsif ($self->{nc} == -1) {
4703 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4704 ## Reconsume.
4705 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4706 redo A;
4707 } else {
4708 ## Stay in the state.
4709 !!!next-input-character;
4710 redo A;
4711 }
4712 } else {
4713 die "$0: $self->{state}: Unknown state";
4714 }
4715 } # A
4716
4717 die "$0: _get_next_token: unexpected case";
4718 } # _get_next_token
4719
4720 1;
4721 ## $Date: 2008/10/19 04:39:25 $
4722

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24