/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (show annotations) (download) (as text)
Sun Oct 19 07:19:00 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.18: +101 -6 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	19 Oct 2008 07:18:24 -0000
	* XML-Parser.t: Typo fixed.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 07:18:52 -0000
	* entities-1.dat, entities-2.dat: EntityValue tests added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	19 Oct 2008 07:17:36 -0000
	* NanoDOM.pm (Entity->new): Initialize ->child_nodes as an empty
	array.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 07:18:01 -0000
	* Tokenizer.pm.src: Support for EntityValue.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.18 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub AFTER_NOTATION_NAME_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188 sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189 sub BOGUS_MD_STATE () { 94 }
190
191 ## Tree constructor state constants (see Whatpm::HTML for the full
192 ## list and descriptions)
193
194 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
195 sub FOREIGN_EL () { 0b1_00000000000 }
196
197 ## Character reference mappings
198
199 my $charref_map = {
200 0x0D => 0x000A,
201 0x80 => 0x20AC,
202 0x81 => 0xFFFD,
203 0x82 => 0x201A,
204 0x83 => 0x0192,
205 0x84 => 0x201E,
206 0x85 => 0x2026,
207 0x86 => 0x2020,
208 0x87 => 0x2021,
209 0x88 => 0x02C6,
210 0x89 => 0x2030,
211 0x8A => 0x0160,
212 0x8B => 0x2039,
213 0x8C => 0x0152,
214 0x8D => 0xFFFD,
215 0x8E => 0x017D,
216 0x8F => 0xFFFD,
217 0x90 => 0xFFFD,
218 0x91 => 0x2018,
219 0x92 => 0x2019,
220 0x93 => 0x201C,
221 0x94 => 0x201D,
222 0x95 => 0x2022,
223 0x96 => 0x2013,
224 0x97 => 0x2014,
225 0x98 => 0x02DC,
226 0x99 => 0x2122,
227 0x9A => 0x0161,
228 0x9B => 0x203A,
229 0x9C => 0x0153,
230 0x9D => 0xFFFD,
231 0x9E => 0x017E,
232 0x9F => 0x0178,
233 }; # $charref_map
234 $charref_map->{$_} = 0xFFFD
235 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
236 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
237 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
238 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
239 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
240 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
241 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
242
243 ## Implementations MUST act as if state machine in the spec
244
245 sub _initialize_tokenizer ($) {
246 my $self = shift;
247
248 ## NOTE: Fields set by |new| constructor:
249 #$self->{level}
250 #$self->{set_nc}
251 #$self->{parse_error}
252 #$self->{is_xml} (if XML)
253
254 $self->{state} = DATA_STATE; # MUST
255 $self->{s_kwd} = ''; # Data state keyword
256 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
257 #$self->{entity__value}; # initialized when used
258 #$self->{entity__match}; # initialized when used
259 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
260 undef $self->{ct}; # current token
261 undef $self->{ca}; # current attribute
262 undef $self->{last_stag_name}; # last emitted start tag name
263 #$self->{prev_state}; # initialized when used
264 delete $self->{self_closing};
265 $self->{char_buffer} = '';
266 $self->{char_buffer_pos} = 0;
267 $self->{nc} = -1; # next input character
268 #$self->{next_nc}
269 !!!next-input-character;
270 $self->{token} = [];
271 # $self->{escape}
272 } # _initialize_tokenizer
273
274 ## A token has:
275 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
276 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
277 ## ->{name} (DOCTYPE_TOKEN)
278 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
279 ## ->{target} (PI_TOKEN)
280 ## ->{pubid} (DOCTYPE_TOKEN)
281 ## ->{sysid} (DOCTYPE_TOKEN)
282 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
283 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
284 ## ->{name}
285 ## ->{value}
286 ## ->{has_reference} == 1 or 0
287 ## ->{index}: Index of the attribute in a tag.
288 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
289 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
290 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
291 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
292
293 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
294 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
295 ## while the token is pushed back to the stack.
296
297 ## Emitted token MUST immediately be handled by the tree construction state.
298
299 ## Before each step, UA MAY check to see if either one of the scripts in
300 ## "list of scripts that will execute as soon as possible" or the first
301 ## script in the "list of scripts that will execute asynchronously",
302 ## has completed loading. If one has, then it MUST be executed
303 ## and removed from the list.
304
305 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
306 ## (This requirement was dropped from HTML5 spec, unfortunately.)
307
308 my $is_space = {
309 0x0009 => 1, # CHARACTER TABULATION (HT)
310 0x000A => 1, # LINE FEED (LF)
311 #0x000B => 0, # LINE TABULATION (VT)
312 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
313 #0x000D => 1, # CARRIAGE RETURN (CR)
314 0x0020 => 1, # SPACE (SP)
315 };
316
317 sub _get_next_token ($) {
318 my $self = shift;
319
320 if ($self->{self_closing}) {
321 !!!parse-error (type => 'nestc', token => $self->{ct});
322 ## NOTE: The |self_closing| flag is only set by start tag token.
323 ## In addition, when a start tag token is emitted, it is always set to
324 ## |ct|.
325 delete $self->{self_closing};
326 }
327
328 if (@{$self->{token}}) {
329 $self->{self_closing} = $self->{token}->[0]->{self_closing};
330 return shift @{$self->{token}};
331 }
332
333 A: {
334 if ($self->{state} == PCDATA_STATE) {
335 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
336
337 if ($self->{nc} == 0x0026) { # &
338 !!!cp (0.1);
339 ## NOTE: In the spec, the tokenizer is switched to the
340 ## "entity data state". In this implementation, the tokenizer
341 ## is switched to the |ENTITY_STATE|, which is an implementation
342 ## of the "consume a character reference" algorithm.
343 $self->{entity_add} = -1;
344 $self->{prev_state} = DATA_STATE;
345 $self->{state} = ENTITY_STATE;
346 !!!next-input-character;
347 redo A;
348 } elsif ($self->{nc} == 0x003C) { # <
349 !!!cp (0.2);
350 $self->{state} = TAG_OPEN_STATE;
351 !!!next-input-character;
352 redo A;
353 } elsif ($self->{nc} == -1) {
354 !!!cp (0.3);
355 !!!emit ({type => END_OF_FILE_TOKEN,
356 line => $self->{line}, column => $self->{column}});
357 last A; ## TODO: ok?
358 } else {
359 !!!cp (0.4);
360 #
361 }
362
363 # Anything else
364 my $token = {type => CHARACTER_TOKEN,
365 data => chr $self->{nc},
366 line => $self->{line}, column => $self->{column},
367 };
368 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
369
370 ## Stay in the state.
371 !!!next-input-character;
372 !!!emit ($token);
373 redo A;
374 } elsif ($self->{state} == DATA_STATE) {
375 $self->{s_kwd} = '' unless defined $self->{s_kwd};
376 if ($self->{nc} == 0x0026) { # &
377 $self->{s_kwd} = '';
378 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
379 not $self->{escape}) {
380 !!!cp (1);
381 ## NOTE: In the spec, the tokenizer is switched to the
382 ## "entity data state". In this implementation, the tokenizer
383 ## is switched to the |ENTITY_STATE|, which is an implementation
384 ## of the "consume a character reference" algorithm.
385 $self->{entity_add} = -1;
386 $self->{prev_state} = DATA_STATE;
387 $self->{state} = ENTITY_STATE;
388 !!!next-input-character;
389 redo A;
390 } else {
391 !!!cp (2);
392 #
393 }
394 } elsif ($self->{nc} == 0x002D) { # -
395 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
396 if ($self->{s_kwd} eq '<!-') {
397 !!!cp (3);
398 $self->{escape} = 1; # unless $self->{escape};
399 $self->{s_kwd} = '--';
400 #
401 } elsif ($self->{s_kwd} eq '-') {
402 !!!cp (4);
403 $self->{s_kwd} = '--';
404 #
405 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
406 !!!cp (4.1);
407 $self->{s_kwd} .= '-';
408 #
409 } else {
410 !!!cp (5);
411 $self->{s_kwd} = '-';
412 #
413 }
414 }
415
416 #
417 } elsif ($self->{nc} == 0x0021) { # !
418 if (length $self->{s_kwd}) {
419 !!!cp (5.1);
420 $self->{s_kwd} .= '!';
421 #
422 } else {
423 !!!cp (5.2);
424 #$self->{s_kwd} = '';
425 #
426 }
427 #
428 } elsif ($self->{nc} == 0x003C) { # <
429 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
430 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
431 not $self->{escape})) {
432 !!!cp (6);
433 $self->{state} = TAG_OPEN_STATE;
434 !!!next-input-character;
435 redo A;
436 } else {
437 !!!cp (7);
438 $self->{s_kwd} = '';
439 #
440 }
441 } elsif ($self->{nc} == 0x003E) { # >
442 if ($self->{escape} and
443 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
444 if ($self->{s_kwd} eq '--') {
445 !!!cp (8);
446 delete $self->{escape};
447 #
448 } else {
449 !!!cp (9);
450 #
451 }
452 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
453 !!!cp (9.1);
454 !!!parse-error (type => 'unmatched mse', ## TODO: type
455 line => $self->{line_prev},
456 column => $self->{column_prev} - 1);
457 #
458 } else {
459 !!!cp (10);
460 #
461 }
462
463 $self->{s_kwd} = '';
464 #
465 } elsif ($self->{nc} == 0x005D) { # ]
466 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
467 !!!cp (10.1);
468 $self->{s_kwd} .= ']';
469 } elsif ($self->{s_kwd} eq ']]') {
470 !!!cp (10.2);
471 #
472 } else {
473 !!!cp (10.3);
474 $self->{s_kwd} = '';
475 }
476 #
477 } elsif ($self->{nc} == -1) {
478 !!!cp (11);
479 $self->{s_kwd} = '';
480 !!!emit ({type => END_OF_FILE_TOKEN,
481 line => $self->{line}, column => $self->{column}});
482 last A; ## TODO: ok?
483 } else {
484 !!!cp (12);
485 $self->{s_kwd} = '';
486 #
487 }
488
489 # Anything else
490 my $token = {type => CHARACTER_TOKEN,
491 data => chr $self->{nc},
492 line => $self->{line}, column => $self->{column},
493 };
494 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
495 length $token->{data})) {
496 $self->{s_kwd} = '';
497 }
498
499 ## Stay in the data state.
500 if (not $self->{is_xml} and
501 $self->{content_model} == PCDATA_CONTENT_MODEL) {
502 !!!cp (13);
503 $self->{state} = PCDATA_STATE;
504 } else {
505 !!!cp (14);
506 ## Stay in the state.
507 }
508 !!!next-input-character;
509 !!!emit ($token);
510 redo A;
511 } elsif ($self->{state} == TAG_OPEN_STATE) {
512 ## XML5: "tag state".
513
514 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
515 if ($self->{nc} == 0x002F) { # /
516 !!!cp (15);
517 !!!next-input-character;
518 $self->{state} = CLOSE_TAG_OPEN_STATE;
519 redo A;
520 } elsif ($self->{nc} == 0x0021) { # !
521 !!!cp (15.1);
522 $self->{s_kwd} = $self->{escaped} ? '' : '<';
523 #
524 } else {
525 !!!cp (16);
526 $self->{s_kwd} = '';
527 #
528 }
529
530 ## reconsume
531 $self->{state} = DATA_STATE;
532 !!!emit ({type => CHARACTER_TOKEN, data => '<',
533 line => $self->{line_prev},
534 column => $self->{column_prev},
535 });
536 redo A;
537 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
538 if ($self->{nc} == 0x0021) { # !
539 !!!cp (17);
540 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
541 !!!next-input-character;
542 redo A;
543 } elsif ($self->{nc} == 0x002F) { # /
544 !!!cp (18);
545 $self->{state} = CLOSE_TAG_OPEN_STATE;
546 !!!next-input-character;
547 redo A;
548 } elsif (0x0041 <= $self->{nc} and
549 $self->{nc} <= 0x005A) { # A..Z
550 !!!cp (19);
551 $self->{ct}
552 = {type => START_TAG_TOKEN,
553 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
554 line => $self->{line_prev},
555 column => $self->{column_prev}};
556 $self->{state} = TAG_NAME_STATE;
557 !!!next-input-character;
558 redo A;
559 } elsif (0x0061 <= $self->{nc} and
560 $self->{nc} <= 0x007A) { # a..z
561 !!!cp (20);
562 $self->{ct} = {type => START_TAG_TOKEN,
563 tag_name => chr ($self->{nc}),
564 line => $self->{line_prev},
565 column => $self->{column_prev}};
566 $self->{state} = TAG_NAME_STATE;
567 !!!next-input-character;
568 redo A;
569 } elsif ($self->{nc} == 0x003E) { # >
570 !!!cp (21);
571 !!!parse-error (type => 'empty start tag',
572 line => $self->{line_prev},
573 column => $self->{column_prev});
574 $self->{state} = DATA_STATE;
575 $self->{s_kwd} = '';
576 !!!next-input-character;
577
578 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
579 line => $self->{line_prev},
580 column => $self->{column_prev},
581 });
582
583 redo A;
584 } elsif ($self->{nc} == 0x003F) { # ?
585 if ($self->{is_xml}) {
586 !!!cp (22.1);
587 $self->{state} = PI_STATE;
588 !!!next-input-character;
589 redo A;
590 } else {
591 !!!cp (22);
592 !!!parse-error (type => 'pio',
593 line => $self->{line_prev},
594 column => $self->{column_prev});
595 $self->{state} = BOGUS_COMMENT_STATE;
596 $self->{ct} = {type => COMMENT_TOKEN, data => '',
597 line => $self->{line_prev},
598 column => $self->{column_prev},
599 };
600 ## $self->{nc} is intentionally left as is
601 redo A;
602 }
603 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
604 !!!cp (23);
605 !!!parse-error (type => 'bare stago',
606 line => $self->{line_prev},
607 column => $self->{column_prev});
608 $self->{state} = DATA_STATE;
609 $self->{s_kwd} = '';
610 ## reconsume
611
612 !!!emit ({type => CHARACTER_TOKEN, data => '<',
613 line => $self->{line_prev},
614 column => $self->{column_prev},
615 });
616
617 redo A;
618 } else {
619 ## XML5: "<:" is a parse error.
620 !!!cp (23.1);
621 $self->{ct} = {type => START_TAG_TOKEN,
622 tag_name => chr ($self->{nc}),
623 line => $self->{line_prev},
624 column => $self->{column_prev}};
625 $self->{state} = TAG_NAME_STATE;
626 !!!next-input-character;
627 redo A;
628 }
629 } else {
630 die "$0: $self->{content_model} in tag open";
631 }
632 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
633 ## NOTE: The "close tag open state" in the spec is implemented as
634 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
635
636 ## XML5: "end tag state".
637
638 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
639 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
640 if (defined $self->{last_stag_name}) {
641 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
642 $self->{kwd} = '';
643 ## Reconsume.
644 redo A;
645 } else {
646 ## No start tag token has ever been emitted
647 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
648 !!!cp (28);
649 $self->{state} = DATA_STATE;
650 $self->{s_kwd} = '';
651 ## Reconsume.
652 !!!emit ({type => CHARACTER_TOKEN, data => '</',
653 line => $l, column => $c,
654 });
655 redo A;
656 }
657 }
658
659 if (0x0041 <= $self->{nc} and
660 $self->{nc} <= 0x005A) { # A..Z
661 !!!cp (29);
662 $self->{ct}
663 = {type => END_TAG_TOKEN,
664 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
665 line => $l, column => $c};
666 $self->{state} = TAG_NAME_STATE;
667 !!!next-input-character;
668 redo A;
669 } elsif (0x0061 <= $self->{nc} and
670 $self->{nc} <= 0x007A) { # a..z
671 !!!cp (30);
672 $self->{ct} = {type => END_TAG_TOKEN,
673 tag_name => chr ($self->{nc}),
674 line => $l, column => $c};
675 $self->{state} = TAG_NAME_STATE;
676 !!!next-input-character;
677 redo A;
678 } elsif ($self->{nc} == 0x003E) { # >
679 !!!parse-error (type => 'empty end tag',
680 line => $self->{line_prev}, ## "<" in "</>"
681 column => $self->{column_prev} - 1);
682 $self->{state} = DATA_STATE;
683 $self->{s_kwd} = '';
684 if ($self->{is_xml}) {
685 !!!cp (31);
686 ## XML5: No parse error.
687
688 ## NOTE: This parser raises a parse error, since it supports
689 ## XML1, not XML5.
690
691 ## NOTE: A short end tag token.
692 my $ct = {type => END_TAG_TOKEN,
693 tag_name => '',
694 line => $self->{line_prev},
695 column => $self->{column_prev} - 1,
696 };
697 !!!next-input-character;
698 !!!emit ($ct);
699 } else {
700 !!!cp (31.1);
701 !!!next-input-character;
702 }
703 redo A;
704 } elsif ($self->{nc} == -1) {
705 !!!cp (32);
706 !!!parse-error (type => 'bare etago');
707 $self->{s_kwd} = '';
708 $self->{state} = DATA_STATE;
709 # reconsume
710
711 !!!emit ({type => CHARACTER_TOKEN, data => '</',
712 line => $l, column => $c,
713 });
714
715 redo A;
716 } elsif (not $self->{is_xml} or
717 $is_space->{$self->{nc}}) {
718 !!!cp (33);
719 !!!parse-error (type => 'bogus end tag',
720 line => $self->{line_prev}, # "<" of "</"
721 column => $self->{column_prev} - 1);
722 $self->{state} = BOGUS_COMMENT_STATE;
723 $self->{ct} = {type => COMMENT_TOKEN, data => '',
724 line => $self->{line_prev}, # "<" of "</"
725 column => $self->{column_prev} - 1,
726 };
727 ## NOTE: $self->{nc} is intentionally left as is.
728 ## Although the "anything else" case of the spec not explicitly
729 ## states that the next input character is to be reconsumed,
730 ## it will be included to the |data| of the comment token
731 ## generated from the bogus end tag, as defined in the
732 ## "bogus comment state" entry.
733 redo A;
734 } else {
735 ## XML5: "</:" is a parse error.
736 !!!cp (30.1);
737 $self->{ct} = {type => END_TAG_TOKEN,
738 tag_name => chr ($self->{nc}),
739 line => $l, column => $c};
740 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
741 !!!next-input-character;
742 redo A;
743 }
744 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
745 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
746 if (length $ch) {
747 my $CH = $ch;
748 $ch =~ tr/a-z/A-Z/;
749 my $nch = chr $self->{nc};
750 if ($nch eq $ch or $nch eq $CH) {
751 !!!cp (24);
752 ## Stay in the state.
753 $self->{kwd} .= $nch;
754 !!!next-input-character;
755 redo A;
756 } else {
757 !!!cp (25);
758 $self->{state} = DATA_STATE;
759 $self->{s_kwd} = '';
760 ## Reconsume.
761 !!!emit ({type => CHARACTER_TOKEN,
762 data => '</' . $self->{kwd},
763 line => $self->{line_prev},
764 column => $self->{column_prev} - 1 - length $self->{kwd},
765 });
766 redo A;
767 }
768 } else { # after "<{tag-name}"
769 unless ($is_space->{$self->{nc}} or
770 {
771 0x003E => 1, # >
772 0x002F => 1, # /
773 -1 => 1, # EOF
774 }->{$self->{nc}}) {
775 !!!cp (26);
776 ## Reconsume.
777 $self->{state} = DATA_STATE;
778 $self->{s_kwd} = '';
779 !!!emit ({type => CHARACTER_TOKEN,
780 data => '</' . $self->{kwd},
781 line => $self->{line_prev},
782 column => $self->{column_prev} - 1 - length $self->{kwd},
783 });
784 redo A;
785 } else {
786 !!!cp (27);
787 $self->{ct}
788 = {type => END_TAG_TOKEN,
789 tag_name => $self->{last_stag_name},
790 line => $self->{line_prev},
791 column => $self->{column_prev} - 1 - length $self->{kwd}};
792 $self->{state} = TAG_NAME_STATE;
793 ## Reconsume.
794 redo A;
795 }
796 }
797 } elsif ($self->{state} == TAG_NAME_STATE) {
798 if ($is_space->{$self->{nc}}) {
799 !!!cp (34);
800 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
801 !!!next-input-character;
802 redo A;
803 } elsif ($self->{nc} == 0x003E) { # >
804 if ($self->{ct}->{type} == START_TAG_TOKEN) {
805 !!!cp (35);
806 $self->{last_stag_name} = $self->{ct}->{tag_name};
807 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
808 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
809 #if ($self->{ct}->{attributes}) {
810 # ## NOTE: This should never be reached.
811 # !!! cp (36);
812 # !!! parse-error (type => 'end tag attribute');
813 #} else {
814 !!!cp (37);
815 #}
816 } else {
817 die "$0: $self->{ct}->{type}: Unknown token type";
818 }
819 $self->{state} = DATA_STATE;
820 $self->{s_kwd} = '';
821 !!!next-input-character;
822
823 !!!emit ($self->{ct}); # start tag or end tag
824
825 redo A;
826 } elsif (0x0041 <= $self->{nc} and
827 $self->{nc} <= 0x005A) { # A..Z
828 !!!cp (38);
829 $self->{ct}->{tag_name}
830 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
831 # start tag or end tag
832 ## Stay in this state
833 !!!next-input-character;
834 redo A;
835 } elsif ($self->{nc} == -1) {
836 !!!parse-error (type => 'unclosed tag');
837 if ($self->{ct}->{type} == START_TAG_TOKEN) {
838 !!!cp (39);
839 $self->{last_stag_name} = $self->{ct}->{tag_name};
840 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
841 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
842 #if ($self->{ct}->{attributes}) {
843 # ## NOTE: This state should never be reached.
844 # !!! cp (40);
845 # !!! parse-error (type => 'end tag attribute');
846 #} else {
847 !!!cp (41);
848 #}
849 } else {
850 die "$0: $self->{ct}->{type}: Unknown token type";
851 }
852 $self->{state} = DATA_STATE;
853 $self->{s_kwd} = '';
854 # reconsume
855
856 !!!emit ($self->{ct}); # start tag or end tag
857
858 redo A;
859 } elsif ($self->{nc} == 0x002F) { # /
860 !!!cp (42);
861 $self->{state} = SELF_CLOSING_START_TAG_STATE;
862 !!!next-input-character;
863 redo A;
864 } else {
865 !!!cp (44);
866 $self->{ct}->{tag_name} .= chr $self->{nc};
867 # start tag or end tag
868 ## Stay in the state
869 !!!next-input-character;
870 redo A;
871 }
872 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
873 ## XML5: "Tag attribute name before state".
874
875 if ($is_space->{$self->{nc}}) {
876 !!!cp (45);
877 ## Stay in the state
878 !!!next-input-character;
879 redo A;
880 } elsif ($self->{nc} == 0x003E) { # >
881 if ($self->{ct}->{type} == START_TAG_TOKEN) {
882 !!!cp (46);
883 $self->{last_stag_name} = $self->{ct}->{tag_name};
884 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
885 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
886 if ($self->{ct}->{attributes}) {
887 !!!cp (47);
888 !!!parse-error (type => 'end tag attribute');
889 } else {
890 !!!cp (48);
891 }
892 } else {
893 die "$0: $self->{ct}->{type}: Unknown token type";
894 }
895 $self->{state} = DATA_STATE;
896 $self->{s_kwd} = '';
897 !!!next-input-character;
898
899 !!!emit ($self->{ct}); # start tag or end tag
900
901 redo A;
902 } elsif (0x0041 <= $self->{nc} and
903 $self->{nc} <= 0x005A) { # A..Z
904 !!!cp (49);
905 $self->{ca}
906 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
907 value => '',
908 line => $self->{line}, column => $self->{column}};
909 $self->{state} = ATTRIBUTE_NAME_STATE;
910 !!!next-input-character;
911 redo A;
912 } elsif ($self->{nc} == 0x002F) { # /
913 !!!cp (50);
914 $self->{state} = SELF_CLOSING_START_TAG_STATE;
915 !!!next-input-character;
916 redo A;
917 } elsif ($self->{nc} == -1) {
918 !!!parse-error (type => 'unclosed tag');
919 if ($self->{ct}->{type} == START_TAG_TOKEN) {
920 !!!cp (52);
921 $self->{last_stag_name} = $self->{ct}->{tag_name};
922 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
923 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
924 if ($self->{ct}->{attributes}) {
925 !!!cp (53);
926 !!!parse-error (type => 'end tag attribute');
927 } else {
928 !!!cp (54);
929 }
930 } else {
931 die "$0: $self->{ct}->{type}: Unknown token type";
932 }
933 $self->{state} = DATA_STATE;
934 $self->{s_kwd} = '';
935 # reconsume
936
937 !!!emit ($self->{ct}); # start tag or end tag
938
939 redo A;
940 } else {
941 if ({
942 0x0022 => 1, # "
943 0x0027 => 1, # '
944 0x003D => 1, # =
945 }->{$self->{nc}}) {
946 !!!cp (55);
947 ## XML5: Not a parse error.
948 !!!parse-error (type => 'bad attribute name');
949 } else {
950 !!!cp (56);
951 ## XML5: ":" raises a parse error and is ignored.
952 }
953 $self->{ca}
954 = {name => chr ($self->{nc}),
955 value => '',
956 line => $self->{line}, column => $self->{column}};
957 $self->{state} = ATTRIBUTE_NAME_STATE;
958 !!!next-input-character;
959 redo A;
960 }
961 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
962 ## XML5: "Tag attribute name state".
963
964 my $before_leave = sub {
965 if (exists $self->{ct}->{attributes} # start tag or end tag
966 ->{$self->{ca}->{name}}) { # MUST
967 !!!cp (57);
968 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
969 ## Discard $self->{ca} # MUST
970 } else {
971 !!!cp (58);
972 $self->{ct}->{attributes}->{$self->{ca}->{name}}
973 = $self->{ca};
974 $self->{ca}->{index} = ++$self->{ct}->{last_index};
975 }
976 }; # $before_leave
977
978 if ($is_space->{$self->{nc}}) {
979 !!!cp (59);
980 $before_leave->();
981 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
982 !!!next-input-character;
983 redo A;
984 } elsif ($self->{nc} == 0x003D) { # =
985 !!!cp (60);
986 $before_leave->();
987 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
988 !!!next-input-character;
989 redo A;
990 } elsif ($self->{nc} == 0x003E) { # >
991 if ($self->{is_xml}) {
992 !!!cp (60.1);
993 ## XML5: Not a parse error.
994 !!!parse-error (type => 'no attr value'); ## TODO: type
995 } else {
996 !!!cp (60.2);
997 }
998
999 $before_leave->();
1000 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1001 !!!cp (61);
1002 $self->{last_stag_name} = $self->{ct}->{tag_name};
1003 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1004 !!!cp (62);
1005 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1006 if ($self->{ct}->{attributes}) {
1007 !!!parse-error (type => 'end tag attribute');
1008 }
1009 } else {
1010 die "$0: $self->{ct}->{type}: Unknown token type";
1011 }
1012 $self->{state} = DATA_STATE;
1013 $self->{s_kwd} = '';
1014 !!!next-input-character;
1015
1016 !!!emit ($self->{ct}); # start tag or end tag
1017
1018 redo A;
1019 } elsif (0x0041 <= $self->{nc} and
1020 $self->{nc} <= 0x005A) { # A..Z
1021 !!!cp (63);
1022 $self->{ca}->{name}
1023 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1024 ## Stay in the state
1025 !!!next-input-character;
1026 redo A;
1027 } elsif ($self->{nc} == 0x002F) { # /
1028 if ($self->{is_xml}) {
1029 !!!cp (64);
1030 ## XML5: Not a parse error.
1031 !!!parse-error (type => 'no attr value'); ## TODO: type
1032 } else {
1033 !!!cp (64.1);
1034 }
1035
1036 $before_leave->();
1037 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1038 !!!next-input-character;
1039 redo A;
1040 } elsif ($self->{nc} == -1) {
1041 !!!parse-error (type => 'unclosed tag');
1042 $before_leave->();
1043 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1044 !!!cp (66);
1045 $self->{last_stag_name} = $self->{ct}->{tag_name};
1046 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1047 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1048 if ($self->{ct}->{attributes}) {
1049 !!!cp (67);
1050 !!!parse-error (type => 'end tag attribute');
1051 } else {
1052 ## NOTE: This state should never be reached.
1053 !!!cp (68);
1054 }
1055 } else {
1056 die "$0: $self->{ct}->{type}: Unknown token type";
1057 }
1058 $self->{state} = DATA_STATE;
1059 $self->{s_kwd} = '';
1060 # reconsume
1061
1062 !!!emit ($self->{ct}); # start tag or end tag
1063
1064 redo A;
1065 } else {
1066 if ($self->{nc} == 0x0022 or # "
1067 $self->{nc} == 0x0027) { # '
1068 !!!cp (69);
1069 ## XML5: Not a parse error.
1070 !!!parse-error (type => 'bad attribute name');
1071 } else {
1072 !!!cp (70);
1073 }
1074 $self->{ca}->{name} .= chr ($self->{nc});
1075 ## Stay in the state
1076 !!!next-input-character;
1077 redo A;
1078 }
1079 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1080 ## XML5: "Tag attribute name after state".
1081
1082 if ($is_space->{$self->{nc}}) {
1083 !!!cp (71);
1084 ## Stay in the state
1085 !!!next-input-character;
1086 redo A;
1087 } elsif ($self->{nc} == 0x003D) { # =
1088 !!!cp (72);
1089 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1090 !!!next-input-character;
1091 redo A;
1092 } elsif ($self->{nc} == 0x003E) { # >
1093 if ($self->{is_xml}) {
1094 !!!cp (72.1);
1095 ## XML5: Not a parse error.
1096 !!!parse-error (type => 'no attr value'); ## TODO: type
1097 } else {
1098 !!!cp (72.2);
1099 }
1100
1101 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1102 !!!cp (73);
1103 $self->{last_stag_name} = $self->{ct}->{tag_name};
1104 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1105 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1106 if ($self->{ct}->{attributes}) {
1107 !!!cp (74);
1108 !!!parse-error (type => 'end tag attribute');
1109 } else {
1110 ## NOTE: This state should never be reached.
1111 !!!cp (75);
1112 }
1113 } else {
1114 die "$0: $self->{ct}->{type}: Unknown token type";
1115 }
1116 $self->{state} = DATA_STATE;
1117 $self->{s_kwd} = '';
1118 !!!next-input-character;
1119
1120 !!!emit ($self->{ct}); # start tag or end tag
1121
1122 redo A;
1123 } elsif (0x0041 <= $self->{nc} and
1124 $self->{nc} <= 0x005A) { # A..Z
1125 !!!cp (76);
1126 $self->{ca}
1127 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1128 value => '',
1129 line => $self->{line}, column => $self->{column}};
1130 $self->{state} = ATTRIBUTE_NAME_STATE;
1131 !!!next-input-character;
1132 redo A;
1133 } elsif ($self->{nc} == 0x002F) { # /
1134 if ($self->{is_xml}) {
1135 !!!cp (77);
1136 ## XML5: Not a parse error.
1137 !!!parse-error (type => 'no attr value'); ## TODO: type
1138 } else {
1139 !!!cp (77.1);
1140 }
1141
1142 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1143 !!!next-input-character;
1144 redo A;
1145 } elsif ($self->{nc} == -1) {
1146 !!!parse-error (type => 'unclosed tag');
1147 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1148 !!!cp (79);
1149 $self->{last_stag_name} = $self->{ct}->{tag_name};
1150 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1151 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1152 if ($self->{ct}->{attributes}) {
1153 !!!cp (80);
1154 !!!parse-error (type => 'end tag attribute');
1155 } else {
1156 ## NOTE: This state should never be reached.
1157 !!!cp (81);
1158 }
1159 } else {
1160 die "$0: $self->{ct}->{type}: Unknown token type";
1161 }
1162 $self->{s_kwd} = '';
1163 $self->{state} = DATA_STATE;
1164 # reconsume
1165
1166 !!!emit ($self->{ct}); # start tag or end tag
1167
1168 redo A;
1169 } else {
1170 if ($self->{is_xml}) {
1171 !!!cp (78.1);
1172 ## XML5: Not a parse error.
1173 !!!parse-error (type => 'no attr value'); ## TODO: type
1174 } else {
1175 !!!cp (78.2);
1176 }
1177
1178 if ($self->{nc} == 0x0022 or # "
1179 $self->{nc} == 0x0027) { # '
1180 !!!cp (78);
1181 ## XML5: Not a parse error.
1182 !!!parse-error (type => 'bad attribute name');
1183 } else {
1184 !!!cp (82);
1185 }
1186 $self->{ca}
1187 = {name => chr ($self->{nc}),
1188 value => '',
1189 line => $self->{line}, column => $self->{column}};
1190 $self->{state} = ATTRIBUTE_NAME_STATE;
1191 !!!next-input-character;
1192 redo A;
1193 }
1194 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1195 ## XML5: "Tag attribute value before state".
1196
1197 if ($is_space->{$self->{nc}}) {
1198 !!!cp (83);
1199 ## Stay in the state
1200 !!!next-input-character;
1201 redo A;
1202 } elsif ($self->{nc} == 0x0022) { # "
1203 !!!cp (84);
1204 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1205 !!!next-input-character;
1206 redo A;
1207 } elsif ($self->{nc} == 0x0026) { # &
1208 !!!cp (85);
1209 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1210 ## reconsume
1211 redo A;
1212 } elsif ($self->{nc} == 0x0027) { # '
1213 !!!cp (86);
1214 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1215 !!!next-input-character;
1216 redo A;
1217 } elsif ($self->{nc} == 0x003E) { # >
1218 !!!parse-error (type => 'empty unquoted attribute value');
1219 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1220 !!!cp (87);
1221 $self->{last_stag_name} = $self->{ct}->{tag_name};
1222 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1223 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1224 if ($self->{ct}->{attributes}) {
1225 !!!cp (88);
1226 !!!parse-error (type => 'end tag attribute');
1227 } else {
1228 ## NOTE: This state should never be reached.
1229 !!!cp (89);
1230 }
1231 } else {
1232 die "$0: $self->{ct}->{type}: Unknown token type";
1233 }
1234 $self->{state} = DATA_STATE;
1235 $self->{s_kwd} = '';
1236 !!!next-input-character;
1237
1238 !!!emit ($self->{ct}); # start tag or end tag
1239
1240 redo A;
1241 } elsif ($self->{nc} == -1) {
1242 !!!parse-error (type => 'unclosed tag');
1243 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1244 !!!cp (90);
1245 $self->{last_stag_name} = $self->{ct}->{tag_name};
1246 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1247 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1248 if ($self->{ct}->{attributes}) {
1249 !!!cp (91);
1250 !!!parse-error (type => 'end tag attribute');
1251 } else {
1252 ## NOTE: This state should never be reached.
1253 !!!cp (92);
1254 }
1255 } else {
1256 die "$0: $self->{ct}->{type}: Unknown token type";
1257 }
1258 $self->{state} = DATA_STATE;
1259 $self->{s_kwd} = '';
1260 ## reconsume
1261
1262 !!!emit ($self->{ct}); # start tag or end tag
1263
1264 redo A;
1265 } else {
1266 if ($self->{nc} == 0x003D) { # =
1267 !!!cp (93);
1268 ## XML5: Not a parse error.
1269 !!!parse-error (type => 'bad attribute value');
1270 } elsif ($self->{is_xml}) {
1271 !!!cp (93.1);
1272 ## XML5: No parse error.
1273 !!!parse-error (type => 'unquoted attr value'); ## TODO
1274 } else {
1275 !!!cp (94);
1276 }
1277 $self->{ca}->{value} .= chr ($self->{nc});
1278 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1279 !!!next-input-character;
1280 redo A;
1281 }
1282 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1283 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1284 ## ATTLIST attribute value double quoted state".
1285
1286 if ($self->{nc} == 0x0022) { # "
1287 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1288 !!!cp (95.1);
1289 ## XML5: "DOCTYPE ATTLIST name after state".
1290 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1291 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1292 } else {
1293 !!!cp (95);
1294 ## XML5: "Tag attribute name before state".
1295 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1296 }
1297 !!!next-input-character;
1298 redo A;
1299 } elsif ($self->{nc} == 0x0026) { # &
1300 !!!cp (96);
1301 ## XML5: Not defined yet.
1302
1303 ## NOTE: In the spec, the tokenizer is switched to the
1304 ## "entity in attribute value state". In this implementation, the
1305 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1306 ## implementation of the "consume a character reference" algorithm.
1307 $self->{prev_state} = $self->{state};
1308 $self->{entity_add} = 0x0022; # "
1309 $self->{state} = ENTITY_STATE;
1310 !!!next-input-character;
1311 redo A;
1312 } elsif ($self->{nc} == -1) {
1313 !!!parse-error (type => 'unclosed attribute value');
1314 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1315 !!!cp (97);
1316 $self->{last_stag_name} = $self->{ct}->{tag_name};
1317
1318 $self->{state} = DATA_STATE;
1319 $self->{s_kwd} = '';
1320 ## reconsume
1321 !!!emit ($self->{ct}); # start tag
1322 redo A;
1323 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1325 if ($self->{ct}->{attributes}) {
1326 !!!cp (98);
1327 !!!parse-error (type => 'end tag attribute');
1328 } else {
1329 ## NOTE: This state should never be reached.
1330 !!!cp (99);
1331 }
1332
1333 $self->{state} = DATA_STATE;
1334 $self->{s_kwd} = '';
1335 ## reconsume
1336 !!!emit ($self->{ct}); # end tag
1337 redo A;
1338 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1339 ## XML5: No parse error above; not defined yet.
1340 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1341 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1342 ## Reconsume.
1343 !!!emit ($self->{ct}); # ATTLIST
1344 redo A;
1345 } else {
1346 die "$0: $self->{ct}->{type}: Unknown token type";
1347 }
1348 } else {
1349 ## XML5 [ATTLIST]: Not defined yet.
1350 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1351 !!!cp (100);
1352 ## XML5: Not a parse error.
1353 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1354 } else {
1355 !!!cp (100.1);
1356 }
1357 $self->{ca}->{value} .= chr ($self->{nc});
1358 $self->{read_until}->($self->{ca}->{value},
1359 q["&<],
1360 length $self->{ca}->{value});
1361
1362 ## Stay in the state
1363 !!!next-input-character;
1364 redo A;
1365 }
1366 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1367 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1368 ## ATTLIST attribute value single quoted state".
1369
1370 if ($self->{nc} == 0x0027) { # '
1371 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1372 !!!cp (101.1);
1373 ## XML5: "DOCTYPE ATTLIST name after state".
1374 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1375 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1376 } else {
1377 !!!cp (101);
1378 ## XML5: "Before attribute name state" (sic).
1379 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1380 }
1381 !!!next-input-character;
1382 redo A;
1383 } elsif ($self->{nc} == 0x0026) { # &
1384 !!!cp (102);
1385 ## XML5: Not defined yet.
1386
1387 ## NOTE: In the spec, the tokenizer is switched to the
1388 ## "entity in attribute value state". In this implementation, the
1389 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1390 ## implementation of the "consume a character reference" algorithm.
1391 $self->{entity_add} = 0x0027; # '
1392 $self->{prev_state} = $self->{state};
1393 $self->{state} = ENTITY_STATE;
1394 !!!next-input-character;
1395 redo A;
1396 } elsif ($self->{nc} == -1) {
1397 !!!parse-error (type => 'unclosed attribute value');
1398 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1399 !!!cp (103);
1400 $self->{last_stag_name} = $self->{ct}->{tag_name};
1401
1402 $self->{state} = DATA_STATE;
1403 $self->{s_kwd} = '';
1404 ## reconsume
1405 !!!emit ($self->{ct}); # start tag
1406 redo A;
1407 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1408 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1409 if ($self->{ct}->{attributes}) {
1410 !!!cp (104);
1411 !!!parse-error (type => 'end tag attribute');
1412 } else {
1413 ## NOTE: This state should never be reached.
1414 !!!cp (105);
1415 }
1416
1417 $self->{state} = DATA_STATE;
1418 $self->{s_kwd} = '';
1419 ## reconsume
1420 !!!emit ($self->{ct}); # end tag
1421 redo A;
1422 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1423 ## XML5: No parse error above; not defined yet.
1424 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1425 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1426 ## Reconsume.
1427 !!!emit ($self->{ct}); # ATTLIST
1428 redo A;
1429 } else {
1430 die "$0: $self->{ct}->{type}: Unknown token type";
1431 }
1432 } else {
1433 ## XML5 [ATTLIST]: Not defined yet.
1434 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1435 !!!cp (106);
1436 ## XML5: Not a parse error.
1437 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1438 } else {
1439 !!!cp (106.1);
1440 }
1441 $self->{ca}->{value} .= chr ($self->{nc});
1442 $self->{read_until}->($self->{ca}->{value},
1443 q['&<],
1444 length $self->{ca}->{value});
1445
1446 ## Stay in the state
1447 !!!next-input-character;
1448 redo A;
1449 }
1450 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1451 ## XML5: "Tag attribute value unquoted state".
1452
1453 if ($is_space->{$self->{nc}}) {
1454 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1455 !!!cp (107.1);
1456 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1457 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1458 } else {
1459 !!!cp (107);
1460 ## XML5: "Tag attribute name before state".
1461 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1462 }
1463 !!!next-input-character;
1464 redo A;
1465 } elsif ($self->{nc} == 0x0026) { # &
1466 !!!cp (108);
1467
1468 ## XML5: Not defined yet.
1469
1470 ## NOTE: In the spec, the tokenizer is switched to the
1471 ## "entity in attribute value state". In this implementation, the
1472 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1473 ## implementation of the "consume a character reference" algorithm.
1474 $self->{entity_add} = -1;
1475 $self->{prev_state} = $self->{state};
1476 $self->{state} = ENTITY_STATE;
1477 !!!next-input-character;
1478 redo A;
1479 } elsif ($self->{nc} == 0x003E) { # >
1480 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1481 !!!cp (109);
1482 $self->{last_stag_name} = $self->{ct}->{tag_name};
1483
1484 $self->{state} = DATA_STATE;
1485 $self->{s_kwd} = '';
1486 !!!next-input-character;
1487 !!!emit ($self->{ct}); # start tag
1488 redo A;
1489 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1490 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1491 if ($self->{ct}->{attributes}) {
1492 !!!cp (110);
1493 !!!parse-error (type => 'end tag attribute');
1494 } else {
1495 ## NOTE: This state should never be reached.
1496 !!!cp (111);
1497 }
1498
1499 $self->{state} = DATA_STATE;
1500 $self->{s_kwd} = '';
1501 !!!next-input-character;
1502 !!!emit ($self->{ct}); # end tag
1503 redo A;
1504 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1505 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1506 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1507 !!!next-input-character;
1508 !!!emit ($self->{ct}); # ATTLIST
1509 redo A;
1510 } else {
1511 die "$0: $self->{ct}->{type}: Unknown token type";
1512 }
1513 } elsif ($self->{nc} == -1) {
1514 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1515 !!!cp (112);
1516 !!!parse-error (type => 'unclosed tag');
1517 $self->{last_stag_name} = $self->{ct}->{tag_name};
1518
1519 $self->{state} = DATA_STATE;
1520 $self->{s_kwd} = '';
1521 ## reconsume
1522 !!!emit ($self->{ct}); # start tag
1523 redo A;
1524 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525 !!!parse-error (type => 'unclosed tag');
1526 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1527 if ($self->{ct}->{attributes}) {
1528 !!!cp (113);
1529 !!!parse-error (type => 'end tag attribute');
1530 } else {
1531 ## NOTE: This state should never be reached.
1532 !!!cp (114);
1533 }
1534
1535 $self->{state} = DATA_STATE;
1536 $self->{s_kwd} = '';
1537 ## reconsume
1538 !!!emit ($self->{ct}); # end tag
1539 redo A;
1540 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1541 !!!parse-error (type => 'unclosed md'); ## TODO: type
1542 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1543 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1544 ## Reconsume.
1545 !!!emit ($self->{ct}); # ATTLIST
1546 redo A;
1547 } else {
1548 die "$0: $self->{ct}->{type}: Unknown token type";
1549 }
1550 } else {
1551 if ({
1552 0x0022 => 1, # "
1553 0x0027 => 1, # '
1554 0x003D => 1, # =
1555 }->{$self->{nc}}) {
1556 !!!cp (115);
1557 ## XML5: Not a parse error.
1558 !!!parse-error (type => 'bad attribute value');
1559 } else {
1560 !!!cp (116);
1561 }
1562 $self->{ca}->{value} .= chr ($self->{nc});
1563 $self->{read_until}->($self->{ca}->{value},
1564 q["'=& >],
1565 length $self->{ca}->{value});
1566
1567 ## Stay in the state
1568 !!!next-input-character;
1569 redo A;
1570 }
1571 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1572 if ($is_space->{$self->{nc}}) {
1573 !!!cp (118);
1574 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1575 !!!next-input-character;
1576 redo A;
1577 } elsif ($self->{nc} == 0x003E) { # >
1578 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1579 !!!cp (119);
1580 $self->{last_stag_name} = $self->{ct}->{tag_name};
1581 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1582 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1583 if ($self->{ct}->{attributes}) {
1584 !!!cp (120);
1585 !!!parse-error (type => 'end tag attribute');
1586 } else {
1587 ## NOTE: This state should never be reached.
1588 !!!cp (121);
1589 }
1590 } else {
1591 die "$0: $self->{ct}->{type}: Unknown token type";
1592 }
1593 $self->{state} = DATA_STATE;
1594 $self->{s_kwd} = '';
1595 !!!next-input-character;
1596
1597 !!!emit ($self->{ct}); # start tag or end tag
1598
1599 redo A;
1600 } elsif ($self->{nc} == 0x002F) { # /
1601 !!!cp (122);
1602 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1603 !!!next-input-character;
1604 redo A;
1605 } elsif ($self->{nc} == -1) {
1606 !!!parse-error (type => 'unclosed tag');
1607 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1608 !!!cp (122.3);
1609 $self->{last_stag_name} = $self->{ct}->{tag_name};
1610 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1611 if ($self->{ct}->{attributes}) {
1612 !!!cp (122.1);
1613 !!!parse-error (type => 'end tag attribute');
1614 } else {
1615 ## NOTE: This state should never be reached.
1616 !!!cp (122.2);
1617 }
1618 } else {
1619 die "$0: $self->{ct}->{type}: Unknown token type";
1620 }
1621 $self->{state} = DATA_STATE;
1622 $self->{s_kwd} = '';
1623 ## Reconsume.
1624 !!!emit ($self->{ct}); # start tag or end tag
1625 redo A;
1626 } else {
1627 !!!cp ('124.1');
1628 !!!parse-error (type => 'no space between attributes');
1629 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1630 ## reconsume
1631 redo A;
1632 }
1633 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1634 ## XML5: "Empty tag state".
1635
1636 if ($self->{nc} == 0x003E) { # >
1637 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1638 !!!cp ('124.2');
1639 !!!parse-error (type => 'nestc', token => $self->{ct});
1640 ## TODO: Different type than slash in start tag
1641 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1642 if ($self->{ct}->{attributes}) {
1643 !!!cp ('124.4');
1644 !!!parse-error (type => 'end tag attribute');
1645 } else {
1646 !!!cp ('124.5');
1647 }
1648 ## TODO: Test |<title></title/>|
1649 } else {
1650 !!!cp ('124.3');
1651 $self->{self_closing} = 1;
1652 }
1653
1654 $self->{state} = DATA_STATE;
1655 $self->{s_kwd} = '';
1656 !!!next-input-character;
1657
1658 !!!emit ($self->{ct}); # start tag or end tag
1659
1660 redo A;
1661 } elsif ($self->{nc} == -1) {
1662 !!!parse-error (type => 'unclosed tag');
1663 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1664 !!!cp (124.7);
1665 $self->{last_stag_name} = $self->{ct}->{tag_name};
1666 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1667 if ($self->{ct}->{attributes}) {
1668 !!!cp (124.5);
1669 !!!parse-error (type => 'end tag attribute');
1670 } else {
1671 ## NOTE: This state should never be reached.
1672 !!!cp (124.6);
1673 }
1674 } else {
1675 die "$0: $self->{ct}->{type}: Unknown token type";
1676 }
1677 ## XML5: "Tag attribute name before state".
1678 $self->{state} = DATA_STATE;
1679 $self->{s_kwd} = '';
1680 ## Reconsume.
1681 !!!emit ($self->{ct}); # start tag or end tag
1682 redo A;
1683 } else {
1684 !!!cp ('124.4');
1685 !!!parse-error (type => 'nestc');
1686 ## TODO: This error type is wrong.
1687 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1688 ## Reconsume.
1689 redo A;
1690 }
1691 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1692 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1693
1694 ## NOTE: Unlike spec's "bogus comment state", this implementation
1695 ## consumes characters one-by-one basis.
1696
1697 if ($self->{nc} == 0x003E) { # >
1698 if ($self->{in_subset}) {
1699 !!!cp (123);
1700 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1701 } else {
1702 !!!cp (124);
1703 $self->{state} = DATA_STATE;
1704 $self->{s_kwd} = '';
1705 }
1706 !!!next-input-character;
1707
1708 !!!emit ($self->{ct}); # comment
1709 redo A;
1710 } elsif ($self->{nc} == -1) {
1711 if ($self->{in_subset}) {
1712 !!!cp (125.1);
1713 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1714 } else {
1715 !!!cp (125);
1716 $self->{state} = DATA_STATE;
1717 $self->{s_kwd} = '';
1718 }
1719 ## reconsume
1720
1721 !!!emit ($self->{ct}); # comment
1722 redo A;
1723 } else {
1724 !!!cp (126);
1725 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1726 $self->{read_until}->($self->{ct}->{data},
1727 q[>],
1728 length $self->{ct}->{data});
1729
1730 ## Stay in the state.
1731 !!!next-input-character;
1732 redo A;
1733 }
1734 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1735 ## XML5: "Markup declaration state".
1736
1737 if ($self->{nc} == 0x002D) { # -
1738 !!!cp (133);
1739 $self->{state} = MD_HYPHEN_STATE;
1740 !!!next-input-character;
1741 redo A;
1742 } elsif ($self->{nc} == 0x0044 or # D
1743 $self->{nc} == 0x0064) { # d
1744 ## ASCII case-insensitive.
1745 !!!cp (130);
1746 $self->{state} = MD_DOCTYPE_STATE;
1747 $self->{kwd} = chr $self->{nc};
1748 !!!next-input-character;
1749 redo A;
1750 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1751 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1752 $self->{is_xml}) and
1753 $self->{nc} == 0x005B) { # [
1754 !!!cp (135.4);
1755 $self->{state} = MD_CDATA_STATE;
1756 $self->{kwd} = '[';
1757 !!!next-input-character;
1758 redo A;
1759 } else {
1760 !!!cp (136);
1761 }
1762
1763 !!!parse-error (type => 'bogus comment',
1764 line => $self->{line_prev},
1765 column => $self->{column_prev} - 1);
1766 ## Reconsume.
1767 $self->{state} = BOGUS_COMMENT_STATE;
1768 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1769 line => $self->{line_prev},
1770 column => $self->{column_prev} - 1,
1771 };
1772 redo A;
1773 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1774 if ($self->{nc} == 0x002D) { # -
1775 !!!cp (127);
1776 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1777 line => $self->{line_prev},
1778 column => $self->{column_prev} - 2,
1779 };
1780 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1781 !!!next-input-character;
1782 redo A;
1783 } else {
1784 !!!cp (128);
1785 !!!parse-error (type => 'bogus comment',
1786 line => $self->{line_prev},
1787 column => $self->{column_prev} - 2);
1788 $self->{state} = BOGUS_COMMENT_STATE;
1789 ## Reconsume.
1790 $self->{ct} = {type => COMMENT_TOKEN,
1791 data => '-',
1792 line => $self->{line_prev},
1793 column => $self->{column_prev} - 2,
1794 };
1795 redo A;
1796 }
1797 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1798 ## ASCII case-insensitive.
1799 if ($self->{nc} == [
1800 undef,
1801 0x004F, # O
1802 0x0043, # C
1803 0x0054, # T
1804 0x0059, # Y
1805 0x0050, # P
1806 ]->[length $self->{kwd}] or
1807 $self->{nc} == [
1808 undef,
1809 0x006F, # o
1810 0x0063, # c
1811 0x0074, # t
1812 0x0079, # y
1813 0x0070, # p
1814 ]->[length $self->{kwd}]) {
1815 !!!cp (131);
1816 ## Stay in the state.
1817 $self->{kwd} .= chr $self->{nc};
1818 !!!next-input-character;
1819 redo A;
1820 } elsif ((length $self->{kwd}) == 6 and
1821 ($self->{nc} == 0x0045 or # E
1822 $self->{nc} == 0x0065)) { # e
1823 if ($self->{is_xml} and
1824 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1825 !!!cp (129);
1826 ## XML5: case-sensitive.
1827 !!!parse-error (type => 'lowercase keyword', ## TODO
1828 text => 'DOCTYPE',
1829 line => $self->{line_prev},
1830 column => $self->{column_prev} - 5);
1831 } else {
1832 !!!cp (129.1);
1833 }
1834 $self->{state} = DOCTYPE_STATE;
1835 $self->{ct} = {type => DOCTYPE_TOKEN,
1836 quirks => 1,
1837 line => $self->{line_prev},
1838 column => $self->{column_prev} - 7,
1839 };
1840 !!!next-input-character;
1841 redo A;
1842 } else {
1843 !!!cp (132);
1844 !!!parse-error (type => 'bogus comment',
1845 line => $self->{line_prev},
1846 column => $self->{column_prev} - 1 - length $self->{kwd});
1847 $self->{state} = BOGUS_COMMENT_STATE;
1848 ## Reconsume.
1849 $self->{ct} = {type => COMMENT_TOKEN,
1850 data => $self->{kwd},
1851 line => $self->{line_prev},
1852 column => $self->{column_prev} - 1 - length $self->{kwd},
1853 };
1854 redo A;
1855 }
1856 } elsif ($self->{state} == MD_CDATA_STATE) {
1857 if ($self->{nc} == {
1858 '[' => 0x0043, # C
1859 '[C' => 0x0044, # D
1860 '[CD' => 0x0041, # A
1861 '[CDA' => 0x0054, # T
1862 '[CDAT' => 0x0041, # A
1863 }->{$self->{kwd}}) {
1864 !!!cp (135.1);
1865 ## Stay in the state.
1866 $self->{kwd} .= chr $self->{nc};
1867 !!!next-input-character;
1868 redo A;
1869 } elsif ($self->{kwd} eq '[CDATA' and
1870 $self->{nc} == 0x005B) { # [
1871 if ($self->{is_xml} and
1872 not $self->{tainted} and
1873 @{$self->{open_elements} or []} == 0) {
1874 !!!cp (135.2);
1875 !!!parse-error (type => 'cdata outside of root element',
1876 line => $self->{line_prev},
1877 column => $self->{column_prev} - 7);
1878 $self->{tainted} = 1;
1879 } else {
1880 !!!cp (135.21);
1881 }
1882
1883 $self->{ct} = {type => CHARACTER_TOKEN,
1884 data => '',
1885 line => $self->{line_prev},
1886 column => $self->{column_prev} - 7};
1887 $self->{state} = CDATA_SECTION_STATE;
1888 !!!next-input-character;
1889 redo A;
1890 } else {
1891 !!!cp (135.3);
1892 !!!parse-error (type => 'bogus comment',
1893 line => $self->{line_prev},
1894 column => $self->{column_prev} - 1 - length $self->{kwd});
1895 $self->{state} = BOGUS_COMMENT_STATE;
1896 ## Reconsume.
1897 $self->{ct} = {type => COMMENT_TOKEN,
1898 data => $self->{kwd},
1899 line => $self->{line_prev},
1900 column => $self->{column_prev} - 1 - length $self->{kwd},
1901 };
1902 redo A;
1903 }
1904 } elsif ($self->{state} == COMMENT_START_STATE) {
1905 if ($self->{nc} == 0x002D) { # -
1906 !!!cp (137);
1907 $self->{state} = COMMENT_START_DASH_STATE;
1908 !!!next-input-character;
1909 redo A;
1910 } elsif ($self->{nc} == 0x003E) { # >
1911 !!!parse-error (type => 'bogus comment');
1912 if ($self->{in_subset}) {
1913 !!!cp (138.1);
1914 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1915 } else {
1916 !!!cp (138);
1917 $self->{state} = DATA_STATE;
1918 $self->{s_kwd} = '';
1919 }
1920 !!!next-input-character;
1921
1922 !!!emit ($self->{ct}); # comment
1923
1924 redo A;
1925 } elsif ($self->{nc} == -1) {
1926 !!!parse-error (type => 'unclosed comment');
1927 if ($self->{in_subset}) {
1928 !!!cp (139.1);
1929 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1930 } else {
1931 !!!cp (139);
1932 $self->{state} = DATA_STATE;
1933 $self->{s_kwd} = '';
1934 }
1935 ## reconsume
1936
1937 !!!emit ($self->{ct}); # comment
1938
1939 redo A;
1940 } else {
1941 !!!cp (140);
1942 $self->{ct}->{data} # comment
1943 .= chr ($self->{nc});
1944 $self->{state} = COMMENT_STATE;
1945 !!!next-input-character;
1946 redo A;
1947 }
1948 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1949 if ($self->{nc} == 0x002D) { # -
1950 !!!cp (141);
1951 $self->{state} = COMMENT_END_STATE;
1952 !!!next-input-character;
1953 redo A;
1954 } elsif ($self->{nc} == 0x003E) { # >
1955 !!!parse-error (type => 'bogus comment');
1956 if ($self->{in_subset}) {
1957 !!!cp (142.1);
1958 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1959 } else {
1960 !!!cp (142);
1961 $self->{state} = DATA_STATE;
1962 $self->{s_kwd} = '';
1963 }
1964 !!!next-input-character;
1965
1966 !!!emit ($self->{ct}); # comment
1967
1968 redo A;
1969 } elsif ($self->{nc} == -1) {
1970 !!!parse-error (type => 'unclosed comment');
1971 if ($self->{in_subset}) {
1972 !!!cp (143.1);
1973 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1974 } else {
1975 !!!cp (143);
1976 $self->{state} = DATA_STATE;
1977 $self->{s_kwd} = '';
1978 }
1979 ## reconsume
1980
1981 !!!emit ($self->{ct}); # comment
1982
1983 redo A;
1984 } else {
1985 !!!cp (144);
1986 $self->{ct}->{data} # comment
1987 .= '-' . chr ($self->{nc});
1988 $self->{state} = COMMENT_STATE;
1989 !!!next-input-character;
1990 redo A;
1991 }
1992 } elsif ($self->{state} == COMMENT_STATE) {
1993 ## XML5: "Comment state" and "DOCTYPE comment state".
1994
1995 if ($self->{nc} == 0x002D) { # -
1996 !!!cp (145);
1997 $self->{state} = COMMENT_END_DASH_STATE;
1998 !!!next-input-character;
1999 redo A;
2000 } elsif ($self->{nc} == -1) {
2001 !!!parse-error (type => 'unclosed comment');
2002 if ($self->{in_subset}) {
2003 !!!cp (146.1);
2004 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005 } else {
2006 !!!cp (146);
2007 $self->{state} = DATA_STATE;
2008 $self->{s_kwd} = '';
2009 }
2010 ## reconsume
2011
2012 !!!emit ($self->{ct}); # comment
2013
2014 redo A;
2015 } else {
2016 !!!cp (147);
2017 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2018 $self->{read_until}->($self->{ct}->{data},
2019 q[-],
2020 length $self->{ct}->{data});
2021
2022 ## Stay in the state
2023 !!!next-input-character;
2024 redo A;
2025 }
2026 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2027 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2028
2029 if ($self->{nc} == 0x002D) { # -
2030 !!!cp (148);
2031 $self->{state} = COMMENT_END_STATE;
2032 !!!next-input-character;
2033 redo A;
2034 } elsif ($self->{nc} == -1) {
2035 !!!parse-error (type => 'unclosed comment');
2036 if ($self->{in_subset}) {
2037 !!!cp (149.1);
2038 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2039 } else {
2040 !!!cp (149);
2041 $self->{state} = DATA_STATE;
2042 $self->{s_kwd} = '';
2043 }
2044 ## reconsume
2045
2046 !!!emit ($self->{ct}); # comment
2047
2048 redo A;
2049 } else {
2050 !!!cp (150);
2051 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2052 $self->{state} = COMMENT_STATE;
2053 !!!next-input-character;
2054 redo A;
2055 }
2056 } elsif ($self->{state} == COMMENT_END_STATE) {
2057 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2058
2059 if ($self->{nc} == 0x003E) { # >
2060 if ($self->{in_subset}) {
2061 !!!cp (151.1);
2062 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2063 } else {
2064 !!!cp (151);
2065 $self->{state} = DATA_STATE;
2066 $self->{s_kwd} = '';
2067 }
2068 !!!next-input-character;
2069
2070 !!!emit ($self->{ct}); # comment
2071
2072 redo A;
2073 } elsif ($self->{nc} == 0x002D) { # -
2074 !!!cp (152);
2075 ## XML5: Not a parse error.
2076 !!!parse-error (type => 'dash in comment',
2077 line => $self->{line_prev},
2078 column => $self->{column_prev});
2079 $self->{ct}->{data} .= '-'; # comment
2080 ## Stay in the state
2081 !!!next-input-character;
2082 redo A;
2083 } elsif ($self->{nc} == -1) {
2084 !!!parse-error (type => 'unclosed comment');
2085 if ($self->{in_subset}) {
2086 !!!cp (153.1);
2087 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2088 } else {
2089 !!!cp (153);
2090 $self->{state} = DATA_STATE;
2091 $self->{s_kwd} = '';
2092 }
2093 ## reconsume
2094
2095 !!!emit ($self->{ct}); # comment
2096
2097 redo A;
2098 } else {
2099 !!!cp (154);
2100 ## XML5: Not a parse error.
2101 !!!parse-error (type => 'dash in comment',
2102 line => $self->{line_prev},
2103 column => $self->{column_prev});
2104 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2105 $self->{state} = COMMENT_STATE;
2106 !!!next-input-character;
2107 redo A;
2108 }
2109 } elsif ($self->{state} == DOCTYPE_STATE) {
2110 if ($is_space->{$self->{nc}}) {
2111 !!!cp (155);
2112 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2113 !!!next-input-character;
2114 redo A;
2115 } else {
2116 !!!cp (156);
2117 ## XML5: Unless EOF, swith to the bogus comment state.
2118 !!!parse-error (type => 'no space before DOCTYPE name');
2119 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2120 ## reconsume
2121 redo A;
2122 }
2123 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2124 ## XML5: "DOCTYPE root name before state".
2125
2126 if ($is_space->{$self->{nc}}) {
2127 !!!cp (157);
2128 ## Stay in the state
2129 !!!next-input-character;
2130 redo A;
2131 } elsif ($self->{nc} == 0x003E) { # >
2132 !!!cp (158);
2133 ## XML5: No parse error.
2134 !!!parse-error (type => 'no DOCTYPE name');
2135 $self->{state} = DATA_STATE;
2136 $self->{s_kwd} = '';
2137 !!!next-input-character;
2138
2139 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2140
2141 redo A;
2142 } elsif ($self->{nc} == -1) {
2143 !!!cp (159);
2144 !!!parse-error (type => 'no DOCTYPE name');
2145 $self->{state} = DATA_STATE;
2146 $self->{s_kwd} = '';
2147 ## reconsume
2148
2149 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2150
2151 redo A;
2152 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2153 !!!cp (159.1);
2154 !!!parse-error (type => 'no DOCTYPE name');
2155 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2156 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2157 $self->{in_subset} = 1;
2158 !!!next-input-character;
2159 !!!emit ($self->{ct}); # DOCTYPE
2160 redo A;
2161 } else {
2162 !!!cp (160);
2163 $self->{ct}->{name} = chr $self->{nc};
2164 delete $self->{ct}->{quirks};
2165 $self->{state} = DOCTYPE_NAME_STATE;
2166 !!!next-input-character;
2167 redo A;
2168 }
2169 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2170 ## XML5: "DOCTYPE root name state".
2171
2172 ## ISSUE: Redundant "First," in the spec.
2173
2174 if ($is_space->{$self->{nc}}) {
2175 !!!cp (161);
2176 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2177 !!!next-input-character;
2178 redo A;
2179 } elsif ($self->{nc} == 0x003E) { # >
2180 !!!cp (162);
2181 $self->{state} = DATA_STATE;
2182 $self->{s_kwd} = '';
2183 !!!next-input-character;
2184
2185 !!!emit ($self->{ct}); # DOCTYPE
2186
2187 redo A;
2188 } elsif ($self->{nc} == -1) {
2189 !!!cp (163);
2190 !!!parse-error (type => 'unclosed DOCTYPE');
2191 $self->{state} = DATA_STATE;
2192 $self->{s_kwd} = '';
2193 ## reconsume
2194
2195 $self->{ct}->{quirks} = 1;
2196 !!!emit ($self->{ct}); # DOCTYPE
2197
2198 redo A;
2199 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2200 !!!cp (163.1);
2201 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2202 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2203 $self->{in_subset} = 1;
2204 !!!next-input-character;
2205 !!!emit ($self->{ct}); # DOCTYPE
2206 redo A;
2207 } else {
2208 !!!cp (164);
2209 $self->{ct}->{name}
2210 .= chr ($self->{nc}); # DOCTYPE
2211 ## Stay in the state
2212 !!!next-input-character;
2213 redo A;
2214 }
2215 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2216 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2217 ## state", but implemented differently.
2218
2219 if ($is_space->{$self->{nc}}) {
2220 !!!cp (165);
2221 ## Stay in the state
2222 !!!next-input-character;
2223 redo A;
2224 } elsif ($self->{nc} == 0x003E) { # >
2225 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2226 !!!cp (166);
2227 $self->{state} = DATA_STATE;
2228 $self->{s_kwd} = '';
2229 } else {
2230 !!!cp (166.1);
2231 !!!parse-error (type => 'no md def'); ## TODO: type
2232 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2233 }
2234
2235 !!!next-input-character;
2236 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2237 redo A;
2238 } elsif ($self->{nc} == -1) {
2239 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2240 !!!cp (167);
2241 !!!parse-error (type => 'unclosed DOCTYPE');
2242 $self->{state} = DATA_STATE;
2243 $self->{s_kwd} = '';
2244 $self->{ct}->{quirks} = 1;
2245 } else {
2246 !!!cp (167.12);
2247 !!!parse-error (type => 'unclosed md'); ## TODO: type
2248 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2249 }
2250
2251 ## Reconsume.
2252 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2253 redo A;
2254 } elsif ($self->{nc} == 0x0050 or # P
2255 $self->{nc} == 0x0070) { # p
2256 !!!cp (167.1);
2257 $self->{state} = PUBLIC_STATE;
2258 $self->{kwd} = chr $self->{nc};
2259 !!!next-input-character;
2260 redo A;
2261 } elsif ($self->{nc} == 0x0053 or # S
2262 $self->{nc} == 0x0073) { # s
2263 !!!cp (167.2);
2264 $self->{state} = SYSTEM_STATE;
2265 $self->{kwd} = chr $self->{nc};
2266 !!!next-input-character;
2267 redo A;
2268 } elsif ($self->{nc} == 0x0022 and # "
2269 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2270 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2271 !!!cp (167.21);
2272 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2273 $self->{ct}->{value} = ''; # ENTITY
2274 !!!next-input-character;
2275 redo A;
2276 } elsif ($self->{nc} == 0x0027 and # '
2277 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2278 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2279 !!!cp (167.22);
2280 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2281 $self->{ct}->{value} = ''; # ENTITY
2282 !!!next-input-character;
2283 redo A;
2284 } elsif ($self->{is_xml} and
2285 $self->{ct}->{type} == DOCTYPE_TOKEN and
2286 $self->{nc} == 0x005B) { # [
2287 !!!cp (167.3);
2288 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2289 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2290 $self->{in_subset} = 1;
2291 !!!next-input-character;
2292 !!!emit ($self->{ct}); # DOCTYPE
2293 redo A;
2294 } else {
2295 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2296
2297 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2298 !!!cp (180);
2299 $self->{ct}->{quirks} = 1;
2300 $self->{state} = BOGUS_DOCTYPE_STATE;
2301 } else {
2302 !!!cp (180.1);
2303 $self->{state} = BOGUS_MD_STATE;
2304 }
2305
2306 !!!next-input-character;
2307 redo A;
2308 }
2309 } elsif ($self->{state} == PUBLIC_STATE) {
2310 ## ASCII case-insensitive
2311 if ($self->{nc} == [
2312 undef,
2313 0x0055, # U
2314 0x0042, # B
2315 0x004C, # L
2316 0x0049, # I
2317 ]->[length $self->{kwd}] or
2318 $self->{nc} == [
2319 undef,
2320 0x0075, # u
2321 0x0062, # b
2322 0x006C, # l
2323 0x0069, # i
2324 ]->[length $self->{kwd}]) {
2325 !!!cp (175);
2326 ## Stay in the state.
2327 $self->{kwd} .= chr $self->{nc};
2328 !!!next-input-character;
2329 redo A;
2330 } elsif ((length $self->{kwd}) == 5 and
2331 ($self->{nc} == 0x0043 or # C
2332 $self->{nc} == 0x0063)) { # c
2333 if ($self->{is_xml} and
2334 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2335 !!!cp (168.1);
2336 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2337 text => 'PUBLIC',
2338 line => $self->{line_prev},
2339 column => $self->{column_prev} - 4);
2340 } else {
2341 !!!cp (168);
2342 }
2343 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2344 !!!next-input-character;
2345 redo A;
2346 } else {
2347 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2348 line => $self->{line_prev},
2349 column => $self->{column_prev} + 1 - length $self->{kwd});
2350 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2351 !!!cp (169);
2352 $self->{ct}->{quirks} = 1;
2353 $self->{state} = BOGUS_DOCTYPE_STATE;
2354 } else {
2355 !!!cp (169.1);
2356 $self->{state} = BOGUS_MD_STATE;
2357 }
2358 ## Reconsume.
2359 redo A;
2360 }
2361 } elsif ($self->{state} == SYSTEM_STATE) {
2362 ## ASCII case-insensitive
2363 if ($self->{nc} == [
2364 undef,
2365 0x0059, # Y
2366 0x0053, # S
2367 0x0054, # T
2368 0x0045, # E
2369 ]->[length $self->{kwd}] or
2370 $self->{nc} == [
2371 undef,
2372 0x0079, # y
2373 0x0073, # s
2374 0x0074, # t
2375 0x0065, # e
2376 ]->[length $self->{kwd}]) {
2377 !!!cp (170);
2378 ## Stay in the state.
2379 $self->{kwd} .= chr $self->{nc};
2380 !!!next-input-character;
2381 redo A;
2382 } elsif ((length $self->{kwd}) == 5 and
2383 ($self->{nc} == 0x004D or # M
2384 $self->{nc} == 0x006D)) { # m
2385 if ($self->{is_xml} and
2386 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2387 !!!cp (171.1);
2388 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2389 text => 'SYSTEM',
2390 line => $self->{line_prev},
2391 column => $self->{column_prev} - 4);
2392 } else {
2393 !!!cp (171);
2394 }
2395 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2396 !!!next-input-character;
2397 redo A;
2398 } else {
2399 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2400 line => $self->{line_prev},
2401 column => $self->{column_prev} + 1 - length $self->{kwd});
2402 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2403 !!!cp (172);
2404 $self->{ct}->{quirks} = 1;
2405 $self->{state} = BOGUS_DOCTYPE_STATE;
2406 } else {
2407 !!!cp (172.1);
2408 $self->{state} = BOGUS_MD_STATE;
2409 }
2410 ## Reconsume.
2411 redo A;
2412 }
2413 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2414 if ($is_space->{$self->{nc}}) {
2415 !!!cp (181);
2416 ## Stay in the state
2417 !!!next-input-character;
2418 redo A;
2419 } elsif ($self->{nc} eq 0x0022) { # "
2420 !!!cp (182);
2421 $self->{ct}->{pubid} = ''; # DOCTYPE
2422 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2423 !!!next-input-character;
2424 redo A;
2425 } elsif ($self->{nc} eq 0x0027) { # '
2426 !!!cp (183);
2427 $self->{ct}->{pubid} = ''; # DOCTYPE
2428 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2429 !!!next-input-character;
2430 redo A;
2431 } elsif ($self->{nc} eq 0x003E) { # >
2432 !!!parse-error (type => 'no PUBLIC literal');
2433
2434 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2435 !!!cp (184);
2436 $self->{state} = DATA_STATE;
2437 $self->{s_kwd} = '';
2438 $self->{ct}->{quirks} = 1;
2439 } else {
2440 !!!cp (184.1);
2441 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2442 }
2443
2444 !!!next-input-character;
2445 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2446 redo A;
2447 } elsif ($self->{nc} == -1) {
2448 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2449 !!!cp (185);
2450 !!!parse-error (type => 'unclosed DOCTYPE');
2451 $self->{state} = DATA_STATE;
2452 $self->{s_kwd} = '';
2453 $self->{ct}->{quirks} = 1;
2454 } else {
2455 !!!cp (185.1);
2456 !!!parse-error (type => 'unclosed md'); ## TODO: type
2457 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2458 }
2459
2460 ## reconsume
2461 !!!emit ($self->{ct}); # DOCTYPE
2462 redo A;
2463 } elsif ($self->{is_xml} and
2464 $self->{ct}->{type} == DOCTYPE_TOKEN and
2465 $self->{nc} == 0x005B) { # [
2466 !!!cp (186.1);
2467 !!!parse-error (type => 'no PUBLIC literal');
2468 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2469 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2470 $self->{in_subset} = 1;
2471 !!!next-input-character;
2472 !!!emit ($self->{ct}); # DOCTYPE
2473 redo A;
2474 } else {
2475 !!!parse-error (type => 'string after PUBLIC');
2476
2477 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2478 !!!cp (186);
2479 $self->{ct}->{quirks} = 1;
2480 $self->{state} = BOGUS_DOCTYPE_STATE;
2481 } else {
2482 !!!cp (186.2);
2483 $self->{state} = BOGUS_MD_STATE;
2484 }
2485
2486 !!!next-input-character;
2487 redo A;
2488 }
2489 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2490 if ($self->{nc} == 0x0022) { # "
2491 !!!cp (187);
2492 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2493 !!!next-input-character;
2494 redo A;
2495 } elsif ($self->{nc} == 0x003E) { # >
2496 !!!parse-error (type => 'unclosed PUBLIC literal');
2497
2498 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2499 !!!cp (188);
2500 $self->{state} = DATA_STATE;
2501 $self->{s_kwd} = '';
2502 $self->{ct}->{quirks} = 1;
2503 } else {
2504 !!!cp (188.1);
2505 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2506 }
2507
2508 !!!next-input-character;
2509 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2510 redo A;
2511 } elsif ($self->{nc} == -1) {
2512 !!!parse-error (type => 'unclosed PUBLIC literal');
2513
2514 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2515 !!!cp (189);
2516 $self->{state} = DATA_STATE;
2517 $self->{s_kwd} = '';
2518 $self->{ct}->{quirks} = 1;
2519 } else {
2520 !!!cp (189.1);
2521 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2522 }
2523
2524 ## Reconsume.
2525 !!!emit ($self->{ct}); # DOCTYPE
2526 redo A;
2527 } else {
2528 !!!cp (190);
2529 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2530 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2531 length $self->{ct}->{pubid});
2532
2533 ## Stay in the state
2534 !!!next-input-character;
2535 redo A;
2536 }
2537 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2538 if ($self->{nc} == 0x0027) { # '
2539 !!!cp (191);
2540 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2541 !!!next-input-character;
2542 redo A;
2543 } elsif ($self->{nc} == 0x003E) { # >
2544 !!!parse-error (type => 'unclosed PUBLIC literal');
2545
2546 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2547 !!!cp (192);
2548 $self->{state} = DATA_STATE;
2549 $self->{s_kwd} = '';
2550 $self->{ct}->{quirks} = 1;
2551 } else {
2552 !!!cp (192.1);
2553 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2554 }
2555
2556 !!!next-input-character;
2557 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2558 redo A;
2559 } elsif ($self->{nc} == -1) {
2560 !!!parse-error (type => 'unclosed PUBLIC literal');
2561
2562 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2563 !!!cp (193);
2564 $self->{state} = DATA_STATE;
2565 $self->{s_kwd} = '';
2566 $self->{ct}->{quirks} = 1;
2567 } else {
2568 !!!cp (193.1);
2569 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2570 }
2571
2572 ## reconsume
2573 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2574 redo A;
2575 } else {
2576 !!!cp (194);
2577 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2578 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2579 length $self->{ct}->{pubid});
2580
2581 ## Stay in the state
2582 !!!next-input-character;
2583 redo A;
2584 }
2585 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2586 if ($is_space->{$self->{nc}}) {
2587 !!!cp (195);
2588 ## Stay in the state
2589 !!!next-input-character;
2590 redo A;
2591 } elsif ($self->{nc} == 0x0022) { # "
2592 !!!cp (196);
2593 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2594 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2595 !!!next-input-character;
2596 redo A;
2597 } elsif ($self->{nc} == 0x0027) { # '
2598 !!!cp (197);
2599 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2600 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2601 !!!next-input-character;
2602 redo A;
2603 } elsif ($self->{nc} == 0x003E) { # >
2604 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2605 if ($self->{is_xml}) {
2606 !!!cp (198.1);
2607 !!!parse-error (type => 'no SYSTEM literal');
2608 } else {
2609 !!!cp (198);
2610 }
2611 $self->{state} = DATA_STATE;
2612 $self->{s_kwd} = '';
2613 } else {
2614 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2615 !!!cp (198.2);
2616 } else {
2617 !!!cp (198.3);
2618 !!!parse-error (type => 'no SYSTEM literal');
2619 }
2620 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2621 }
2622
2623 !!!next-input-character;
2624 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2625 redo A;
2626 } elsif ($self->{nc} == -1) {
2627 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2628 !!!cp (199);
2629 !!!parse-error (type => 'unclosed DOCTYPE');
2630
2631 $self->{state} = DATA_STATE;
2632 $self->{s_kwd} = '';
2633 $self->{ct}->{quirks} = 1;
2634 } else {
2635 !!!parse-error (type => 'unclosed md'); ## TODO: type
2636 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2637 }
2638
2639 ## reconsume
2640 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2641 redo A;
2642 } elsif ($self->{is_xml} and
2643 $self->{ct}->{type} == DOCTYPE_TOKEN and
2644 $self->{nc} == 0x005B) { # [
2645 !!!cp (200.1);
2646 !!!parse-error (type => 'no SYSTEM literal');
2647 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2648 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2649 $self->{in_subset} = 1;
2650 !!!next-input-character;
2651 !!!emit ($self->{ct}); # DOCTYPE
2652 redo A;
2653 } else {
2654 !!!parse-error (type => 'string after PUBLIC literal');
2655
2656 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2657 !!!cp (200);
2658 $self->{ct}->{quirks} = 1;
2659 $self->{state} = BOGUS_DOCTYPE_STATE;
2660 } else {
2661 !!!cp (200.2);
2662 $self->{state} = BOGUS_MD_STATE;
2663 }
2664
2665 !!!next-input-character;
2666 redo A;
2667 }
2668 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2669 if ($is_space->{$self->{nc}}) {
2670 !!!cp (201);
2671 ## Stay in the state
2672 !!!next-input-character;
2673 redo A;
2674 } elsif ($self->{nc} == 0x0022) { # "
2675 !!!cp (202);
2676 $self->{ct}->{sysid} = ''; # DOCTYPE
2677 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2678 !!!next-input-character;
2679 redo A;
2680 } elsif ($self->{nc} == 0x0027) { # '
2681 !!!cp (203);
2682 $self->{ct}->{sysid} = ''; # DOCTYPE
2683 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2684 !!!next-input-character;
2685 redo A;
2686 } elsif ($self->{nc} == 0x003E) { # >
2687 !!!parse-error (type => 'no SYSTEM literal');
2688 !!!next-input-character;
2689
2690 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2691 !!!cp (204);
2692 $self->{state} = DATA_STATE;
2693 $self->{s_kwd} = '';
2694 $self->{ct}->{quirks} = 1;
2695 } else {
2696 !!!cp (204.1);
2697 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2698 }
2699
2700 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2701 redo A;
2702 } elsif ($self->{nc} == -1) {
2703 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2704 !!!cp (205);
2705 !!!parse-error (type => 'unclosed DOCTYPE');
2706 $self->{state} = DATA_STATE;
2707 $self->{s_kwd} = '';
2708 $self->{ct}->{quirks} = 1;
2709 } else {
2710 !!!cp (205.1);
2711 !!!parse-error (type => 'unclosed md'); ## TODO: type
2712 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2713 }
2714
2715 ## reconsume
2716 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2717 redo A;
2718 } elsif ($self->{is_xml} and
2719 $self->{ct}->{type} == DOCTYPE_TOKEN and
2720 $self->{nc} == 0x005B) { # [
2721 !!!cp (206.1);
2722 !!!parse-error (type => 'no SYSTEM literal');
2723
2724 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2726 $self->{in_subset} = 1;
2727 !!!next-input-character;
2728 !!!emit ($self->{ct}); # DOCTYPE
2729 redo A;
2730 } else {
2731 !!!parse-error (type => 'string after SYSTEM');
2732
2733 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2734 !!!cp (206);
2735 $self->{ct}->{quirks} = 1;
2736 $self->{state} = BOGUS_DOCTYPE_STATE;
2737 } else {
2738 !!!cp (206.2);
2739 $self->{state} = BOGUS_MD_STATE;
2740 }
2741
2742 !!!next-input-character;
2743 redo A;
2744 }
2745 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2746 if ($self->{nc} == 0x0022) { # "
2747 !!!cp (207);
2748 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2749 !!!next-input-character;
2750 redo A;
2751 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2752 !!!parse-error (type => 'unclosed SYSTEM literal');
2753
2754 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2755 !!!cp (208);
2756 $self->{state} = DATA_STATE;
2757 $self->{s_kwd} = '';
2758 $self->{ct}->{quirks} = 1;
2759 } else {
2760 !!!cp (208.1);
2761 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2762 }
2763
2764 !!!next-input-character;
2765 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2766 redo A;
2767 } elsif ($self->{nc} == -1) {
2768 !!!parse-error (type => 'unclosed SYSTEM literal');
2769
2770 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2771 !!!cp (209);
2772 $self->{state} = DATA_STATE;
2773 $self->{s_kwd} = '';
2774 $self->{ct}->{quirks} = 1;
2775 } else {
2776 !!!cp (209.1);
2777 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2778 }
2779
2780 ## reconsume
2781 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2782 redo A;
2783 } else {
2784 !!!cp (210);
2785 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2786 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2787 length $self->{ct}->{sysid});
2788
2789 ## Stay in the state
2790 !!!next-input-character;
2791 redo A;
2792 }
2793 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2794 if ($self->{nc} == 0x0027) { # '
2795 !!!cp (211);
2796 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2797 !!!next-input-character;
2798 redo A;
2799 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2800 !!!cp (212);
2801 !!!parse-error (type => 'unclosed SYSTEM literal');
2802
2803 $self->{state} = DATA_STATE;
2804 $self->{s_kwd} = '';
2805 !!!next-input-character;
2806
2807 $self->{ct}->{quirks} = 1;
2808 !!!emit ($self->{ct}); # DOCTYPE
2809
2810 redo A;
2811 } elsif ($self->{nc} == -1) {
2812 !!!parse-error (type => 'unclosed SYSTEM literal');
2813
2814 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2815 !!!cp (213);
2816 $self->{state} = DATA_STATE;
2817 $self->{s_kwd} = '';
2818 $self->{ct}->{quirks} = 1;
2819 } else {
2820 !!!cp (213.1);
2821 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822 }
2823
2824 ## reconsume
2825 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2826 redo A;
2827 } else {
2828 !!!cp (214);
2829 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2830 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2831 length $self->{ct}->{sysid});
2832
2833 ## Stay in the state
2834 !!!next-input-character;
2835 redo A;
2836 }
2837 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2838 if ($is_space->{$self->{nc}}) {
2839 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2840 !!!cp (215.1);
2841 $self->{state} = BEFORE_NDATA_STATE;
2842 } else {
2843 !!!cp (215);
2844 ## Stay in the state
2845 }
2846 !!!next-input-character;
2847 redo A;
2848 } elsif ($self->{nc} == 0x003E) { # >
2849 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2850 !!!cp (216);
2851 $self->{state} = DATA_STATE;
2852 $self->{s_kwd} = '';
2853 } else {
2854 !!!cp (216.1);
2855 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856 }
2857
2858 !!!next-input-character;
2859 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2860 redo A;
2861 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2862 ($self->{nc} == 0x004E or # N
2863 $self->{nc} == 0x006E)) { # n
2864 !!!cp (216.2);
2865 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2866 $self->{state} = NDATA_STATE;
2867 $self->{kwd} = chr $self->{nc};
2868 !!!next-input-character;
2869 redo A;
2870 } elsif ($self->{nc} == -1) {
2871 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2872 !!!cp (217);
2873 !!!parse-error (type => 'unclosed DOCTYPE');
2874 $self->{state} = DATA_STATE;
2875 $self->{s_kwd} = '';
2876 $self->{ct}->{quirks} = 1;
2877 } else {
2878 !!!cp (217.1);
2879 !!!parse-error (type => 'unclosed md'); ## TODO: type
2880 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2881 }
2882
2883 ## reconsume
2884 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2885 redo A;
2886 } elsif ($self->{is_xml} and
2887 $self->{ct}->{type} == DOCTYPE_TOKEN and
2888 $self->{nc} == 0x005B) { # [
2889 !!!cp (218.1);
2890 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2891 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2892 $self->{in_subset} = 1;
2893 !!!next-input-character;
2894 !!!emit ($self->{ct}); # DOCTYPE
2895 redo A;
2896 } else {
2897 !!!parse-error (type => 'string after SYSTEM literal');
2898
2899 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900 !!!cp (218);
2901 #$self->{ct}->{quirks} = 1;
2902 $self->{state} = BOGUS_DOCTYPE_STATE;
2903 } else {
2904 !!!cp (218.2);
2905 $self->{state} = BOGUS_MD_STATE;
2906 }
2907
2908 !!!next-input-character;
2909 redo A;
2910 }
2911 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2912 if ($is_space->{$self->{nc}}) {
2913 !!!cp (218.3);
2914 ## Stay in the state.
2915 !!!next-input-character;
2916 redo A;
2917 } elsif ($self->{nc} == 0x003E) { # >
2918 !!!cp (218.4);
2919 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920 !!!next-input-character;
2921 !!!emit ($self->{ct}); # ENTITY
2922 redo A;
2923 } elsif ($self->{nc} == 0x004E or # N
2924 $self->{nc} == 0x006E) { # n
2925 !!!cp (218.5);
2926 $self->{state} = NDATA_STATE;
2927 $self->{kwd} = chr $self->{nc};
2928 !!!next-input-character;
2929 redo A;
2930 } elsif ($self->{nc} == -1) {
2931 !!!cp (218.6);
2932 !!!parse-error (type => 'unclosed md'); ## TODO: type
2933 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2934 ## reconsume
2935 !!!emit ($self->{ct}); # ENTITY
2936 redo A;
2937 } else {
2938 !!!cp (218.7);
2939 !!!parse-error (type => 'string after SYSTEM literal');
2940 $self->{state} = BOGUS_MD_STATE;
2941 !!!next-input-character;
2942 redo A;
2943 }
2944 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2945 if ($self->{nc} == 0x003E) { # >
2946 !!!cp (219);
2947 $self->{state} = DATA_STATE;
2948 $self->{s_kwd} = '';
2949 !!!next-input-character;
2950
2951 !!!emit ($self->{ct}); # DOCTYPE
2952
2953 redo A;
2954 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2955 !!!cp (220.1);
2956 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2957 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2958 $self->{in_subset} = 1;
2959 !!!next-input-character;
2960 !!!emit ($self->{ct}); # DOCTYPE
2961 redo A;
2962 } elsif ($self->{nc} == -1) {
2963 !!!cp (220);
2964 $self->{state} = DATA_STATE;
2965 $self->{s_kwd} = '';
2966 ## reconsume
2967
2968 !!!emit ($self->{ct}); # DOCTYPE
2969
2970 redo A;
2971 } else {
2972 !!!cp (221);
2973 my $s = '';
2974 $self->{read_until}->($s, q{>[}, 0);
2975
2976 ## Stay in the state
2977 !!!next-input-character;
2978 redo A;
2979 }
2980 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2981 ## NOTE: "CDATA section state" in the state is jointly implemented
2982 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2983 ## and |CDATA_SECTION_MSE2_STATE|.
2984
2985 ## XML5: "CDATA state".
2986
2987 if ($self->{nc} == 0x005D) { # ]
2988 !!!cp (221.1);
2989 $self->{state} = CDATA_SECTION_MSE1_STATE;
2990 !!!next-input-character;
2991 redo A;
2992 } elsif ($self->{nc} == -1) {
2993 if ($self->{is_xml}) {
2994 !!!cp (221.11);
2995 !!!parse-error (type => 'no mse'); ## TODO: type
2996 } else {
2997 !!!cp (221.12);
2998 }
2999
3000 $self->{state} = DATA_STATE;
3001 $self->{s_kwd} = '';
3002 ## Reconsume.
3003 if (length $self->{ct}->{data}) { # character
3004 !!!cp (221.2);
3005 !!!emit ($self->{ct}); # character
3006 } else {
3007 !!!cp (221.3);
3008 ## No token to emit. $self->{ct} is discarded.
3009 }
3010 redo A;
3011 } else {
3012 !!!cp (221.4);
3013 $self->{ct}->{data} .= chr $self->{nc};
3014 $self->{read_until}->($self->{ct}->{data},
3015 q<]>,
3016 length $self->{ct}->{data});
3017
3018 ## Stay in the state.
3019 !!!next-input-character;
3020 redo A;
3021 }
3022
3023 ## ISSUE: "text tokens" in spec.
3024 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3025 ## XML5: "CDATA bracket state".
3026
3027 if ($self->{nc} == 0x005D) { # ]
3028 !!!cp (221.5);
3029 $self->{state} = CDATA_SECTION_MSE2_STATE;
3030 !!!next-input-character;
3031 redo A;
3032 } else {
3033 !!!cp (221.6);
3034 ## XML5: If EOF, "]" is not appended and changed to the data state.
3035 $self->{ct}->{data} .= ']';
3036 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3037 ## Reconsume.
3038 redo A;
3039 }
3040 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3041 ## XML5: "CDATA end state".
3042
3043 if ($self->{nc} == 0x003E) { # >
3044 $self->{state} = DATA_STATE;
3045 $self->{s_kwd} = '';
3046 !!!next-input-character;
3047 if (length $self->{ct}->{data}) { # character
3048 !!!cp (221.7);
3049 !!!emit ($self->{ct}); # character
3050 } else {
3051 !!!cp (221.8);
3052 ## No token to emit. $self->{ct} is discarded.
3053 }
3054 redo A;
3055 } elsif ($self->{nc} == 0x005D) { # ]
3056 !!!cp (221.9); # character
3057 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3058 ## Stay in the state.
3059 !!!next-input-character;
3060 redo A;
3061 } else {
3062 !!!cp (221.11);
3063 $self->{ct}->{data} .= ']]'; # character
3064 $self->{state} = CDATA_SECTION_STATE;
3065 ## Reconsume. ## XML5: Emit.
3066 redo A;
3067 }
3068 } elsif ($self->{state} == ENTITY_STATE) {
3069 if ($is_space->{$self->{nc}} or
3070 {
3071 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3072 $self->{entity_add} => 1,
3073 }->{$self->{nc}}) {
3074 !!!cp (1001);
3075 ## Don't consume
3076 ## No error
3077 ## Return nothing.
3078 #
3079 } elsif ($self->{nc} == 0x0023) { # #
3080 !!!cp (999);
3081 $self->{state} = ENTITY_HASH_STATE;
3082 $self->{kwd} = '#';
3083 !!!next-input-character;
3084 redo A;
3085 } elsif ((0x0041 <= $self->{nc} and
3086 $self->{nc} <= 0x005A) or # A..Z
3087 (0x0061 <= $self->{nc} and
3088 $self->{nc} <= 0x007A)) { # a..z
3089 !!!cp (998);
3090 require Whatpm::_NamedEntityList;
3091 $self->{state} = ENTITY_NAME_STATE;
3092 $self->{kwd} = chr $self->{nc};
3093 $self->{entity__value} = $self->{kwd};
3094 $self->{entity__match} = 0;
3095 !!!next-input-character;
3096 redo A;
3097 } else {
3098 !!!cp (1027);
3099 !!!parse-error (type => 'bare ero');
3100 ## Return nothing.
3101 #
3102 }
3103
3104 ## NOTE: No character is consumed by the "consume a character
3105 ## reference" algorithm. In other word, there is an "&" character
3106 ## that does not introduce a character reference, which would be
3107 ## appended to the parent element or the attribute value in later
3108 ## process of the tokenizer.
3109
3110 if ($self->{prev_state} == DATA_STATE) {
3111 !!!cp (997);
3112 $self->{state} = $self->{prev_state};
3113 $self->{s_kwd} = '';
3114 ## Reconsume.
3115 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3116 line => $self->{line_prev},
3117 column => $self->{column_prev},
3118 });
3119 redo A;
3120 } else {
3121 !!!cp (996);
3122 $self->{ca}->{value} .= '&';
3123 $self->{state} = $self->{prev_state};
3124 $self->{s_kwd} = '';
3125 ## Reconsume.
3126 redo A;
3127 }
3128 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3129 if ($self->{nc} == 0x0078 or # x
3130 $self->{nc} == 0x0058) { # X
3131 !!!cp (995);
3132 $self->{state} = HEXREF_X_STATE;
3133 $self->{kwd} .= chr $self->{nc};
3134 !!!next-input-character;
3135 redo A;
3136 } elsif (0x0030 <= $self->{nc} and
3137 $self->{nc} <= 0x0039) { # 0..9
3138 !!!cp (994);
3139 $self->{state} = NCR_NUM_STATE;
3140 $self->{kwd} = $self->{nc} - 0x0030;
3141 !!!next-input-character;
3142 redo A;
3143 } else {
3144 !!!parse-error (type => 'bare nero',
3145 line => $self->{line_prev},
3146 column => $self->{column_prev} - 1);
3147
3148 ## NOTE: According to the spec algorithm, nothing is returned,
3149 ## and then "&#" is appended to the parent element or the attribute
3150 ## value in the later processing.
3151
3152 if ($self->{prev_state} == DATA_STATE) {
3153 !!!cp (1019);
3154 $self->{state} = $self->{prev_state};
3155 $self->{s_kwd} = '';
3156 ## Reconsume.
3157 !!!emit ({type => CHARACTER_TOKEN,
3158 data => '&#',
3159 line => $self->{line_prev},
3160 column => $self->{column_prev} - 1,
3161 });
3162 redo A;
3163 } else {
3164 !!!cp (993);
3165 $self->{ca}->{value} .= '&#';
3166 $self->{state} = $self->{prev_state};
3167 $self->{s_kwd} = '';
3168 ## Reconsume.
3169 redo A;
3170 }
3171 }
3172 } elsif ($self->{state} == NCR_NUM_STATE) {
3173 if (0x0030 <= $self->{nc} and
3174 $self->{nc} <= 0x0039) { # 0..9
3175 !!!cp (1012);
3176 $self->{kwd} *= 10;
3177 $self->{kwd} += $self->{nc} - 0x0030;
3178
3179 ## Stay in the state.
3180 !!!next-input-character;
3181 redo A;
3182 } elsif ($self->{nc} == 0x003B) { # ;
3183 !!!cp (1013);
3184 !!!next-input-character;
3185 #
3186 } else {
3187 !!!cp (1014);
3188 !!!parse-error (type => 'no refc');
3189 ## Reconsume.
3190 #
3191 }
3192
3193 my $code = $self->{kwd};
3194 my $l = $self->{line_prev};
3195 my $c = $self->{column_prev};
3196 if ($charref_map->{$code}) {
3197 !!!cp (1015);
3198 !!!parse-error (type => 'invalid character reference',
3199 text => (sprintf 'U+%04X', $code),
3200 line => $l, column => $c);
3201 $code = $charref_map->{$code};
3202 } elsif ($code > 0x10FFFF) {
3203 !!!cp (1016);
3204 !!!parse-error (type => 'invalid character reference',
3205 text => (sprintf 'U-%08X', $code),
3206 line => $l, column => $c);
3207 $code = 0xFFFD;
3208 }
3209
3210 if ($self->{prev_state} == DATA_STATE) {
3211 !!!cp (992);
3212 $self->{state} = $self->{prev_state};
3213 $self->{s_kwd} = '';
3214 ## Reconsume.
3215 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3216 has_reference => 1,
3217 line => $l, column => $c,
3218 });
3219 redo A;
3220 } else {
3221 !!!cp (991);
3222 $self->{ca}->{value} .= chr $code;
3223 $self->{ca}->{has_reference} = 1;
3224 $self->{state} = $self->{prev_state};
3225 $self->{s_kwd} = '';
3226 ## Reconsume.
3227 redo A;
3228 }
3229 } elsif ($self->{state} == HEXREF_X_STATE) {
3230 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3231 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3232 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3233 # 0..9, A..F, a..f
3234 !!!cp (990);
3235 $self->{state} = HEXREF_HEX_STATE;
3236 $self->{kwd} = 0;
3237 ## Reconsume.
3238 redo A;
3239 } else {
3240 !!!parse-error (type => 'bare hcro',
3241 line => $self->{line_prev},
3242 column => $self->{column_prev} - 2);
3243
3244 ## NOTE: According to the spec algorithm, nothing is returned,
3245 ## and then "&#" followed by "X" or "x" is appended to the parent
3246 ## element or the attribute value in the later processing.
3247
3248 if ($self->{prev_state} == DATA_STATE) {
3249 !!!cp (1005);
3250 $self->{state} = $self->{prev_state};
3251 $self->{s_kwd} = '';
3252 ## Reconsume.
3253 !!!emit ({type => CHARACTER_TOKEN,
3254 data => '&' . $self->{kwd},
3255 line => $self->{line_prev},
3256 column => $self->{column_prev} - length $self->{kwd},
3257 });
3258 redo A;
3259 } else {
3260 !!!cp (989);
3261 $self->{ca}->{value} .= '&' . $self->{kwd};
3262 $self->{state} = $self->{prev_state};
3263 $self->{s_kwd} = '';
3264 ## Reconsume.
3265 redo A;
3266 }
3267 }
3268 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3269 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3270 # 0..9
3271 !!!cp (1002);
3272 $self->{kwd} *= 0x10;
3273 $self->{kwd} += $self->{nc} - 0x0030;
3274 ## Stay in the state.
3275 !!!next-input-character;
3276 redo A;
3277 } elsif (0x0061 <= $self->{nc} and
3278 $self->{nc} <= 0x0066) { # a..f
3279 !!!cp (1003);
3280 $self->{kwd} *= 0x10;
3281 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3282 ## Stay in the state.
3283 !!!next-input-character;
3284 redo A;
3285 } elsif (0x0041 <= $self->{nc} and
3286 $self->{nc} <= 0x0046) { # A..F
3287 !!!cp (1004);
3288 $self->{kwd} *= 0x10;
3289 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3290 ## Stay in the state.
3291 !!!next-input-character;
3292 redo A;
3293 } elsif ($self->{nc} == 0x003B) { # ;
3294 !!!cp (1006);
3295 !!!next-input-character;
3296 #
3297 } else {
3298 !!!cp (1007);
3299 !!!parse-error (type => 'no refc',
3300 line => $self->{line},
3301 column => $self->{column});
3302 ## Reconsume.
3303 #
3304 }
3305
3306 my $code = $self->{kwd};
3307 my $l = $self->{line_prev};
3308 my $c = $self->{column_prev};
3309 if ($charref_map->{$code}) {
3310 !!!cp (1008);
3311 !!!parse-error (type => 'invalid character reference',
3312 text => (sprintf 'U+%04X', $code),
3313 line => $l, column => $c);
3314 $code = $charref_map->{$code};
3315 } elsif ($code > 0x10FFFF) {
3316 !!!cp (1009);
3317 !!!parse-error (type => 'invalid character reference',
3318 text => (sprintf 'U-%08X', $code),
3319 line => $l, column => $c);
3320 $code = 0xFFFD;
3321 }
3322
3323 if ($self->{prev_state} == DATA_STATE) {
3324 !!!cp (988);
3325 $self->{state} = $self->{prev_state};
3326 $self->{s_kwd} = '';
3327 ## Reconsume.
3328 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3329 has_reference => 1,
3330 line => $l, column => $c,
3331 });
3332 redo A;
3333 } else {
3334 !!!cp (987);
3335 $self->{ca}->{value} .= chr $code;
3336 $self->{ca}->{has_reference} = 1;
3337 $self->{state} = $self->{prev_state};
3338 $self->{s_kwd} = '';
3339 ## Reconsume.
3340 redo A;
3341 }
3342 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3343 if (length $self->{kwd} < 30 and
3344 ## NOTE: Some number greater than the maximum length of entity name
3345 ((0x0041 <= $self->{nc} and # a
3346 $self->{nc} <= 0x005A) or # x
3347 (0x0061 <= $self->{nc} and # a
3348 $self->{nc} <= 0x007A) or # z
3349 (0x0030 <= $self->{nc} and # 0
3350 $self->{nc} <= 0x0039) or # 9
3351 $self->{nc} == 0x003B)) { # ;
3352 our $EntityChar;
3353 $self->{kwd} .= chr $self->{nc};
3354 if (defined $EntityChar->{$self->{kwd}}) {
3355 if ($self->{nc} == 0x003B) { # ;
3356 !!!cp (1020);
3357 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3358 $self->{entity__match} = 1;
3359 !!!next-input-character;
3360 #
3361 } else {
3362 !!!cp (1021);
3363 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3364 $self->{entity__match} = -1;
3365 ## Stay in the state.
3366 !!!next-input-character;
3367 redo A;
3368 }
3369 } else {
3370 !!!cp (1022);
3371 $self->{entity__value} .= chr $self->{nc};
3372 $self->{entity__match} *= 2;
3373 ## Stay in the state.
3374 !!!next-input-character;
3375 redo A;
3376 }
3377 }
3378
3379 my $data;
3380 my $has_ref;
3381 if ($self->{entity__match} > 0) {
3382 !!!cp (1023);
3383 $data = $self->{entity__value};
3384 $has_ref = 1;
3385 #
3386 } elsif ($self->{entity__match} < 0) {
3387 !!!parse-error (type => 'no refc');
3388 if ($self->{prev_state} != DATA_STATE and # in attribute
3389 $self->{entity__match} < -1) {
3390 !!!cp (1024);
3391 $data = '&' . $self->{kwd};
3392 #
3393 } else {
3394 !!!cp (1025);
3395 $data = $self->{entity__value};
3396 $has_ref = 1;
3397 #
3398 }
3399 } else {
3400 !!!cp (1026);
3401 !!!parse-error (type => 'bare ero',
3402 line => $self->{line_prev},
3403 column => $self->{column_prev} - length $self->{kwd});
3404 $data = '&' . $self->{kwd};
3405 #
3406 }
3407
3408 ## NOTE: In these cases, when a character reference is found,
3409 ## it is consumed and a character token is returned, or, otherwise,
3410 ## nothing is consumed and returned, according to the spec algorithm.
3411 ## In this implementation, anything that has been examined by the
3412 ## tokenizer is appended to the parent element or the attribute value
3413 ## as string, either literal string when no character reference or
3414 ## entity-replaced string otherwise, in this stage, since any characters
3415 ## that would not be consumed are appended in the data state or in an
3416 ## appropriate attribute value state anyway.
3417
3418 if ($self->{prev_state} == DATA_STATE) {
3419 !!!cp (986);
3420 $self->{state} = $self->{prev_state};
3421 $self->{s_kwd} = '';
3422 ## Reconsume.
3423 !!!emit ({type => CHARACTER_TOKEN,
3424 data => $data,
3425 has_reference => $has_ref,
3426 line => $self->{line_prev},
3427 column => $self->{column_prev} + 1 - length $self->{kwd},
3428 });
3429 redo A;
3430 } else {
3431 !!!cp (985);
3432 $self->{ca}->{value} .= $data;
3433 $self->{ca}->{has_reference} = 1 if $has_ref;
3434 $self->{state} = $self->{prev_state};
3435 $self->{s_kwd} = '';
3436 ## Reconsume.
3437 redo A;
3438 }
3439
3440 ## XML-only states
3441
3442 } elsif ($self->{state} == PI_STATE) {
3443 ## XML5: "Pi state" and "DOCTYPE pi state".
3444
3445 if ($is_space->{$self->{nc}} or
3446 $self->{nc} == 0x003F or # ?
3447 $self->{nc} == -1) {
3448 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3449 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3450 ## "DOCTYPE pi state": Parse error, switch to the "data
3451 ## state".
3452 !!!parse-error (type => 'bare pio', ## TODO: type
3453 line => $self->{line_prev},
3454 column => $self->{column_prev}
3455 - 1 * ($self->{nc} != -1));
3456 $self->{state} = BOGUS_COMMENT_STATE;
3457 ## Reconsume.
3458 $self->{ct} = {type => COMMENT_TOKEN,
3459 data => '?',
3460 line => $self->{line_prev},
3461 column => $self->{column_prev}
3462 - 1 * ($self->{nc} != -1),
3463 };
3464 redo A;
3465 } else {
3466 ## XML5: "DOCTYPE pi state": Stay in the state.
3467 $self->{ct} = {type => PI_TOKEN,
3468 target => chr $self->{nc},
3469 data => '',
3470 line => $self->{line_prev},
3471 column => $self->{column_prev} - 1,
3472 };
3473 $self->{state} = PI_TARGET_STATE;
3474 !!!next-input-character;
3475 redo A;
3476 }
3477 } elsif ($self->{state} == PI_TARGET_STATE) {
3478 if ($is_space->{$self->{nc}}) {
3479 $self->{state} = PI_TARGET_AFTER_STATE;
3480 !!!next-input-character;
3481 redo A;
3482 } elsif ($self->{nc} == -1) {
3483 !!!parse-error (type => 'no pic'); ## TODO: type
3484 if ($self->{in_subset}) {
3485 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3486 } else {
3487 $self->{state} = DATA_STATE;
3488 $self->{s_kwd} = '';
3489 }
3490 ## Reconsume.
3491 !!!emit ($self->{ct}); # pi
3492 redo A;
3493 } elsif ($self->{nc} == 0x003F) { # ?
3494 $self->{state} = PI_AFTER_STATE;
3495 !!!next-input-character;
3496 redo A;
3497 } else {
3498 ## XML5: typo ("tag name" -> "target")
3499 $self->{ct}->{target} .= chr $self->{nc}; # pi
3500 !!!next-input-character;
3501 redo A;
3502 }
3503 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3504 if ($is_space->{$self->{nc}}) {
3505 ## Stay in the state.
3506 !!!next-input-character;
3507 redo A;
3508 } else {
3509 $self->{state} = PI_DATA_STATE;
3510 ## Reprocess.
3511 redo A;
3512 }
3513 } elsif ($self->{state} == PI_DATA_STATE) {
3514 if ($self->{nc} == 0x003F) { # ?
3515 $self->{state} = PI_DATA_AFTER_STATE;
3516 !!!next-input-character;
3517 redo A;
3518 } elsif ($self->{nc} == -1) {
3519 !!!parse-error (type => 'no pic'); ## TODO: type
3520 if ($self->{in_subset}) {
3521 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3522 } else {
3523 $self->{state} = DATA_STATE;
3524 $self->{s_kwd} = '';
3525 }
3526 ## Reprocess.
3527 !!!emit ($self->{ct}); # pi
3528 redo A;
3529 } else {
3530 $self->{ct}->{data} .= chr $self->{nc}; # pi
3531 $self->{read_until}->($self->{ct}->{data}, q[?],
3532 length $self->{ct}->{data});
3533 ## Stay in the state.
3534 !!!next-input-character;
3535 ## Reprocess.
3536 redo A;
3537 }
3538 } elsif ($self->{state} == PI_AFTER_STATE) {
3539 ## XML5: Part of "Pi after state".
3540
3541 if ($self->{nc} == 0x003E) { # >
3542 if ($self->{in_subset}) {
3543 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3544 } else {
3545 $self->{state} = DATA_STATE;
3546 $self->{s_kwd} = '';
3547 }
3548 !!!next-input-character;
3549 !!!emit ($self->{ct}); # pi
3550 redo A;
3551 } elsif ($self->{nc} == 0x003F) { # ?
3552 !!!parse-error (type => 'no s after target', ## TODO: type
3553 line => $self->{line_prev},
3554 column => $self->{column_prev}); ## XML5: no error
3555 $self->{ct}->{data} .= '?';
3556 $self->{state} = PI_DATA_AFTER_STATE;
3557 !!!next-input-character;
3558 redo A;
3559 } else {
3560 !!!parse-error (type => 'no s after target', ## TODO: type
3561 line => $self->{line_prev},
3562 column => $self->{column_prev}
3563 + 1 * ($self->{nc} == -1)); ## XML5: no error
3564 $self->{ct}->{data} .= '?'; ## XML5: not appended
3565 $self->{state} = PI_DATA_STATE;
3566 ## Reprocess.
3567 redo A;
3568 }
3569 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3570 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3571
3572 if ($self->{nc} == 0x003E) { # >
3573 if ($self->{in_subset}) {
3574 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3575 } else {
3576 $self->{state} = DATA_STATE;
3577 $self->{s_kwd} = '';
3578 }
3579 !!!next-input-character;
3580 !!!emit ($self->{ct}); # pi
3581 redo A;
3582 } elsif ($self->{nc} == 0x003F) { # ?
3583 $self->{ct}->{data} .= '?';
3584 ## Stay in the state.
3585 !!!next-input-character;
3586 redo A;
3587 } else {
3588 $self->{ct}->{data} .= '?'; ## XML5: not appended
3589 $self->{state} = PI_DATA_STATE;
3590 ## Reprocess.
3591 redo A;
3592 }
3593
3594 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3595 if ($self->{nc} == 0x003C) { # <
3596 $self->{state} = DOCTYPE_TAG_STATE;
3597 !!!next-input-character;
3598 redo A;
3599 } elsif ($self->{nc} == 0x0025) { # %
3600 ## XML5: Not defined yet.
3601
3602 ## TODO:
3603 !!!next-input-character;
3604 redo A;
3605 } elsif ($self->{nc} == 0x005D) { # ]
3606 delete $self->{in_subset};
3607 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3608 !!!next-input-character;
3609 redo A;
3610 } elsif ($is_space->{$self->{nc}}) {
3611 ## Stay in the state.
3612 !!!next-input-character;
3613 redo A;
3614 } elsif ($self->{nc} == -1) {
3615 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3616 delete $self->{in_subset};
3617 $self->{state} = DATA_STATE;
3618 $self->{s_kwd} = '';
3619 ## Reconsume.
3620 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3621 redo A;
3622 } else {
3623 unless ($self->{internal_subset_tainted}) {
3624 ## XML5: No parse error.
3625 !!!parse-error (type => 'string in internal subset');
3626 $self->{internal_subset_tainted} = 1;
3627 }
3628 ## Stay in the state.
3629 !!!next-input-character;
3630 redo A;
3631 }
3632 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3633 if ($self->{nc} == 0x003E) { # >
3634 $self->{state} = DATA_STATE;
3635 $self->{s_kwd} = '';
3636 !!!next-input-character;
3637 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3638 redo A;
3639 } elsif ($self->{nc} == -1) {
3640 !!!parse-error (type => 'unclosed DOCTYPE');
3641 $self->{state} = DATA_STATE;
3642 $self->{s_kwd} = '';
3643 ## Reconsume.
3644 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3645 redo A;
3646 } else {
3647 ## XML5: No parse error and stay in the state.
3648 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3649
3650 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3651 !!!next-input-character;
3652 redo A;
3653 }
3654 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3655 if ($self->{nc} == 0x003E) { # >
3656 $self->{state} = DATA_STATE;
3657 $self->{s_kwd} = '';
3658 !!!next-input-character;
3659 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3660 redo A;
3661 } elsif ($self->{nc} == -1) {
3662 $self->{state} = DATA_STATE;
3663 $self->{s_kwd} = '';
3664 ## Reconsume.
3665 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3666 redo A;
3667 } else {
3668 ## Stay in the state.
3669 !!!next-input-character;
3670 redo A;
3671 }
3672 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3673 if ($self->{nc} == 0x0021) { # !
3674 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3675 !!!next-input-character;
3676 redo A;
3677 } elsif ($self->{nc} == 0x003F) { # ?
3678 $self->{state} = PI_STATE;
3679 !!!next-input-character;
3680 redo A;
3681 } elsif ($self->{nc} == -1) {
3682 !!!parse-error (type => 'bare stago');
3683 $self->{state} = DATA_STATE;
3684 $self->{s_kwd} = '';
3685 ## Reconsume.
3686 redo A;
3687 } else {
3688 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3689 line => $self->{line_prev},
3690 column => $self->{column_prev});
3691 $self->{state} = BOGUS_COMMENT_STATE;
3692 $self->{ct} = {type => COMMENT_TOKEN,
3693 data => '',
3694 }; ## NOTE: Will be discarded.
3695 !!!next-input-character;
3696 redo A;
3697 }
3698 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3699 ## XML5: "DOCTYPE markup declaration state".
3700
3701 if ($self->{nc} == 0x002D) { # -
3702 $self->{state} = MD_HYPHEN_STATE;
3703 !!!next-input-character;
3704 redo A;
3705 } elsif ($self->{nc} == 0x0045 or # E
3706 $self->{nc} == 0x0065) { # e
3707 $self->{state} = MD_E_STATE;
3708 $self->{kwd} = chr $self->{nc};
3709 !!!next-input-character;
3710 redo A;
3711 } elsif ($self->{nc} == 0x0041 or # A
3712 $self->{nc} == 0x0061) { # a
3713 $self->{state} = MD_ATTLIST_STATE;
3714 $self->{kwd} = chr $self->{nc};
3715 !!!next-input-character;
3716 redo A;
3717 } elsif ($self->{nc} == 0x004E or # N
3718 $self->{nc} == 0x006E) { # n
3719 $self->{state} = MD_NOTATION_STATE;
3720 $self->{kwd} = chr $self->{nc};
3721 !!!next-input-character;
3722 redo A;
3723 } else {
3724 #
3725 }
3726
3727 ## XML5: No parse error.
3728 !!!parse-error (type => 'bogus comment',
3729 line => $self->{line_prev},
3730 column => $self->{column_prev} - 1);
3731 ## Reconsume.
3732 $self->{state} = BOGUS_COMMENT_STATE;
3733 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3734 redo A;
3735 } elsif ($self->{state} == MD_E_STATE) {
3736 if ($self->{nc} == 0x004E or # N
3737 $self->{nc} == 0x006E) { # n
3738 $self->{state} = MD_ENTITY_STATE;
3739 $self->{kwd} .= chr $self->{nc};
3740 !!!next-input-character;
3741 redo A;
3742 } elsif ($self->{nc} == 0x004C or # L
3743 $self->{nc} == 0x006C) { # l
3744 ## XML5: <!ELEMENT> not supported.
3745 $self->{state} = MD_ELEMENT_STATE;
3746 $self->{kwd} .= chr $self->{nc};
3747 !!!next-input-character;
3748 redo A;
3749 } else {
3750 ## XML5: No parse error.
3751 !!!parse-error (type => 'bogus comment',
3752 line => $self->{line_prev},
3753 column => $self->{column_prev} - 2
3754 + 1 * ($self->{nc} == -1));
3755 ## Reconsume.
3756 $self->{state} = BOGUS_COMMENT_STATE;
3757 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3758 redo A;
3759 }
3760 } elsif ($self->{state} == MD_ENTITY_STATE) {
3761 if ($self->{nc} == [
3762 undef,
3763 undef,
3764 0x0054, # T
3765 0x0049, # I
3766 0x0054, # T
3767 ]->[length $self->{kwd}] or
3768 $self->{nc} == [
3769 undef,
3770 undef,
3771 0x0074, # t
3772 0x0069, # i
3773 0x0074, # t
3774 ]->[length $self->{kwd}]) {
3775 ## Stay in the state.
3776 $self->{kwd} .= chr $self->{nc};
3777 !!!next-input-character;
3778 redo A;
3779 } elsif ((length $self->{kwd}) == 5 and
3780 ($self->{nc} == 0x0059 or # Y
3781 $self->{nc} == 0x0079)) { # y
3782 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3783 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3784 text => 'ENTITY',
3785 line => $self->{line_prev},
3786 column => $self->{column_prev} - 4);
3787 }
3788 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3789 line => $self->{line_prev},
3790 column => $self->{column_prev} - 6};
3791 $self->{state} = DOCTYPE_MD_STATE;
3792 !!!next-input-character;
3793 redo A;
3794 } else {
3795 !!!parse-error (type => 'bogus comment',
3796 line => $self->{line_prev},
3797 column => $self->{column_prev} - 1
3798 - (length $self->{kwd})
3799 + 1 * ($self->{nc} == -1));
3800 $self->{state} = BOGUS_COMMENT_STATE;
3801 ## Reconsume.
3802 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3803 redo A;
3804 }
3805 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3806 if ($self->{nc} == [
3807 undef,
3808 undef,
3809 0x0045, # E
3810 0x004D, # M
3811 0x0045, # E
3812 0x004E, # N
3813 ]->[length $self->{kwd}] or
3814 $self->{nc} == [
3815 undef,
3816 undef,
3817 0x0065, # e
3818 0x006D, # m
3819 0x0065, # e
3820 0x006E, # n
3821 ]->[length $self->{kwd}]) {
3822 ## Stay in the state.
3823 $self->{kwd} .= chr $self->{nc};
3824 !!!next-input-character;
3825 redo A;
3826 } elsif ((length $self->{kwd}) == 6 and
3827 ($self->{nc} == 0x0054 or # T
3828 $self->{nc} == 0x0074)) { # t
3829 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3830 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3831 text => 'ELEMENT',
3832 line => $self->{line_prev},
3833 column => $self->{column_prev} - 5);
3834 }
3835 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3836 line => $self->{line_prev},
3837 column => $self->{column_prev} - 6};
3838 $self->{state} = DOCTYPE_MD_STATE;
3839 !!!next-input-character;
3840 redo A;
3841 } else {
3842 !!!parse-error (type => 'bogus comment',
3843 line => $self->{line_prev},
3844 column => $self->{column_prev} - 1
3845 - (length $self->{kwd})
3846 + 1 * ($self->{nc} == -1));
3847 $self->{state} = BOGUS_COMMENT_STATE;
3848 ## Reconsume.
3849 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3850 redo A;
3851 }
3852 } elsif ($self->{state} == MD_ATTLIST_STATE) {
3853 if ($self->{nc} == [
3854 undef,
3855 0x0054, # T
3856 0x0054, # T
3857 0x004C, # L
3858 0x0049, # I
3859 0x0053, # S
3860 ]->[length $self->{kwd}] or
3861 $self->{nc} == [
3862 undef,
3863 0x0074, # t
3864 0x0074, # t
3865 0x006C, # l
3866 0x0069, # i
3867 0x0073, # s
3868 ]->[length $self->{kwd}]) {
3869 ## Stay in the state.
3870 $self->{kwd} .= chr $self->{nc};
3871 !!!next-input-character;
3872 redo A;
3873 } elsif ((length $self->{kwd}) == 6 and
3874 ($self->{nc} == 0x0054 or # T
3875 $self->{nc} == 0x0074)) { # t
3876 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3877 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3878 text => 'ATTLIST',
3879 line => $self->{line_prev},
3880 column => $self->{column_prev} - 5);
3881 }
3882 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3883 attrdefs => [],
3884 line => $self->{line_prev},
3885 column => $self->{column_prev} - 6};
3886 $self->{state} = DOCTYPE_MD_STATE;
3887 !!!next-input-character;
3888 redo A;
3889 } else {
3890 !!!parse-error (type => 'bogus comment',
3891 line => $self->{line_prev},
3892 column => $self->{column_prev} - 1
3893 - (length $self->{kwd})
3894 + 1 * ($self->{nc} == -1));
3895 $self->{state} = BOGUS_COMMENT_STATE;
3896 ## Reconsume.
3897 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3898 redo A;
3899 }
3900 } elsif ($self->{state} == MD_NOTATION_STATE) {
3901 if ($self->{nc} == [
3902 undef,
3903 0x004F, # O
3904 0x0054, # T
3905 0x0041, # A
3906 0x0054, # T
3907 0x0049, # I
3908 0x004F, # O
3909 ]->[length $self->{kwd}] or
3910 $self->{nc} == [
3911 undef,
3912 0x006F, # o
3913 0x0074, # t
3914 0x0061, # a
3915 0x0074, # t
3916 0x0069, # i
3917 0x006F, # o
3918 ]->[length $self->{kwd}]) {
3919 ## Stay in the state.
3920 $self->{kwd} .= chr $self->{nc};
3921 !!!next-input-character;
3922 redo A;
3923 } elsif ((length $self->{kwd}) == 7 and
3924 ($self->{nc} == 0x004E or # N
3925 $self->{nc} == 0x006E)) { # n
3926 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3927 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3928 text => 'NOTATION',
3929 line => $self->{line_prev},
3930 column => $self->{column_prev} - 6);
3931 }
3932 $self->{ct} = {type => NOTATION_TOKEN, name => '',
3933 line => $self->{line_prev},
3934 column => $self->{column_prev} - 6};
3935 $self->{state} = DOCTYPE_MD_STATE;
3936 !!!next-input-character;
3937 redo A;
3938 } else {
3939 !!!parse-error (type => 'bogus comment',
3940 line => $self->{line_prev},
3941 column => $self->{column_prev} - 1
3942 - (length $self->{kwd})
3943 + 1 * ($self->{nc} == -1));
3944 $self->{state} = BOGUS_COMMENT_STATE;
3945 ## Reconsume.
3946 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3947 redo A;
3948 }
3949 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3950 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3951 ## "DOCTYPE NOTATION state".
3952
3953 if ($is_space->{$self->{nc}}) {
3954 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3955 $self->{state} = BEFORE_MD_NAME_STATE;
3956 !!!next-input-character;
3957 redo A;
3958 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3959 $self->{nc} == 0x0025) { # %
3960 ## XML5: Switch to the "DOCTYPE bogus comment state".
3961 !!!parse-error (type => 'no space before md name'); ## TODO: type
3962 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3963 !!!next-input-character;
3964 redo A;
3965 } elsif ($self->{nc} == -1) {
3966 !!!parse-error (type => 'unclosed md'); ## TODO: type
3967 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3968 ## Reconsume.
3969 redo A;
3970 } elsif ($self->{nc} == 0x003E) { # >
3971 ## XML5: Switch to the "DOCTYPE bogus comment state".
3972 !!!parse-error (type => 'no md name'); ## TODO: type
3973 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3974 !!!next-input-character;
3975 redo A;
3976 } else {
3977 ## XML5: Switch to the "DOCTYPE bogus comment state".
3978 !!!parse-error (type => 'no space before md name'); ## TODO: type
3979 $self->{state} = BEFORE_MD_NAME_STATE;
3980 redo A;
3981 }
3982 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3983 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3984 ## before state", "DOCTYPE ATTLIST name before state".
3985
3986 if ($is_space->{$self->{nc}}) {
3987 ## Stay in the state.
3988 !!!next-input-character;
3989 redo A;
3990 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3991 $self->{nc} == 0x0025) { # %
3992 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3993 !!!next-input-character;
3994 redo A;
3995 } elsif ($self->{nc} == 0x003E) { # >
3996 ## XML5: Same as "Anything else".
3997 !!!parse-error (type => 'no md name'); ## TODO: type
3998 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3999 !!!next-input-character;
4000 redo A;
4001 } elsif ($self->{nc} == -1) {
4002 !!!parse-error (type => 'unclosed md'); ## TODO: type
4003 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4004 ## Reconsume.
4005 redo A;
4006 } else {
4007 ## XML5: [ATTLIST] Not defined yet.
4008 $self->{ct}->{name} .= chr $self->{nc};
4009 $self->{state} = MD_NAME_STATE;
4010 !!!next-input-character;
4011 redo A;
4012 }
4013 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4014 if ($is_space->{$self->{nc}}) {
4015 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4016 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4017 $self->{state} = BEFORE_MD_NAME_STATE;
4018 !!!next-input-character;
4019 redo A;
4020 } elsif ($self->{nc} == 0x003E) { # >
4021 ## XML5: Same as "Anything else".
4022 !!!parse-error (type => 'no md name'); ## TODO: type
4023 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4024 !!!next-input-character;
4025 redo A;
4026 } elsif ($self->{nc} == -1) {
4027 !!!parse-error (type => 'unclosed md');
4028 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4029 ## Reconsume.
4030 redo A;
4031 } else {
4032 ## XML5: No parse error.
4033 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4034 $self->{state} = BOGUS_COMMENT_STATE;
4035 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4036 ## Reconsume.
4037 redo A;
4038 }
4039 } elsif ($self->{state} == MD_NAME_STATE) {
4040 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4041
4042 if ($is_space->{$self->{nc}}) {
4043 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4044 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4045 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4046 ## TODO: ...
4047 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4048 } else { # ENTITY/NOTATION
4049 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4050 }
4051 !!!next-input-character;
4052 redo A;
4053 } elsif ($self->{nc} == 0x003E) { # >
4054 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4055 #
4056 } else {
4057 !!!parse-error (type => 'no md def'); ## TODO: type
4058 }
4059 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4060 !!!next-input-character;
4061 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4062 redo A;
4063 } elsif ($self->{nc} == -1) {
4064 ## XML5: [ATTLIST] No parse error.
4065 !!!parse-error (type => 'unclosed md');
4066 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4067 ## Reconsume.
4068 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4069 redo A;
4070 } else {
4071 ## XML5: [ATTLIST] Not defined yet.
4072 $self->{ct}->{name} .= chr $self->{nc};
4073 ## Stay in the state.
4074 !!!next-input-character;
4075 redo A;
4076 }
4077 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4078 if ($is_space->{$self->{nc}}) {
4079 ## Stay in the state.
4080 !!!next-input-character;
4081 redo A;
4082 } elsif ($self->{nc} == 0x003E) { # >
4083 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4084 !!!next-input-character;
4085 !!!emit ($self->{ct}); # ATTLIST
4086 redo A;
4087 } elsif ($self->{nc} == -1) {
4088 ## XML5: No parse error.
4089 !!!parse-error (type => 'unclosed md'); ## TODO: type
4090 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4091 !!!emit ($self->{ct});
4092 redo A;
4093 } else {
4094 ## XML5: Not defined yet.
4095 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4096 tokens => [],
4097 line => $self->{line}, column => $self->{column}};
4098 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4099 !!!next-input-character;
4100 redo A;
4101 }
4102 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4103 if ($is_space->{$self->{nc}}) {
4104 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4105 !!!next-input-character;
4106 redo A;
4107 } elsif ($self->{nc} == 0x003E) { # >
4108 ## XML5: Same as "anything else".
4109 !!!parse-error (type => 'no attr type'); ## TODO: type
4110 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4111 !!!next-input-character;
4112 !!!emit ($self->{ct}); # ATTLIST
4113 redo A;
4114 } elsif ($self->{nc} == 0x0028) { # (
4115 ## XML5: Same as "anything else".
4116 !!!parse-error (type => 'no space before paren'); ## TODO: type
4117 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4118 !!!next-input-character;
4119 redo A;
4120 } elsif ($self->{nc} == -1) {
4121 ## XML5: No parse error.
4122 !!!parse-error (type => 'unclosed md'); ## TODO: type
4123 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4124 !!!next-input-character;
4125 !!!emit ($self->{ct}); # ATTLIST
4126 redo A;
4127 } else {
4128 ## XML5: Not defined yet.
4129 $self->{ca}->{name} .= chr $self->{nc};
4130 ## Stay in the state.
4131 !!!next-input-character;
4132 redo A;
4133 }
4134 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4135 if ($is_space->{$self->{nc}}) {
4136 ## Stay in the state.
4137 !!!next-input-character;
4138 redo A;
4139 } elsif ($self->{nc} == 0x003E) { # >
4140 ## XML5: Same as "anything else".
4141 !!!parse-error (type => 'no attr type'); ## TODO: type
4142 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4143 !!!next-input-character;
4144 !!!emit ($self->{ct}); # ATTLIST
4145 redo A;
4146 } elsif ($self->{nc} == 0x0028) { # (
4147 ## XML5: Same as "anything else".
4148 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4149 !!!next-input-character;
4150 redo A;
4151 } elsif ($self->{nc} == -1) {
4152 ## XML5: No parse error.
4153 !!!parse-error (type => 'unclosed md'); ## TODO: type
4154 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4155 !!!next-input-character;
4156 !!!emit ($self->{ct});
4157 redo A;
4158 } else {
4159 ## XML5: Not defined yet.
4160 $self->{ca}->{type} = chr $self->{nc};
4161 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4162 !!!next-input-character;
4163 redo A;
4164 }
4165 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4166 if ($is_space->{$self->{nc}}) {
4167 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4168 !!!next-input-character;
4169 redo A;
4170 } elsif ($self->{nc} == 0x0023) { # #
4171 ## XML5: Same as "anything else".
4172 !!!parse-error (type => 'no space before default value'); ## TODO: type
4173 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4174 !!!next-input-character;
4175 redo A;
4176 } elsif ($self->{nc} == 0x0022) { # "
4177 ## XML5: Same as "anything else".
4178 !!!parse-error (type => 'no space before default value'); ## TODO: type
4179 $self->{ca}->{value} = '';
4180 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4181 !!!next-input-character;
4182 redo A;
4183 } elsif ($self->{nc} == 0x0027) { # '
4184 ## XML5: Same as "anything else".
4185 !!!parse-error (type => 'no space before default value'); ## TODO: type
4186 $self->{ca}->{value} = '';
4187 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4188 !!!next-input-character;
4189 redo A;
4190 } elsif ($self->{nc} == 0x003E) { # >
4191 ## XML5: Same as "anything else".
4192 !!!parse-error (type => 'no attr default'); ## TODO: type
4193 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4194 !!!next-input-character;
4195 !!!emit ($self->{ct}); # ATTLIST
4196 redo A;
4197 } elsif ($self->{nc} == 0x0028) { # (
4198 ## XML5: Same as "anything else".
4199 !!!parse-error (type => 'no space before paren'); ## TODO: type
4200 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4201 !!!next-input-character;
4202 redo A;
4203 } elsif ($self->{nc} == -1) {
4204 ## XML5: No parse error.
4205 !!!parse-error (type => 'unclosed md'); ## TODO: type
4206 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4207 !!!next-input-character;
4208 !!!emit ($self->{ct});
4209 redo A;
4210 } else {
4211 ## XML5: Not defined yet.
4212 $self->{ca}->{type} .= chr $self->{nc};
4213 ## Stay in the state.
4214 !!!next-input-character;
4215 redo A;
4216 }
4217 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4218 if ($is_space->{$self->{nc}}) {
4219 ## Stay in the state.
4220 !!!next-input-character;
4221 redo A;
4222 } elsif ($self->{nc} == 0x0028) { # (
4223 ## XML5: Same as "anything else".
4224 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4225 !!!next-input-character;
4226 redo A;
4227 } elsif ($self->{nc} == 0x0023) { # #
4228 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4229 !!!next-input-character;
4230 redo A;
4231 } elsif ($self->{nc} == 0x0022) { # "
4232 ## XML5: Same as "anything else".
4233 $self->{ca}->{value} = '';
4234 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4235 !!!next-input-character;
4236 redo A;
4237 } elsif ($self->{nc} == 0x0027) { # '
4238 ## XML5: Same as "anything else".
4239 $self->{ca}->{value} = '';
4240 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4241 !!!next-input-character;
4242 redo A;
4243 } elsif ($self->{nc} == 0x003E) { # >
4244 ## XML5: Same as "anything else".
4245 !!!parse-error (type => 'no attr default'); ## TODO: type
4246 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247 !!!next-input-character;
4248 !!!emit ($self->{ct}); # ATTLIST
4249 redo A;
4250 } elsif ($self->{nc} == -1) {
4251 ## XML5: No parse error.
4252 !!!parse-error (type => 'unclosed md'); ## TODO: type
4253 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4254 !!!next-input-character;
4255 !!!emit ($self->{ct});
4256 redo A;
4257 } else {
4258 ## XML5: Switch to the "DOCTYPE bogus comment state".
4259 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4260 $self->{ca}->{value} = '';
4261 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4262 ## Reconsume.
4263 redo A;
4264 }
4265 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4266 if ($is_space->{$self->{nc}}) {
4267 ## Stay in the state.
4268 !!!next-input-character;
4269 redo A;
4270 } elsif ($self->{nc} == 0x007C) { # |
4271 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4272 ## Stay in the state.
4273 !!!next-input-character;
4274 redo A;
4275 } elsif ($self->{nc} == 0x0029) { # )
4276 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4277 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4278 !!!next-input-character;
4279 redo A;
4280 } elsif ($self->{nc} == 0x003E) { # >
4281 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4282 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4283 !!!next-input-character;
4284 !!!emit ($self->{ct}); # ATTLIST
4285 redo A;
4286 } elsif ($self->{nc} == -1) {
4287 ## XML5: No parse error.
4288 !!!parse-error (type => 'unclosed md'); ## TODO: type
4289 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4290 !!!next-input-character;
4291 !!!emit ($self->{ct});
4292 redo A;
4293 } else {
4294 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4295 $self->{state} = ALLOWED_TOKEN_STATE;
4296 !!!next-input-character;
4297 redo A;
4298 }
4299 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4300 if ($is_space->{$self->{nc}}) {
4301 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4302 !!!next-input-character;
4303 redo A;
4304 } elsif ($self->{nc} == 0x007C) { # |
4305 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4306 !!!next-input-character;
4307 redo A;
4308 } elsif ($self->{nc} == 0x0029) { # )
4309 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4310 !!!next-input-character;
4311 redo A;
4312 } elsif ($self->{nc} == 0x003E) { # >
4313 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4314 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4315 !!!next-input-character;
4316 !!!emit ($self->{ct}); # ATTLIST
4317 redo A;
4318 } elsif ($self->{nc} == -1) {
4319 ## XML5: No parse error.
4320 !!!parse-error (type => 'unclosed md'); ## TODO: type
4321 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4322 !!!next-input-character;
4323 !!!emit ($self->{ct});
4324 redo A;
4325 } else {
4326 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4327 ## Stay in the state.
4328 !!!next-input-character;
4329 redo A;
4330 }
4331 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4332 if ($is_space->{$self->{nc}}) {
4333 ## Stay in the state.
4334 !!!next-input-character;
4335 redo A;
4336 } elsif ($self->{nc} == 0x007C) { # |
4337 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4338 !!!next-input-character;
4339 redo A;
4340 } elsif ($self->{nc} == 0x0029) { # )
4341 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4342 !!!next-input-character;
4343 redo A;
4344 } elsif ($self->{nc} == 0x003E) { # >
4345 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4346 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4347 !!!next-input-character;
4348 !!!emit ($self->{ct}); # ATTLIST
4349 redo A;
4350 } elsif ($self->{nc} == -1) {
4351 ## XML5: No parse error.
4352 !!!parse-error (type => 'unclosed md'); ## TODO: type
4353 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4354 !!!next-input-character;
4355 !!!emit ($self->{ct});
4356 redo A;
4357 } else {
4358 !!!parse-error (type => 'space in allowed token', ## TODO: type
4359 line => $self->{line_prev},
4360 column => $self->{column_prev});
4361 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4362 $self->{state} = ALLOWED_TOKEN_STATE;
4363 !!!next-input-character;
4364 redo A;
4365 }
4366 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4367 if ($is_space->{$self->{nc}}) {
4368 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4369 !!!next-input-character;
4370 redo A;
4371 } elsif ($self->{nc} == 0x0023) { # #
4372 !!!parse-error (type => 'no space before default value'); ## TODO: type
4373 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4374 !!!next-input-character;
4375 redo A;
4376 } elsif ($self->{nc} == 0x0022) { # "
4377 !!!parse-error (type => 'no space before default value'); ## TODO: type
4378 $self->{ca}->{value} = '';
4379 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4380 !!!next-input-character;
4381 redo A;
4382 } elsif ($self->{nc} == 0x0027) { # '
4383 !!!parse-error (type => 'no space before default value'); ## TODO: type
4384 $self->{ca}->{value} = '';
4385 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4386 !!!next-input-character;
4387 redo A;
4388 } elsif ($self->{nc} == 0x003E) { # >
4389 !!!parse-error (type => 'no attr default'); ## TODO: type
4390 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391 !!!next-input-character;
4392 !!!emit ($self->{ct}); # ATTLIST
4393 redo A;
4394 } elsif ($self->{nc} == -1) {
4395 !!!parse-error (type => 'unclosed md'); ## TODO: type
4396 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4397 !!!next-input-character;
4398 !!!emit ($self->{ct});
4399 redo A;
4400 } else {
4401 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4402 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4403 ## Reconsume.
4404 redo A;
4405 }
4406 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4407 if ($is_space->{$self->{nc}}) {
4408 ## Stay in the state.
4409 !!!next-input-character;
4410 redo A;
4411 } elsif ($self->{nc} == 0x0023) { # #
4412 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4413 !!!next-input-character;
4414 redo A;
4415 } elsif ($self->{nc} == 0x0022) { # "
4416 $self->{ca}->{value} = '';
4417 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4418 !!!next-input-character;
4419 redo A;
4420 } elsif ($self->{nc} == 0x0027) { # '
4421 $self->{ca}->{value} = '';
4422 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4423 !!!next-input-character;
4424 redo A;
4425 } elsif ($self->{nc} == 0x003E) { # >
4426 !!!parse-error (type => 'no attr default'); ## TODO: type
4427 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4428 !!!next-input-character;
4429 !!!emit ($self->{ct}); # ATTLIST
4430 redo A;
4431 } elsif ($self->{nc} == -1) {
4432 !!!parse-error (type => 'unclosed md'); ## TODO: type
4433 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4434 !!!next-input-character;
4435 !!!emit ($self->{ct});
4436 redo A;
4437 } else {
4438 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4439 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4440 ## Reconsume.
4441 redo A;
4442 }
4443 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4444 if ($is_space->{$self->{nc}}) {
4445 ## XML5: No parse error.
4446 !!!parse-error (type => 'no default type'); ## TODO: type
4447 $self->{state} = BOGUS_MD_STATE;
4448 ## Reconsume.
4449 redo A;
4450 } elsif ($self->{nc} == 0x0022) { # "
4451 ## XML5: Same as "anything else".
4452 $self->{ca}->{value} = '';
4453 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4454 !!!next-input-character;
4455 redo A;
4456 } elsif ($self->{nc} == 0x0027) { # '
4457 ## XML5: Same as "anything else".
4458 $self->{ca}->{value} = '';
4459 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4460 !!!next-input-character;
4461 redo A;
4462 } elsif ($self->{nc} == 0x003E) { # >
4463 ## XML5: Same as "anything else".
4464 !!!parse-error (type => 'no attr default'); ## TODO: type
4465 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4466 !!!next-input-character;
4467 !!!emit ($self->{ct}); # ATTLIST
4468 redo A;
4469 } elsif ($self->{nc} == -1) {
4470 ## XML5: No parse error.
4471 !!!parse-error (type => 'unclosed md'); ## TODO: type
4472 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4473 !!!next-input-character;
4474 !!!emit ($self->{ct});
4475 redo A;
4476 } else {
4477 $self->{ca}->{default} = chr $self->{nc};
4478 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4479 !!!next-input-character;
4480 redo A;
4481 }
4482 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4483 if ($is_space->{$self->{nc}}) {
4484 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4485 !!!next-input-character;
4486 redo A;
4487 } elsif ($self->{nc} == 0x0022) { # "
4488 ## XML5: Same as "anything else".
4489 !!!parse-error (type => 'no space before default value'); ## TODO: type
4490 $self->{ca}->{value} = '';
4491 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4492 !!!next-input-character;
4493 redo A;
4494 } elsif ($self->{nc} == 0x0027) { # '
4495 ## XML5: Same as "anything else".
4496 !!!parse-error (type => 'no space before default value'); ## TODO: type
4497 $self->{ca}->{value} = '';
4498 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4499 !!!next-input-character;
4500 redo A;
4501 } elsif ($self->{nc} == 0x003E) { # >
4502 ## XML5: Same as "anything else".
4503 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4504 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505 !!!next-input-character;
4506 !!!emit ($self->{ct}); # ATTLIST
4507 redo A;
4508 } elsif ($self->{nc} == -1) {
4509 ## XML5: No parse error.
4510 !!!parse-error (type => 'unclosed md'); ## TODO: type
4511 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4512 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4513 !!!next-input-character;
4514 !!!emit ($self->{ct});
4515 redo A;
4516 } else {
4517 $self->{ca}->{default} .= chr $self->{nc};
4518 ## Stay in the state.
4519 !!!next-input-character;
4520 redo A;
4521 }
4522 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4523 if ($is_space->{$self->{nc}}) {
4524 ## Stay in the state.
4525 !!!next-input-character;
4526 redo A;
4527 } elsif ($self->{nc} == 0x0022) { # "
4528 $self->{ca}->{value} = '';
4529 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4530 !!!next-input-character;
4531 redo A;
4532 } elsif ($self->{nc} == 0x0027) { # '
4533 $self->{ca}->{value} = '';
4534 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4535 !!!next-input-character;
4536 redo A;
4537 } elsif ($self->{nc} == 0x003E) { # >
4538 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4539 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540 !!!next-input-character;
4541 !!!emit ($self->{ct}); # ATTLIST
4542 redo A;
4543 } elsif ($self->{nc} == -1) {
4544 ## XML5: No parse error.
4545 !!!parse-error (type => 'unclosed md'); ## TODO: type
4546 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4547 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4548 !!!next-input-character;
4549 !!!emit ($self->{ct});
4550 redo A;
4551 } else {
4552 ## XML5: Not defined yet.
4553 if ($self->{ca}->{default} eq 'FIXED') {
4554 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4555 } else {
4556 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4557 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4558 }
4559 ## Reconsume.
4560 redo A;
4561 }
4562 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4563 if ($is_space->{$self->{nc}} or
4564 $self->{nc} == -1 or
4565 $self->{nc} == 0x003E) { # >
4566 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4567 ## Reconsume.
4568 redo A;
4569 } else {
4570 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4571 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4572 ## Reconsume.
4573 redo A;
4574 }
4575 } elsif ($self->{state} == NDATA_STATE) {
4576 ## ASCII case-insensitive
4577 if ($self->{nc} == [
4578 undef,
4579 0x0044, # D
4580 0x0041, # A
4581 0x0054, # T
4582 ]->[length $self->{kwd}] or
4583 $self->{nc} == [
4584 undef,
4585 0x0064, # d
4586 0x0061, # a
4587 0x0074, # t
4588 ]->[length $self->{kwd}]) {
4589 !!!cp (172.2);
4590 ## Stay in the state.
4591 $self->{kwd} .= chr $self->{nc};
4592 !!!next-input-character;
4593 redo A;
4594 } elsif ((length $self->{kwd}) == 4 and
4595 ($self->{nc} == 0x0041 or # A
4596 $self->{nc} == 0x0061)) { # a
4597 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4598 !!!cp (172.3);
4599 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4600 text => 'NDATA',
4601 line => $self->{line_prev},
4602 column => $self->{column_prev} - 4);
4603 } else {
4604 !!!cp (172.4);
4605 }
4606 $self->{state} = AFTER_NDATA_STATE;
4607 !!!next-input-character;
4608 redo A;
4609 } else {
4610 !!!parse-error (type => 'string after literal', ## TODO: type
4611 line => $self->{line_prev},
4612 column => $self->{column_prev} + 1
4613 - length $self->{kwd});
4614 !!!cp (172.5);
4615 $self->{state} = BOGUS_MD_STATE;
4616 ## Reconsume.
4617 redo A;
4618 }
4619 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4620 if ($is_space->{$self->{nc}}) {
4621 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4622 !!!next-input-character;
4623 redo A;
4624 } elsif ($self->{nc} == 0x003E) { # >
4625 !!!parse-error (type => 'no notation name'); ## TODO: type
4626 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4627 !!!next-input-character;
4628 !!!emit ($self->{ct}); # ENTITY
4629 redo A;
4630 } elsif ($self->{nc} == -1) {
4631 !!!parse-error (type => 'unclosed md'); ## TODO: type
4632 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4633 !!!next-input-character;
4634 !!!emit ($self->{ct}); # ENTITY
4635 redo A;
4636 } else {
4637 !!!parse-error (type => 'string after literal', ## TODO: type
4638 line => $self->{line_prev},
4639 column => $self->{column_prev} + 1
4640 - length $self->{kwd});
4641 $self->{state} = BOGUS_MD_STATE;
4642 ## Reconsume.
4643 redo A;
4644 }
4645 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4646 if ($is_space->{$self->{nc}}) {
4647 ## Stay in the state.
4648 !!!next-input-character;
4649 redo A;
4650 } elsif ($self->{nc} == 0x003E) { # >
4651 !!!parse-error (type => 'no notation name'); ## TODO: type
4652 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4653 !!!next-input-character;
4654 !!!emit ($self->{ct}); # ENTITY
4655 redo A;
4656 } elsif ($self->{nc} == -1) {
4657 !!!parse-error (type => 'unclosed md'); ## TODO: type
4658 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4659 !!!next-input-character;
4660 !!!emit ($self->{ct}); # ENTITY
4661 redo A;
4662 } else {
4663 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4664 $self->{state} = NOTATION_NAME_STATE;
4665 !!!next-input-character;
4666 redo A;
4667 }
4668 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4669 if ($is_space->{$self->{nc}}) {
4670 $self->{state} = AFTER_NOTATION_NAME_STATE;
4671 !!!next-input-character;
4672 redo A;
4673 } elsif ($self->{nc} == 0x003E) { # >
4674 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4675 !!!next-input-character;
4676 !!!emit ($self->{ct}); # ENTITY
4677 redo A;
4678 } elsif ($self->{nc} == -1) {
4679 !!!parse-error (type => 'unclosed md'); ## TODO: type
4680 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4681 !!!next-input-character;
4682 !!!emit ($self->{ct}); # ENTITY
4683 redo A;
4684 } else {
4685 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4686 ## Stay in the state.
4687 !!!next-input-character;
4688 redo A;
4689 }
4690 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4691 if ($self->{nc} == 0x0022) { # "
4692 $self->{state} = AFTER_NOTATION_NAME_STATE;
4693 !!!next-input-character;
4694 redo A;
4695 } elsif ($self->{nc} == 0x0026) { # &
4696 $self->{prev_state} = $self->{state};
4697 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4698 $self->{entity_add} = 0x0022; # "
4699 !!!next-input-character;
4700 redo A;
4701 ## TODO: %
4702 } elsif ($self->{nc} == -1) {
4703 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4704 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4705 ## Reconsume.
4706 !!!emit ($self->{ct}); # ENTITY
4707 redo A;
4708 } else {
4709 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4710 !!!next-input-character;
4711 redo A;
4712 }
4713 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4714 if ($self->{nc} == 0x0027) { # '
4715 $self->{state} = AFTER_NOTATION_NAME_STATE;
4716 !!!next-input-character;
4717 redo A;
4718 } elsif ($self->{nc} == 0x0026) { # &
4719 $self->{prev_state} = $self->{state};
4720 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4721 $self->{entity_add} = 0x0027; # '
4722 !!!next-input-character;
4723 redo A;
4724 ## TODO: %
4725 } elsif ($self->{nc} == -1) {
4726 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4727 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728 ## Reconsume.
4729 !!!emit ($self->{ct}); # ENTITY
4730 redo A;
4731 } else {
4732 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4733 !!!next-input-character;
4734 redo A;
4735 }
4736 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4737 ## TODO: XMLize
4738
4739 if ($is_space->{$self->{nc}} or
4740 {
4741 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4742 $self->{entity_add} => 1,
4743 }->{$self->{nc}}) {
4744 ## Don't consume
4745 ## No error
4746 ## Return nothing.
4747 #
4748 } elsif ($self->{nc} == 0x0023) { # #
4749 $self->{ca} = $self->{ct};
4750 $self->{state} = ENTITY_HASH_STATE;
4751 $self->{kwd} = '#';
4752 !!!next-input-character;
4753 redo A;
4754 } elsif ((0x0041 <= $self->{nc} and
4755 $self->{nc} <= 0x005A) or # A..Z
4756 (0x0061 <= $self->{nc} and
4757 $self->{nc} <= 0x007A)) { # a..z
4758 #
4759 } else {
4760 !!!parse-error (type => 'bare ero');
4761 ## Return nothing.
4762 #
4763 }
4764
4765 $self->{ct}->{value} .= '&';
4766 $self->{state} = $self->{prev_state};
4767 ## Reconsume.
4768 redo A;
4769 } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
4770 if ($is_space->{$self->{nc}}) {
4771 ## Stay in the state.
4772 !!!next-input-character;
4773 redo A;
4774 } elsif ($self->{nc} == 0x003E) { # >
4775 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4776 !!!next-input-character;
4777 !!!emit ($self->{ct}); # ENTITY
4778 redo A;
4779 } elsif ($self->{nc} == -1) {
4780 !!!parse-error (type => 'unclosed md'); ## TODO: type
4781 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4782 !!!next-input-character;
4783 !!!emit ($self->{ct}); # ENTITY
4784 redo A;
4785 } else {
4786 !!!parse-error (type => 'string after notation name'); ## TODO: type
4787 $self->{state} = BOGUS_MD_STATE;
4788 ## Reconsume.
4789 redo A;
4790 }
4791 } elsif ($self->{state} == BOGUS_MD_STATE) {
4792 if ($self->{nc} == 0x003E) { # >
4793 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4794 !!!next-input-character;
4795 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4796 redo A;
4797 } elsif ($self->{nc} == -1) {
4798 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4799 ## Reconsume.
4800 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
4801 redo A;
4802 } else {
4803 ## Stay in the state.
4804 !!!next-input-character;
4805 redo A;
4806 }
4807 } else {
4808 die "$0: $self->{state}: Unknown state";
4809 }
4810 } # A
4811
4812 die "$0: _get_next_token: unexpected case";
4813 } # _get_next_token
4814
4815 1;
4816 ## $Date: 2008/10/19 06:14:57 $
4817

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24