/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (show annotations) (download) (as text)
Sun Oct 19 14:05:20 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.23: +10 -2 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	19 Oct 2008 14:05:17 -0000
	* attlist-1.dat, eldecls-1.dat, entities-1.dat, entities-2.dat,
	notations-1.dat, pis-2.dat: Unexpanded parameter entity tests are
	added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 14:03:50 -0000
	* Tokenizer.pm.src: Set the "stop_processing" flag true when a
	parameter entity occurs in a standalone="no" document.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 14:04:25 -0000
	* Parser.pm.src: Don't process ATTLIST_TOKEN and ENTITY_TOKEN if
	the "stop_processing" flag is set.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.23 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276 !!!next-input-character;
277 $self->{token} = [];
278 # $self->{escape}
279 } # _initialize_tokenizer
280
281 ## A token has:
282 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 ## ->{name} (DOCTYPE_TOKEN)
285 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 ## ->{target} (PI_TOKEN)
287 ## ->{pubid} (DOCTYPE_TOKEN)
288 ## ->{sysid} (DOCTYPE_TOKEN)
289 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291 ## ->{name}
292 ## ->{value}
293 ## ->{has_reference} == 1 or 0
294 ## ->{index}: Index of the attribute in a tag.
295 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299
300 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302 ## while the token is pushed back to the stack.
303
304 ## Emitted token MUST immediately be handled by the tree construction state.
305
306 ## Before each step, UA MAY check to see if either one of the scripts in
307 ## "list of scripts that will execute as soon as possible" or the first
308 ## script in the "list of scripts that will execute asynchronously",
309 ## has completed loading. If one has, then it MUST be executed
310 ## and removed from the list.
311
312 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313 ## (This requirement was dropped from HTML5 spec, unfortunately.)
314
315 my $is_space = {
316 0x0009 => 1, # CHARACTER TABULATION (HT)
317 0x000A => 1, # LINE FEED (LF)
318 #0x000B => 0, # LINE TABULATION (VT)
319 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 #0x000D => 1, # CARRIAGE RETURN (CR)
321 0x0020 => 1, # SPACE (SP)
322 };
323
324 sub _get_next_token ($) {
325 my $self = shift;
326
327 if ($self->{self_closing}) {
328 !!!parse-error (type => 'nestc', token => $self->{ct});
329 ## NOTE: The |self_closing| flag is only set by start tag token.
330 ## In addition, when a start tag token is emitted, it is always set to
331 ## |ct|.
332 delete $self->{self_closing};
333 }
334
335 if (@{$self->{token}}) {
336 $self->{self_closing} = $self->{token}->[0]->{self_closing};
337 return shift @{$self->{token}};
338 }
339
340 A: {
341 if ($self->{state} == PCDATA_STATE) {
342 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343
344 if ($self->{nc} == 0x0026) { # &
345 !!!cp (0.1);
346 ## NOTE: In the spec, the tokenizer is switched to the
347 ## "entity data state". In this implementation, the tokenizer
348 ## is switched to the |ENTITY_STATE|, which is an implementation
349 ## of the "consume a character reference" algorithm.
350 $self->{entity_add} = -1;
351 $self->{prev_state} = DATA_STATE;
352 $self->{state} = ENTITY_STATE;
353 !!!next-input-character;
354 redo A;
355 } elsif ($self->{nc} == 0x003C) { # <
356 !!!cp (0.2);
357 $self->{state} = TAG_OPEN_STATE;
358 !!!next-input-character;
359 redo A;
360 } elsif ($self->{nc} == -1) {
361 !!!cp (0.3);
362 !!!emit ({type => END_OF_FILE_TOKEN,
363 line => $self->{line}, column => $self->{column}});
364 last A; ## TODO: ok?
365 } else {
366 !!!cp (0.4);
367 #
368 }
369
370 # Anything else
371 my $token = {type => CHARACTER_TOKEN,
372 data => chr $self->{nc},
373 line => $self->{line}, column => $self->{column},
374 };
375 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376
377 ## Stay in the state.
378 !!!next-input-character;
379 !!!emit ($token);
380 redo A;
381 } elsif ($self->{state} == DATA_STATE) {
382 $self->{s_kwd} = '' unless defined $self->{s_kwd};
383 if ($self->{nc} == 0x0026) { # &
384 $self->{s_kwd} = '';
385 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386 not $self->{escape}) {
387 !!!cp (1);
388 ## NOTE: In the spec, the tokenizer is switched to the
389 ## "entity data state". In this implementation, the tokenizer
390 ## is switched to the |ENTITY_STATE|, which is an implementation
391 ## of the "consume a character reference" algorithm.
392 $self->{entity_add} = -1;
393 $self->{prev_state} = DATA_STATE;
394 $self->{state} = ENTITY_STATE;
395 !!!next-input-character;
396 redo A;
397 } else {
398 !!!cp (2);
399 #
400 }
401 } elsif ($self->{nc} == 0x002D) { # -
402 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 if ($self->{s_kwd} eq '<!-') {
404 !!!cp (3);
405 $self->{escape} = 1; # unless $self->{escape};
406 $self->{s_kwd} = '--';
407 #
408 } elsif ($self->{s_kwd} eq '-') {
409 !!!cp (4);
410 $self->{s_kwd} = '--';
411 #
412 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413 !!!cp (4.1);
414 $self->{s_kwd} .= '-';
415 #
416 } else {
417 !!!cp (5);
418 $self->{s_kwd} = '-';
419 #
420 }
421 }
422
423 #
424 } elsif ($self->{nc} == 0x0021) { # !
425 if (length $self->{s_kwd}) {
426 !!!cp (5.1);
427 $self->{s_kwd} .= '!';
428 #
429 } else {
430 !!!cp (5.2);
431 #$self->{s_kwd} = '';
432 #
433 }
434 #
435 } elsif ($self->{nc} == 0x003C) { # <
436 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438 not $self->{escape})) {
439 !!!cp (6);
440 $self->{state} = TAG_OPEN_STATE;
441 !!!next-input-character;
442 redo A;
443 } else {
444 !!!cp (7);
445 $self->{s_kwd} = '';
446 #
447 }
448 } elsif ($self->{nc} == 0x003E) { # >
449 if ($self->{escape} and
450 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451 if ($self->{s_kwd} eq '--') {
452 !!!cp (8);
453 delete $self->{escape};
454 #
455 } else {
456 !!!cp (9);
457 #
458 }
459 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460 !!!cp (9.1);
461 !!!parse-error (type => 'unmatched mse', ## TODO: type
462 line => $self->{line_prev},
463 column => $self->{column_prev} - 1);
464 #
465 } else {
466 !!!cp (10);
467 #
468 }
469
470 $self->{s_kwd} = '';
471 #
472 } elsif ($self->{nc} == 0x005D) { # ]
473 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474 !!!cp (10.1);
475 $self->{s_kwd} .= ']';
476 } elsif ($self->{s_kwd} eq ']]') {
477 !!!cp (10.2);
478 #
479 } else {
480 !!!cp (10.3);
481 $self->{s_kwd} = '';
482 }
483 #
484 } elsif ($self->{nc} == -1) {
485 !!!cp (11);
486 $self->{s_kwd} = '';
487 !!!emit ({type => END_OF_FILE_TOKEN,
488 line => $self->{line}, column => $self->{column}});
489 last A; ## TODO: ok?
490 } else {
491 !!!cp (12);
492 $self->{s_kwd} = '';
493 #
494 }
495
496 # Anything else
497 my $token = {type => CHARACTER_TOKEN,
498 data => chr $self->{nc},
499 line => $self->{line}, column => $self->{column},
500 };
501 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 length $token->{data})) {
503 $self->{s_kwd} = '';
504 }
505
506 ## Stay in the data state.
507 if (not $self->{is_xml} and
508 $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 !!!cp (13);
510 $self->{state} = PCDATA_STATE;
511 } else {
512 !!!cp (14);
513 ## Stay in the state.
514 }
515 !!!next-input-character;
516 !!!emit ($token);
517 redo A;
518 } elsif ($self->{state} == TAG_OPEN_STATE) {
519 ## XML5: "tag state".
520
521 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522 if ($self->{nc} == 0x002F) { # /
523 !!!cp (15);
524 !!!next-input-character;
525 $self->{state} = CLOSE_TAG_OPEN_STATE;
526 redo A;
527 } elsif ($self->{nc} == 0x0021) { # !
528 !!!cp (15.1);
529 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 #
531 } else {
532 !!!cp (16);
533 $self->{s_kwd} = '';
534 #
535 }
536
537 ## reconsume
538 $self->{state} = DATA_STATE;
539 !!!emit ({type => CHARACTER_TOKEN, data => '<',
540 line => $self->{line_prev},
541 column => $self->{column_prev},
542 });
543 redo A;
544 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545 if ($self->{nc} == 0x0021) { # !
546 !!!cp (17);
547 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548 !!!next-input-character;
549 redo A;
550 } elsif ($self->{nc} == 0x002F) { # /
551 !!!cp (18);
552 $self->{state} = CLOSE_TAG_OPEN_STATE;
553 !!!next-input-character;
554 redo A;
555 } elsif (0x0041 <= $self->{nc} and
556 $self->{nc} <= 0x005A) { # A..Z
557 !!!cp (19);
558 $self->{ct}
559 = {type => START_TAG_TOKEN,
560 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 line => $self->{line_prev},
562 column => $self->{column_prev}};
563 $self->{state} = TAG_NAME_STATE;
564 !!!next-input-character;
565 redo A;
566 } elsif (0x0061 <= $self->{nc} and
567 $self->{nc} <= 0x007A) { # a..z
568 !!!cp (20);
569 $self->{ct} = {type => START_TAG_TOKEN,
570 tag_name => chr ($self->{nc}),
571 line => $self->{line_prev},
572 column => $self->{column_prev}};
573 $self->{state} = TAG_NAME_STATE;
574 !!!next-input-character;
575 redo A;
576 } elsif ($self->{nc} == 0x003E) { # >
577 !!!cp (21);
578 !!!parse-error (type => 'empty start tag',
579 line => $self->{line_prev},
580 column => $self->{column_prev});
581 $self->{state} = DATA_STATE;
582 $self->{s_kwd} = '';
583 !!!next-input-character;
584
585 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586 line => $self->{line_prev},
587 column => $self->{column_prev},
588 });
589
590 redo A;
591 } elsif ($self->{nc} == 0x003F) { # ?
592 if ($self->{is_xml}) {
593 !!!cp (22.1);
594 $self->{state} = PI_STATE;
595 !!!next-input-character;
596 redo A;
597 } else {
598 !!!cp (22);
599 !!!parse-error (type => 'pio',
600 line => $self->{line_prev},
601 column => $self->{column_prev});
602 $self->{state} = BOGUS_COMMENT_STATE;
603 $self->{ct} = {type => COMMENT_TOKEN, data => '',
604 line => $self->{line_prev},
605 column => $self->{column_prev},
606 };
607 ## $self->{nc} is intentionally left as is
608 redo A;
609 }
610 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 !!!cp (23);
612 !!!parse-error (type => 'bare stago',
613 line => $self->{line_prev},
614 column => $self->{column_prev});
615 $self->{state} = DATA_STATE;
616 $self->{s_kwd} = '';
617 ## reconsume
618
619 !!!emit ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623
624 redo A;
625 } else {
626 ## XML5: "<:" is a parse error.
627 !!!cp (23.1);
628 $self->{ct} = {type => START_TAG_TOKEN,
629 tag_name => chr ($self->{nc}),
630 line => $self->{line_prev},
631 column => $self->{column_prev}};
632 $self->{state} = TAG_NAME_STATE;
633 !!!next-input-character;
634 redo A;
635 }
636 } else {
637 die "$0: $self->{content_model} in tag open";
638 }
639 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640 ## NOTE: The "close tag open state" in the spec is implemented as
641 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642
643 ## XML5: "end tag state".
644
645 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647 if (defined $self->{last_stag_name}) {
648 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 $self->{kwd} = '';
650 ## Reconsume.
651 redo A;
652 } else {
653 ## No start tag token has ever been emitted
654 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655 !!!cp (28);
656 $self->{state} = DATA_STATE;
657 $self->{s_kwd} = '';
658 ## Reconsume.
659 !!!emit ({type => CHARACTER_TOKEN, data => '</',
660 line => $l, column => $c,
661 });
662 redo A;
663 }
664 }
665
666 if (0x0041 <= $self->{nc} and
667 $self->{nc} <= 0x005A) { # A..Z
668 !!!cp (29);
669 $self->{ct}
670 = {type => END_TAG_TOKEN,
671 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 line => $l, column => $c};
673 $self->{state} = TAG_NAME_STATE;
674 !!!next-input-character;
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678 !!!cp (30);
679 $self->{ct} = {type => END_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $l, column => $c};
682 $self->{state} = TAG_NAME_STATE;
683 !!!next-input-character;
684 redo A;
685 } elsif ($self->{nc} == 0x003E) { # >
686 !!!parse-error (type => 'empty end tag',
687 line => $self->{line_prev}, ## "<" in "</>"
688 column => $self->{column_prev} - 1);
689 $self->{state} = DATA_STATE;
690 $self->{s_kwd} = '';
691 if ($self->{is_xml}) {
692 !!!cp (31);
693 ## XML5: No parse error.
694
695 ## NOTE: This parser raises a parse error, since it supports
696 ## XML1, not XML5.
697
698 ## NOTE: A short end tag token.
699 my $ct = {type => END_TAG_TOKEN,
700 tag_name => '',
701 line => $self->{line_prev},
702 column => $self->{column_prev} - 1,
703 };
704 !!!next-input-character;
705 !!!emit ($ct);
706 } else {
707 !!!cp (31.1);
708 !!!next-input-character;
709 }
710 redo A;
711 } elsif ($self->{nc} == -1) {
712 !!!cp (32);
713 !!!parse-error (type => 'bare etago');
714 $self->{s_kwd} = '';
715 $self->{state} = DATA_STATE;
716 # reconsume
717
718 !!!emit ({type => CHARACTER_TOKEN, data => '</',
719 line => $l, column => $c,
720 });
721
722 redo A;
723 } elsif (not $self->{is_xml} or
724 $is_space->{$self->{nc}}) {
725 !!!cp (33);
726 !!!parse-error (type => 'bogus end tag',
727 line => $self->{line_prev}, # "<" of "</"
728 column => $self->{column_prev} - 1);
729 $self->{state} = BOGUS_COMMENT_STATE;
730 $self->{ct} = {type => COMMENT_TOKEN, data => '',
731 line => $self->{line_prev}, # "<" of "</"
732 column => $self->{column_prev} - 1,
733 };
734 ## NOTE: $self->{nc} is intentionally left as is.
735 ## Although the "anything else" case of the spec not explicitly
736 ## states that the next input character is to be reconsumed,
737 ## it will be included to the |data| of the comment token
738 ## generated from the bogus end tag, as defined in the
739 ## "bogus comment state" entry.
740 redo A;
741 } else {
742 ## XML5: "</:" is a parse error.
743 !!!cp (30.1);
744 $self->{ct} = {type => END_TAG_TOKEN,
745 tag_name => chr ($self->{nc}),
746 line => $l, column => $c};
747 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748 !!!next-input-character;
749 redo A;
750 }
751 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 if (length $ch) {
754 my $CH = $ch;
755 $ch =~ tr/a-z/A-Z/;
756 my $nch = chr $self->{nc};
757 if ($nch eq $ch or $nch eq $CH) {
758 !!!cp (24);
759 ## Stay in the state.
760 $self->{kwd} .= $nch;
761 !!!next-input-character;
762 redo A;
763 } else {
764 !!!cp (25);
765 $self->{state} = DATA_STATE;
766 $self->{s_kwd} = '';
767 ## Reconsume.
768 !!!emit ({type => CHARACTER_TOKEN,
769 data => '</' . $self->{kwd},
770 line => $self->{line_prev},
771 column => $self->{column_prev} - 1 - length $self->{kwd},
772 });
773 redo A;
774 }
775 } else { # after "<{tag-name}"
776 unless ($is_space->{$self->{nc}} or
777 {
778 0x003E => 1, # >
779 0x002F => 1, # /
780 -1 => 1, # EOF
781 }->{$self->{nc}}) {
782 !!!cp (26);
783 ## Reconsume.
784 $self->{state} = DATA_STATE;
785 $self->{s_kwd} = '';
786 !!!emit ({type => CHARACTER_TOKEN,
787 data => '</' . $self->{kwd},
788 line => $self->{line_prev},
789 column => $self->{column_prev} - 1 - length $self->{kwd},
790 });
791 redo A;
792 } else {
793 !!!cp (27);
794 $self->{ct}
795 = {type => END_TAG_TOKEN,
796 tag_name => $self->{last_stag_name},
797 line => $self->{line_prev},
798 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 $self->{state} = TAG_NAME_STATE;
800 ## Reconsume.
801 redo A;
802 }
803 }
804 } elsif ($self->{state} == TAG_NAME_STATE) {
805 if ($is_space->{$self->{nc}}) {
806 !!!cp (34);
807 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808 !!!next-input-character;
809 redo A;
810 } elsif ($self->{nc} == 0x003E) { # >
811 if ($self->{ct}->{type} == START_TAG_TOKEN) {
812 !!!cp (35);
813 $self->{last_stag_name} = $self->{ct}->{tag_name};
814 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816 #if ($self->{ct}->{attributes}) {
817 # ## NOTE: This should never be reached.
818 # !!! cp (36);
819 # !!! parse-error (type => 'end tag attribute');
820 #} else {
821 !!!cp (37);
822 #}
823 } else {
824 die "$0: $self->{ct}->{type}: Unknown token type";
825 }
826 $self->{state} = DATA_STATE;
827 $self->{s_kwd} = '';
828 !!!next-input-character;
829
830 !!!emit ($self->{ct}); # start tag or end tag
831
832 redo A;
833 } elsif (0x0041 <= $self->{nc} and
834 $self->{nc} <= 0x005A) { # A..Z
835 !!!cp (38);
836 $self->{ct}->{tag_name}
837 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 # start tag or end tag
839 ## Stay in this state
840 !!!next-input-character;
841 redo A;
842 } elsif ($self->{nc} == -1) {
843 !!!parse-error (type => 'unclosed tag');
844 if ($self->{ct}->{type} == START_TAG_TOKEN) {
845 !!!cp (39);
846 $self->{last_stag_name} = $self->{ct}->{tag_name};
847 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849 #if ($self->{ct}->{attributes}) {
850 # ## NOTE: This state should never be reached.
851 # !!! cp (40);
852 # !!! parse-error (type => 'end tag attribute');
853 #} else {
854 !!!cp (41);
855 #}
856 } else {
857 die "$0: $self->{ct}->{type}: Unknown token type";
858 }
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 # reconsume
862
863 !!!emit ($self->{ct}); # start tag or end tag
864
865 redo A;
866 } elsif ($self->{nc} == 0x002F) { # /
867 !!!cp (42);
868 $self->{state} = SELF_CLOSING_START_TAG_STATE;
869 !!!next-input-character;
870 redo A;
871 } else {
872 !!!cp (44);
873 $self->{ct}->{tag_name} .= chr $self->{nc};
874 # start tag or end tag
875 ## Stay in the state
876 !!!next-input-character;
877 redo A;
878 }
879 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 ## XML5: "Tag attribute name before state".
881
882 if ($is_space->{$self->{nc}}) {
883 !!!cp (45);
884 ## Stay in the state
885 !!!next-input-character;
886 redo A;
887 } elsif ($self->{nc} == 0x003E) { # >
888 if ($self->{ct}->{type} == START_TAG_TOKEN) {
889 !!!cp (46);
890 $self->{last_stag_name} = $self->{ct}->{tag_name};
891 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893 if ($self->{ct}->{attributes}) {
894 !!!cp (47);
895 !!!parse-error (type => 'end tag attribute');
896 } else {
897 !!!cp (48);
898 }
899 } else {
900 die "$0: $self->{ct}->{type}: Unknown token type";
901 }
902 $self->{state} = DATA_STATE;
903 $self->{s_kwd} = '';
904 !!!next-input-character;
905
906 !!!emit ($self->{ct}); # start tag or end tag
907
908 redo A;
909 } elsif (0x0041 <= $self->{nc} and
910 $self->{nc} <= 0x005A) { # A..Z
911 !!!cp (49);
912 $self->{ca}
913 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 value => '',
915 line => $self->{line}, column => $self->{column}};
916 $self->{state} = ATTRIBUTE_NAME_STATE;
917 !!!next-input-character;
918 redo A;
919 } elsif ($self->{nc} == 0x002F) { # /
920 !!!cp (50);
921 $self->{state} = SELF_CLOSING_START_TAG_STATE;
922 !!!next-input-character;
923 redo A;
924 } elsif ($self->{nc} == -1) {
925 !!!parse-error (type => 'unclosed tag');
926 if ($self->{ct}->{type} == START_TAG_TOKEN) {
927 !!!cp (52);
928 $self->{last_stag_name} = $self->{ct}->{tag_name};
929 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931 if ($self->{ct}->{attributes}) {
932 !!!cp (53);
933 !!!parse-error (type => 'end tag attribute');
934 } else {
935 !!!cp (54);
936 }
937 } else {
938 die "$0: $self->{ct}->{type}: Unknown token type";
939 }
940 $self->{state} = DATA_STATE;
941 $self->{s_kwd} = '';
942 # reconsume
943
944 !!!emit ($self->{ct}); # start tag or end tag
945
946 redo A;
947 } else {
948 if ({
949 0x0022 => 1, # "
950 0x0027 => 1, # '
951 0x003D => 1, # =
952 }->{$self->{nc}}) {
953 !!!cp (55);
954 ## XML5: Not a parse error.
955 !!!parse-error (type => 'bad attribute name');
956 } else {
957 !!!cp (56);
958 ## XML5: ":" raises a parse error and is ignored.
959 }
960 $self->{ca}
961 = {name => chr ($self->{nc}),
962 value => '',
963 line => $self->{line}, column => $self->{column}};
964 $self->{state} = ATTRIBUTE_NAME_STATE;
965 !!!next-input-character;
966 redo A;
967 }
968 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 ## XML5: "Tag attribute name state".
970
971 my $before_leave = sub {
972 if (exists $self->{ct}->{attributes} # start tag or end tag
973 ->{$self->{ca}->{name}}) { # MUST
974 !!!cp (57);
975 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976 ## Discard $self->{ca} # MUST
977 } else {
978 !!!cp (58);
979 $self->{ct}->{attributes}->{$self->{ca}->{name}}
980 = $self->{ca};
981 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 }
983 }; # $before_leave
984
985 if ($is_space->{$self->{nc}}) {
986 !!!cp (59);
987 $before_leave->();
988 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989 !!!next-input-character;
990 redo A;
991 } elsif ($self->{nc} == 0x003D) { # =
992 !!!cp (60);
993 $before_leave->();
994 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{nc} == 0x003E) { # >
998 if ($self->{is_xml}) {
999 !!!cp (60.1);
1000 ## XML5: Not a parse error.
1001 !!!parse-error (type => 'no attr value'); ## TODO: type
1002 } else {
1003 !!!cp (60.2);
1004 }
1005
1006 $before_leave->();
1007 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008 !!!cp (61);
1009 $self->{last_stag_name} = $self->{ct}->{tag_name};
1010 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011 !!!cp (62);
1012 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013 if ($self->{ct}->{attributes}) {
1014 !!!parse-error (type => 'end tag attribute');
1015 }
1016 } else {
1017 die "$0: $self->{ct}->{type}: Unknown token type";
1018 }
1019 $self->{state} = DATA_STATE;
1020 $self->{s_kwd} = '';
1021 !!!next-input-character;
1022
1023 !!!emit ($self->{ct}); # start tag or end tag
1024
1025 redo A;
1026 } elsif (0x0041 <= $self->{nc} and
1027 $self->{nc} <= 0x005A) { # A..Z
1028 !!!cp (63);
1029 $self->{ca}->{name}
1030 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 ## Stay in the state
1032 !!!next-input-character;
1033 redo A;
1034 } elsif ($self->{nc} == 0x002F) { # /
1035 if ($self->{is_xml}) {
1036 !!!cp (64);
1037 ## XML5: Not a parse error.
1038 !!!parse-error (type => 'no attr value'); ## TODO: type
1039 } else {
1040 !!!cp (64.1);
1041 }
1042
1043 $before_leave->();
1044 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045 !!!next-input-character;
1046 redo A;
1047 } elsif ($self->{nc} == -1) {
1048 !!!parse-error (type => 'unclosed tag');
1049 $before_leave->();
1050 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051 !!!cp (66);
1052 $self->{last_stag_name} = $self->{ct}->{tag_name};
1053 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055 if ($self->{ct}->{attributes}) {
1056 !!!cp (67);
1057 !!!parse-error (type => 'end tag attribute');
1058 } else {
1059 ## NOTE: This state should never be reached.
1060 !!!cp (68);
1061 }
1062 } else {
1063 die "$0: $self->{ct}->{type}: Unknown token type";
1064 }
1065 $self->{state} = DATA_STATE;
1066 $self->{s_kwd} = '';
1067 # reconsume
1068
1069 !!!emit ($self->{ct}); # start tag or end tag
1070
1071 redo A;
1072 } else {
1073 if ($self->{nc} == 0x0022 or # "
1074 $self->{nc} == 0x0027) { # '
1075 !!!cp (69);
1076 ## XML5: Not a parse error.
1077 !!!parse-error (type => 'bad attribute name');
1078 } else {
1079 !!!cp (70);
1080 }
1081 $self->{ca}->{name} .= chr ($self->{nc});
1082 ## Stay in the state
1083 !!!next-input-character;
1084 redo A;
1085 }
1086 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 ## XML5: "Tag attribute name after state".
1088
1089 if ($is_space->{$self->{nc}}) {
1090 !!!cp (71);
1091 ## Stay in the state
1092 !!!next-input-character;
1093 redo A;
1094 } elsif ($self->{nc} == 0x003D) { # =
1095 !!!cp (72);
1096 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097 !!!next-input-character;
1098 redo A;
1099 } elsif ($self->{nc} == 0x003E) { # >
1100 if ($self->{is_xml}) {
1101 !!!cp (72.1);
1102 ## XML5: Not a parse error.
1103 !!!parse-error (type => 'no attr value'); ## TODO: type
1104 } else {
1105 !!!cp (72.2);
1106 }
1107
1108 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109 !!!cp (73);
1110 $self->{last_stag_name} = $self->{ct}->{tag_name};
1111 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113 if ($self->{ct}->{attributes}) {
1114 !!!cp (74);
1115 !!!parse-error (type => 'end tag attribute');
1116 } else {
1117 ## NOTE: This state should never be reached.
1118 !!!cp (75);
1119 }
1120 } else {
1121 die "$0: $self->{ct}->{type}: Unknown token type";
1122 }
1123 $self->{state} = DATA_STATE;
1124 $self->{s_kwd} = '';
1125 !!!next-input-character;
1126
1127 !!!emit ($self->{ct}); # start tag or end tag
1128
1129 redo A;
1130 } elsif (0x0041 <= $self->{nc} and
1131 $self->{nc} <= 0x005A) { # A..Z
1132 !!!cp (76);
1133 $self->{ca}
1134 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 value => '',
1136 line => $self->{line}, column => $self->{column}};
1137 $self->{state} = ATTRIBUTE_NAME_STATE;
1138 !!!next-input-character;
1139 redo A;
1140 } elsif ($self->{nc} == 0x002F) { # /
1141 if ($self->{is_xml}) {
1142 !!!cp (77);
1143 ## XML5: Not a parse error.
1144 !!!parse-error (type => 'no attr value'); ## TODO: type
1145 } else {
1146 !!!cp (77.1);
1147 }
1148
1149 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150 !!!next-input-character;
1151 redo A;
1152 } elsif ($self->{nc} == -1) {
1153 !!!parse-error (type => 'unclosed tag');
1154 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155 !!!cp (79);
1156 $self->{last_stag_name} = $self->{ct}->{tag_name};
1157 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159 if ($self->{ct}->{attributes}) {
1160 !!!cp (80);
1161 !!!parse-error (type => 'end tag attribute');
1162 } else {
1163 ## NOTE: This state should never be reached.
1164 !!!cp (81);
1165 }
1166 } else {
1167 die "$0: $self->{ct}->{type}: Unknown token type";
1168 }
1169 $self->{s_kwd} = '';
1170 $self->{state} = DATA_STATE;
1171 # reconsume
1172
1173 !!!emit ($self->{ct}); # start tag or end tag
1174
1175 redo A;
1176 } else {
1177 if ($self->{is_xml}) {
1178 !!!cp (78.1);
1179 ## XML5: Not a parse error.
1180 !!!parse-error (type => 'no attr value'); ## TODO: type
1181 } else {
1182 !!!cp (78.2);
1183 }
1184
1185 if ($self->{nc} == 0x0022 or # "
1186 $self->{nc} == 0x0027) { # '
1187 !!!cp (78);
1188 ## XML5: Not a parse error.
1189 !!!parse-error (type => 'bad attribute name');
1190 } else {
1191 !!!cp (82);
1192 }
1193 $self->{ca}
1194 = {name => chr ($self->{nc}),
1195 value => '',
1196 line => $self->{line}, column => $self->{column}};
1197 $self->{state} = ATTRIBUTE_NAME_STATE;
1198 !!!next-input-character;
1199 redo A;
1200 }
1201 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 ## XML5: "Tag attribute value before state".
1203
1204 if ($is_space->{$self->{nc}}) {
1205 !!!cp (83);
1206 ## Stay in the state
1207 !!!next-input-character;
1208 redo A;
1209 } elsif ($self->{nc} == 0x0022) { # "
1210 !!!cp (84);
1211 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 } elsif ($self->{nc} == 0x0026) { # &
1215 !!!cp (85);
1216 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217 ## reconsume
1218 redo A;
1219 } elsif ($self->{nc} == 0x0027) { # '
1220 !!!cp (86);
1221 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222 !!!next-input-character;
1223 redo A;
1224 } elsif ($self->{nc} == 0x003E) { # >
1225 !!!parse-error (type => 'empty unquoted attribute value');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227 !!!cp (87);
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232 !!!cp (88);
1233 !!!parse-error (type => 'end tag attribute');
1234 } else {
1235 ## NOTE: This state should never be reached.
1236 !!!cp (89);
1237 }
1238 } else {
1239 die "$0: $self->{ct}->{type}: Unknown token type";
1240 }
1241 $self->{state} = DATA_STATE;
1242 $self->{s_kwd} = '';
1243 !!!next-input-character;
1244
1245 !!!emit ($self->{ct}); # start tag or end tag
1246
1247 redo A;
1248 } elsif ($self->{nc} == -1) {
1249 !!!parse-error (type => 'unclosed tag');
1250 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251 !!!cp (90);
1252 $self->{last_stag_name} = $self->{ct}->{tag_name};
1253 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255 if ($self->{ct}->{attributes}) {
1256 !!!cp (91);
1257 !!!parse-error (type => 'end tag attribute');
1258 } else {
1259 ## NOTE: This state should never be reached.
1260 !!!cp (92);
1261 }
1262 } else {
1263 die "$0: $self->{ct}->{type}: Unknown token type";
1264 }
1265 $self->{state} = DATA_STATE;
1266 $self->{s_kwd} = '';
1267 ## reconsume
1268
1269 !!!emit ($self->{ct}); # start tag or end tag
1270
1271 redo A;
1272 } else {
1273 if ($self->{nc} == 0x003D) { # =
1274 !!!cp (93);
1275 ## XML5: Not a parse error.
1276 !!!parse-error (type => 'bad attribute value');
1277 } elsif ($self->{is_xml}) {
1278 !!!cp (93.1);
1279 ## XML5: No parse error.
1280 !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 } else {
1282 !!!cp (94);
1283 }
1284 $self->{ca}->{value} .= chr ($self->{nc});
1285 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286 !!!next-input-character;
1287 redo A;
1288 }
1289 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291 ## ATTLIST attribute value double quoted state".
1292
1293 if ($self->{nc} == 0x0022) { # "
1294 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295 !!!cp (95.1);
1296 ## XML5: "DOCTYPE ATTLIST name after state".
1297 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299 } else {
1300 !!!cp (95);
1301 ## XML5: "Tag attribute name before state".
1302 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303 }
1304 !!!next-input-character;
1305 redo A;
1306 } elsif ($self->{nc} == 0x0026) { # &
1307 !!!cp (96);
1308 ## XML5: Not defined yet.
1309
1310 ## NOTE: In the spec, the tokenizer is switched to the
1311 ## "entity in attribute value state". In this implementation, the
1312 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313 ## implementation of the "consume a character reference" algorithm.
1314 $self->{prev_state} = $self->{state};
1315 $self->{entity_add} = 0x0022; # "
1316 $self->{state} = ENTITY_STATE;
1317 !!!next-input-character;
1318 redo A;
1319 } elsif ($self->{nc} == -1) {
1320 !!!parse-error (type => 'unclosed attribute value');
1321 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1322 !!!cp (97);
1323 $self->{last_stag_name} = $self->{ct}->{tag_name};
1324
1325 $self->{state} = DATA_STATE;
1326 $self->{s_kwd} = '';
1327 ## reconsume
1328 !!!emit ($self->{ct}); # start tag
1329 redo A;
1330 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1331 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1332 if ($self->{ct}->{attributes}) {
1333 !!!cp (98);
1334 !!!parse-error (type => 'end tag attribute');
1335 } else {
1336 ## NOTE: This state should never be reached.
1337 !!!cp (99);
1338 }
1339
1340 $self->{state} = DATA_STATE;
1341 $self->{s_kwd} = '';
1342 ## reconsume
1343 !!!emit ($self->{ct}); # end tag
1344 redo A;
1345 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1346 ## XML5: No parse error above; not defined yet.
1347 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1348 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1349 ## Reconsume.
1350 !!!emit ($self->{ct}); # ATTLIST
1351 redo A;
1352 } else {
1353 die "$0: $self->{ct}->{type}: Unknown token type";
1354 }
1355 } else {
1356 ## XML5 [ATTLIST]: Not defined yet.
1357 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1358 !!!cp (100);
1359 ## XML5: Not a parse error.
1360 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1361 } else {
1362 !!!cp (100.1);
1363 }
1364 $self->{ca}->{value} .= chr ($self->{nc});
1365 $self->{read_until}->($self->{ca}->{value},
1366 q["&<],
1367 length $self->{ca}->{value});
1368
1369 ## Stay in the state
1370 !!!next-input-character;
1371 redo A;
1372 }
1373 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1374 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1375 ## ATTLIST attribute value single quoted state".
1376
1377 if ($self->{nc} == 0x0027) { # '
1378 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1379 !!!cp (101.1);
1380 ## XML5: "DOCTYPE ATTLIST name after state".
1381 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1382 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1383 } else {
1384 !!!cp (101);
1385 ## XML5: "Before attribute name state" (sic).
1386 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1387 }
1388 !!!next-input-character;
1389 redo A;
1390 } elsif ($self->{nc} == 0x0026) { # &
1391 !!!cp (102);
1392 ## XML5: Not defined yet.
1393
1394 ## NOTE: In the spec, the tokenizer is switched to the
1395 ## "entity in attribute value state". In this implementation, the
1396 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1397 ## implementation of the "consume a character reference" algorithm.
1398 $self->{entity_add} = 0x0027; # '
1399 $self->{prev_state} = $self->{state};
1400 $self->{state} = ENTITY_STATE;
1401 !!!next-input-character;
1402 redo A;
1403 } elsif ($self->{nc} == -1) {
1404 !!!parse-error (type => 'unclosed attribute value');
1405 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1406 !!!cp (103);
1407 $self->{last_stag_name} = $self->{ct}->{tag_name};
1408
1409 $self->{state} = DATA_STATE;
1410 $self->{s_kwd} = '';
1411 ## reconsume
1412 !!!emit ($self->{ct}); # start tag
1413 redo A;
1414 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1415 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1416 if ($self->{ct}->{attributes}) {
1417 !!!cp (104);
1418 !!!parse-error (type => 'end tag attribute');
1419 } else {
1420 ## NOTE: This state should never be reached.
1421 !!!cp (105);
1422 }
1423
1424 $self->{state} = DATA_STATE;
1425 $self->{s_kwd} = '';
1426 ## reconsume
1427 !!!emit ($self->{ct}); # end tag
1428 redo A;
1429 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1430 ## XML5: No parse error above; not defined yet.
1431 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1432 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1433 ## Reconsume.
1434 !!!emit ($self->{ct}); # ATTLIST
1435 redo A;
1436 } else {
1437 die "$0: $self->{ct}->{type}: Unknown token type";
1438 }
1439 } else {
1440 ## XML5 [ATTLIST]: Not defined yet.
1441 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1442 !!!cp (106);
1443 ## XML5: Not a parse error.
1444 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1445 } else {
1446 !!!cp (106.1);
1447 }
1448 $self->{ca}->{value} .= chr ($self->{nc});
1449 $self->{read_until}->($self->{ca}->{value},
1450 q['&<],
1451 length $self->{ca}->{value});
1452
1453 ## Stay in the state
1454 !!!next-input-character;
1455 redo A;
1456 }
1457 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1458 ## XML5: "Tag attribute value unquoted state".
1459
1460 if ($is_space->{$self->{nc}}) {
1461 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1462 !!!cp (107.1);
1463 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1464 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1465 } else {
1466 !!!cp (107);
1467 ## XML5: "Tag attribute name before state".
1468 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1469 }
1470 !!!next-input-character;
1471 redo A;
1472 } elsif ($self->{nc} == 0x0026) { # &
1473 !!!cp (108);
1474
1475 ## XML5: Not defined yet.
1476
1477 ## NOTE: In the spec, the tokenizer is switched to the
1478 ## "entity in attribute value state". In this implementation, the
1479 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1480 ## implementation of the "consume a character reference" algorithm.
1481 $self->{entity_add} = -1;
1482 $self->{prev_state} = $self->{state};
1483 $self->{state} = ENTITY_STATE;
1484 !!!next-input-character;
1485 redo A;
1486 } elsif ($self->{nc} == 0x003E) { # >
1487 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1488 !!!cp (109);
1489 $self->{last_stag_name} = $self->{ct}->{tag_name};
1490
1491 $self->{state} = DATA_STATE;
1492 $self->{s_kwd} = '';
1493 !!!next-input-character;
1494 !!!emit ($self->{ct}); # start tag
1495 redo A;
1496 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1497 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1498 if ($self->{ct}->{attributes}) {
1499 !!!cp (110);
1500 !!!parse-error (type => 'end tag attribute');
1501 } else {
1502 ## NOTE: This state should never be reached.
1503 !!!cp (111);
1504 }
1505
1506 $self->{state} = DATA_STATE;
1507 $self->{s_kwd} = '';
1508 !!!next-input-character;
1509 !!!emit ($self->{ct}); # end tag
1510 redo A;
1511 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1512 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1513 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1514 !!!next-input-character;
1515 !!!emit ($self->{ct}); # ATTLIST
1516 redo A;
1517 } else {
1518 die "$0: $self->{ct}->{type}: Unknown token type";
1519 }
1520 } elsif ($self->{nc} == -1) {
1521 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1522 !!!cp (112);
1523 !!!parse-error (type => 'unclosed tag');
1524 $self->{last_stag_name} = $self->{ct}->{tag_name};
1525
1526 $self->{state} = DATA_STATE;
1527 $self->{s_kwd} = '';
1528 ## reconsume
1529 !!!emit ($self->{ct}); # start tag
1530 redo A;
1531 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1532 !!!parse-error (type => 'unclosed tag');
1533 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1534 if ($self->{ct}->{attributes}) {
1535 !!!cp (113);
1536 !!!parse-error (type => 'end tag attribute');
1537 } else {
1538 ## NOTE: This state should never be reached.
1539 !!!cp (114);
1540 }
1541
1542 $self->{state} = DATA_STATE;
1543 $self->{s_kwd} = '';
1544 ## reconsume
1545 !!!emit ($self->{ct}); # end tag
1546 redo A;
1547 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1548 !!!parse-error (type => 'unclosed md'); ## TODO: type
1549 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1550 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1551 ## Reconsume.
1552 !!!emit ($self->{ct}); # ATTLIST
1553 redo A;
1554 } else {
1555 die "$0: $self->{ct}->{type}: Unknown token type";
1556 }
1557 } else {
1558 if ({
1559 0x0022 => 1, # "
1560 0x0027 => 1, # '
1561 0x003D => 1, # =
1562 }->{$self->{nc}}) {
1563 !!!cp (115);
1564 ## XML5: Not a parse error.
1565 !!!parse-error (type => 'bad attribute value');
1566 } else {
1567 !!!cp (116);
1568 }
1569 $self->{ca}->{value} .= chr ($self->{nc});
1570 $self->{read_until}->($self->{ca}->{value},
1571 q["'=& >],
1572 length $self->{ca}->{value});
1573
1574 ## Stay in the state
1575 !!!next-input-character;
1576 redo A;
1577 }
1578 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1579 if ($is_space->{$self->{nc}}) {
1580 !!!cp (118);
1581 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1582 !!!next-input-character;
1583 redo A;
1584 } elsif ($self->{nc} == 0x003E) { # >
1585 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1586 !!!cp (119);
1587 $self->{last_stag_name} = $self->{ct}->{tag_name};
1588 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1589 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1590 if ($self->{ct}->{attributes}) {
1591 !!!cp (120);
1592 !!!parse-error (type => 'end tag attribute');
1593 } else {
1594 ## NOTE: This state should never be reached.
1595 !!!cp (121);
1596 }
1597 } else {
1598 die "$0: $self->{ct}->{type}: Unknown token type";
1599 }
1600 $self->{state} = DATA_STATE;
1601 $self->{s_kwd} = '';
1602 !!!next-input-character;
1603
1604 !!!emit ($self->{ct}); # start tag or end tag
1605
1606 redo A;
1607 } elsif ($self->{nc} == 0x002F) { # /
1608 !!!cp (122);
1609 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1610 !!!next-input-character;
1611 redo A;
1612 } elsif ($self->{nc} == -1) {
1613 !!!parse-error (type => 'unclosed tag');
1614 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1615 !!!cp (122.3);
1616 $self->{last_stag_name} = $self->{ct}->{tag_name};
1617 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1618 if ($self->{ct}->{attributes}) {
1619 !!!cp (122.1);
1620 !!!parse-error (type => 'end tag attribute');
1621 } else {
1622 ## NOTE: This state should never be reached.
1623 !!!cp (122.2);
1624 }
1625 } else {
1626 die "$0: $self->{ct}->{type}: Unknown token type";
1627 }
1628 $self->{state} = DATA_STATE;
1629 $self->{s_kwd} = '';
1630 ## Reconsume.
1631 !!!emit ($self->{ct}); # start tag or end tag
1632 redo A;
1633 } else {
1634 !!!cp ('124.1');
1635 !!!parse-error (type => 'no space between attributes');
1636 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1637 ## reconsume
1638 redo A;
1639 }
1640 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1641 ## XML5: "Empty tag state".
1642
1643 if ($self->{nc} == 0x003E) { # >
1644 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1645 !!!cp ('124.2');
1646 !!!parse-error (type => 'nestc', token => $self->{ct});
1647 ## TODO: Different type than slash in start tag
1648 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1649 if ($self->{ct}->{attributes}) {
1650 !!!cp ('124.4');
1651 !!!parse-error (type => 'end tag attribute');
1652 } else {
1653 !!!cp ('124.5');
1654 }
1655 ## TODO: Test |<title></title/>|
1656 } else {
1657 !!!cp ('124.3');
1658 $self->{self_closing} = 1;
1659 }
1660
1661 $self->{state} = DATA_STATE;
1662 $self->{s_kwd} = '';
1663 !!!next-input-character;
1664
1665 !!!emit ($self->{ct}); # start tag or end tag
1666
1667 redo A;
1668 } elsif ($self->{nc} == -1) {
1669 !!!parse-error (type => 'unclosed tag');
1670 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1671 !!!cp (124.7);
1672 $self->{last_stag_name} = $self->{ct}->{tag_name};
1673 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1674 if ($self->{ct}->{attributes}) {
1675 !!!cp (124.5);
1676 !!!parse-error (type => 'end tag attribute');
1677 } else {
1678 ## NOTE: This state should never be reached.
1679 !!!cp (124.6);
1680 }
1681 } else {
1682 die "$0: $self->{ct}->{type}: Unknown token type";
1683 }
1684 ## XML5: "Tag attribute name before state".
1685 $self->{state} = DATA_STATE;
1686 $self->{s_kwd} = '';
1687 ## Reconsume.
1688 !!!emit ($self->{ct}); # start tag or end tag
1689 redo A;
1690 } else {
1691 !!!cp ('124.4');
1692 !!!parse-error (type => 'nestc');
1693 ## TODO: This error type is wrong.
1694 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1695 ## Reconsume.
1696 redo A;
1697 }
1698 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1699 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1700
1701 ## NOTE: Unlike spec's "bogus comment state", this implementation
1702 ## consumes characters one-by-one basis.
1703
1704 if ($self->{nc} == 0x003E) { # >
1705 if ($self->{in_subset}) {
1706 !!!cp (123);
1707 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1708 } else {
1709 !!!cp (124);
1710 $self->{state} = DATA_STATE;
1711 $self->{s_kwd} = '';
1712 }
1713 !!!next-input-character;
1714
1715 !!!emit ($self->{ct}); # comment
1716 redo A;
1717 } elsif ($self->{nc} == -1) {
1718 if ($self->{in_subset}) {
1719 !!!cp (125.1);
1720 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1721 } else {
1722 !!!cp (125);
1723 $self->{state} = DATA_STATE;
1724 $self->{s_kwd} = '';
1725 }
1726 ## reconsume
1727
1728 !!!emit ($self->{ct}); # comment
1729 redo A;
1730 } else {
1731 !!!cp (126);
1732 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1733 $self->{read_until}->($self->{ct}->{data},
1734 q[>],
1735 length $self->{ct}->{data});
1736
1737 ## Stay in the state.
1738 !!!next-input-character;
1739 redo A;
1740 }
1741 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1742 ## XML5: "Markup declaration state".
1743
1744 if ($self->{nc} == 0x002D) { # -
1745 !!!cp (133);
1746 $self->{state} = MD_HYPHEN_STATE;
1747 !!!next-input-character;
1748 redo A;
1749 } elsif ($self->{nc} == 0x0044 or # D
1750 $self->{nc} == 0x0064) { # d
1751 ## ASCII case-insensitive.
1752 !!!cp (130);
1753 $self->{state} = MD_DOCTYPE_STATE;
1754 $self->{kwd} = chr $self->{nc};
1755 !!!next-input-character;
1756 redo A;
1757 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1758 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1759 $self->{is_xml}) and
1760 $self->{nc} == 0x005B) { # [
1761 !!!cp (135.4);
1762 $self->{state} = MD_CDATA_STATE;
1763 $self->{kwd} = '[';
1764 !!!next-input-character;
1765 redo A;
1766 } else {
1767 !!!cp (136);
1768 }
1769
1770 !!!parse-error (type => 'bogus comment',
1771 line => $self->{line_prev},
1772 column => $self->{column_prev} - 1);
1773 ## Reconsume.
1774 $self->{state} = BOGUS_COMMENT_STATE;
1775 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1776 line => $self->{line_prev},
1777 column => $self->{column_prev} - 1,
1778 };
1779 redo A;
1780 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1781 if ($self->{nc} == 0x002D) { # -
1782 !!!cp (127);
1783 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1784 line => $self->{line_prev},
1785 column => $self->{column_prev} - 2,
1786 };
1787 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1788 !!!next-input-character;
1789 redo A;
1790 } else {
1791 !!!cp (128);
1792 !!!parse-error (type => 'bogus comment',
1793 line => $self->{line_prev},
1794 column => $self->{column_prev} - 2);
1795 $self->{state} = BOGUS_COMMENT_STATE;
1796 ## Reconsume.
1797 $self->{ct} = {type => COMMENT_TOKEN,
1798 data => '-',
1799 line => $self->{line_prev},
1800 column => $self->{column_prev} - 2,
1801 };
1802 redo A;
1803 }
1804 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1805 ## ASCII case-insensitive.
1806 if ($self->{nc} == [
1807 undef,
1808 0x004F, # O
1809 0x0043, # C
1810 0x0054, # T
1811 0x0059, # Y
1812 0x0050, # P
1813 ]->[length $self->{kwd}] or
1814 $self->{nc} == [
1815 undef,
1816 0x006F, # o
1817 0x0063, # c
1818 0x0074, # t
1819 0x0079, # y
1820 0x0070, # p
1821 ]->[length $self->{kwd}]) {
1822 !!!cp (131);
1823 ## Stay in the state.
1824 $self->{kwd} .= chr $self->{nc};
1825 !!!next-input-character;
1826 redo A;
1827 } elsif ((length $self->{kwd}) == 6 and
1828 ($self->{nc} == 0x0045 or # E
1829 $self->{nc} == 0x0065)) { # e
1830 if ($self->{is_xml} and
1831 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1832 !!!cp (129);
1833 ## XML5: case-sensitive.
1834 !!!parse-error (type => 'lowercase keyword', ## TODO
1835 text => 'DOCTYPE',
1836 line => $self->{line_prev},
1837 column => $self->{column_prev} - 5);
1838 } else {
1839 !!!cp (129.1);
1840 }
1841 $self->{state} = DOCTYPE_STATE;
1842 $self->{ct} = {type => DOCTYPE_TOKEN,
1843 quirks => 1,
1844 line => $self->{line_prev},
1845 column => $self->{column_prev} - 7,
1846 };
1847 !!!next-input-character;
1848 redo A;
1849 } else {
1850 !!!cp (132);
1851 !!!parse-error (type => 'bogus comment',
1852 line => $self->{line_prev},
1853 column => $self->{column_prev} - 1 - length $self->{kwd});
1854 $self->{state} = BOGUS_COMMENT_STATE;
1855 ## Reconsume.
1856 $self->{ct} = {type => COMMENT_TOKEN,
1857 data => $self->{kwd},
1858 line => $self->{line_prev},
1859 column => $self->{column_prev} - 1 - length $self->{kwd},
1860 };
1861 redo A;
1862 }
1863 } elsif ($self->{state} == MD_CDATA_STATE) {
1864 if ($self->{nc} == {
1865 '[' => 0x0043, # C
1866 '[C' => 0x0044, # D
1867 '[CD' => 0x0041, # A
1868 '[CDA' => 0x0054, # T
1869 '[CDAT' => 0x0041, # A
1870 }->{$self->{kwd}}) {
1871 !!!cp (135.1);
1872 ## Stay in the state.
1873 $self->{kwd} .= chr $self->{nc};
1874 !!!next-input-character;
1875 redo A;
1876 } elsif ($self->{kwd} eq '[CDATA' and
1877 $self->{nc} == 0x005B) { # [
1878 if ($self->{is_xml} and
1879 not $self->{tainted} and
1880 @{$self->{open_elements} or []} == 0) {
1881 !!!cp (135.2);
1882 !!!parse-error (type => 'cdata outside of root element',
1883 line => $self->{line_prev},
1884 column => $self->{column_prev} - 7);
1885 $self->{tainted} = 1;
1886 } else {
1887 !!!cp (135.21);
1888 }
1889
1890 $self->{ct} = {type => CHARACTER_TOKEN,
1891 data => '',
1892 line => $self->{line_prev},
1893 column => $self->{column_prev} - 7};
1894 $self->{state} = CDATA_SECTION_STATE;
1895 !!!next-input-character;
1896 redo A;
1897 } else {
1898 !!!cp (135.3);
1899 !!!parse-error (type => 'bogus comment',
1900 line => $self->{line_prev},
1901 column => $self->{column_prev} - 1 - length $self->{kwd});
1902 $self->{state} = BOGUS_COMMENT_STATE;
1903 ## Reconsume.
1904 $self->{ct} = {type => COMMENT_TOKEN,
1905 data => $self->{kwd},
1906 line => $self->{line_prev},
1907 column => $self->{column_prev} - 1 - length $self->{kwd},
1908 };
1909 redo A;
1910 }
1911 } elsif ($self->{state} == COMMENT_START_STATE) {
1912 if ($self->{nc} == 0x002D) { # -
1913 !!!cp (137);
1914 $self->{state} = COMMENT_START_DASH_STATE;
1915 !!!next-input-character;
1916 redo A;
1917 } elsif ($self->{nc} == 0x003E) { # >
1918 !!!parse-error (type => 'bogus comment');
1919 if ($self->{in_subset}) {
1920 !!!cp (138.1);
1921 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1922 } else {
1923 !!!cp (138);
1924 $self->{state} = DATA_STATE;
1925 $self->{s_kwd} = '';
1926 }
1927 !!!next-input-character;
1928
1929 !!!emit ($self->{ct}); # comment
1930
1931 redo A;
1932 } elsif ($self->{nc} == -1) {
1933 !!!parse-error (type => 'unclosed comment');
1934 if ($self->{in_subset}) {
1935 !!!cp (139.1);
1936 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937 } else {
1938 !!!cp (139);
1939 $self->{state} = DATA_STATE;
1940 $self->{s_kwd} = '';
1941 }
1942 ## reconsume
1943
1944 !!!emit ($self->{ct}); # comment
1945
1946 redo A;
1947 } else {
1948 !!!cp (140);
1949 $self->{ct}->{data} # comment
1950 .= chr ($self->{nc});
1951 $self->{state} = COMMENT_STATE;
1952 !!!next-input-character;
1953 redo A;
1954 }
1955 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1956 if ($self->{nc} == 0x002D) { # -
1957 !!!cp (141);
1958 $self->{state} = COMMENT_END_STATE;
1959 !!!next-input-character;
1960 redo A;
1961 } elsif ($self->{nc} == 0x003E) { # >
1962 !!!parse-error (type => 'bogus comment');
1963 if ($self->{in_subset}) {
1964 !!!cp (142.1);
1965 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1966 } else {
1967 !!!cp (142);
1968 $self->{state} = DATA_STATE;
1969 $self->{s_kwd} = '';
1970 }
1971 !!!next-input-character;
1972
1973 !!!emit ($self->{ct}); # comment
1974
1975 redo A;
1976 } elsif ($self->{nc} == -1) {
1977 !!!parse-error (type => 'unclosed comment');
1978 if ($self->{in_subset}) {
1979 !!!cp (143.1);
1980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981 } else {
1982 !!!cp (143);
1983 $self->{state} = DATA_STATE;
1984 $self->{s_kwd} = '';
1985 }
1986 ## reconsume
1987
1988 !!!emit ($self->{ct}); # comment
1989
1990 redo A;
1991 } else {
1992 !!!cp (144);
1993 $self->{ct}->{data} # comment
1994 .= '-' . chr ($self->{nc});
1995 $self->{state} = COMMENT_STATE;
1996 !!!next-input-character;
1997 redo A;
1998 }
1999 } elsif ($self->{state} == COMMENT_STATE) {
2000 ## XML5: "Comment state" and "DOCTYPE comment state".
2001
2002 if ($self->{nc} == 0x002D) { # -
2003 !!!cp (145);
2004 $self->{state} = COMMENT_END_DASH_STATE;
2005 !!!next-input-character;
2006 redo A;
2007 } elsif ($self->{nc} == -1) {
2008 !!!parse-error (type => 'unclosed comment');
2009 if ($self->{in_subset}) {
2010 !!!cp (146.1);
2011 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2012 } else {
2013 !!!cp (146);
2014 $self->{state} = DATA_STATE;
2015 $self->{s_kwd} = '';
2016 }
2017 ## reconsume
2018
2019 !!!emit ($self->{ct}); # comment
2020
2021 redo A;
2022 } else {
2023 !!!cp (147);
2024 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2025 $self->{read_until}->($self->{ct}->{data},
2026 q[-],
2027 length $self->{ct}->{data});
2028
2029 ## Stay in the state
2030 !!!next-input-character;
2031 redo A;
2032 }
2033 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2034 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2035
2036 if ($self->{nc} == 0x002D) { # -
2037 !!!cp (148);
2038 $self->{state} = COMMENT_END_STATE;
2039 !!!next-input-character;
2040 redo A;
2041 } elsif ($self->{nc} == -1) {
2042 !!!parse-error (type => 'unclosed comment');
2043 if ($self->{in_subset}) {
2044 !!!cp (149.1);
2045 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2046 } else {
2047 !!!cp (149);
2048 $self->{state} = DATA_STATE;
2049 $self->{s_kwd} = '';
2050 }
2051 ## reconsume
2052
2053 !!!emit ($self->{ct}); # comment
2054
2055 redo A;
2056 } else {
2057 !!!cp (150);
2058 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2059 $self->{state} = COMMENT_STATE;
2060 !!!next-input-character;
2061 redo A;
2062 }
2063 } elsif ($self->{state} == COMMENT_END_STATE) {
2064 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2065
2066 if ($self->{nc} == 0x003E) { # >
2067 if ($self->{in_subset}) {
2068 !!!cp (151.1);
2069 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2070 } else {
2071 !!!cp (151);
2072 $self->{state} = DATA_STATE;
2073 $self->{s_kwd} = '';
2074 }
2075 !!!next-input-character;
2076
2077 !!!emit ($self->{ct}); # comment
2078
2079 redo A;
2080 } elsif ($self->{nc} == 0x002D) { # -
2081 !!!cp (152);
2082 ## XML5: Not a parse error.
2083 !!!parse-error (type => 'dash in comment',
2084 line => $self->{line_prev},
2085 column => $self->{column_prev});
2086 $self->{ct}->{data} .= '-'; # comment
2087 ## Stay in the state
2088 !!!next-input-character;
2089 redo A;
2090 } elsif ($self->{nc} == -1) {
2091 !!!parse-error (type => 'unclosed comment');
2092 if ($self->{in_subset}) {
2093 !!!cp (153.1);
2094 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2095 } else {
2096 !!!cp (153);
2097 $self->{state} = DATA_STATE;
2098 $self->{s_kwd} = '';
2099 }
2100 ## reconsume
2101
2102 !!!emit ($self->{ct}); # comment
2103
2104 redo A;
2105 } else {
2106 !!!cp (154);
2107 ## XML5: Not a parse error.
2108 !!!parse-error (type => 'dash in comment',
2109 line => $self->{line_prev},
2110 column => $self->{column_prev});
2111 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2112 $self->{state} = COMMENT_STATE;
2113 !!!next-input-character;
2114 redo A;
2115 }
2116 } elsif ($self->{state} == DOCTYPE_STATE) {
2117 if ($is_space->{$self->{nc}}) {
2118 !!!cp (155);
2119 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2120 !!!next-input-character;
2121 redo A;
2122 } else {
2123 !!!cp (156);
2124 ## XML5: Unless EOF, swith to the bogus comment state.
2125 !!!parse-error (type => 'no space before DOCTYPE name');
2126 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2127 ## reconsume
2128 redo A;
2129 }
2130 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2131 ## XML5: "DOCTYPE root name before state".
2132
2133 if ($is_space->{$self->{nc}}) {
2134 !!!cp (157);
2135 ## Stay in the state
2136 !!!next-input-character;
2137 redo A;
2138 } elsif ($self->{nc} == 0x003E) { # >
2139 !!!cp (158);
2140 ## XML5: No parse error.
2141 !!!parse-error (type => 'no DOCTYPE name');
2142 $self->{state} = DATA_STATE;
2143 $self->{s_kwd} = '';
2144 !!!next-input-character;
2145
2146 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2147
2148 redo A;
2149 } elsif ($self->{nc} == -1) {
2150 !!!cp (159);
2151 !!!parse-error (type => 'no DOCTYPE name');
2152 $self->{state} = DATA_STATE;
2153 $self->{s_kwd} = '';
2154 ## reconsume
2155
2156 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2157
2158 redo A;
2159 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2160 !!!cp (159.1);
2161 !!!parse-error (type => 'no DOCTYPE name');
2162 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2163 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2164 $self->{in_subset} = 1;
2165 !!!next-input-character;
2166 !!!emit ($self->{ct}); # DOCTYPE
2167 redo A;
2168 } else {
2169 !!!cp (160);
2170 $self->{ct}->{name} = chr $self->{nc};
2171 delete $self->{ct}->{quirks};
2172 $self->{state} = DOCTYPE_NAME_STATE;
2173 !!!next-input-character;
2174 redo A;
2175 }
2176 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2177 ## XML5: "DOCTYPE root name state".
2178
2179 ## ISSUE: Redundant "First," in the spec.
2180
2181 if ($is_space->{$self->{nc}}) {
2182 !!!cp (161);
2183 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2184 !!!next-input-character;
2185 redo A;
2186 } elsif ($self->{nc} == 0x003E) { # >
2187 !!!cp (162);
2188 $self->{state} = DATA_STATE;
2189 $self->{s_kwd} = '';
2190 !!!next-input-character;
2191
2192 !!!emit ($self->{ct}); # DOCTYPE
2193
2194 redo A;
2195 } elsif ($self->{nc} == -1) {
2196 !!!cp (163);
2197 !!!parse-error (type => 'unclosed DOCTYPE');
2198 $self->{state} = DATA_STATE;
2199 $self->{s_kwd} = '';
2200 ## reconsume
2201
2202 $self->{ct}->{quirks} = 1;
2203 !!!emit ($self->{ct}); # DOCTYPE
2204
2205 redo A;
2206 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2207 !!!cp (163.1);
2208 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2209 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2210 $self->{in_subset} = 1;
2211 !!!next-input-character;
2212 !!!emit ($self->{ct}); # DOCTYPE
2213 redo A;
2214 } else {
2215 !!!cp (164);
2216 $self->{ct}->{name}
2217 .= chr ($self->{nc}); # DOCTYPE
2218 ## Stay in the state
2219 !!!next-input-character;
2220 redo A;
2221 }
2222 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2223 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2224 ## state", but implemented differently.
2225
2226 if ($is_space->{$self->{nc}}) {
2227 !!!cp (165);
2228 ## Stay in the state
2229 !!!next-input-character;
2230 redo A;
2231 } elsif ($self->{nc} == 0x003E) { # >
2232 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2233 !!!cp (166);
2234 $self->{state} = DATA_STATE;
2235 $self->{s_kwd} = '';
2236 } else {
2237 !!!cp (166.1);
2238 !!!parse-error (type => 'no md def'); ## TODO: type
2239 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2240 }
2241
2242 !!!next-input-character;
2243 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2244 redo A;
2245 } elsif ($self->{nc} == -1) {
2246 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2247 !!!cp (167);
2248 !!!parse-error (type => 'unclosed DOCTYPE');
2249 $self->{state} = DATA_STATE;
2250 $self->{s_kwd} = '';
2251 $self->{ct}->{quirks} = 1;
2252 } else {
2253 !!!cp (167.12);
2254 !!!parse-error (type => 'unclosed md'); ## TODO: type
2255 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2256 }
2257
2258 ## Reconsume.
2259 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2260 redo A;
2261 } elsif ($self->{nc} == 0x0050 or # P
2262 $self->{nc} == 0x0070) { # p
2263 !!!cp (167.1);
2264 $self->{state} = PUBLIC_STATE;
2265 $self->{kwd} = chr $self->{nc};
2266 !!!next-input-character;
2267 redo A;
2268 } elsif ($self->{nc} == 0x0053 or # S
2269 $self->{nc} == 0x0073) { # s
2270 !!!cp (167.2);
2271 $self->{state} = SYSTEM_STATE;
2272 $self->{kwd} = chr $self->{nc};
2273 !!!next-input-character;
2274 redo A;
2275 } elsif ($self->{nc} == 0x0022 and # "
2276 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2277 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2278 !!!cp (167.21);
2279 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2280 $self->{ct}->{value} = ''; # ENTITY
2281 !!!next-input-character;
2282 redo A;
2283 } elsif ($self->{nc} == 0x0027 and # '
2284 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2285 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2286 !!!cp (167.22);
2287 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2288 $self->{ct}->{value} = ''; # ENTITY
2289 !!!next-input-character;
2290 redo A;
2291 } elsif ($self->{is_xml} and
2292 $self->{ct}->{type} == DOCTYPE_TOKEN and
2293 $self->{nc} == 0x005B) { # [
2294 !!!cp (167.3);
2295 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2296 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2297 $self->{in_subset} = 1;
2298 !!!next-input-character;
2299 !!!emit ($self->{ct}); # DOCTYPE
2300 redo A;
2301 } else {
2302 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2303
2304 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2305 !!!cp (180);
2306 $self->{ct}->{quirks} = 1;
2307 $self->{state} = BOGUS_DOCTYPE_STATE;
2308 } else {
2309 !!!cp (180.1);
2310 $self->{state} = BOGUS_MD_STATE;
2311 }
2312
2313 !!!next-input-character;
2314 redo A;
2315 }
2316 } elsif ($self->{state} == PUBLIC_STATE) {
2317 ## ASCII case-insensitive
2318 if ($self->{nc} == [
2319 undef,
2320 0x0055, # U
2321 0x0042, # B
2322 0x004C, # L
2323 0x0049, # I
2324 ]->[length $self->{kwd}] or
2325 $self->{nc} == [
2326 undef,
2327 0x0075, # u
2328 0x0062, # b
2329 0x006C, # l
2330 0x0069, # i
2331 ]->[length $self->{kwd}]) {
2332 !!!cp (175);
2333 ## Stay in the state.
2334 $self->{kwd} .= chr $self->{nc};
2335 !!!next-input-character;
2336 redo A;
2337 } elsif ((length $self->{kwd}) == 5 and
2338 ($self->{nc} == 0x0043 or # C
2339 $self->{nc} == 0x0063)) { # c
2340 if ($self->{is_xml} and
2341 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2342 !!!cp (168.1);
2343 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2344 text => 'PUBLIC',
2345 line => $self->{line_prev},
2346 column => $self->{column_prev} - 4);
2347 } else {
2348 !!!cp (168);
2349 }
2350 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2351 !!!next-input-character;
2352 redo A;
2353 } else {
2354 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2355 line => $self->{line_prev},
2356 column => $self->{column_prev} + 1 - length $self->{kwd});
2357 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2358 !!!cp (169);
2359 $self->{ct}->{quirks} = 1;
2360 $self->{state} = BOGUS_DOCTYPE_STATE;
2361 } else {
2362 !!!cp (169.1);
2363 $self->{state} = BOGUS_MD_STATE;
2364 }
2365 ## Reconsume.
2366 redo A;
2367 }
2368 } elsif ($self->{state} == SYSTEM_STATE) {
2369 ## ASCII case-insensitive
2370 if ($self->{nc} == [
2371 undef,
2372 0x0059, # Y
2373 0x0053, # S
2374 0x0054, # T
2375 0x0045, # E
2376 ]->[length $self->{kwd}] or
2377 $self->{nc} == [
2378 undef,
2379 0x0079, # y
2380 0x0073, # s
2381 0x0074, # t
2382 0x0065, # e
2383 ]->[length $self->{kwd}]) {
2384 !!!cp (170);
2385 ## Stay in the state.
2386 $self->{kwd} .= chr $self->{nc};
2387 !!!next-input-character;
2388 redo A;
2389 } elsif ((length $self->{kwd}) == 5 and
2390 ($self->{nc} == 0x004D or # M
2391 $self->{nc} == 0x006D)) { # m
2392 if ($self->{is_xml} and
2393 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2394 !!!cp (171.1);
2395 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2396 text => 'SYSTEM',
2397 line => $self->{line_prev},
2398 column => $self->{column_prev} - 4);
2399 } else {
2400 !!!cp (171);
2401 }
2402 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2403 !!!next-input-character;
2404 redo A;
2405 } else {
2406 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2407 line => $self->{line_prev},
2408 column => $self->{column_prev} + 1 - length $self->{kwd});
2409 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2410 !!!cp (172);
2411 $self->{ct}->{quirks} = 1;
2412 $self->{state} = BOGUS_DOCTYPE_STATE;
2413 } else {
2414 !!!cp (172.1);
2415 $self->{state} = BOGUS_MD_STATE;
2416 }
2417 ## Reconsume.
2418 redo A;
2419 }
2420 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2421 if ($is_space->{$self->{nc}}) {
2422 !!!cp (181);
2423 ## Stay in the state
2424 !!!next-input-character;
2425 redo A;
2426 } elsif ($self->{nc} eq 0x0022) { # "
2427 !!!cp (182);
2428 $self->{ct}->{pubid} = ''; # DOCTYPE
2429 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2430 !!!next-input-character;
2431 redo A;
2432 } elsif ($self->{nc} eq 0x0027) { # '
2433 !!!cp (183);
2434 $self->{ct}->{pubid} = ''; # DOCTYPE
2435 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2436 !!!next-input-character;
2437 redo A;
2438 } elsif ($self->{nc} eq 0x003E) { # >
2439 !!!parse-error (type => 'no PUBLIC literal');
2440
2441 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2442 !!!cp (184);
2443 $self->{state} = DATA_STATE;
2444 $self->{s_kwd} = '';
2445 $self->{ct}->{quirks} = 1;
2446 } else {
2447 !!!cp (184.1);
2448 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2449 }
2450
2451 !!!next-input-character;
2452 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2453 redo A;
2454 } elsif ($self->{nc} == -1) {
2455 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2456 !!!cp (185);
2457 !!!parse-error (type => 'unclosed DOCTYPE');
2458 $self->{state} = DATA_STATE;
2459 $self->{s_kwd} = '';
2460 $self->{ct}->{quirks} = 1;
2461 } else {
2462 !!!cp (185.1);
2463 !!!parse-error (type => 'unclosed md'); ## TODO: type
2464 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2465 }
2466
2467 ## reconsume
2468 !!!emit ($self->{ct}); # DOCTYPE
2469 redo A;
2470 } elsif ($self->{is_xml} and
2471 $self->{ct}->{type} == DOCTYPE_TOKEN and
2472 $self->{nc} == 0x005B) { # [
2473 !!!cp (186.1);
2474 !!!parse-error (type => 'no PUBLIC literal');
2475 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2476 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2477 $self->{in_subset} = 1;
2478 !!!next-input-character;
2479 !!!emit ($self->{ct}); # DOCTYPE
2480 redo A;
2481 } else {
2482 !!!parse-error (type => 'string after PUBLIC');
2483
2484 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2485 !!!cp (186);
2486 $self->{ct}->{quirks} = 1;
2487 $self->{state} = BOGUS_DOCTYPE_STATE;
2488 } else {
2489 !!!cp (186.2);
2490 $self->{state} = BOGUS_MD_STATE;
2491 }
2492
2493 !!!next-input-character;
2494 redo A;
2495 }
2496 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2497 if ($self->{nc} == 0x0022) { # "
2498 !!!cp (187);
2499 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2500 !!!next-input-character;
2501 redo A;
2502 } elsif ($self->{nc} == 0x003E) { # >
2503 !!!parse-error (type => 'unclosed PUBLIC literal');
2504
2505 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2506 !!!cp (188);
2507 $self->{state} = DATA_STATE;
2508 $self->{s_kwd} = '';
2509 $self->{ct}->{quirks} = 1;
2510 } else {
2511 !!!cp (188.1);
2512 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2513 }
2514
2515 !!!next-input-character;
2516 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2517 redo A;
2518 } elsif ($self->{nc} == -1) {
2519 !!!parse-error (type => 'unclosed PUBLIC literal');
2520
2521 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2522 !!!cp (189);
2523 $self->{state} = DATA_STATE;
2524 $self->{s_kwd} = '';
2525 $self->{ct}->{quirks} = 1;
2526 } else {
2527 !!!cp (189.1);
2528 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2529 }
2530
2531 ## Reconsume.
2532 !!!emit ($self->{ct}); # DOCTYPE
2533 redo A;
2534 } else {
2535 !!!cp (190);
2536 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2537 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2538 length $self->{ct}->{pubid});
2539
2540 ## Stay in the state
2541 !!!next-input-character;
2542 redo A;
2543 }
2544 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2545 if ($self->{nc} == 0x0027) { # '
2546 !!!cp (191);
2547 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2548 !!!next-input-character;
2549 redo A;
2550 } elsif ($self->{nc} == 0x003E) { # >
2551 !!!parse-error (type => 'unclosed PUBLIC literal');
2552
2553 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2554 !!!cp (192);
2555 $self->{state} = DATA_STATE;
2556 $self->{s_kwd} = '';
2557 $self->{ct}->{quirks} = 1;
2558 } else {
2559 !!!cp (192.1);
2560 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2561 }
2562
2563 !!!next-input-character;
2564 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2565 redo A;
2566 } elsif ($self->{nc} == -1) {
2567 !!!parse-error (type => 'unclosed PUBLIC literal');
2568
2569 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2570 !!!cp (193);
2571 $self->{state} = DATA_STATE;
2572 $self->{s_kwd} = '';
2573 $self->{ct}->{quirks} = 1;
2574 } else {
2575 !!!cp (193.1);
2576 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2577 }
2578
2579 ## reconsume
2580 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2581 redo A;
2582 } else {
2583 !!!cp (194);
2584 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2585 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2586 length $self->{ct}->{pubid});
2587
2588 ## Stay in the state
2589 !!!next-input-character;
2590 redo A;
2591 }
2592 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2593 if ($is_space->{$self->{nc}}) {
2594 !!!cp (195);
2595 ## Stay in the state
2596 !!!next-input-character;
2597 redo A;
2598 } elsif ($self->{nc} == 0x0022) { # "
2599 !!!cp (196);
2600 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2601 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2602 !!!next-input-character;
2603 redo A;
2604 } elsif ($self->{nc} == 0x0027) { # '
2605 !!!cp (197);
2606 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2607 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2608 !!!next-input-character;
2609 redo A;
2610 } elsif ($self->{nc} == 0x003E) { # >
2611 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2612 if ($self->{is_xml}) {
2613 !!!cp (198.1);
2614 !!!parse-error (type => 'no SYSTEM literal');
2615 } else {
2616 !!!cp (198);
2617 }
2618 $self->{state} = DATA_STATE;
2619 $self->{s_kwd} = '';
2620 } else {
2621 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2622 !!!cp (198.2);
2623 } else {
2624 !!!cp (198.3);
2625 !!!parse-error (type => 'no SYSTEM literal');
2626 }
2627 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2628 }
2629
2630 !!!next-input-character;
2631 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2632 redo A;
2633 } elsif ($self->{nc} == -1) {
2634 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2635 !!!cp (199);
2636 !!!parse-error (type => 'unclosed DOCTYPE');
2637
2638 $self->{state} = DATA_STATE;
2639 $self->{s_kwd} = '';
2640 $self->{ct}->{quirks} = 1;
2641 } else {
2642 !!!parse-error (type => 'unclosed md'); ## TODO: type
2643 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2644 }
2645
2646 ## reconsume
2647 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2648 redo A;
2649 } elsif ($self->{is_xml} and
2650 $self->{ct}->{type} == DOCTYPE_TOKEN and
2651 $self->{nc} == 0x005B) { # [
2652 !!!cp (200.1);
2653 !!!parse-error (type => 'no SYSTEM literal');
2654 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2655 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2656 $self->{in_subset} = 1;
2657 !!!next-input-character;
2658 !!!emit ($self->{ct}); # DOCTYPE
2659 redo A;
2660 } else {
2661 !!!parse-error (type => 'string after PUBLIC literal');
2662
2663 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2664 !!!cp (200);
2665 $self->{ct}->{quirks} = 1;
2666 $self->{state} = BOGUS_DOCTYPE_STATE;
2667 } else {
2668 !!!cp (200.2);
2669 $self->{state} = BOGUS_MD_STATE;
2670 }
2671
2672 !!!next-input-character;
2673 redo A;
2674 }
2675 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2676 if ($is_space->{$self->{nc}}) {
2677 !!!cp (201);
2678 ## Stay in the state
2679 !!!next-input-character;
2680 redo A;
2681 } elsif ($self->{nc} == 0x0022) { # "
2682 !!!cp (202);
2683 $self->{ct}->{sysid} = ''; # DOCTYPE
2684 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2685 !!!next-input-character;
2686 redo A;
2687 } elsif ($self->{nc} == 0x0027) { # '
2688 !!!cp (203);
2689 $self->{ct}->{sysid} = ''; # DOCTYPE
2690 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2691 !!!next-input-character;
2692 redo A;
2693 } elsif ($self->{nc} == 0x003E) { # >
2694 !!!parse-error (type => 'no SYSTEM literal');
2695 !!!next-input-character;
2696
2697 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2698 !!!cp (204);
2699 $self->{state} = DATA_STATE;
2700 $self->{s_kwd} = '';
2701 $self->{ct}->{quirks} = 1;
2702 } else {
2703 !!!cp (204.1);
2704 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2705 }
2706
2707 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2708 redo A;
2709 } elsif ($self->{nc} == -1) {
2710 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2711 !!!cp (205);
2712 !!!parse-error (type => 'unclosed DOCTYPE');
2713 $self->{state} = DATA_STATE;
2714 $self->{s_kwd} = '';
2715 $self->{ct}->{quirks} = 1;
2716 } else {
2717 !!!cp (205.1);
2718 !!!parse-error (type => 'unclosed md'); ## TODO: type
2719 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2720 }
2721
2722 ## reconsume
2723 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2724 redo A;
2725 } elsif ($self->{is_xml} and
2726 $self->{ct}->{type} == DOCTYPE_TOKEN and
2727 $self->{nc} == 0x005B) { # [
2728 !!!cp (206.1);
2729 !!!parse-error (type => 'no SYSTEM literal');
2730
2731 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2733 $self->{in_subset} = 1;
2734 !!!next-input-character;
2735 !!!emit ($self->{ct}); # DOCTYPE
2736 redo A;
2737 } else {
2738 !!!parse-error (type => 'string after SYSTEM');
2739
2740 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2741 !!!cp (206);
2742 $self->{ct}->{quirks} = 1;
2743 $self->{state} = BOGUS_DOCTYPE_STATE;
2744 } else {
2745 !!!cp (206.2);
2746 $self->{state} = BOGUS_MD_STATE;
2747 }
2748
2749 !!!next-input-character;
2750 redo A;
2751 }
2752 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2753 if ($self->{nc} == 0x0022) { # "
2754 !!!cp (207);
2755 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2756 !!!next-input-character;
2757 redo A;
2758 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2759 !!!parse-error (type => 'unclosed SYSTEM literal');
2760
2761 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2762 !!!cp (208);
2763 $self->{state} = DATA_STATE;
2764 $self->{s_kwd} = '';
2765 $self->{ct}->{quirks} = 1;
2766 } else {
2767 !!!cp (208.1);
2768 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2769 }
2770
2771 !!!next-input-character;
2772 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2773 redo A;
2774 } elsif ($self->{nc} == -1) {
2775 !!!parse-error (type => 'unclosed SYSTEM literal');
2776
2777 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2778 !!!cp (209);
2779 $self->{state} = DATA_STATE;
2780 $self->{s_kwd} = '';
2781 $self->{ct}->{quirks} = 1;
2782 } else {
2783 !!!cp (209.1);
2784 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2785 }
2786
2787 ## reconsume
2788 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2789 redo A;
2790 } else {
2791 !!!cp (210);
2792 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2793 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2794 length $self->{ct}->{sysid});
2795
2796 ## Stay in the state
2797 !!!next-input-character;
2798 redo A;
2799 }
2800 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2801 if ($self->{nc} == 0x0027) { # '
2802 !!!cp (211);
2803 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2804 !!!next-input-character;
2805 redo A;
2806 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2807 !!!cp (212);
2808 !!!parse-error (type => 'unclosed SYSTEM literal');
2809
2810 $self->{state} = DATA_STATE;
2811 $self->{s_kwd} = '';
2812 !!!next-input-character;
2813
2814 $self->{ct}->{quirks} = 1;
2815 !!!emit ($self->{ct}); # DOCTYPE
2816
2817 redo A;
2818 } elsif ($self->{nc} == -1) {
2819 !!!parse-error (type => 'unclosed SYSTEM literal');
2820
2821 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2822 !!!cp (213);
2823 $self->{state} = DATA_STATE;
2824 $self->{s_kwd} = '';
2825 $self->{ct}->{quirks} = 1;
2826 } else {
2827 !!!cp (213.1);
2828 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2829 }
2830
2831 ## reconsume
2832 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2833 redo A;
2834 } else {
2835 !!!cp (214);
2836 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2837 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2838 length $self->{ct}->{sysid});
2839
2840 ## Stay in the state
2841 !!!next-input-character;
2842 redo A;
2843 }
2844 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2845 if ($is_space->{$self->{nc}}) {
2846 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2847 !!!cp (215.1);
2848 $self->{state} = BEFORE_NDATA_STATE;
2849 } else {
2850 !!!cp (215);
2851 ## Stay in the state
2852 }
2853 !!!next-input-character;
2854 redo A;
2855 } elsif ($self->{nc} == 0x003E) { # >
2856 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2857 !!!cp (216);
2858 $self->{state} = DATA_STATE;
2859 $self->{s_kwd} = '';
2860 } else {
2861 !!!cp (216.1);
2862 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2863 }
2864
2865 !!!next-input-character;
2866 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2867 redo A;
2868 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2869 ($self->{nc} == 0x004E or # N
2870 $self->{nc} == 0x006E)) { # n
2871 !!!cp (216.2);
2872 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2873 $self->{state} = NDATA_STATE;
2874 $self->{kwd} = chr $self->{nc};
2875 !!!next-input-character;
2876 redo A;
2877 } elsif ($self->{nc} == -1) {
2878 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2879 !!!cp (217);
2880 !!!parse-error (type => 'unclosed DOCTYPE');
2881 $self->{state} = DATA_STATE;
2882 $self->{s_kwd} = '';
2883 $self->{ct}->{quirks} = 1;
2884 } else {
2885 !!!cp (217.1);
2886 !!!parse-error (type => 'unclosed md'); ## TODO: type
2887 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2888 }
2889
2890 ## reconsume
2891 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2892 redo A;
2893 } elsif ($self->{is_xml} and
2894 $self->{ct}->{type} == DOCTYPE_TOKEN and
2895 $self->{nc} == 0x005B) { # [
2896 !!!cp (218.1);
2897 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2898 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2899 $self->{in_subset} = 1;
2900 !!!next-input-character;
2901 !!!emit ($self->{ct}); # DOCTYPE
2902 redo A;
2903 } else {
2904 !!!parse-error (type => 'string after SYSTEM literal');
2905
2906 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2907 !!!cp (218);
2908 #$self->{ct}->{quirks} = 1;
2909 $self->{state} = BOGUS_DOCTYPE_STATE;
2910 } else {
2911 !!!cp (218.2);
2912 $self->{state} = BOGUS_MD_STATE;
2913 }
2914
2915 !!!next-input-character;
2916 redo A;
2917 }
2918 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2919 if ($is_space->{$self->{nc}}) {
2920 !!!cp (218.3);
2921 ## Stay in the state.
2922 !!!next-input-character;
2923 redo A;
2924 } elsif ($self->{nc} == 0x003E) { # >
2925 !!!cp (218.4);
2926 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2927 !!!next-input-character;
2928 !!!emit ($self->{ct}); # ENTITY
2929 redo A;
2930 } elsif ($self->{nc} == 0x004E or # N
2931 $self->{nc} == 0x006E) { # n
2932 !!!cp (218.5);
2933 $self->{state} = NDATA_STATE;
2934 $self->{kwd} = chr $self->{nc};
2935 !!!next-input-character;
2936 redo A;
2937 } elsif ($self->{nc} == -1) {
2938 !!!cp (218.6);
2939 !!!parse-error (type => 'unclosed md'); ## TODO: type
2940 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2941 ## reconsume
2942 !!!emit ($self->{ct}); # ENTITY
2943 redo A;
2944 } else {
2945 !!!cp (218.7);
2946 !!!parse-error (type => 'string after SYSTEM literal');
2947 $self->{state} = BOGUS_MD_STATE;
2948 !!!next-input-character;
2949 redo A;
2950 }
2951 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2952 if ($self->{nc} == 0x003E) { # >
2953 !!!cp (219);
2954 $self->{state} = DATA_STATE;
2955 $self->{s_kwd} = '';
2956 !!!next-input-character;
2957
2958 !!!emit ($self->{ct}); # DOCTYPE
2959
2960 redo A;
2961 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2962 !!!cp (220.1);
2963 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2964 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2965 $self->{in_subset} = 1;
2966 !!!next-input-character;
2967 !!!emit ($self->{ct}); # DOCTYPE
2968 redo A;
2969 } elsif ($self->{nc} == -1) {
2970 !!!cp (220);
2971 $self->{state} = DATA_STATE;
2972 $self->{s_kwd} = '';
2973 ## reconsume
2974
2975 !!!emit ($self->{ct}); # DOCTYPE
2976
2977 redo A;
2978 } else {
2979 !!!cp (221);
2980 my $s = '';
2981 $self->{read_until}->($s, q{>[}, 0);
2982
2983 ## Stay in the state
2984 !!!next-input-character;
2985 redo A;
2986 }
2987 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2988 ## NOTE: "CDATA section state" in the state is jointly implemented
2989 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2990 ## and |CDATA_SECTION_MSE2_STATE|.
2991
2992 ## XML5: "CDATA state".
2993
2994 if ($self->{nc} == 0x005D) { # ]
2995 !!!cp (221.1);
2996 $self->{state} = CDATA_SECTION_MSE1_STATE;
2997 !!!next-input-character;
2998 redo A;
2999 } elsif ($self->{nc} == -1) {
3000 if ($self->{is_xml}) {
3001 !!!cp (221.11);
3002 !!!parse-error (type => 'no mse'); ## TODO: type
3003 } else {
3004 !!!cp (221.12);
3005 }
3006
3007 $self->{state} = DATA_STATE;
3008 $self->{s_kwd} = '';
3009 ## Reconsume.
3010 if (length $self->{ct}->{data}) { # character
3011 !!!cp (221.2);
3012 !!!emit ($self->{ct}); # character
3013 } else {
3014 !!!cp (221.3);
3015 ## No token to emit. $self->{ct} is discarded.
3016 }
3017 redo A;
3018 } else {
3019 !!!cp (221.4);
3020 $self->{ct}->{data} .= chr $self->{nc};
3021 $self->{read_until}->($self->{ct}->{data},
3022 q<]>,
3023 length $self->{ct}->{data});
3024
3025 ## Stay in the state.
3026 !!!next-input-character;
3027 redo A;
3028 }
3029
3030 ## ISSUE: "text tokens" in spec.
3031 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3032 ## XML5: "CDATA bracket state".
3033
3034 if ($self->{nc} == 0x005D) { # ]
3035 !!!cp (221.5);
3036 $self->{state} = CDATA_SECTION_MSE2_STATE;
3037 !!!next-input-character;
3038 redo A;
3039 } else {
3040 !!!cp (221.6);
3041 ## XML5: If EOF, "]" is not appended and changed to the data state.
3042 $self->{ct}->{data} .= ']';
3043 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3044 ## Reconsume.
3045 redo A;
3046 }
3047 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3048 ## XML5: "CDATA end state".
3049
3050 if ($self->{nc} == 0x003E) { # >
3051 $self->{state} = DATA_STATE;
3052 $self->{s_kwd} = '';
3053 !!!next-input-character;
3054 if (length $self->{ct}->{data}) { # character
3055 !!!cp (221.7);
3056 !!!emit ($self->{ct}); # character
3057 } else {
3058 !!!cp (221.8);
3059 ## No token to emit. $self->{ct} is discarded.
3060 }
3061 redo A;
3062 } elsif ($self->{nc} == 0x005D) { # ]
3063 !!!cp (221.9); # character
3064 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3065 ## Stay in the state.
3066 !!!next-input-character;
3067 redo A;
3068 } else {
3069 !!!cp (221.11);
3070 $self->{ct}->{data} .= ']]'; # character
3071 $self->{state} = CDATA_SECTION_STATE;
3072 ## Reconsume. ## XML5: Emit.
3073 redo A;
3074 }
3075 } elsif ($self->{state} == ENTITY_STATE) {
3076 if ($is_space->{$self->{nc}} or
3077 {
3078 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3079 $self->{entity_add} => 1,
3080 }->{$self->{nc}}) {
3081 if ($self->{is_xml}) {
3082 !!!cp (1001.1);
3083 !!!parse-error (type => 'bare ero',
3084 line => $self->{line_prev},
3085 column => $self->{column_prev}
3086 + ($self->{nc} == -1 ? 1 : 0));
3087 } else {
3088 !!!cp (1001);
3089 ## No error
3090 }
3091 ## Don't consume
3092 ## Return nothing.
3093 #
3094 } elsif ($self->{nc} == 0x0023) { # #
3095 !!!cp (999);
3096 $self->{state} = ENTITY_HASH_STATE;
3097 $self->{kwd} = '#';
3098 !!!next-input-character;
3099 redo A;
3100 } elsif ($self->{is_xml} or
3101 (0x0041 <= $self->{nc} and
3102 $self->{nc} <= 0x005A) or # A..Z
3103 (0x0061 <= $self->{nc} and
3104 $self->{nc} <= 0x007A)) { # a..z
3105 !!!cp (998);
3106 require Whatpm::_NamedEntityList;
3107 $self->{state} = ENTITY_NAME_STATE;
3108 $self->{kwd} = chr $self->{nc};
3109 $self->{entity__value} = $self->{kwd};
3110 $self->{entity__match} = 0;
3111 !!!next-input-character;
3112 redo A;
3113 } else {
3114 !!!cp (1027);
3115 !!!parse-error (type => 'bare ero');
3116 ## Return nothing.
3117 #
3118 }
3119
3120 ## NOTE: No character is consumed by the "consume a character
3121 ## reference" algorithm. In other word, there is an "&" character
3122 ## that does not introduce a character reference, which would be
3123 ## appended to the parent element or the attribute value in later
3124 ## process of the tokenizer.
3125
3126 if ($self->{prev_state} == DATA_STATE) {
3127 !!!cp (997);
3128 $self->{state} = $self->{prev_state};
3129 $self->{s_kwd} = '';
3130 ## Reconsume.
3131 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3132 line => $self->{line_prev},
3133 column => $self->{column_prev},
3134 });
3135 redo A;
3136 } else {
3137 !!!cp (996);
3138 $self->{ca}->{value} .= '&';
3139 $self->{state} = $self->{prev_state};
3140 $self->{s_kwd} = '';
3141 ## Reconsume.
3142 redo A;
3143 }
3144 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3145 if ($self->{nc} == 0x0078) { # x
3146 !!!cp (995);
3147 $self->{state} = HEXREF_X_STATE;
3148 $self->{kwd} .= chr $self->{nc};
3149 !!!next-input-character;
3150 redo A;
3151 } elsif ($self->{nc} == 0x0058) { # X
3152 !!!cp (995.1);
3153 if ($self->{is_xml}) {
3154 !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3155 }
3156 $self->{state} = HEXREF_X_STATE;
3157 $self->{kwd} .= chr $self->{nc};
3158 !!!next-input-character;
3159 redo A;
3160 } elsif (0x0030 <= $self->{nc} and
3161 $self->{nc} <= 0x0039) { # 0..9
3162 !!!cp (994);
3163 $self->{state} = NCR_NUM_STATE;
3164 $self->{kwd} = $self->{nc} - 0x0030;
3165 !!!next-input-character;
3166 redo A;
3167 } else {
3168 !!!parse-error (type => 'bare nero',
3169 line => $self->{line_prev},
3170 column => $self->{column_prev} - 1);
3171
3172 ## NOTE: According to the spec algorithm, nothing is returned,
3173 ## and then "&#" is appended to the parent element or the attribute
3174 ## value in the later processing.
3175
3176 if ($self->{prev_state} == DATA_STATE) {
3177 !!!cp (1019);
3178 $self->{state} = $self->{prev_state};
3179 $self->{s_kwd} = '';
3180 ## Reconsume.
3181 !!!emit ({type => CHARACTER_TOKEN,
3182 data => '&#',
3183 line => $self->{line_prev},
3184 column => $self->{column_prev} - 1,
3185 });
3186 redo A;
3187 } else {
3188 !!!cp (993);
3189 $self->{ca}->{value} .= '&#';
3190 $self->{state} = $self->{prev_state};
3191 $self->{s_kwd} = '';
3192 ## Reconsume.
3193 redo A;
3194 }
3195 }
3196 } elsif ($self->{state} == NCR_NUM_STATE) {
3197 if (0x0030 <= $self->{nc} and
3198 $self->{nc} <= 0x0039) { # 0..9
3199 !!!cp (1012);
3200 $self->{kwd} *= 10;
3201 $self->{kwd} += $self->{nc} - 0x0030;
3202
3203 ## Stay in the state.
3204 !!!next-input-character;
3205 redo A;
3206 } elsif ($self->{nc} == 0x003B) { # ;
3207 !!!cp (1013);
3208 !!!next-input-character;
3209 #
3210 } else {
3211 !!!cp (1014);
3212 !!!parse-error (type => 'no refc');
3213 ## Reconsume.
3214 #
3215 }
3216
3217 my $code = $self->{kwd};
3218 my $l = $self->{line_prev};
3219 my $c = $self->{column_prev};
3220 if ($charref_map->{$code}) {
3221 !!!cp (1015);
3222 !!!parse-error (type => 'invalid character reference',
3223 text => (sprintf 'U+%04X', $code),
3224 line => $l, column => $c);
3225 $code = $charref_map->{$code};
3226 } elsif ($code > 0x10FFFF) {
3227 !!!cp (1016);
3228 !!!parse-error (type => 'invalid character reference',
3229 text => (sprintf 'U-%08X', $code),
3230 line => $l, column => $c);
3231 $code = 0xFFFD;
3232 }
3233
3234 if ($self->{prev_state} == DATA_STATE) {
3235 !!!cp (992);
3236 $self->{state} = $self->{prev_state};
3237 $self->{s_kwd} = '';
3238 ## Reconsume.
3239 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3240 has_reference => 1,
3241 line => $l, column => $c,
3242 });
3243 redo A;
3244 } else {
3245 !!!cp (991);
3246 $self->{ca}->{value} .= chr $code;
3247 $self->{ca}->{has_reference} = 1;
3248 $self->{state} = $self->{prev_state};
3249 $self->{s_kwd} = '';
3250 ## Reconsume.
3251 redo A;
3252 }
3253 } elsif ($self->{state} == HEXREF_X_STATE) {
3254 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3255 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3256 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3257 # 0..9, A..F, a..f
3258 !!!cp (990);
3259 $self->{state} = HEXREF_HEX_STATE;
3260 $self->{kwd} = 0;
3261 ## Reconsume.
3262 redo A;
3263 } else {
3264 !!!parse-error (type => 'bare hcro',
3265 line => $self->{line_prev},
3266 column => $self->{column_prev} - 2);
3267
3268 ## NOTE: According to the spec algorithm, nothing is returned,
3269 ## and then "&#" followed by "X" or "x" is appended to the parent
3270 ## element or the attribute value in the later processing.
3271
3272 if ($self->{prev_state} == DATA_STATE) {
3273 !!!cp (1005);
3274 $self->{state} = $self->{prev_state};
3275 $self->{s_kwd} = '';
3276 ## Reconsume.
3277 !!!emit ({type => CHARACTER_TOKEN,
3278 data => '&' . $self->{kwd},
3279 line => $self->{line_prev},
3280 column => $self->{column_prev} - length $self->{kwd},
3281 });
3282 redo A;
3283 } else {
3284 !!!cp (989);
3285 $self->{ca}->{value} .= '&' . $self->{kwd};
3286 $self->{state} = $self->{prev_state};
3287 $self->{s_kwd} = '';
3288 ## Reconsume.
3289 redo A;
3290 }
3291 }
3292 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3293 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3294 # 0..9
3295 !!!cp (1002);
3296 $self->{kwd} *= 0x10;
3297 $self->{kwd} += $self->{nc} - 0x0030;
3298 ## Stay in the state.
3299 !!!next-input-character;
3300 redo A;
3301 } elsif (0x0061 <= $self->{nc} and
3302 $self->{nc} <= 0x0066) { # a..f
3303 !!!cp (1003);
3304 $self->{kwd} *= 0x10;
3305 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3306 ## Stay in the state.
3307 !!!next-input-character;
3308 redo A;
3309 } elsif (0x0041 <= $self->{nc} and
3310 $self->{nc} <= 0x0046) { # A..F
3311 !!!cp (1004);
3312 $self->{kwd} *= 0x10;
3313 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3314 ## Stay in the state.
3315 !!!next-input-character;
3316 redo A;
3317 } elsif ($self->{nc} == 0x003B) { # ;
3318 !!!cp (1006);
3319 !!!next-input-character;
3320 #
3321 } else {
3322 !!!cp (1007);
3323 !!!parse-error (type => 'no refc',
3324 line => $self->{line},
3325 column => $self->{column});
3326 ## Reconsume.
3327 #
3328 }
3329
3330 my $code = $self->{kwd};
3331 my $l = $self->{line_prev};
3332 my $c = $self->{column_prev};
3333 if ($charref_map->{$code}) {
3334 !!!cp (1008);
3335 !!!parse-error (type => 'invalid character reference',
3336 text => (sprintf 'U+%04X', $code),
3337 line => $l, column => $c);
3338 $code = $charref_map->{$code};
3339 } elsif ($code > 0x10FFFF) {
3340 !!!cp (1009);
3341 !!!parse-error (type => 'invalid character reference',
3342 text => (sprintf 'U-%08X', $code),
3343 line => $l, column => $c);
3344 $code = 0xFFFD;
3345 }
3346
3347 if ($self->{prev_state} == DATA_STATE) {
3348 !!!cp (988);
3349 $self->{state} = $self->{prev_state};
3350 $self->{s_kwd} = '';
3351 ## Reconsume.
3352 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3353 has_reference => 1,
3354 line => $l, column => $c,
3355 });
3356 redo A;
3357 } else {
3358 !!!cp (987);
3359 $self->{ca}->{value} .= chr $code;
3360 $self->{ca}->{has_reference} = 1;
3361 $self->{state} = $self->{prev_state};
3362 $self->{s_kwd} = '';
3363 ## Reconsume.
3364 redo A;
3365 }
3366 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3367 if ((0x0041 <= $self->{nc} and # a
3368 $self->{nc} <= 0x005A) or # x
3369 (0x0061 <= $self->{nc} and # a
3370 $self->{nc} <= 0x007A) or # z
3371 (0x0030 <= $self->{nc} and # 0
3372 $self->{nc} <= 0x0039) or # 9
3373 $self->{nc} == 0x003B or # ;
3374 ($self->{is_xml} and
3375 not ($is_space->{$self->{nc}} or
3376 {
3377 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3378 $self->{entity_add} => 1,
3379 }->{$self->{nc}}))) {
3380 our $EntityChar;
3381 $self->{kwd} .= chr $self->{nc};
3382 if (defined $EntityChar->{$self->{kwd}} or
3383 $self->{ge}->{$self->{kwd}}) {
3384 if ($self->{nc} == 0x003B) { # ;
3385 if (defined $self->{ge}->{$self->{kwd}}) {
3386 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3387 !!!cp (1020.1);
3388 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3389 } else {
3390 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3391 !!!cp (1020.2);
3392 !!!parse-error (type => 'unparsed entity', ## TODO: type
3393 value => $self->{kwd});
3394 } else {
3395 !!!cp (1020.3);
3396 }
3397 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3398 }
3399 } else {
3400 if ($self->{is_xml}) {
3401 !!!cp (1020.4);
3402 !!!parse-error (type => 'entity not declared', ## TODO: type
3403 value => $self->{kwd},
3404 level => {
3405 'amp;' => $self->{level}->{warn},
3406 'quot;' => $self->{level}->{warn},
3407 'lt;' => $self->{level}->{warn},
3408 'gt;' => $self->{level}->{warn},
3409 'apos;' => $self->{level}->{warn},
3410 }->{$self->{kwd}} ||
3411 $self->{level}->{must});
3412 } else {
3413 !!!cp (1020);
3414 }
3415 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3416 }
3417 $self->{entity__match} = 1;
3418 !!!next-input-character;
3419 #
3420 } else {
3421 !!!cp (1021);
3422 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3423 $self->{entity__match} = -1;
3424 ## Stay in the state.
3425 !!!next-input-character;
3426 redo A;
3427 }
3428 } else {
3429 !!!cp (1022);
3430 $self->{entity__value} .= chr $self->{nc};
3431 $self->{entity__match} *= 2;
3432 ## Stay in the state.
3433 !!!next-input-character;
3434 redo A;
3435 }
3436 }
3437
3438 my $data;
3439 my $has_ref;
3440 if ($self->{entity__match} > 0) {
3441 !!!cp (1023);
3442 $data = $self->{entity__value};
3443 $has_ref = 1;
3444 #
3445 } elsif ($self->{entity__match} < 0) {
3446 !!!parse-error (type => 'no refc');
3447 if ($self->{prev_state} != DATA_STATE and # in attribute
3448 $self->{entity__match} < -1) {
3449 !!!cp (1024);
3450 $data = '&' . $self->{kwd};
3451 #
3452 } else {
3453 !!!cp (1025);
3454 $data = $self->{entity__value};
3455 $has_ref = 1;
3456 #
3457 }
3458 } else {
3459 !!!cp (1026);
3460 !!!parse-error (type => 'bare ero',
3461 line => $self->{line_prev},
3462 column => $self->{column_prev} - length $self->{kwd});
3463 $data = '&' . $self->{kwd};
3464 #
3465 }
3466
3467 ## NOTE: In these cases, when a character reference is found,
3468 ## it is consumed and a character token is returned, or, otherwise,
3469 ## nothing is consumed and returned, according to the spec algorithm.
3470 ## In this implementation, anything that has been examined by the
3471 ## tokenizer is appended to the parent element or the attribute value
3472 ## as string, either literal string when no character reference or
3473 ## entity-replaced string otherwise, in this stage, since any characters
3474 ## that would not be consumed are appended in the data state or in an
3475 ## appropriate attribute value state anyway.
3476
3477 if ($self->{prev_state} == DATA_STATE) {
3478 !!!cp (986);
3479 $self->{state} = $self->{prev_state};
3480 $self->{s_kwd} = '';
3481 ## Reconsume.
3482 !!!emit ({type => CHARACTER_TOKEN,
3483 data => $data,
3484 has_reference => $has_ref,
3485 line => $self->{line_prev},
3486 column => $self->{column_prev} + 1 - length $self->{kwd},
3487 });
3488 redo A;
3489 } else {
3490 !!!cp (985);
3491 $self->{ca}->{value} .= $data;
3492 $self->{ca}->{has_reference} = 1 if $has_ref;
3493 $self->{state} = $self->{prev_state};
3494 $self->{s_kwd} = '';
3495 ## Reconsume.
3496 redo A;
3497 }
3498
3499 ## XML-only states
3500
3501 } elsif ($self->{state} == PI_STATE) {
3502 ## XML5: "Pi state" and "DOCTYPE pi state".
3503
3504 if ($is_space->{$self->{nc}} or
3505 $self->{nc} == 0x003F or # ?
3506 $self->{nc} == -1) {
3507 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3508 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3509 ## "DOCTYPE pi state": Parse error, switch to the "data
3510 ## state".
3511 !!!parse-error (type => 'bare pio', ## TODO: type
3512 line => $self->{line_prev},
3513 column => $self->{column_prev}
3514 - 1 * ($self->{nc} != -1));
3515 $self->{state} = BOGUS_COMMENT_STATE;
3516 ## Reconsume.
3517 $self->{ct} = {type => COMMENT_TOKEN,
3518 data => '?',
3519 line => $self->{line_prev},
3520 column => $self->{column_prev}
3521 - 1 * ($self->{nc} != -1),
3522 };
3523 redo A;
3524 } else {
3525 ## XML5: "DOCTYPE pi state": Stay in the state.
3526 $self->{ct} = {type => PI_TOKEN,
3527 target => chr $self->{nc},
3528 data => '',
3529 line => $self->{line_prev},
3530 column => $self->{column_prev} - 1,
3531 };
3532 $self->{state} = PI_TARGET_STATE;
3533 !!!next-input-character;
3534 redo A;
3535 }
3536 } elsif ($self->{state} == PI_TARGET_STATE) {
3537 if ($is_space->{$self->{nc}}) {
3538 $self->{state} = PI_TARGET_AFTER_STATE;
3539 !!!next-input-character;
3540 redo A;
3541 } elsif ($self->{nc} == -1) {
3542 !!!parse-error (type => 'no pic'); ## TODO: type
3543 if ($self->{in_subset}) {
3544 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3545 } else {
3546 $self->{state} = DATA_STATE;
3547 $self->{s_kwd} = '';
3548 }
3549 ## Reconsume.
3550 !!!emit ($self->{ct}); # pi
3551 redo A;
3552 } elsif ($self->{nc} == 0x003F) { # ?
3553 $self->{state} = PI_AFTER_STATE;
3554 !!!next-input-character;
3555 redo A;
3556 } else {
3557 ## XML5: typo ("tag name" -> "target")
3558 $self->{ct}->{target} .= chr $self->{nc}; # pi
3559 !!!next-input-character;
3560 redo A;
3561 }
3562 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3563 if ($is_space->{$self->{nc}}) {
3564 ## Stay in the state.
3565 !!!next-input-character;
3566 redo A;
3567 } else {
3568 $self->{state} = PI_DATA_STATE;
3569 ## Reprocess.
3570 redo A;
3571 }
3572 } elsif ($self->{state} == PI_DATA_STATE) {
3573 if ($self->{nc} == 0x003F) { # ?
3574 $self->{state} = PI_DATA_AFTER_STATE;
3575 !!!next-input-character;
3576 redo A;
3577 } elsif ($self->{nc} == -1) {
3578 !!!parse-error (type => 'no pic'); ## TODO: type
3579 if ($self->{in_subset}) {
3580 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3581 } else {
3582 $self->{state} = DATA_STATE;
3583 $self->{s_kwd} = '';
3584 }
3585 ## Reprocess.
3586 !!!emit ($self->{ct}); # pi
3587 redo A;
3588 } else {
3589 $self->{ct}->{data} .= chr $self->{nc}; # pi
3590 $self->{read_until}->($self->{ct}->{data}, q[?],
3591 length $self->{ct}->{data});
3592 ## Stay in the state.
3593 !!!next-input-character;
3594 ## Reprocess.
3595 redo A;
3596 }
3597 } elsif ($self->{state} == PI_AFTER_STATE) {
3598 ## XML5: Part of "Pi after state".
3599
3600 if ($self->{nc} == 0x003E) { # >
3601 if ($self->{in_subset}) {
3602 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3603 } else {
3604 $self->{state} = DATA_STATE;
3605 $self->{s_kwd} = '';
3606 }
3607 !!!next-input-character;
3608 !!!emit ($self->{ct}); # pi
3609 redo A;
3610 } elsif ($self->{nc} == 0x003F) { # ?
3611 !!!parse-error (type => 'no s after target', ## TODO: type
3612 line => $self->{line_prev},
3613 column => $self->{column_prev}); ## XML5: no error
3614 $self->{ct}->{data} .= '?';
3615 $self->{state} = PI_DATA_AFTER_STATE;
3616 !!!next-input-character;
3617 redo A;
3618 } else {
3619 !!!parse-error (type => 'no s after target', ## TODO: type
3620 line => $self->{line_prev},
3621 column => $self->{column_prev}
3622 + 1 * ($self->{nc} == -1)); ## XML5: no error
3623 $self->{ct}->{data} .= '?'; ## XML5: not appended
3624 $self->{state} = PI_DATA_STATE;
3625 ## Reprocess.
3626 redo A;
3627 }
3628 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3629 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3630
3631 if ($self->{nc} == 0x003E) { # >
3632 if ($self->{in_subset}) {
3633 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3634 } else {
3635 $self->{state} = DATA_STATE;
3636 $self->{s_kwd} = '';
3637 }
3638 !!!next-input-character;
3639 !!!emit ($self->{ct}); # pi
3640 redo A;
3641 } elsif ($self->{nc} == 0x003F) { # ?
3642 $self->{ct}->{data} .= '?';
3643 ## Stay in the state.
3644 !!!next-input-character;
3645 redo A;
3646 } else {
3647 $self->{ct}->{data} .= '?'; ## XML5: not appended
3648 $self->{state} = PI_DATA_STATE;
3649 ## Reprocess.
3650 redo A;
3651 }
3652
3653 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3654 if ($self->{nc} == 0x003C) { # <
3655 $self->{state} = DOCTYPE_TAG_STATE;
3656 !!!next-input-character;
3657 redo A;
3658 } elsif ($self->{nc} == 0x0025) { # %
3659 ## XML5: Not defined yet.
3660
3661 ## TODO:
3662
3663 if (not $self->{stop_processing} and
3664 not $self->{document}->xml_standalone) {
3665 !!!parse-error (type => 'stop processing', ## TODO: type
3666 level => $self->{level}->{info});
3667 $self->{stop_processing} = 1;
3668 }
3669
3670 !!!next-input-character;
3671 redo A;
3672 } elsif ($self->{nc} == 0x005D) { # ]
3673 delete $self->{in_subset};
3674 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3675 !!!next-input-character;
3676 redo A;
3677 } elsif ($is_space->{$self->{nc}}) {
3678 ## Stay in the state.
3679 !!!next-input-character;
3680 redo A;
3681 } elsif ($self->{nc} == -1) {
3682 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3683 delete $self->{in_subset};
3684 $self->{state} = DATA_STATE;
3685 $self->{s_kwd} = '';
3686 ## Reconsume.
3687 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3688 redo A;
3689 } else {
3690 unless ($self->{internal_subset_tainted}) {
3691 ## XML5: No parse error.
3692 !!!parse-error (type => 'string in internal subset');
3693 $self->{internal_subset_tainted} = 1;
3694 }
3695 ## Stay in the state.
3696 !!!next-input-character;
3697 redo A;
3698 }
3699 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3700 if ($self->{nc} == 0x003E) { # >
3701 $self->{state} = DATA_STATE;
3702 $self->{s_kwd} = '';
3703 !!!next-input-character;
3704 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3705 redo A;
3706 } elsif ($self->{nc} == -1) {
3707 !!!parse-error (type => 'unclosed DOCTYPE');
3708 $self->{state} = DATA_STATE;
3709 $self->{s_kwd} = '';
3710 ## Reconsume.
3711 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3712 redo A;
3713 } else {
3714 ## XML5: No parse error and stay in the state.
3715 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3716
3717 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3718 !!!next-input-character;
3719 redo A;
3720 }
3721 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3722 if ($self->{nc} == 0x003E) { # >
3723 $self->{state} = DATA_STATE;
3724 $self->{s_kwd} = '';
3725 !!!next-input-character;
3726 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3727 redo A;
3728 } elsif ($self->{nc} == -1) {
3729 $self->{state} = DATA_STATE;
3730 $self->{s_kwd} = '';
3731 ## Reconsume.
3732 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3733 redo A;
3734 } else {
3735 ## Stay in the state.
3736 !!!next-input-character;
3737 redo A;
3738 }
3739 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3740 if ($self->{nc} == 0x0021) { # !
3741 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3742 !!!next-input-character;
3743 redo A;
3744 } elsif ($self->{nc} == 0x003F) { # ?
3745 $self->{state} = PI_STATE;
3746 !!!next-input-character;
3747 redo A;
3748 } elsif ($self->{nc} == -1) {
3749 !!!parse-error (type => 'bare stago');
3750 $self->{state} = DATA_STATE;
3751 $self->{s_kwd} = '';
3752 ## Reconsume.
3753 redo A;
3754 } else {
3755 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3756 line => $self->{line_prev},
3757 column => $self->{column_prev});
3758 $self->{state} = BOGUS_COMMENT_STATE;
3759 $self->{ct} = {type => COMMENT_TOKEN,
3760 data => '',
3761 }; ## NOTE: Will be discarded.
3762 !!!next-input-character;
3763 redo A;
3764 }
3765 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3766 ## XML5: "DOCTYPE markup declaration state".
3767
3768 if ($self->{nc} == 0x002D) { # -
3769 $self->{state} = MD_HYPHEN_STATE;
3770 !!!next-input-character;
3771 redo A;
3772 } elsif ($self->{nc} == 0x0045 or # E
3773 $self->{nc} == 0x0065) { # e
3774 $self->{state} = MD_E_STATE;
3775 $self->{kwd} = chr $self->{nc};
3776 !!!next-input-character;
3777 redo A;
3778 } elsif ($self->{nc} == 0x0041 or # A
3779 $self->{nc} == 0x0061) { # a
3780 $self->{state} = MD_ATTLIST_STATE;
3781 $self->{kwd} = chr $self->{nc};
3782 !!!next-input-character;
3783 redo A;
3784 } elsif ($self->{nc} == 0x004E or # N
3785 $self->{nc} == 0x006E) { # n
3786 $self->{state} = MD_NOTATION_STATE;
3787 $self->{kwd} = chr $self->{nc};
3788 !!!next-input-character;
3789 redo A;
3790 } else {
3791 #
3792 }
3793
3794 ## XML5: No parse error.
3795 !!!parse-error (type => 'bogus comment',
3796 line => $self->{line_prev},
3797 column => $self->{column_prev} - 1);
3798 ## Reconsume.
3799 $self->{state} = BOGUS_COMMENT_STATE;
3800 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3801 redo A;
3802 } elsif ($self->{state} == MD_E_STATE) {
3803 if ($self->{nc} == 0x004E or # N
3804 $self->{nc} == 0x006E) { # n
3805 $self->{state} = MD_ENTITY_STATE;
3806 $self->{kwd} .= chr $self->{nc};
3807 !!!next-input-character;
3808 redo A;
3809 } elsif ($self->{nc} == 0x004C or # L
3810 $self->{nc} == 0x006C) { # l
3811 ## XML5: <!ELEMENT> not supported.
3812 $self->{state} = MD_ELEMENT_STATE;
3813 $self->{kwd} .= chr $self->{nc};
3814 !!!next-input-character;
3815 redo A;
3816 } else {
3817 ## XML5: No parse error.
3818 !!!parse-error (type => 'bogus comment',
3819 line => $self->{line_prev},
3820 column => $self->{column_prev} - 2
3821 + 1 * ($self->{nc} == -1));
3822 ## Reconsume.
3823 $self->{state} = BOGUS_COMMENT_STATE;
3824 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3825 redo A;
3826 }
3827 } elsif ($self->{state} == MD_ENTITY_STATE) {
3828 if ($self->{nc} == [
3829 undef,
3830 undef,
3831 0x0054, # T
3832 0x0049, # I
3833 0x0054, # T
3834 ]->[length $self->{kwd}] or
3835 $self->{nc} == [
3836 undef,
3837 undef,
3838 0x0074, # t
3839 0x0069, # i
3840 0x0074, # t
3841 ]->[length $self->{kwd}]) {
3842 ## Stay in the state.
3843 $self->{kwd} .= chr $self->{nc};
3844 !!!next-input-character;
3845 redo A;
3846 } elsif ((length $self->{kwd}) == 5 and
3847 ($self->{nc} == 0x0059 or # Y
3848 $self->{nc} == 0x0079)) { # y
3849 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3850 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3851 text => 'ENTITY',
3852 line => $self->{line_prev},
3853 column => $self->{column_prev} - 4);
3854 }
3855 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3856 line => $self->{line_prev},
3857 column => $self->{column_prev} - 6};
3858 $self->{state} = DOCTYPE_MD_STATE;
3859 !!!next-input-character;
3860 redo A;
3861 } else {
3862 !!!parse-error (type => 'bogus comment',
3863 line => $self->{line_prev},
3864 column => $self->{column_prev} - 1
3865 - (length $self->{kwd})
3866 + 1 * ($self->{nc} == -1));
3867 $self->{state} = BOGUS_COMMENT_STATE;
3868 ## Reconsume.
3869 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3870 redo A;
3871 }
3872 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3873 if ($self->{nc} == [
3874 undef,
3875 undef,
3876 0x0045, # E
3877 0x004D, # M
3878 0x0045, # E
3879 0x004E, # N
3880 ]->[length $self->{kwd}] or
3881 $self->{nc} == [
3882 undef,
3883 undef,
3884 0x0065, # e
3885 0x006D, # m
3886 0x0065, # e
3887 0x006E, # n
3888 ]->[length $self->{kwd}]) {
3889 ## Stay in the state.
3890 $self->{kwd} .= chr $self->{nc};
3891 !!!next-input-character;
3892 redo A;
3893 } elsif ((length $self->{kwd}) == 6 and
3894 ($self->{nc} == 0x0054 or # T
3895 $self->{nc} == 0x0074)) { # t
3896 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3897 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3898 text => 'ELEMENT',
3899 line => $self->{line_prev},
3900 column => $self->{column_prev} - 5);
3901 }
3902 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3903 line => $self->{line_prev},
3904 column => $self->{column_prev} - 7};
3905 $self->{state} = DOCTYPE_MD_STATE;
3906 !!!next-input-character;
3907 redo A;
3908 } else {
3909 !!!parse-error (type => 'bogus comment',
3910 line => $self->{line_prev},
3911 column => $self->{column_prev} - 1
3912 - (length $self->{kwd})
3913 + 1 * ($self->{nc} == -1));
3914 $self->{state} = BOGUS_COMMENT_STATE;
3915 ## Reconsume.
3916 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3917 redo A;
3918 }
3919 } elsif ($self->{state} == MD_ATTLIST_STATE) {
3920 if ($self->{nc} == [
3921 undef,
3922 0x0054, # T
3923 0x0054, # T
3924 0x004C, # L
3925 0x0049, # I
3926 0x0053, # S
3927 ]->[length $self->{kwd}] or
3928 $self->{nc} == [
3929 undef,
3930 0x0074, # t
3931 0x0074, # t
3932 0x006C, # l
3933 0x0069, # i
3934 0x0073, # s
3935 ]->[length $self->{kwd}]) {
3936 ## Stay in the state.
3937 $self->{kwd} .= chr $self->{nc};
3938 !!!next-input-character;
3939 redo A;
3940 } elsif ((length $self->{kwd}) == 6 and
3941 ($self->{nc} == 0x0054 or # T
3942 $self->{nc} == 0x0074)) { # t
3943 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3944 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3945 text => 'ATTLIST',
3946 line => $self->{line_prev},
3947 column => $self->{column_prev} - 5);
3948 }
3949 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3950 attrdefs => [],
3951 line => $self->{line_prev},
3952 column => $self->{column_prev} - 7};
3953 $self->{state} = DOCTYPE_MD_STATE;
3954 !!!next-input-character;
3955 redo A;
3956 } else {
3957 !!!parse-error (type => 'bogus comment',
3958 line => $self->{line_prev},
3959 column => $self->{column_prev} - 1
3960 - (length $self->{kwd})
3961 + 1 * ($self->{nc} == -1));
3962 $self->{state} = BOGUS_COMMENT_STATE;
3963 ## Reconsume.
3964 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3965 redo A;
3966 }
3967 } elsif ($self->{state} == MD_NOTATION_STATE) {
3968 if ($self->{nc} == [
3969 undef,
3970 0x004F, # O
3971 0x0054, # T
3972 0x0041, # A
3973 0x0054, # T
3974 0x0049, # I
3975 0x004F, # O
3976 ]->[length $self->{kwd}] or
3977 $self->{nc} == [
3978 undef,
3979 0x006F, # o
3980 0x0074, # t
3981 0x0061, # a
3982 0x0074, # t
3983 0x0069, # i
3984 0x006F, # o
3985 ]->[length $self->{kwd}]) {
3986 ## Stay in the state.
3987 $self->{kwd} .= chr $self->{nc};
3988 !!!next-input-character;
3989 redo A;
3990 } elsif ((length $self->{kwd}) == 7 and
3991 ($self->{nc} == 0x004E or # N
3992 $self->{nc} == 0x006E)) { # n
3993 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
3994 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3995 text => 'NOTATION',
3996 line => $self->{line_prev},
3997 column => $self->{column_prev} - 6);
3998 }
3999 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4000 line => $self->{line_prev},
4001 column => $self->{column_prev} - 8};
4002 $self->{state} = DOCTYPE_MD_STATE;
4003 !!!next-input-character;
4004 redo A;
4005 } else {
4006 !!!parse-error (type => 'bogus comment',
4007 line => $self->{line_prev},
4008 column => $self->{column_prev} - 1
4009 - (length $self->{kwd})
4010 + 1 * ($self->{nc} == -1));
4011 $self->{state} = BOGUS_COMMENT_STATE;
4012 ## Reconsume.
4013 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4014 redo A;
4015 }
4016 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4017 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4018 ## "DOCTYPE NOTATION state".
4019
4020 if ($is_space->{$self->{nc}}) {
4021 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4022 $self->{state} = BEFORE_MD_NAME_STATE;
4023 !!!next-input-character;
4024 redo A;
4025 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4026 $self->{nc} == 0x0025) { # %
4027 ## XML5: Switch to the "DOCTYPE bogus comment state".
4028 !!!parse-error (type => 'no space before md name'); ## TODO: type
4029 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4030 !!!next-input-character;
4031 redo A;
4032 } elsif ($self->{nc} == -1) {
4033 !!!parse-error (type => 'unclosed md'); ## TODO: type
4034 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4035 ## Reconsume.
4036 redo A;
4037 } elsif ($self->{nc} == 0x003E) { # >
4038 ## XML5: Switch to the "DOCTYPE bogus comment state".
4039 !!!parse-error (type => 'no md name'); ## TODO: type
4040 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4041 !!!next-input-character;
4042 redo A;
4043 } else {
4044 ## XML5: Switch to the "DOCTYPE bogus comment state".
4045 !!!parse-error (type => 'no space before md name'); ## TODO: type
4046 $self->{state} = BEFORE_MD_NAME_STATE;
4047 redo A;
4048 }
4049 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4050 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4051 ## before state", "DOCTYPE ATTLIST name before state".
4052
4053 if ($is_space->{$self->{nc}}) {
4054 ## Stay in the state.
4055 !!!next-input-character;
4056 redo A;
4057 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4058 $self->{nc} == 0x0025) { # %
4059 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4060 !!!next-input-character;
4061 redo A;
4062 } elsif ($self->{nc} == 0x003E) { # >
4063 ## XML5: Same as "Anything else".
4064 !!!parse-error (type => 'no md name'); ## TODO: type
4065 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066 !!!next-input-character;
4067 redo A;
4068 } elsif ($self->{nc} == -1) {
4069 !!!parse-error (type => 'unclosed md'); ## TODO: type
4070 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4071 ## Reconsume.
4072 redo A;
4073 } else {
4074 ## XML5: [ATTLIST] Not defined yet.
4075 $self->{ct}->{name} .= chr $self->{nc};
4076 $self->{state} = MD_NAME_STATE;
4077 !!!next-input-character;
4078 redo A;
4079 }
4080 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4081 if ($is_space->{$self->{nc}}) {
4082 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4083 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4084 $self->{state} = BEFORE_MD_NAME_STATE;
4085 !!!next-input-character;
4086 redo A;
4087 } elsif ($self->{nc} == 0x003E) { # >
4088 ## XML5: Same as "Anything else".
4089 !!!parse-error (type => 'no md name'); ## TODO: type
4090 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4091 !!!next-input-character;
4092 redo A;
4093 } elsif ($self->{nc} == -1) {
4094 !!!parse-error (type => 'unclosed md');
4095 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4096 ## Reconsume.
4097 redo A;
4098 } else {
4099 ## XML5: No parse error.
4100 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4101 $self->{state} = BOGUS_COMMENT_STATE;
4102 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4103 ## Reconsume.
4104 redo A;
4105 }
4106 } elsif ($self->{state} == MD_NAME_STATE) {
4107 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4108
4109 if ($is_space->{$self->{nc}}) {
4110 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4111 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4112 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4113 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4114 } else { # ENTITY/NOTATION
4115 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4116 }
4117 !!!next-input-character;
4118 redo A;
4119 } elsif ($self->{nc} == 0x003E) { # >
4120 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4121 #
4122 } else {
4123 !!!parse-error (type => 'no md def'); ## TODO: type
4124 }
4125 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4126 !!!next-input-character;
4127 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4128 redo A;
4129 } elsif ($self->{nc} == -1) {
4130 ## XML5: [ATTLIST] No parse error.
4131 !!!parse-error (type => 'unclosed md');
4132 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4133 ## Reconsume.
4134 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4135 redo A;
4136 } else {
4137 ## XML5: [ATTLIST] Not defined yet.
4138 $self->{ct}->{name} .= chr $self->{nc};
4139 ## Stay in the state.
4140 !!!next-input-character;
4141 redo A;
4142 }
4143 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4144 if ($is_space->{$self->{nc}}) {
4145 ## Stay in the state.
4146 !!!next-input-character;
4147 redo A;
4148 } elsif ($self->{nc} == 0x003E) { # >
4149 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4150 !!!next-input-character;
4151 !!!emit ($self->{ct}); # ATTLIST
4152 redo A;
4153 } elsif ($self->{nc} == -1) {
4154 ## XML5: No parse error.
4155 !!!parse-error (type => 'unclosed md'); ## TODO: type
4156 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4157 !!!emit ($self->{ct});
4158 redo A;
4159 } else {
4160 ## XML5: Not defined yet.
4161 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4162 tokens => [],
4163 line => $self->{line}, column => $self->{column}};
4164 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4165 !!!next-input-character;
4166 redo A;
4167 }
4168 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4169 if ($is_space->{$self->{nc}}) {
4170 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4171 !!!next-input-character;
4172 redo A;
4173 } elsif ($self->{nc} == 0x003E) { # >
4174 ## XML5: Same as "anything else".
4175 !!!parse-error (type => 'no attr type'); ## TODO: type
4176 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4177 !!!next-input-character;
4178 !!!emit ($self->{ct}); # ATTLIST
4179 redo A;
4180 } elsif ($self->{nc} == 0x0028) { # (
4181 ## XML5: Same as "anything else".
4182 !!!parse-error (type => 'no space before paren'); ## TODO: type
4183 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4184 !!!next-input-character;
4185 redo A;
4186 } elsif ($self->{nc} == -1) {
4187 ## XML5: No parse error.
4188 !!!parse-error (type => 'unclosed md'); ## TODO: type
4189 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4190 !!!next-input-character;
4191 !!!emit ($self->{ct}); # ATTLIST
4192 redo A;
4193 } else {
4194 ## XML5: Not defined yet.
4195 $self->{ca}->{name} .= chr $self->{nc};
4196 ## Stay in the state.
4197 !!!next-input-character;
4198 redo A;
4199 }
4200 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4201 if ($is_space->{$self->{nc}}) {
4202 ## Stay in the state.
4203 !!!next-input-character;
4204 redo A;
4205 } elsif ($self->{nc} == 0x003E) { # >
4206 ## XML5: Same as "anything else".
4207 !!!parse-error (type => 'no attr type'); ## TODO: type
4208 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4209 !!!next-input-character;
4210 !!!emit ($self->{ct}); # ATTLIST
4211 redo A;
4212 } elsif ($self->{nc} == 0x0028) { # (
4213 ## XML5: Same as "anything else".
4214 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4215 !!!next-input-character;
4216 redo A;
4217 } elsif ($self->{nc} == -1) {
4218 ## XML5: No parse error.
4219 !!!parse-error (type => 'unclosed md'); ## TODO: type
4220 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4221 !!!next-input-character;
4222 !!!emit ($self->{ct});
4223 redo A;
4224 } else {
4225 ## XML5: Not defined yet.
4226 $self->{ca}->{type} = chr $self->{nc};
4227 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4228 !!!next-input-character;
4229 redo A;
4230 }
4231 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4232 if ($is_space->{$self->{nc}}) {
4233 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4234 !!!next-input-character;
4235 redo A;
4236 } elsif ($self->{nc} == 0x0023) { # #
4237 ## XML5: Same as "anything else".
4238 !!!parse-error (type => 'no space before default value'); ## TODO: type
4239 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4240 !!!next-input-character;
4241 redo A;
4242 } elsif ($self->{nc} == 0x0022) { # "
4243 ## XML5: Same as "anything else".
4244 !!!parse-error (type => 'no space before default value'); ## TODO: type
4245 $self->{ca}->{value} = '';
4246 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4247 !!!next-input-character;
4248 redo A;
4249 } elsif ($self->{nc} == 0x0027) { # '
4250 ## XML5: Same as "anything else".
4251 !!!parse-error (type => 'no space before default value'); ## TODO: type
4252 $self->{ca}->{value} = '';
4253 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4254 !!!next-input-character;
4255 redo A;
4256 } elsif ($self->{nc} == 0x003E) { # >
4257 ## XML5: Same as "anything else".
4258 !!!parse-error (type => 'no attr default'); ## TODO: type
4259 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4260 !!!next-input-character;
4261 !!!emit ($self->{ct}); # ATTLIST
4262 redo A;
4263 } elsif ($self->{nc} == 0x0028) { # (
4264 ## XML5: Same as "anything else".
4265 !!!parse-error (type => 'no space before paren'); ## TODO: type
4266 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4267 !!!next-input-character;
4268 redo A;
4269 } elsif ($self->{nc} == -1) {
4270 ## XML5: No parse error.
4271 !!!parse-error (type => 'unclosed md'); ## TODO: type
4272 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4273 !!!next-input-character;
4274 !!!emit ($self->{ct});
4275 redo A;
4276 } else {
4277 ## XML5: Not defined yet.
4278 $self->{ca}->{type} .= chr $self->{nc};
4279 ## Stay in the state.
4280 !!!next-input-character;
4281 redo A;
4282 }
4283 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4284 if ($is_space->{$self->{nc}}) {
4285 ## Stay in the state.
4286 !!!next-input-character;
4287 redo A;
4288 } elsif ($self->{nc} == 0x0028) { # (
4289 ## XML5: Same as "anything else".
4290 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4291 !!!next-input-character;
4292 redo A;
4293 } elsif ($self->{nc} == 0x0023) { # #
4294 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4295 !!!next-input-character;
4296 redo A;
4297 } elsif ($self->{nc} == 0x0022) { # "
4298 ## XML5: Same as "anything else".
4299 $self->{ca}->{value} = '';
4300 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4301 !!!next-input-character;
4302 redo A;
4303 } elsif ($self->{nc} == 0x0027) { # '
4304 ## XML5: Same as "anything else".
4305 $self->{ca}->{value} = '';
4306 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4307 !!!next-input-character;
4308 redo A;
4309 } elsif ($self->{nc} == 0x003E) { # >
4310 ## XML5: Same as "anything else".
4311 !!!parse-error (type => 'no attr default'); ## TODO: type
4312 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4313 !!!next-input-character;
4314 !!!emit ($self->{ct}); # ATTLIST
4315 redo A;
4316 } elsif ($self->{nc} == -1) {
4317 ## XML5: No parse error.
4318 !!!parse-error (type => 'unclosed md'); ## TODO: type
4319 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4320 !!!next-input-character;
4321 !!!emit ($self->{ct});
4322 redo A;
4323 } else {
4324 ## XML5: Switch to the "DOCTYPE bogus comment state".
4325 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4326 $self->{ca}->{value} = '';
4327 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4328 ## Reconsume.
4329 redo A;
4330 }
4331 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4332 if ($is_space->{$self->{nc}}) {
4333 ## Stay in the state.
4334 !!!next-input-character;
4335 redo A;
4336 } elsif ($self->{nc} == 0x007C) { # |
4337 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4338 ## Stay in the state.
4339 !!!next-input-character;
4340 redo A;
4341 } elsif ($self->{nc} == 0x0029) { # )
4342 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4343 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4344 !!!next-input-character;
4345 redo A;
4346 } elsif ($self->{nc} == 0x003E) { # >
4347 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4348 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4349 !!!next-input-character;
4350 !!!emit ($self->{ct}); # ATTLIST
4351 redo A;
4352 } elsif ($self->{nc} == -1) {
4353 ## XML5: No parse error.
4354 !!!parse-error (type => 'unclosed md'); ## TODO: type
4355 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4356 !!!next-input-character;
4357 !!!emit ($self->{ct});
4358 redo A;
4359 } else {
4360 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4361 $self->{state} = ALLOWED_TOKEN_STATE;
4362 !!!next-input-character;
4363 redo A;
4364 }
4365 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4366 if ($is_space->{$self->{nc}}) {
4367 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4368 !!!next-input-character;
4369 redo A;
4370 } elsif ($self->{nc} == 0x007C) { # |
4371 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4372 !!!next-input-character;
4373 redo A;
4374 } elsif ($self->{nc} == 0x0029) { # )
4375 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4376 !!!next-input-character;
4377 redo A;
4378 } elsif ($self->{nc} == 0x003E) { # >
4379 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4380 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4381 !!!next-input-character;
4382 !!!emit ($self->{ct}); # ATTLIST
4383 redo A;
4384 } elsif ($self->{nc} == -1) {
4385 ## XML5: No parse error.
4386 !!!parse-error (type => 'unclosed md'); ## TODO: type
4387 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4388 !!!next-input-character;
4389 !!!emit ($self->{ct});
4390 redo A;
4391 } else {
4392 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4393 ## Stay in the state.
4394 !!!next-input-character;
4395 redo A;
4396 }
4397 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4398 if ($is_space->{$self->{nc}}) {
4399 ## Stay in the state.
4400 !!!next-input-character;
4401 redo A;
4402 } elsif ($self->{nc} == 0x007C) { # |
4403 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4404 !!!next-input-character;
4405 redo A;
4406 } elsif ($self->{nc} == 0x0029) { # )
4407 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4408 !!!next-input-character;
4409 redo A;
4410 } elsif ($self->{nc} == 0x003E) { # >
4411 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4412 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4413 !!!next-input-character;
4414 !!!emit ($self->{ct}); # ATTLIST
4415 redo A;
4416 } elsif ($self->{nc} == -1) {
4417 ## XML5: No parse error.
4418 !!!parse-error (type => 'unclosed md'); ## TODO: type
4419 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4420 !!!next-input-character;
4421 !!!emit ($self->{ct});
4422 redo A;
4423 } else {
4424 !!!parse-error (type => 'space in allowed token', ## TODO: type
4425 line => $self->{line_prev},
4426 column => $self->{column_prev});
4427 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4428 $self->{state} = ALLOWED_TOKEN_STATE;
4429 !!!next-input-character;
4430 redo A;
4431 }
4432 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4433 if ($is_space->{$self->{nc}}) {
4434 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4435 !!!next-input-character;
4436 redo A;
4437 } elsif ($self->{nc} == 0x0023) { # #
4438 !!!parse-error (type => 'no space before default value'); ## TODO: type
4439 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4440 !!!next-input-character;
4441 redo A;
4442 } elsif ($self->{nc} == 0x0022) { # "
4443 !!!parse-error (type => 'no space before default value'); ## TODO: type
4444 $self->{ca}->{value} = '';
4445 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4446 !!!next-input-character;
4447 redo A;
4448 } elsif ($self->{nc} == 0x0027) { # '
4449 !!!parse-error (type => 'no space before default value'); ## TODO: type
4450 $self->{ca}->{value} = '';
4451 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4452 !!!next-input-character;
4453 redo A;
4454 } elsif ($self->{nc} == 0x003E) { # >
4455 !!!parse-error (type => 'no attr default'); ## TODO: type
4456 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4457 !!!next-input-character;
4458 !!!emit ($self->{ct}); # ATTLIST
4459 redo A;
4460 } elsif ($self->{nc} == -1) {
4461 !!!parse-error (type => 'unclosed md'); ## TODO: type
4462 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4463 !!!next-input-character;
4464 !!!emit ($self->{ct});
4465 redo A;
4466 } else {
4467 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4468 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4469 ## Reconsume.
4470 redo A;
4471 }
4472 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4473 if ($is_space->{$self->{nc}}) {
4474 ## Stay in the state.
4475 !!!next-input-character;
4476 redo A;
4477 } elsif ($self->{nc} == 0x0023) { # #
4478 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4479 !!!next-input-character;
4480 redo A;
4481 } elsif ($self->{nc} == 0x0022) { # "
4482 $self->{ca}->{value} = '';
4483 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4484 !!!next-input-character;
4485 redo A;
4486 } elsif ($self->{nc} == 0x0027) { # '
4487 $self->{ca}->{value} = '';
4488 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4489 !!!next-input-character;
4490 redo A;
4491 } elsif ($self->{nc} == 0x003E) { # >
4492 !!!parse-error (type => 'no attr default'); ## TODO: type
4493 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4494 !!!next-input-character;
4495 !!!emit ($self->{ct}); # ATTLIST
4496 redo A;
4497 } elsif ($self->{nc} == -1) {
4498 !!!parse-error (type => 'unclosed md'); ## TODO: type
4499 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4500 !!!next-input-character;
4501 !!!emit ($self->{ct});
4502 redo A;
4503 } else {
4504 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4505 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4506 ## Reconsume.
4507 redo A;
4508 }
4509 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4510 if ($is_space->{$self->{nc}}) {
4511 ## XML5: No parse error.
4512 !!!parse-error (type => 'no default type'); ## TODO: type
4513 $self->{state} = BOGUS_MD_STATE;
4514 ## Reconsume.
4515 redo A;
4516 } elsif ($self->{nc} == 0x0022) { # "
4517 ## XML5: Same as "anything else".
4518 $self->{ca}->{value} = '';
4519 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4520 !!!next-input-character;
4521 redo A;
4522 } elsif ($self->{nc} == 0x0027) { # '
4523 ## XML5: Same as "anything else".
4524 $self->{ca}->{value} = '';
4525 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4526 !!!next-input-character;
4527 redo A;
4528 } elsif ($self->{nc} == 0x003E) { # >
4529 ## XML5: Same as "anything else".
4530 !!!parse-error (type => 'no attr default'); ## TODO: type
4531 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4532 !!!next-input-character;
4533 !!!emit ($self->{ct}); # ATTLIST
4534 redo A;
4535 } elsif ($self->{nc} == -1) {
4536 ## XML5: No parse error.
4537 !!!parse-error (type => 'unclosed md'); ## TODO: type
4538 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4539 !!!next-input-character;
4540 !!!emit ($self->{ct});
4541 redo A;
4542 } else {
4543 $self->{ca}->{default} = chr $self->{nc};
4544 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4545 !!!next-input-character;
4546 redo A;
4547 }
4548 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4549 if ($is_space->{$self->{nc}}) {
4550 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4551 !!!next-input-character;
4552 redo A;
4553 } elsif ($self->{nc} == 0x0022) { # "
4554 ## XML5: Same as "anything else".
4555 !!!parse-error (type => 'no space before default value'); ## TODO: type
4556 $self->{ca}->{value} = '';
4557 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4558 !!!next-input-character;
4559 redo A;
4560 } elsif ($self->{nc} == 0x0027) { # '
4561 ## XML5: Same as "anything else".
4562 !!!parse-error (type => 'no space before default value'); ## TODO: type
4563 $self->{ca}->{value} = '';
4564 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4565 !!!next-input-character;
4566 redo A;
4567 } elsif ($self->{nc} == 0x003E) { # >
4568 ## XML5: Same as "anything else".
4569 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4570 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4571 !!!next-input-character;
4572 !!!emit ($self->{ct}); # ATTLIST
4573 redo A;
4574 } elsif ($self->{nc} == -1) {
4575 ## XML5: No parse error.
4576 !!!parse-error (type => 'unclosed md'); ## TODO: type
4577 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4578 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4579 !!!next-input-character;
4580 !!!emit ($self->{ct});
4581 redo A;
4582 } else {
4583 $self->{ca}->{default} .= chr $self->{nc};
4584 ## Stay in the state.
4585 !!!next-input-character;
4586 redo A;
4587 }
4588 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4589 if ($is_space->{$self->{nc}}) {
4590 ## Stay in the state.
4591 !!!next-input-character;
4592 redo A;
4593 } elsif ($self->{nc} == 0x0022) { # "
4594 $self->{ca}->{value} = '';
4595 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4596 !!!next-input-character;
4597 redo A;
4598 } elsif ($self->{nc} == 0x0027) { # '
4599 $self->{ca}->{value} = '';
4600 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4601 !!!next-input-character;
4602 redo A;
4603 } elsif ($self->{nc} == 0x003E) { # >
4604 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4605 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4606 !!!next-input-character;
4607 !!!emit ($self->{ct}); # ATTLIST
4608 redo A;
4609 } elsif ($self->{nc} == -1) {
4610 ## XML5: No parse error.
4611 !!!parse-error (type => 'unclosed md'); ## TODO: type
4612 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4613 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4614 !!!next-input-character;
4615 !!!emit ($self->{ct});
4616 redo A;
4617 } else {
4618 ## XML5: Not defined yet.
4619 if ($self->{ca}->{default} eq 'FIXED') {
4620 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4621 } else {
4622 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4623 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4624 }
4625 ## Reconsume.
4626 redo A;
4627 }
4628 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4629 if ($is_space->{$self->{nc}} or
4630 $self->{nc} == -1 or
4631 $self->{nc} == 0x003E) { # >
4632 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4633 ## Reconsume.
4634 redo A;
4635 } else {
4636 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4637 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4638 ## Reconsume.
4639 redo A;
4640 }
4641 } elsif ($self->{state} == NDATA_STATE) {
4642 ## ASCII case-insensitive
4643 if ($self->{nc} == [
4644 undef,
4645 0x0044, # D
4646 0x0041, # A
4647 0x0054, # T
4648 ]->[length $self->{kwd}] or
4649 $self->{nc} == [
4650 undef,
4651 0x0064, # d
4652 0x0061, # a
4653 0x0074, # t
4654 ]->[length $self->{kwd}]) {
4655 !!!cp (172.2);
4656 ## Stay in the state.
4657 $self->{kwd} .= chr $self->{nc};
4658 !!!next-input-character;
4659 redo A;
4660 } elsif ((length $self->{kwd}) == 4 and
4661 ($self->{nc} == 0x0041 or # A
4662 $self->{nc} == 0x0061)) { # a
4663 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4664 !!!cp (172.3);
4665 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4666 text => 'NDATA',
4667 line => $self->{line_prev},
4668 column => $self->{column_prev} - 4);
4669 } else {
4670 !!!cp (172.4);
4671 }
4672 $self->{state} = AFTER_NDATA_STATE;
4673 !!!next-input-character;
4674 redo A;
4675 } else {
4676 !!!parse-error (type => 'string after literal', ## TODO: type
4677 line => $self->{line_prev},
4678 column => $self->{column_prev} + 1
4679 - length $self->{kwd});
4680 !!!cp (172.5);
4681 $self->{state} = BOGUS_MD_STATE;
4682 ## Reconsume.
4683 redo A;
4684 }
4685 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4686 if ($is_space->{$self->{nc}}) {
4687 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4688 !!!next-input-character;
4689 redo A;
4690 } elsif ($self->{nc} == 0x003E) { # >
4691 !!!parse-error (type => 'no notation name'); ## TODO: type
4692 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4693 !!!next-input-character;
4694 !!!emit ($self->{ct}); # ENTITY
4695 redo A;
4696 } elsif ($self->{nc} == -1) {
4697 !!!parse-error (type => 'unclosed md'); ## TODO: type
4698 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4699 !!!next-input-character;
4700 !!!emit ($self->{ct}); # ENTITY
4701 redo A;
4702 } else {
4703 !!!parse-error (type => 'string after literal', ## TODO: type
4704 line => $self->{line_prev},
4705 column => $self->{column_prev} + 1
4706 - length $self->{kwd});
4707 $self->{state} = BOGUS_MD_STATE;
4708 ## Reconsume.
4709 redo A;
4710 }
4711 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4712 if ($is_space->{$self->{nc}}) {
4713 ## Stay in the state.
4714 !!!next-input-character;
4715 redo A;
4716 } elsif ($self->{nc} == 0x003E) { # >
4717 !!!parse-error (type => 'no notation name'); ## TODO: type
4718 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4719 !!!next-input-character;
4720 !!!emit ($self->{ct}); # ENTITY
4721 redo A;
4722 } elsif ($self->{nc} == -1) {
4723 !!!parse-error (type => 'unclosed md'); ## TODO: type
4724 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4725 !!!next-input-character;
4726 !!!emit ($self->{ct}); # ENTITY
4727 redo A;
4728 } else {
4729 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4730 $self->{state} = NOTATION_NAME_STATE;
4731 !!!next-input-character;
4732 redo A;
4733 }
4734 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4735 if ($is_space->{$self->{nc}}) {
4736 $self->{state} = AFTER_MD_DEF_STATE;
4737 !!!next-input-character;
4738 redo A;
4739 } elsif ($self->{nc} == 0x003E) { # >
4740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4741 !!!next-input-character;
4742 !!!emit ($self->{ct}); # ENTITY
4743 redo A;
4744 } elsif ($self->{nc} == -1) {
4745 !!!parse-error (type => 'unclosed md'); ## TODO: type
4746 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4747 !!!next-input-character;
4748 !!!emit ($self->{ct}); # ENTITY
4749 redo A;
4750 } else {
4751 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4752 ## Stay in the state.
4753 !!!next-input-character;
4754 redo A;
4755 }
4756 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4757 if ($self->{nc} == 0x0022) { # "
4758 $self->{state} = AFTER_MD_DEF_STATE;
4759 !!!next-input-character;
4760 redo A;
4761 } elsif ($self->{nc} == 0x0026) { # &
4762 $self->{prev_state} = $self->{state};
4763 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4764 $self->{entity_add} = 0x0022; # "
4765 !!!next-input-character;
4766 redo A;
4767 ## TODO: %
4768 } elsif ($self->{nc} == -1) {
4769 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4770 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4771 ## Reconsume.
4772 !!!emit ($self->{ct}); # ENTITY
4773 redo A;
4774 } else {
4775 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4776 !!!next-input-character;
4777 redo A;
4778 }
4779 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4780 if ($self->{nc} == 0x0027) { # '
4781 $self->{state} = AFTER_MD_DEF_STATE;
4782 !!!next-input-character;
4783 redo A;
4784 } elsif ($self->{nc} == 0x0026) { # &
4785 $self->{prev_state} = $self->{state};
4786 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4787 $self->{entity_add} = 0x0027; # '
4788 !!!next-input-character;
4789 redo A;
4790 ## TODO: %
4791 } elsif ($self->{nc} == -1) {
4792 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4793 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4794 ## Reconsume.
4795 !!!emit ($self->{ct}); # ENTITY
4796 redo A;
4797 } else {
4798 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4799 !!!next-input-character;
4800 redo A;
4801 }
4802 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4803 if ($is_space->{$self->{nc}} or
4804 {
4805 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4806 $self->{entity_add} => 1,
4807 }->{$self->{nc}}) {
4808 !!!parse-error (type => 'bare ero',
4809 line => $self->{line_prev},
4810 column => $self->{column_prev}
4811 + ($self->{nc} == -1 ? 1 : 0));
4812 ## Don't consume
4813 ## Return nothing.
4814 #
4815 } elsif ($self->{nc} == 0x0023) { # #
4816 $self->{ca} = $self->{ct};
4817 $self->{state} = ENTITY_HASH_STATE;
4818 $self->{kwd} = '#';
4819 !!!next-input-character;
4820 redo A;
4821 } else {
4822 #
4823 }
4824
4825 $self->{ct}->{value} .= '&';
4826 $self->{state} = $self->{prev_state};
4827 ## Reconsume.
4828 redo A;
4829 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4830 if ($is_space->{$self->{nc}}) {
4831 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4832 !!!next-input-character;
4833 redo A;
4834 } elsif ($self->{nc} == 0x0028) { # (
4835 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4836 $self->{ct}->{content} = ['('];
4837 $self->{group_depth} = 1;
4838 !!!next-input-character;
4839 redo A;
4840 } elsif ($self->{nc} == 0x003E) { # >
4841 !!!parse-error (type => 'no md def'); ## TODO: type
4842 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4843 !!!next-input-character;
4844 !!!emit ($self->{ct}); # ELEMENT
4845 redo A;
4846 } elsif ($self->{nc} == -1) {
4847 !!!parse-error (type => 'unclosed md'); ## TODO: type
4848 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4849 !!!next-input-character;
4850 !!!emit ($self->{ct}); # ELEMENT
4851 redo A;
4852 } else {
4853 $self->{ct}->{content} = [chr $self->{nc}];
4854 $self->{state} = CONTENT_KEYWORD_STATE;
4855 !!!next-input-character;
4856 redo A;
4857 }
4858 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4859 if ($is_space->{$self->{nc}}) {
4860 $self->{state} = AFTER_MD_DEF_STATE;
4861 !!!next-input-character;
4862 redo A;
4863 } elsif ($self->{nc} == 0x003E) { # >
4864 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4865 !!!next-input-character;
4866 !!!emit ($self->{ct}); # ELEMENT
4867 redo A;
4868 } elsif ($self->{nc} == -1) {
4869 !!!parse-error (type => 'unclosed md'); ## TODO: type
4870 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4871 !!!next-input-character;
4872 !!!emit ($self->{ct}); # ELEMENT
4873 redo A;
4874 } else {
4875 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4876 ## Stay in the state.
4877 !!!next-input-character;
4878 redo A;
4879 }
4880 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4881 if ($is_space->{$self->{nc}}) {
4882 ## Stay in the state.
4883 !!!next-input-character;
4884 redo A;
4885 } elsif ($self->{nc} == 0x0028) { # (
4886 $self->{group_depth}++;
4887 push @{$self->{ct}->{content}}, chr $self->{nc};
4888 ## Stay in the state.
4889 !!!next-input-character;
4890 redo A;
4891 } elsif ($self->{nc} == 0x007C or # |
4892 $self->{nc} == 0x002C) { # ,
4893 !!!parse-error (type => 'empty element name'); ## TODO: type
4894 ## Stay in the state.
4895 !!!next-input-character;
4896 redo A;
4897 } elsif ($self->{nc} == 0x0029) { # )
4898 !!!parse-error (type => 'empty element name'); ## TODO: type
4899 push @{$self->{ct}->{content}}, chr $self->{nc};
4900 $self->{group_depth}--;
4901 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4902 !!!next-input-character;
4903 redo A;
4904 } elsif ($self->{nc} == 0x003E) { # >
4905 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4906 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4907 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4908 !!!next-input-character;
4909 !!!emit ($self->{ct}); # ELEMENT
4910 redo A;
4911 } elsif ($self->{nc} == -1) {
4912 !!!parse-error (type => 'unclosed md'); ## TODO: type
4913 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4914 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4915 !!!next-input-character;
4916 !!!emit ($self->{ct}); # ELEMENT
4917 redo A;
4918 } else {
4919 push @{$self->{ct}->{content}}, chr $self->{nc};
4920 $self->{state} = CM_ELEMENT_NAME_STATE;
4921 !!!next-input-character;
4922 redo A;
4923 }
4924 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4925 if ($is_space->{$self->{nc}}) {
4926 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4927 !!!next-input-character;
4928 redo A;
4929 } elsif ($self->{nc} == 0x002A or # *
4930 $self->{nc} == 0x002B or # +
4931 $self->{nc} == 0x003F) { # ?
4932 push @{$self->{ct}->{content}}, chr $self->{nc};
4933 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4934 !!!next-input-character;
4935 redo A;
4936 } elsif ($self->{nc} == 0x007C or # |
4937 $self->{nc} == 0x002C) { # ,
4938 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4939 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4940 !!!next-input-character;
4941 redo A;
4942 } elsif ($self->{nc} == 0x0029) { # )
4943 $self->{group_depth}--;
4944 push @{$self->{ct}->{content}}, chr $self->{nc};
4945 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4946 !!!next-input-character;
4947 redo A;
4948 } elsif ($self->{nc} == 0x003E) { # >
4949 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4950 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4951 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4952 !!!next-input-character;
4953 !!!emit ($self->{ct}); # ELEMENT
4954 redo A;
4955 } elsif ($self->{nc} == -1) {
4956 !!!parse-error (type => 'unclosed md'); ## TODO: type
4957 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4958 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4959 !!!next-input-character;
4960 !!!emit ($self->{ct}); # ELEMENT
4961 redo A;
4962 } else {
4963 $self->{ct}->{content}->[-1] .= chr $self->{nc};
4964 ## Stay in the state.
4965 !!!next-input-character;
4966 redo A;
4967 }
4968 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4969 if ($is_space->{$self->{nc}}) {
4970 ## Stay in the state.
4971 !!!next-input-character;
4972 redo A;
4973 } elsif ($self->{nc} == 0x007C or # |
4974 $self->{nc} == 0x002C) { # ,
4975 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4976 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4977 !!!next-input-character;
4978 redo A;
4979 } elsif ($self->{nc} == 0x0029) { # )
4980 $self->{group_depth}--;
4981 push @{$self->{ct}->{content}}, chr $self->{nc};
4982 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4983 !!!next-input-character;
4984 redo A;
4985 } elsif ($self->{nc} == 0x003E) { # >
4986 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4987 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4988 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4989 !!!next-input-character;
4990 !!!emit ($self->{ct}); # ELEMENT
4991 redo A;
4992 } elsif ($self->{nc} == -1) {
4993 !!!parse-error (type => 'unclosed md'); ## TODO: type
4994 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4995 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4996 !!!next-input-character;
4997 !!!emit ($self->{ct}); # ELEMENT
4998 redo A;
4999 } else {
5000 !!!parse-error (type => 'after element name'); ## TODO: type
5001 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5002 $self->{state} = BOGUS_MD_STATE;
5003 !!!next-input-character;
5004 redo A;
5005 }
5006 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5007 if ($is_space->{$self->{nc}}) {
5008 if ($self->{group_depth}) {
5009 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5010 } else {
5011 $self->{state} = AFTER_MD_DEF_STATE;
5012 }
5013 !!!next-input-character;
5014 redo A;
5015 } elsif ($self->{nc} == 0x002A or # *
5016 $self->{nc} == 0x002B or # +
5017 $self->{nc} == 0x003F) { # ?
5018 push @{$self->{ct}->{content}}, chr $self->{nc};
5019 if ($self->{group_depth}) {
5020 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5021 } else {
5022 $self->{state} = AFTER_MD_DEF_STATE;
5023 }
5024 !!!next-input-character;
5025 redo A;
5026 } elsif ($self->{nc} == 0x0029) { # )
5027 if ($self->{group_depth}) {
5028 $self->{group_depth}--;
5029 push @{$self->{ct}->{content}}, chr $self->{nc};
5030 ## Stay in the state.
5031 !!!next-input-character;
5032 redo A;
5033 } else {
5034 !!!parse-error (type => 'string after md def'); ## TODO: type
5035 $self->{state} = BOGUS_MD_STATE;
5036 ## Reconsume.
5037 redo A;
5038 }
5039 } elsif ($self->{nc} == 0x003E) { # >
5040 if ($self->{group_depth}) {
5041 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5042 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5043 }
5044 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5045 !!!next-input-character;
5046 !!!emit ($self->{ct}); # ELEMENT
5047 redo A;
5048 } elsif ($self->{nc} == -1) {
5049 !!!parse-error (type => 'unclosed md'); ## TODO: type
5050 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5051 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5052 !!!next-input-character;
5053 !!!emit ($self->{ct}); # ELEMENT
5054 redo A;
5055 } else {
5056 if ($self->{group_depth}) {
5057 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5058 } else {
5059 !!!parse-error (type => 'string after md def'); ## TODO: type
5060 $self->{state} = BOGUS_MD_STATE;
5061 }
5062 ## Reconsume.
5063 redo A;
5064 }
5065 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5066 if ($is_space->{$self->{nc}}) {
5067 ## Stay in the state.
5068 !!!next-input-character;
5069 redo A;
5070 } elsif ($self->{nc} == 0x003E) { # >
5071 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5072 !!!next-input-character;
5073 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5074 redo A;
5075 } elsif ($self->{nc} == -1) {
5076 !!!parse-error (type => 'unclosed md'); ## TODO: type
5077 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5078 !!!next-input-character;
5079 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5080 redo A;
5081 } else {
5082 !!!parse-error (type => 'string after md def'); ## TODO: type
5083 $self->{state} = BOGUS_MD_STATE;
5084 ## Reconsume.
5085 redo A;
5086 }
5087 } elsif ($self->{state} == BOGUS_MD_STATE) {
5088 if ($self->{nc} == 0x003E) { # >
5089 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5090 !!!next-input-character;
5091 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5092 redo A;
5093 } elsif ($self->{nc} == -1) {
5094 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5095 ## Reconsume.
5096 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5097 redo A;
5098 } else {
5099 ## Stay in the state.
5100 !!!next-input-character;
5101 redo A;
5102 }
5103 } else {
5104 die "$0: $self->{state}: Unknown state";
5105 }
5106 } # A
5107
5108 die "$0: _get_next_token: unexpected case";
5109 } # _get_next_token
5110
5111 1;
5112 ## $Date: 2008/10/19 13:43:55 $
5113

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24