/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.34 - (show annotations) (download) (as text)
Sat Sep 5 11:31:58 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.33: +11 -10 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Sep 2009 11:31:07 -0000
	* tokenizer-test-1.test: Changed to keep non-normal character
	references (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 11:31:46 -0000
	* Tokenizer.pm.src: Changed to keep non-normal character
	references as is (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.33 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_BANG_STATE () { 102 }
109 sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 sub COMMENT_END_DASH_STATE () { 18 }
111 sub BOGUS_COMMENT_STATE () { 19 }
112 sub DOCTYPE_STATE () { 20 }
113 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114 sub DOCTYPE_NAME_STATE () { 22 }
115 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124 sub BOGUS_DOCTYPE_STATE () { 32 }
125 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126 sub SELF_CLOSING_START_TAG_STATE () { 34 }
127 sub CDATA_SECTION_STATE () { 35 }
128 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136 ## NOTE: "Entity data state", "entity in attribute value state", and
137 ## "consume a character reference" algorithm are jointly implemented
138 ## using the following six states:
139 sub ENTITY_STATE () { 44 }
140 sub ENTITY_HASH_STATE () { 45 }
141 sub NCR_NUM_STATE () { 46 }
142 sub HEXREF_X_STATE () { 47 }
143 sub HEXREF_HEX_STATE () { 48 }
144 sub ENTITY_NAME_STATE () { 49 }
145 sub PCDATA_STATE () { 50 } # "data state" in the spec
146
147 ## XML-only states
148 sub PI_STATE () { 51 }
149 sub PI_TARGET_STATE () { 52 }
150 sub PI_TARGET_AFTER_STATE () { 53 }
151 sub PI_DATA_STATE () { 54 }
152 sub PI_AFTER_STATE () { 55 }
153 sub PI_DATA_AFTER_STATE () { 56 }
154 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157 sub DOCTYPE_TAG_STATE () { 60 }
158 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159 sub MD_ATTLIST_STATE () { 62 }
160 sub MD_E_STATE () { 63 }
161 sub MD_ELEMENT_STATE () { 64 }
162 sub MD_ENTITY_STATE () { 65 }
163 sub MD_NOTATION_STATE () { 66 }
164 sub DOCTYPE_MD_STATE () { 67 }
165 sub BEFORE_MD_NAME_STATE () { 68 }
166 sub MD_NAME_STATE () { 69 }
167 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174 sub ALLOWED_TOKEN_STATE () { 77 }
175 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 sub BEFORE_NDATA_STATE () { 85 }
183 sub NDATA_STATE () { 86 }
184 sub AFTER_NDATA_STATE () { 87 }
185 sub BEFORE_NOTATION_NAME_STATE () { 88 }
186 sub NOTATION_NAME_STATE () { 89 }
187 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190 sub AFTER_ELEMENT_NAME_STATE () { 93 }
191 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192 sub CONTENT_KEYWORD_STATE () { 95 }
193 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194 sub CM_ELEMENT_NAME_STATE () { 97 }
195 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197 sub AFTER_MD_DEF_STATE () { 100 }
198 sub BOGUS_MD_STATE () { 101 }
199
200 ## Tree constructor state constants (see Whatpm::HTML for the full
201 ## list and descriptions)
202
203 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204 sub FOREIGN_EL () { 0b1_00000000000 }
205
206 ## Character reference mappings
207
208 my $charref_map = {
209 0x00 => 0xFFFD, # REPLACEMENT CHARACTER
210 0x0D => 0x000A,
211 0x80 => 0x20AC,
212 0x81 => 0x0081,
213 0x82 => 0x201A,
214 0x83 => 0x0192,
215 0x84 => 0x201E,
216 0x85 => 0x2026,
217 0x86 => 0x2020,
218 0x87 => 0x2021,
219 0x88 => 0x02C6,
220 0x89 => 0x2030,
221 0x8A => 0x0160,
222 0x8B => 0x2039,
223 0x8C => 0x0152,
224 0x8D => 0x008D,
225 0x8E => 0x017D,
226 0x8F => 0x008F,
227 0x90 => 0x0090,
228 0x91 => 0x2018,
229 0x92 => 0x2019,
230 0x93 => 0x201C,
231 0x94 => 0x201D,
232 0x95 => 0x2022,
233 0x96 => 0x2013,
234 0x97 => 0x2014,
235 0x98 => 0x02DC,
236 0x99 => 0x2122,
237 0x9A => 0x0161,
238 0x9B => 0x203A,
239 0x9C => 0x0153,
240 0x9D => 0x009D,
241 0x9E => 0x017E,
242 0x9F => 0x0178,
243 }; # $charref_map
244 $charref_map->{$_} = $_
245 for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
246 0xD800..0xDFFF, 0xFDD0..0xFDEF,
247 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
248 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
249 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
250 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
251 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
252
253 ## Implementations MUST act as if state machine in the spec
254
255 sub _initialize_tokenizer ($) {
256 my $self = shift;
257
258 ## NOTE: Fields set by |new| constructor:
259 #$self->{level}
260 #$self->{set_nc}
261 #$self->{parse_error}
262 #$self->{is_xml} (if XML)
263
264 $self->{state} = DATA_STATE; # MUST
265 $self->{s_kwd} = ''; # Data state keyword
266 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
267 #$self->{entity__value}; # initialized when used
268 #$self->{entity__match}; # initialized when used
269 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
270 undef $self->{ct}; # current token
271 undef $self->{ca}; # current attribute
272 undef $self->{last_stag_name}; # last emitted start tag name
273 #$self->{prev_state}; # initialized when used
274 delete $self->{self_closing};
275 $self->{char_buffer} = '';
276 $self->{char_buffer_pos} = 0;
277 $self->{nc} = -1; # next input character
278 #$self->{next_nc}
279 !!!next-input-character;
280 $self->{token} = [];
281 # $self->{escape}
282 } # _initialize_tokenizer
283
284 ## A token has:
285 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
286 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
287 ## ->{name} (DOCTYPE_TOKEN)
288 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
289 ## ->{target} (PI_TOKEN)
290 ## ->{pubid} (DOCTYPE_TOKEN)
291 ## ->{sysid} (DOCTYPE_TOKEN)
292 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
293 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
294 ## ->{name}
295 ## ->{value}
296 ## ->{has_reference} == 1 or 0
297 ## ->{index}: Index of the attribute in a tag.
298 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
299 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
300 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
301 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
302
303 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
304 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
305 ## while the token is pushed back to the stack.
306
307 ## Emitted token MUST immediately be handled by the tree construction state.
308
309 ## Before each step, UA MAY check to see if either one of the scripts in
310 ## "list of scripts that will execute as soon as possible" or the first
311 ## script in the "list of scripts that will execute asynchronously",
312 ## has completed loading. If one has, then it MUST be executed
313 ## and removed from the list.
314
315 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
316 ## (This requirement was dropped from HTML5 spec, unfortunately.)
317
318 my $is_space = {
319 0x0009 => 1, # CHARACTER TABULATION (HT)
320 0x000A => 1, # LINE FEED (LF)
321 #0x000B => 0, # LINE TABULATION (VT)
322 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
323 #0x000D => 1, # CARRIAGE RETURN (CR)
324 0x0020 => 1, # SPACE (SP)
325 };
326
327 sub _get_next_token ($) {
328 my $self = shift;
329
330 if ($self->{self_closing}) {
331 !!!parse-error (type => 'nestc', token => $self->{ct});
332 ## NOTE: The |self_closing| flag is only set by start tag token.
333 ## In addition, when a start tag token is emitted, it is always set to
334 ## |ct|.
335 delete $self->{self_closing};
336 }
337
338 if (@{$self->{token}}) {
339 $self->{self_closing} = $self->{token}->[0]->{self_closing};
340 return shift @{$self->{token}};
341 }
342
343 A: {
344 if ($self->{state} == PCDATA_STATE) {
345 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
346
347 if ($self->{nc} == 0x0026) { # &
348 !!!cp (0.1);
349 ## NOTE: In the spec, the tokenizer is switched to the
350 ## "entity data state". In this implementation, the tokenizer
351 ## is switched to the |ENTITY_STATE|, which is an implementation
352 ## of the "consume a character reference" algorithm.
353 $self->{entity_add} = -1;
354 $self->{prev_state} = DATA_STATE;
355 $self->{state} = ENTITY_STATE;
356 !!!next-input-character;
357 redo A;
358 } elsif ($self->{nc} == 0x003C) { # <
359 !!!cp (0.2);
360 $self->{state} = TAG_OPEN_STATE;
361 !!!next-input-character;
362 redo A;
363 } elsif ($self->{nc} == -1) {
364 !!!cp (0.3);
365 !!!emit ({type => END_OF_FILE_TOKEN,
366 line => $self->{line}, column => $self->{column}});
367 last A; ## TODO: ok?
368 } else {
369 !!!cp (0.4);
370 #
371 }
372
373 # Anything else
374 my $token = {type => CHARACTER_TOKEN,
375 data => chr $self->{nc},
376 line => $self->{line}, column => $self->{column},
377 };
378 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
379
380 ## Stay in the state.
381 !!!next-input-character;
382 !!!emit ($token);
383 redo A;
384 } elsif ($self->{state} == DATA_STATE) {
385 $self->{s_kwd} = '' unless defined $self->{s_kwd};
386 if ($self->{nc} == 0x0026) { # &
387 $self->{s_kwd} = '';
388 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
389 not $self->{escape}) {
390 !!!cp (1);
391 ## NOTE: In the spec, the tokenizer is switched to the
392 ## "entity data state". In this implementation, the tokenizer
393 ## is switched to the |ENTITY_STATE|, which is an implementation
394 ## of the "consume a character reference" algorithm.
395 $self->{entity_add} = -1;
396 $self->{prev_state} = DATA_STATE;
397 $self->{state} = ENTITY_STATE;
398 !!!next-input-character;
399 redo A;
400 } else {
401 !!!cp (2);
402 #
403 }
404 } elsif ($self->{nc} == 0x002D) { # -
405 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
406 if ($self->{s_kwd} eq '<!-') {
407 !!!cp (3);
408 $self->{escape} = 1; # unless $self->{escape};
409 $self->{s_kwd} = '--';
410 #
411 } elsif ($self->{s_kwd} eq '-') {
412 !!!cp (4);
413 $self->{s_kwd} = '--';
414 #
415 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
416 !!!cp (4.1);
417 $self->{s_kwd} .= '-';
418 #
419 } else {
420 !!!cp (5);
421 $self->{s_kwd} = '-';
422 #
423 }
424 }
425
426 #
427 } elsif ($self->{nc} == 0x0021) { # !
428 if (length $self->{s_kwd}) {
429 !!!cp (5.1);
430 $self->{s_kwd} .= '!';
431 #
432 } else {
433 !!!cp (5.2);
434 #$self->{s_kwd} = '';
435 #
436 }
437 #
438 } elsif ($self->{nc} == 0x003C) { # <
439 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
440 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
441 not $self->{escape})) {
442 !!!cp (6);
443 $self->{state} = TAG_OPEN_STATE;
444 !!!next-input-character;
445 redo A;
446 } else {
447 !!!cp (7);
448 $self->{s_kwd} = '';
449 #
450 }
451 } elsif ($self->{nc} == 0x003E) { # >
452 if ($self->{escape} and
453 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
454 if ($self->{s_kwd} eq '--') {
455 !!!cp (8);
456 delete $self->{escape};
457 #
458 } else {
459 !!!cp (9);
460 #
461 }
462 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
463 !!!cp (9.1);
464 !!!parse-error (type => 'unmatched mse', ## TODO: type
465 line => $self->{line_prev},
466 column => $self->{column_prev} - 1);
467 #
468 } else {
469 !!!cp (10);
470 #
471 }
472
473 $self->{s_kwd} = '';
474 #
475 } elsif ($self->{nc} == 0x005D) { # ]
476 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
477 !!!cp (10.1);
478 $self->{s_kwd} .= ']';
479 } elsif ($self->{s_kwd} eq ']]') {
480 !!!cp (10.2);
481 #
482 } else {
483 !!!cp (10.3);
484 $self->{s_kwd} = '';
485 }
486 #
487 } elsif ($self->{nc} == -1) {
488 !!!cp (11);
489 $self->{s_kwd} = '';
490 !!!emit ({type => END_OF_FILE_TOKEN,
491 line => $self->{line}, column => $self->{column}});
492 last A; ## TODO: ok?
493 } else {
494 !!!cp (12);
495 $self->{s_kwd} = '';
496 #
497 }
498
499 # Anything else
500 my $token = {type => CHARACTER_TOKEN,
501 data => chr $self->{nc},
502 line => $self->{line}, column => $self->{column},
503 };
504 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
505 length $token->{data})) {
506 $self->{s_kwd} = '';
507 }
508
509 ## Stay in the data state.
510 if (not $self->{is_xml} and
511 $self->{content_model} == PCDATA_CONTENT_MODEL) {
512 !!!cp (13);
513 $self->{state} = PCDATA_STATE;
514 } else {
515 !!!cp (14);
516 ## Stay in the state.
517 }
518 !!!next-input-character;
519 !!!emit ($token);
520 redo A;
521 } elsif ($self->{state} == TAG_OPEN_STATE) {
522 ## XML5: "tag state".
523
524 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
525 if ($self->{nc} == 0x002F) { # /
526 !!!cp (15);
527 !!!next-input-character;
528 $self->{state} = CLOSE_TAG_OPEN_STATE;
529 redo A;
530 } elsif ($self->{nc} == 0x0021) { # !
531 !!!cp (15.1);
532 $self->{s_kwd} = $self->{escaped} ? '' : '<';
533 #
534 } else {
535 !!!cp (16);
536 $self->{s_kwd} = '';
537 #
538 }
539
540 ## reconsume
541 $self->{state} = DATA_STATE;
542 !!!emit ({type => CHARACTER_TOKEN, data => '<',
543 line => $self->{line_prev},
544 column => $self->{column_prev},
545 });
546 redo A;
547 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
548 if ($self->{nc} == 0x0021) { # !
549 !!!cp (17);
550 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
551 !!!next-input-character;
552 redo A;
553 } elsif ($self->{nc} == 0x002F) { # /
554 !!!cp (18);
555 $self->{state} = CLOSE_TAG_OPEN_STATE;
556 !!!next-input-character;
557 redo A;
558 } elsif (0x0041 <= $self->{nc} and
559 $self->{nc} <= 0x005A) { # A..Z
560 !!!cp (19);
561 $self->{ct}
562 = {type => START_TAG_TOKEN,
563 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
564 line => $self->{line_prev},
565 column => $self->{column_prev}};
566 $self->{state} = TAG_NAME_STATE;
567 !!!next-input-character;
568 redo A;
569 } elsif (0x0061 <= $self->{nc} and
570 $self->{nc} <= 0x007A) { # a..z
571 !!!cp (20);
572 $self->{ct} = {type => START_TAG_TOKEN,
573 tag_name => chr ($self->{nc}),
574 line => $self->{line_prev},
575 column => $self->{column_prev}};
576 $self->{state} = TAG_NAME_STATE;
577 !!!next-input-character;
578 redo A;
579 } elsif ($self->{nc} == 0x003E) { # >
580 !!!cp (21);
581 !!!parse-error (type => 'empty start tag',
582 line => $self->{line_prev},
583 column => $self->{column_prev});
584 $self->{state} = DATA_STATE;
585 $self->{s_kwd} = '';
586 !!!next-input-character;
587
588 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
589 line => $self->{line_prev},
590 column => $self->{column_prev},
591 });
592
593 redo A;
594 } elsif ($self->{nc} == 0x003F) { # ?
595 if ($self->{is_xml}) {
596 !!!cp (22.1);
597 $self->{state} = PI_STATE;
598 !!!next-input-character;
599 redo A;
600 } else {
601 !!!cp (22);
602 !!!parse-error (type => 'pio',
603 line => $self->{line_prev},
604 column => $self->{column_prev});
605 $self->{state} = BOGUS_COMMENT_STATE;
606 $self->{ct} = {type => COMMENT_TOKEN, data => '',
607 line => $self->{line_prev},
608 column => $self->{column_prev},
609 };
610 ## $self->{nc} is intentionally left as is
611 redo A;
612 }
613 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
614 !!!cp (23);
615 !!!parse-error (type => 'bare stago',
616 line => $self->{line_prev},
617 column => $self->{column_prev});
618 $self->{state} = DATA_STATE;
619 $self->{s_kwd} = '';
620 ## reconsume
621
622 !!!emit ({type => CHARACTER_TOKEN, data => '<',
623 line => $self->{line_prev},
624 column => $self->{column_prev},
625 });
626
627 redo A;
628 } else {
629 ## XML5: "<:" is a parse error.
630 !!!cp (23.1);
631 $self->{ct} = {type => START_TAG_TOKEN,
632 tag_name => chr ($self->{nc}),
633 line => $self->{line_prev},
634 column => $self->{column_prev}};
635 $self->{state} = TAG_NAME_STATE;
636 !!!next-input-character;
637 redo A;
638 }
639 } else {
640 die "$0: $self->{content_model} in tag open";
641 }
642 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
643 ## NOTE: The "close tag open state" in the spec is implemented as
644 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
645
646 ## XML5: "end tag state".
647
648 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
649 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
650 if (defined $self->{last_stag_name}) {
651 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
652 $self->{kwd} = '';
653 ## Reconsume.
654 redo A;
655 } else {
656 ## No start tag token has ever been emitted
657 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
658 !!!cp (28);
659 $self->{state} = DATA_STATE;
660 $self->{s_kwd} = '';
661 ## Reconsume.
662 !!!emit ({type => CHARACTER_TOKEN, data => '</',
663 line => $l, column => $c,
664 });
665 redo A;
666 }
667 }
668
669 if (0x0041 <= $self->{nc} and
670 $self->{nc} <= 0x005A) { # A..Z
671 !!!cp (29);
672 $self->{ct}
673 = {type => END_TAG_TOKEN,
674 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
675 line => $l, column => $c};
676 $self->{state} = TAG_NAME_STATE;
677 !!!next-input-character;
678 redo A;
679 } elsif (0x0061 <= $self->{nc} and
680 $self->{nc} <= 0x007A) { # a..z
681 !!!cp (30);
682 $self->{ct} = {type => END_TAG_TOKEN,
683 tag_name => chr ($self->{nc}),
684 line => $l, column => $c};
685 $self->{state} = TAG_NAME_STATE;
686 !!!next-input-character;
687 redo A;
688 } elsif ($self->{nc} == 0x003E) { # >
689 !!!parse-error (type => 'empty end tag',
690 line => $self->{line_prev}, ## "<" in "</>"
691 column => $self->{column_prev} - 1);
692 $self->{state} = DATA_STATE;
693 $self->{s_kwd} = '';
694 if ($self->{is_xml}) {
695 !!!cp (31);
696 ## XML5: No parse error.
697
698 ## NOTE: This parser raises a parse error, since it supports
699 ## XML1, not XML5.
700
701 ## NOTE: A short end tag token.
702 my $ct = {type => END_TAG_TOKEN,
703 tag_name => '',
704 line => $self->{line_prev},
705 column => $self->{column_prev} - 1,
706 };
707 !!!next-input-character;
708 !!!emit ($ct);
709 } else {
710 !!!cp (31.1);
711 !!!next-input-character;
712 }
713 redo A;
714 } elsif ($self->{nc} == -1) {
715 !!!cp (32);
716 !!!parse-error (type => 'bare etago');
717 $self->{s_kwd} = '';
718 $self->{state} = DATA_STATE;
719 # reconsume
720
721 !!!emit ({type => CHARACTER_TOKEN, data => '</',
722 line => $l, column => $c,
723 });
724
725 redo A;
726 } elsif (not $self->{is_xml} or
727 $is_space->{$self->{nc}}) {
728 !!!cp (33);
729 !!!parse-error (type => 'bogus end tag',
730 line => $self->{line_prev}, # "<" of "</"
731 column => $self->{column_prev} - 1);
732 $self->{state} = BOGUS_COMMENT_STATE;
733 $self->{ct} = {type => COMMENT_TOKEN, data => '',
734 line => $self->{line_prev}, # "<" of "</"
735 column => $self->{column_prev} - 1,
736 };
737 ## NOTE: $self->{nc} is intentionally left as is.
738 ## Although the "anything else" case of the spec not explicitly
739 ## states that the next input character is to be reconsumed,
740 ## it will be included to the |data| of the comment token
741 ## generated from the bogus end tag, as defined in the
742 ## "bogus comment state" entry.
743 redo A;
744 } else {
745 ## XML5: "</:" is a parse error.
746 !!!cp (30.1);
747 $self->{ct} = {type => END_TAG_TOKEN,
748 tag_name => chr ($self->{nc}),
749 line => $l, column => $c};
750 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
751 !!!next-input-character;
752 redo A;
753 }
754 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
755 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
756 if (length $ch) {
757 my $CH = $ch;
758 $ch =~ tr/a-z/A-Z/;
759 my $nch = chr $self->{nc};
760 if ($nch eq $ch or $nch eq $CH) {
761 !!!cp (24);
762 ## Stay in the state.
763 $self->{kwd} .= $nch;
764 !!!next-input-character;
765 redo A;
766 } else {
767 !!!cp (25);
768 $self->{state} = DATA_STATE;
769 $self->{s_kwd} = '';
770 ## Reconsume.
771 !!!emit ({type => CHARACTER_TOKEN,
772 data => '</' . $self->{kwd},
773 line => $self->{line_prev},
774 column => $self->{column_prev} - 1 - length $self->{kwd},
775 });
776 redo A;
777 }
778 } else { # after "<{tag-name}"
779 unless ($is_space->{$self->{nc}} or
780 {
781 0x003E => 1, # >
782 0x002F => 1, # /
783 -1 => 1, # EOF
784 }->{$self->{nc}}) {
785 !!!cp (26);
786 ## Reconsume.
787 $self->{state} = DATA_STATE;
788 $self->{s_kwd} = '';
789 !!!emit ({type => CHARACTER_TOKEN,
790 data => '</' . $self->{kwd},
791 line => $self->{line_prev},
792 column => $self->{column_prev} - 1 - length $self->{kwd},
793 });
794 redo A;
795 } else {
796 !!!cp (27);
797 $self->{ct}
798 = {type => END_TAG_TOKEN,
799 tag_name => $self->{last_stag_name},
800 line => $self->{line_prev},
801 column => $self->{column_prev} - 1 - length $self->{kwd}};
802 $self->{state} = TAG_NAME_STATE;
803 ## Reconsume.
804 redo A;
805 }
806 }
807 } elsif ($self->{state} == TAG_NAME_STATE) {
808 if ($is_space->{$self->{nc}}) {
809 !!!cp (34);
810 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
811 !!!next-input-character;
812 redo A;
813 } elsif ($self->{nc} == 0x003E) { # >
814 if ($self->{ct}->{type} == START_TAG_TOKEN) {
815 !!!cp (35);
816 $self->{last_stag_name} = $self->{ct}->{tag_name};
817 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
818 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
819 #if ($self->{ct}->{attributes}) {
820 # ## NOTE: This should never be reached.
821 # !!! cp (36);
822 # !!! parse-error (type => 'end tag attribute');
823 #} else {
824 !!!cp (37);
825 #}
826 } else {
827 die "$0: $self->{ct}->{type}: Unknown token type";
828 }
829 $self->{state} = DATA_STATE;
830 $self->{s_kwd} = '';
831 !!!next-input-character;
832
833 !!!emit ($self->{ct}); # start tag or end tag
834
835 redo A;
836 } elsif (0x0041 <= $self->{nc} and
837 $self->{nc} <= 0x005A) { # A..Z
838 !!!cp (38);
839 $self->{ct}->{tag_name}
840 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
841 # start tag or end tag
842 ## Stay in this state
843 !!!next-input-character;
844 redo A;
845 } elsif ($self->{nc} == -1) {
846 !!!parse-error (type => 'unclosed tag');
847 if ($self->{ct}->{type} == START_TAG_TOKEN) {
848 !!!cp (39);
849 $self->{last_stag_name} = $self->{ct}->{tag_name};
850 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
851 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
852 #if ($self->{ct}->{attributes}) {
853 # ## NOTE: This state should never be reached.
854 # !!! cp (40);
855 # !!! parse-error (type => 'end tag attribute');
856 #} else {
857 !!!cp (41);
858 #}
859 } else {
860 die "$0: $self->{ct}->{type}: Unknown token type";
861 }
862 $self->{state} = DATA_STATE;
863 $self->{s_kwd} = '';
864 # reconsume
865
866 ## Discard the token.
867 #!!!emit ($self->{ct}); # start tag or end tag
868
869 redo A;
870 } elsif ($self->{nc} == 0x002F) { # /
871 !!!cp (42);
872 $self->{state} = SELF_CLOSING_START_TAG_STATE;
873 !!!next-input-character;
874 redo A;
875 } else {
876 !!!cp (44);
877 $self->{ct}->{tag_name} .= chr $self->{nc};
878 # start tag or end tag
879 ## Stay in the state
880 !!!next-input-character;
881 redo A;
882 }
883 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
884 ## XML5: "Tag attribute name before state".
885
886 if ($is_space->{$self->{nc}}) {
887 !!!cp (45);
888 ## Stay in the state
889 !!!next-input-character;
890 redo A;
891 } elsif ($self->{nc} == 0x003E) { # >
892 if ($self->{ct}->{type} == START_TAG_TOKEN) {
893 !!!cp (46);
894 $self->{last_stag_name} = $self->{ct}->{tag_name};
895 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
896 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
897 if ($self->{ct}->{attributes}) {
898 !!!cp (47);
899 !!!parse-error (type => 'end tag attribute');
900 } else {
901 !!!cp (48);
902 }
903 } else {
904 die "$0: $self->{ct}->{type}: Unknown token type";
905 }
906 $self->{state} = DATA_STATE;
907 $self->{s_kwd} = '';
908 !!!next-input-character;
909
910 !!!emit ($self->{ct}); # start tag or end tag
911
912 redo A;
913 } elsif (0x0041 <= $self->{nc} and
914 $self->{nc} <= 0x005A) { # A..Z
915 !!!cp (49);
916 $self->{ca}
917 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
918 value => '',
919 line => $self->{line}, column => $self->{column}};
920 $self->{state} = ATTRIBUTE_NAME_STATE;
921 !!!next-input-character;
922 redo A;
923 } elsif ($self->{nc} == 0x002F) { # /
924 !!!cp (50);
925 $self->{state} = SELF_CLOSING_START_TAG_STATE;
926 !!!next-input-character;
927 redo A;
928 } elsif ($self->{nc} == -1) {
929 !!!parse-error (type => 'unclosed tag');
930 if ($self->{ct}->{type} == START_TAG_TOKEN) {
931 !!!cp (52);
932 $self->{last_stag_name} = $self->{ct}->{tag_name};
933 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
934 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
935 if ($self->{ct}->{attributes}) {
936 !!!cp (53);
937 !!!parse-error (type => 'end tag attribute');
938 } else {
939 !!!cp (54);
940 }
941 } else {
942 die "$0: $self->{ct}->{type}: Unknown token type";
943 }
944 $self->{state} = DATA_STATE;
945 $self->{s_kwd} = '';
946 # reconsume
947
948 ## Discard the token.
949 #!!!emit ($self->{ct}); # start tag or end tag
950
951 redo A;
952 } else {
953 if ({
954 0x0022 => 1, # "
955 0x0027 => 1, # '
956 0x003C => 1, # <
957 0x003D => 1, # =
958 }->{$self->{nc}}) {
959 !!!cp (55);
960 ## XML5: Not a parse error.
961 !!!parse-error (type => 'bad attribute name');
962 } else {
963 !!!cp (56);
964 ## XML5: ":" raises a parse error and is ignored.
965 }
966 $self->{ca}
967 = {name => chr ($self->{nc}),
968 value => '',
969 line => $self->{line}, column => $self->{column}};
970 $self->{state} = ATTRIBUTE_NAME_STATE;
971 !!!next-input-character;
972 redo A;
973 }
974 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
975 ## XML5: "Tag attribute name state".
976
977 my $before_leave = sub {
978 if (exists $self->{ct}->{attributes} # start tag or end tag
979 ->{$self->{ca}->{name}}) { # MUST
980 !!!cp (57);
981 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
982 ## Discard $self->{ca} # MUST
983 } else {
984 !!!cp (58);
985 $self->{ct}->{attributes}->{$self->{ca}->{name}}
986 = $self->{ca};
987 $self->{ca}->{index} = ++$self->{ct}->{last_index};
988 }
989 }; # $before_leave
990
991 if ($is_space->{$self->{nc}}) {
992 !!!cp (59);
993 $before_leave->();
994 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{nc} == 0x003D) { # =
998 !!!cp (60);
999 $before_leave->();
1000 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1001 !!!next-input-character;
1002 redo A;
1003 } elsif ($self->{nc} == 0x003E) { # >
1004 if ($self->{is_xml}) {
1005 !!!cp (60.1);
1006 ## XML5: Not a parse error.
1007 !!!parse-error (type => 'no attr value'); ## TODO: type
1008 } else {
1009 !!!cp (60.2);
1010 }
1011
1012 $before_leave->();
1013 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1014 !!!cp (61);
1015 $self->{last_stag_name} = $self->{ct}->{tag_name};
1016 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1017 !!!cp (62);
1018 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1019 if ($self->{ct}->{attributes}) {
1020 !!!parse-error (type => 'end tag attribute');
1021 }
1022 } else {
1023 die "$0: $self->{ct}->{type}: Unknown token type";
1024 }
1025 $self->{state} = DATA_STATE;
1026 $self->{s_kwd} = '';
1027 !!!next-input-character;
1028
1029 !!!emit ($self->{ct}); # start tag or end tag
1030
1031 redo A;
1032 } elsif (0x0041 <= $self->{nc} and
1033 $self->{nc} <= 0x005A) { # A..Z
1034 !!!cp (63);
1035 $self->{ca}->{name}
1036 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1037 ## Stay in the state
1038 !!!next-input-character;
1039 redo A;
1040 } elsif ($self->{nc} == 0x002F) { # /
1041 if ($self->{is_xml}) {
1042 !!!cp (64);
1043 ## XML5: Not a parse error.
1044 !!!parse-error (type => 'no attr value'); ## TODO: type
1045 } else {
1046 !!!cp (64.1);
1047 }
1048
1049 $before_leave->();
1050 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1051 !!!next-input-character;
1052 redo A;
1053 } elsif ($self->{nc} == -1) {
1054 !!!parse-error (type => 'unclosed tag');
1055 $before_leave->();
1056 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1057 !!!cp (66);
1058 $self->{last_stag_name} = $self->{ct}->{tag_name};
1059 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1060 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1061 if ($self->{ct}->{attributes}) {
1062 !!!cp (67);
1063 !!!parse-error (type => 'end tag attribute');
1064 } else {
1065 ## NOTE: This state should never be reached.
1066 !!!cp (68);
1067 }
1068 } else {
1069 die "$0: $self->{ct}->{type}: Unknown token type";
1070 }
1071 $self->{state} = DATA_STATE;
1072 $self->{s_kwd} = '';
1073 # reconsume
1074
1075 ## Discard the token.
1076 #!!!emit ($self->{ct}); # start tag or end tag
1077
1078 redo A;
1079 } else {
1080 if ({
1081 0x0022 => 1, # "
1082 0x0027 => 1, # '
1083 0x003C => 1, # <
1084 }->{$self->{nc}}) {
1085 !!!cp (69);
1086 ## XML5: Not a parse error.
1087 !!!parse-error (type => 'bad attribute name');
1088 } else {
1089 !!!cp (70);
1090 }
1091 $self->{ca}->{name} .= chr ($self->{nc});
1092 ## Stay in the state
1093 !!!next-input-character;
1094 redo A;
1095 }
1096 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1097 ## XML5: "Tag attribute name after state".
1098
1099 if ($is_space->{$self->{nc}}) {
1100 !!!cp (71);
1101 ## Stay in the state
1102 !!!next-input-character;
1103 redo A;
1104 } elsif ($self->{nc} == 0x003D) { # =
1105 !!!cp (72);
1106 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1107 !!!next-input-character;
1108 redo A;
1109 } elsif ($self->{nc} == 0x003E) { # >
1110 if ($self->{is_xml}) {
1111 !!!cp (72.1);
1112 ## XML5: Not a parse error.
1113 !!!parse-error (type => 'no attr value'); ## TODO: type
1114 } else {
1115 !!!cp (72.2);
1116 }
1117
1118 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1119 !!!cp (73);
1120 $self->{last_stag_name} = $self->{ct}->{tag_name};
1121 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1122 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1123 if ($self->{ct}->{attributes}) {
1124 !!!cp (74);
1125 !!!parse-error (type => 'end tag attribute');
1126 } else {
1127 ## NOTE: This state should never be reached.
1128 !!!cp (75);
1129 }
1130 } else {
1131 die "$0: $self->{ct}->{type}: Unknown token type";
1132 }
1133 $self->{state} = DATA_STATE;
1134 $self->{s_kwd} = '';
1135 !!!next-input-character;
1136
1137 !!!emit ($self->{ct}); # start tag or end tag
1138
1139 redo A;
1140 } elsif (0x0041 <= $self->{nc} and
1141 $self->{nc} <= 0x005A) { # A..Z
1142 !!!cp (76);
1143 $self->{ca}
1144 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1145 value => '',
1146 line => $self->{line}, column => $self->{column}};
1147 $self->{state} = ATTRIBUTE_NAME_STATE;
1148 !!!next-input-character;
1149 redo A;
1150 } elsif ($self->{nc} == 0x002F) { # /
1151 if ($self->{is_xml}) {
1152 !!!cp (77);
1153 ## XML5: Not a parse error.
1154 !!!parse-error (type => 'no attr value'); ## TODO: type
1155 } else {
1156 !!!cp (77.1);
1157 }
1158
1159 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1160 !!!next-input-character;
1161 redo A;
1162 } elsif ($self->{nc} == -1) {
1163 !!!parse-error (type => 'unclosed tag');
1164 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1165 !!!cp (79);
1166 $self->{last_stag_name} = $self->{ct}->{tag_name};
1167 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1168 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1169 if ($self->{ct}->{attributes}) {
1170 !!!cp (80);
1171 !!!parse-error (type => 'end tag attribute');
1172 } else {
1173 ## NOTE: This state should never be reached.
1174 !!!cp (81);
1175 }
1176 } else {
1177 die "$0: $self->{ct}->{type}: Unknown token type";
1178 }
1179 $self->{s_kwd} = '';
1180 $self->{state} = DATA_STATE;
1181 # reconsume
1182
1183 ## Discard the token.
1184 #!!!emit ($self->{ct}); # start tag or end tag
1185
1186 redo A;
1187 } else {
1188 if ($self->{is_xml}) {
1189 !!!cp (78.1);
1190 ## XML5: Not a parse error.
1191 !!!parse-error (type => 'no attr value'); ## TODO: type
1192 } else {
1193 !!!cp (78.2);
1194 }
1195
1196 if ({
1197 0x0022 => 1, # "
1198 0x0027 => 1, # '
1199 0x003C => 1, # <
1200 }->{$self->{nc}}) {
1201 !!!cp (78);
1202 ## XML5: Not a parse error.
1203 !!!parse-error (type => 'bad attribute name');
1204 } else {
1205 !!!cp (82);
1206 }
1207 $self->{ca}
1208 = {name => chr ($self->{nc}),
1209 value => '',
1210 line => $self->{line}, column => $self->{column}};
1211 $self->{state} = ATTRIBUTE_NAME_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 }
1215 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1216 ## XML5: "Tag attribute value before state".
1217
1218 if ($is_space->{$self->{nc}}) {
1219 !!!cp (83);
1220 ## Stay in the state
1221 !!!next-input-character;
1222 redo A;
1223 } elsif ($self->{nc} == 0x0022) { # "
1224 !!!cp (84);
1225 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1226 !!!next-input-character;
1227 redo A;
1228 } elsif ($self->{nc} == 0x0026) { # &
1229 !!!cp (85);
1230 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1231 ## reconsume
1232 redo A;
1233 } elsif ($self->{nc} == 0x0027) { # '
1234 !!!cp (86);
1235 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1236 !!!next-input-character;
1237 redo A;
1238 } elsif ($self->{nc} == 0x003E) { # >
1239 !!!parse-error (type => 'empty unquoted attribute value');
1240 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1241 !!!cp (87);
1242 $self->{last_stag_name} = $self->{ct}->{tag_name};
1243 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1244 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1245 if ($self->{ct}->{attributes}) {
1246 !!!cp (88);
1247 !!!parse-error (type => 'end tag attribute');
1248 } else {
1249 ## NOTE: This state should never be reached.
1250 !!!cp (89);
1251 }
1252 } else {
1253 die "$0: $self->{ct}->{type}: Unknown token type";
1254 }
1255 $self->{state} = DATA_STATE;
1256 $self->{s_kwd} = '';
1257 !!!next-input-character;
1258
1259 !!!emit ($self->{ct}); # start tag or end tag
1260
1261 redo A;
1262 } elsif ($self->{nc} == -1) {
1263 !!!parse-error (type => 'unclosed tag');
1264 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1265 !!!cp (90);
1266 $self->{last_stag_name} = $self->{ct}->{tag_name};
1267 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1268 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1269 if ($self->{ct}->{attributes}) {
1270 !!!cp (91);
1271 !!!parse-error (type => 'end tag attribute');
1272 } else {
1273 ## NOTE: This state should never be reached.
1274 !!!cp (92);
1275 }
1276 } else {
1277 die "$0: $self->{ct}->{type}: Unknown token type";
1278 }
1279 $self->{state} = DATA_STATE;
1280 $self->{s_kwd} = '';
1281 ## reconsume
1282
1283 ## Discard the token.
1284 #!!!emit ($self->{ct}); # start tag or end tag
1285
1286 redo A;
1287 } else {
1288 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1289 !!!cp (93);
1290 ## XML5: Not a parse error.
1291 !!!parse-error (type => 'bad attribute value');
1292 } elsif ($self->{is_xml}) {
1293 !!!cp (93.1);
1294 ## XML5: No parse error.
1295 !!!parse-error (type => 'unquoted attr value'); ## TODO
1296 } else {
1297 !!!cp (94);
1298 }
1299 $self->{ca}->{value} .= chr ($self->{nc});
1300 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1301 !!!next-input-character;
1302 redo A;
1303 }
1304 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1305 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1306 ## ATTLIST attribute value double quoted state".
1307
1308 if ($self->{nc} == 0x0022) { # "
1309 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1310 !!!cp (95.1);
1311 ## XML5: "DOCTYPE ATTLIST name after state".
1312 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1313 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1314 } else {
1315 !!!cp (95);
1316 ## XML5: "Tag attribute name before state".
1317 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1318 }
1319 !!!next-input-character;
1320 redo A;
1321 } elsif ($self->{nc} == 0x0026) { # &
1322 !!!cp (96);
1323 ## XML5: Not defined yet.
1324
1325 ## NOTE: In the spec, the tokenizer is switched to the
1326 ## "entity in attribute value state". In this implementation, the
1327 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1328 ## implementation of the "consume a character reference" algorithm.
1329 $self->{prev_state} = $self->{state};
1330 $self->{entity_add} = 0x0022; # "
1331 $self->{state} = ENTITY_STATE;
1332 !!!next-input-character;
1333 redo A;
1334 } elsif ($self->{is_xml} and
1335 $is_space->{$self->{nc}}) {
1336 !!!cp (97.1);
1337 $self->{ca}->{value} .= ' ';
1338 ## Stay in the state.
1339 !!!next-input-character;
1340 redo A;
1341 } elsif ($self->{nc} == -1) {
1342 !!!parse-error (type => 'unclosed attribute value');
1343 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1344 !!!cp (97);
1345 $self->{last_stag_name} = $self->{ct}->{tag_name};
1346
1347 $self->{state} = DATA_STATE;
1348 $self->{s_kwd} = '';
1349 ## reconsume
1350 !!!emit ($self->{ct}); # start tag
1351 redo A;
1352 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1353 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1354 if ($self->{ct}->{attributes}) {
1355 !!!cp (98);
1356 !!!parse-error (type => 'end tag attribute');
1357 } else {
1358 ## NOTE: This state should never be reached.
1359 !!!cp (99);
1360 }
1361
1362 $self->{state} = DATA_STATE;
1363 $self->{s_kwd} = '';
1364 ## reconsume
1365
1366 ## Discard the token.
1367 #!!!emit ($self->{ct}); # end tag
1368
1369 redo A;
1370 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1371 ## XML5: No parse error above; not defined yet.
1372 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1373 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1374 ## Reconsume.
1375
1376 ## Discard the token.
1377 #!!!emit ($self->{ct}); # ATTLIST
1378
1379 redo A;
1380 } else {
1381 die "$0: $self->{ct}->{type}: Unknown token type";
1382 }
1383 } else {
1384 ## XML5 [ATTLIST]: Not defined yet.
1385 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1386 !!!cp (100);
1387 ## XML5: Not a parse error.
1388 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1389 } else {
1390 !!!cp (100.1);
1391 }
1392 $self->{ca}->{value} .= chr ($self->{nc});
1393 $self->{read_until}->($self->{ca}->{value},
1394 qq["&<\x09\x0C\x20],
1395 length $self->{ca}->{value});
1396
1397 ## Stay in the state
1398 !!!next-input-character;
1399 redo A;
1400 }
1401 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1402 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1403 ## ATTLIST attribute value single quoted state".
1404
1405 if ($self->{nc} == 0x0027) { # '
1406 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1407 !!!cp (101.1);
1408 ## XML5: "DOCTYPE ATTLIST name after state".
1409 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1410 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1411 } else {
1412 !!!cp (101);
1413 ## XML5: "Before attribute name state" (sic).
1414 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1415 }
1416 !!!next-input-character;
1417 redo A;
1418 } elsif ($self->{nc} == 0x0026) { # &
1419 !!!cp (102);
1420 ## XML5: Not defined yet.
1421
1422 ## NOTE: In the spec, the tokenizer is switched to the
1423 ## "entity in attribute value state". In this implementation, the
1424 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1425 ## implementation of the "consume a character reference" algorithm.
1426 $self->{entity_add} = 0x0027; # '
1427 $self->{prev_state} = $self->{state};
1428 $self->{state} = ENTITY_STATE;
1429 !!!next-input-character;
1430 redo A;
1431 } elsif ($self->{is_xml} and
1432 $is_space->{$self->{nc}}) {
1433 !!!cp (103.1);
1434 $self->{ca}->{value} .= ' ';
1435 ## Stay in the state.
1436 !!!next-input-character;
1437 redo A;
1438 } elsif ($self->{nc} == -1) {
1439 !!!parse-error (type => 'unclosed attribute value');
1440 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1441 !!!cp (103);
1442 $self->{last_stag_name} = $self->{ct}->{tag_name};
1443
1444 $self->{state} = DATA_STATE;
1445 $self->{s_kwd} = '';
1446 ## reconsume
1447
1448 ## Discard the token.
1449 #!!!emit ($self->{ct}); # start tag
1450
1451 redo A;
1452 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1453 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1454 if ($self->{ct}->{attributes}) {
1455 !!!cp (104);
1456 !!!parse-error (type => 'end tag attribute');
1457 } else {
1458 ## NOTE: This state should never be reached.
1459 !!!cp (105);
1460 }
1461
1462 $self->{state} = DATA_STATE;
1463 $self->{s_kwd} = '';
1464 ## reconsume
1465
1466 ## Discard the token.
1467 #!!!emit ($self->{ct}); # end tag
1468
1469 redo A;
1470 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1471 ## XML5: No parse error above; not defined yet.
1472 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1473 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1474 ## Reconsume.
1475
1476 ## Discard the token.
1477 #!!!emit ($self->{ct}); # ATTLIST
1478
1479 redo A;
1480 } else {
1481 die "$0: $self->{ct}->{type}: Unknown token type";
1482 }
1483 } else {
1484 ## XML5 [ATTLIST]: Not defined yet.
1485 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1486 !!!cp (106);
1487 ## XML5: Not a parse error.
1488 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1489 } else {
1490 !!!cp (106.1);
1491 }
1492 $self->{ca}->{value} .= chr ($self->{nc});
1493 $self->{read_until}->($self->{ca}->{value},
1494 qq['&<\x09\x0C\x20],
1495 length $self->{ca}->{value});
1496
1497 ## Stay in the state
1498 !!!next-input-character;
1499 redo A;
1500 }
1501 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1502 ## XML5: "Tag attribute value unquoted state".
1503
1504 if ($is_space->{$self->{nc}}) {
1505 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1506 !!!cp (107.1);
1507 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1508 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1509 } else {
1510 !!!cp (107);
1511 ## XML5: "Tag attribute name before state".
1512 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1513 }
1514 !!!next-input-character;
1515 redo A;
1516 } elsif ($self->{nc} == 0x0026) { # &
1517 !!!cp (108);
1518
1519 ## XML5: Not defined yet.
1520
1521 ## NOTE: In the spec, the tokenizer is switched to the
1522 ## "entity in attribute value state". In this implementation, the
1523 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1524 ## implementation of the "consume a character reference" algorithm.
1525 $self->{entity_add} = -1;
1526 $self->{prev_state} = $self->{state};
1527 $self->{state} = ENTITY_STATE;
1528 !!!next-input-character;
1529 redo A;
1530 } elsif ($self->{nc} == 0x003E) { # >
1531 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1532 !!!cp (109);
1533 $self->{last_stag_name} = $self->{ct}->{tag_name};
1534
1535 $self->{state} = DATA_STATE;
1536 $self->{s_kwd} = '';
1537 !!!next-input-character;
1538 !!!emit ($self->{ct}); # start tag
1539 redo A;
1540 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1541 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1542 if ($self->{ct}->{attributes}) {
1543 !!!cp (110);
1544 !!!parse-error (type => 'end tag attribute');
1545 } else {
1546 ## NOTE: This state should never be reached.
1547 !!!cp (111);
1548 }
1549
1550 $self->{state} = DATA_STATE;
1551 $self->{s_kwd} = '';
1552 !!!next-input-character;
1553 !!!emit ($self->{ct}); # end tag
1554 redo A;
1555 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1556 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1557 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1558 !!!next-input-character;
1559 !!!emit ($self->{ct}); # ATTLIST
1560 redo A;
1561 } else {
1562 die "$0: $self->{ct}->{type}: Unknown token type";
1563 }
1564 } elsif ($self->{nc} == -1) {
1565 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1566 !!!cp (112);
1567 !!!parse-error (type => 'unclosed tag');
1568 $self->{last_stag_name} = $self->{ct}->{tag_name};
1569
1570 $self->{state} = DATA_STATE;
1571 $self->{s_kwd} = '';
1572 ## reconsume
1573
1574 ## Discard the token.
1575 #!!!emit ($self->{ct}); # start tag
1576
1577 redo A;
1578 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1579 !!!parse-error (type => 'unclosed tag');
1580 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1581 if ($self->{ct}->{attributes}) {
1582 !!!cp (113);
1583 !!!parse-error (type => 'end tag attribute');
1584 } else {
1585 ## NOTE: This state should never be reached.
1586 !!!cp (114);
1587 }
1588
1589 $self->{state} = DATA_STATE;
1590 $self->{s_kwd} = '';
1591 ## reconsume
1592
1593 ## Discard the token.
1594 #!!!emit ($self->{ct}); # end tag
1595
1596 redo A;
1597 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1598 !!!parse-error (type => 'unclosed md'); ## TODO: type
1599 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1600 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1601 ## Reconsume.
1602
1603 ## Discard the token.
1604 #!!!emit ($self->{ct}); # ATTLIST
1605
1606 redo A;
1607 } else {
1608 die "$0: $self->{ct}->{type}: Unknown token type";
1609 }
1610 } else {
1611 if ({
1612 0x0022 => 1, # "
1613 0x0027 => 1, # '
1614 0x003D => 1, # =
1615 0x003C => 1, # <
1616 }->{$self->{nc}}) {
1617 !!!cp (115);
1618 ## XML5: Not a parse error.
1619 !!!parse-error (type => 'bad attribute value');
1620 } else {
1621 !!!cp (116);
1622 }
1623 $self->{ca}->{value} .= chr ($self->{nc});
1624 $self->{read_until}->($self->{ca}->{value},
1625 qq["'=& \x09\x0C>],
1626 length $self->{ca}->{value});
1627
1628 ## Stay in the state
1629 !!!next-input-character;
1630 redo A;
1631 }
1632 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1633 if ($is_space->{$self->{nc}}) {
1634 !!!cp (118);
1635 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1636 !!!next-input-character;
1637 redo A;
1638 } elsif ($self->{nc} == 0x003E) { # >
1639 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1640 !!!cp (119);
1641 $self->{last_stag_name} = $self->{ct}->{tag_name};
1642 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1643 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1644 if ($self->{ct}->{attributes}) {
1645 !!!cp (120);
1646 !!!parse-error (type => 'end tag attribute');
1647 } else {
1648 ## NOTE: This state should never be reached.
1649 !!!cp (121);
1650 }
1651 } else {
1652 die "$0: $self->{ct}->{type}: Unknown token type";
1653 }
1654 $self->{state} = DATA_STATE;
1655 $self->{s_kwd} = '';
1656 !!!next-input-character;
1657
1658 !!!emit ($self->{ct}); # start tag or end tag
1659
1660 redo A;
1661 } elsif ($self->{nc} == 0x002F) { # /
1662 !!!cp (122);
1663 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1664 !!!next-input-character;
1665 redo A;
1666 } elsif ($self->{nc} == -1) {
1667 !!!parse-error (type => 'unclosed tag');
1668 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1669 !!!cp (122.3);
1670 $self->{last_stag_name} = $self->{ct}->{tag_name};
1671 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1672 if ($self->{ct}->{attributes}) {
1673 !!!cp (122.1);
1674 !!!parse-error (type => 'end tag attribute');
1675 } else {
1676 ## NOTE: This state should never be reached.
1677 !!!cp (122.2);
1678 }
1679 } else {
1680 die "$0: $self->{ct}->{type}: Unknown token type";
1681 }
1682 $self->{state} = DATA_STATE;
1683 $self->{s_kwd} = '';
1684 ## Reconsume.
1685
1686 ## Discard the token.
1687 #!!!emit ($self->{ct}); # start tag or end tag
1688
1689 redo A;
1690 } else {
1691 !!!cp ('124.1');
1692 !!!parse-error (type => 'no space between attributes');
1693 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1694 ## reconsume
1695 redo A;
1696 }
1697 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1698 ## XML5: "Empty tag state".
1699
1700 if ($self->{nc} == 0x003E) { # >
1701 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1702 !!!cp ('124.2');
1703 !!!parse-error (type => 'nestc', token => $self->{ct});
1704 ## TODO: Different type than slash in start tag
1705 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1706 if ($self->{ct}->{attributes}) {
1707 !!!cp ('124.4');
1708 !!!parse-error (type => 'end tag attribute');
1709 } else {
1710 !!!cp ('124.5');
1711 }
1712 ## TODO: Test |<title></title/>|
1713 } else {
1714 !!!cp ('124.3');
1715 $self->{self_closing} = 1;
1716 }
1717
1718 $self->{state} = DATA_STATE;
1719 $self->{s_kwd} = '';
1720 !!!next-input-character;
1721
1722 !!!emit ($self->{ct}); # start tag or end tag
1723
1724 redo A;
1725 } elsif ($self->{nc} == -1) {
1726 !!!parse-error (type => 'unclosed tag');
1727 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1728 !!!cp (124.7);
1729 $self->{last_stag_name} = $self->{ct}->{tag_name};
1730 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1731 if ($self->{ct}->{attributes}) {
1732 !!!cp (124.5);
1733 !!!parse-error (type => 'end tag attribute');
1734 } else {
1735 ## NOTE: This state should never be reached.
1736 !!!cp (124.6);
1737 }
1738 } else {
1739 die "$0: $self->{ct}->{type}: Unknown token type";
1740 }
1741 ## XML5: "Tag attribute name before state".
1742 $self->{state} = DATA_STATE;
1743 $self->{s_kwd} = '';
1744 ## Reconsume.
1745
1746 ## Discard the token.
1747 #!!!emit ($self->{ct}); # start tag or end tag
1748
1749 redo A;
1750 } else {
1751 !!!cp ('124.4');
1752 !!!parse-error (type => 'nestc');
1753 ## TODO: This error type is wrong.
1754 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1755 ## Reconsume.
1756 redo A;
1757 }
1758 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1759 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1760
1761 ## NOTE: Unlike spec's "bogus comment state", this implementation
1762 ## consumes characters one-by-one basis.
1763
1764 if ($self->{nc} == 0x003E) { # >
1765 if ($self->{in_subset}) {
1766 !!!cp (123);
1767 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1768 } else {
1769 !!!cp (124);
1770 $self->{state} = DATA_STATE;
1771 $self->{s_kwd} = '';
1772 }
1773 !!!next-input-character;
1774
1775 !!!emit ($self->{ct}); # comment
1776 redo A;
1777 } elsif ($self->{nc} == -1) {
1778 if ($self->{in_subset}) {
1779 !!!cp (125.1);
1780 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1781 } else {
1782 !!!cp (125);
1783 $self->{state} = DATA_STATE;
1784 $self->{s_kwd} = '';
1785 }
1786 ## reconsume
1787
1788 !!!emit ($self->{ct}); # comment
1789 redo A;
1790 } else {
1791 !!!cp (126);
1792 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1793 $self->{read_until}->($self->{ct}->{data},
1794 q[>],
1795 length $self->{ct}->{data});
1796
1797 ## Stay in the state.
1798 !!!next-input-character;
1799 redo A;
1800 }
1801 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1802 ## XML5: "Markup declaration state".
1803
1804 if ($self->{nc} == 0x002D) { # -
1805 !!!cp (133);
1806 $self->{state} = MD_HYPHEN_STATE;
1807 !!!next-input-character;
1808 redo A;
1809 } elsif ($self->{nc} == 0x0044 or # D
1810 $self->{nc} == 0x0064) { # d
1811 ## ASCII case-insensitive.
1812 !!!cp (130);
1813 $self->{state} = MD_DOCTYPE_STATE;
1814 $self->{kwd} = chr $self->{nc};
1815 !!!next-input-character;
1816 redo A;
1817 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1818 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1819 $self->{is_xml}) and
1820 $self->{nc} == 0x005B) { # [
1821 !!!cp (135.4);
1822 $self->{state} = MD_CDATA_STATE;
1823 $self->{kwd} = '[';
1824 !!!next-input-character;
1825 redo A;
1826 } else {
1827 !!!cp (136);
1828 }
1829
1830 !!!parse-error (type => 'bogus comment',
1831 line => $self->{line_prev},
1832 column => $self->{column_prev} - 1);
1833 ## Reconsume.
1834 $self->{state} = BOGUS_COMMENT_STATE;
1835 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1836 line => $self->{line_prev},
1837 column => $self->{column_prev} - 1,
1838 };
1839 redo A;
1840 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1841 if ($self->{nc} == 0x002D) { # -
1842 !!!cp (127);
1843 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1844 line => $self->{line_prev},
1845 column => $self->{column_prev} - 2,
1846 };
1847 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1848 !!!next-input-character;
1849 redo A;
1850 } else {
1851 !!!cp (128);
1852 !!!parse-error (type => 'bogus comment',
1853 line => $self->{line_prev},
1854 column => $self->{column_prev} - 2);
1855 $self->{state} = BOGUS_COMMENT_STATE;
1856 ## Reconsume.
1857 $self->{ct} = {type => COMMENT_TOKEN,
1858 data => '-',
1859 line => $self->{line_prev},
1860 column => $self->{column_prev} - 2,
1861 };
1862 redo A;
1863 }
1864 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1865 ## ASCII case-insensitive.
1866 if ($self->{nc} == [
1867 undef,
1868 0x004F, # O
1869 0x0043, # C
1870 0x0054, # T
1871 0x0059, # Y
1872 0x0050, # P
1873 ]->[length $self->{kwd}] or
1874 $self->{nc} == [
1875 undef,
1876 0x006F, # o
1877 0x0063, # c
1878 0x0074, # t
1879 0x0079, # y
1880 0x0070, # p
1881 ]->[length $self->{kwd}]) {
1882 !!!cp (131);
1883 ## Stay in the state.
1884 $self->{kwd} .= chr $self->{nc};
1885 !!!next-input-character;
1886 redo A;
1887 } elsif ((length $self->{kwd}) == 6 and
1888 ($self->{nc} == 0x0045 or # E
1889 $self->{nc} == 0x0065)) { # e
1890 if ($self->{is_xml} and
1891 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1892 !!!cp (129);
1893 ## XML5: case-sensitive.
1894 !!!parse-error (type => 'lowercase keyword', ## TODO
1895 text => 'DOCTYPE',
1896 line => $self->{line_prev},
1897 column => $self->{column_prev} - 5);
1898 } else {
1899 !!!cp (129.1);
1900 }
1901 $self->{state} = DOCTYPE_STATE;
1902 $self->{ct} = {type => DOCTYPE_TOKEN,
1903 quirks => 1,
1904 line => $self->{line_prev},
1905 column => $self->{column_prev} - 7,
1906 };
1907 !!!next-input-character;
1908 redo A;
1909 } else {
1910 !!!cp (132);
1911 !!!parse-error (type => 'bogus comment',
1912 line => $self->{line_prev},
1913 column => $self->{column_prev} - 1 - length $self->{kwd});
1914 $self->{state} = BOGUS_COMMENT_STATE;
1915 ## Reconsume.
1916 $self->{ct} = {type => COMMENT_TOKEN,
1917 data => $self->{kwd},
1918 line => $self->{line_prev},
1919 column => $self->{column_prev} - 1 - length $self->{kwd},
1920 };
1921 redo A;
1922 }
1923 } elsif ($self->{state} == MD_CDATA_STATE) {
1924 if ($self->{nc} == {
1925 '[' => 0x0043, # C
1926 '[C' => 0x0044, # D
1927 '[CD' => 0x0041, # A
1928 '[CDA' => 0x0054, # T
1929 '[CDAT' => 0x0041, # A
1930 }->{$self->{kwd}}) {
1931 !!!cp (135.1);
1932 ## Stay in the state.
1933 $self->{kwd} .= chr $self->{nc};
1934 !!!next-input-character;
1935 redo A;
1936 } elsif ($self->{kwd} eq '[CDATA' and
1937 $self->{nc} == 0x005B) { # [
1938 if ($self->{is_xml} and
1939 not $self->{tainted} and
1940 @{$self->{open_elements} or []} == 0) {
1941 !!!cp (135.2);
1942 !!!parse-error (type => 'cdata outside of root element',
1943 line => $self->{line_prev},
1944 column => $self->{column_prev} - 7);
1945 $self->{tainted} = 1;
1946 } else {
1947 !!!cp (135.21);
1948 }
1949
1950 $self->{ct} = {type => CHARACTER_TOKEN,
1951 data => '',
1952 line => $self->{line_prev},
1953 column => $self->{column_prev} - 7};
1954 $self->{state} = CDATA_SECTION_STATE;
1955 !!!next-input-character;
1956 redo A;
1957 } else {
1958 !!!cp (135.3);
1959 !!!parse-error (type => 'bogus comment',
1960 line => $self->{line_prev},
1961 column => $self->{column_prev} - 1 - length $self->{kwd});
1962 $self->{state} = BOGUS_COMMENT_STATE;
1963 ## Reconsume.
1964 $self->{ct} = {type => COMMENT_TOKEN,
1965 data => $self->{kwd},
1966 line => $self->{line_prev},
1967 column => $self->{column_prev} - 1 - length $self->{kwd},
1968 };
1969 redo A;
1970 }
1971 } elsif ($self->{state} == COMMENT_START_STATE) {
1972 if ($self->{nc} == 0x002D) { # -
1973 !!!cp (137);
1974 $self->{state} = COMMENT_START_DASH_STATE;
1975 !!!next-input-character;
1976 redo A;
1977 } elsif ($self->{nc} == 0x003E) { # >
1978 !!!parse-error (type => 'bogus comment');
1979 if ($self->{in_subset}) {
1980 !!!cp (138.1);
1981 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1982 } else {
1983 !!!cp (138);
1984 $self->{state} = DATA_STATE;
1985 $self->{s_kwd} = '';
1986 }
1987 !!!next-input-character;
1988
1989 !!!emit ($self->{ct}); # comment
1990
1991 redo A;
1992 } elsif ($self->{nc} == -1) {
1993 !!!parse-error (type => 'unclosed comment');
1994 if ($self->{in_subset}) {
1995 !!!cp (139.1);
1996 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997 } else {
1998 !!!cp (139);
1999 $self->{state} = DATA_STATE;
2000 $self->{s_kwd} = '';
2001 }
2002 ## reconsume
2003
2004 !!!emit ($self->{ct}); # comment
2005
2006 redo A;
2007 } else {
2008 !!!cp (140);
2009 $self->{ct}->{data} # comment
2010 .= chr ($self->{nc});
2011 $self->{state} = COMMENT_STATE;
2012 !!!next-input-character;
2013 redo A;
2014 }
2015 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2016 if ($self->{nc} == 0x002D) { # -
2017 !!!cp (141);
2018 $self->{state} = COMMENT_END_STATE;
2019 !!!next-input-character;
2020 redo A;
2021 } elsif ($self->{nc} == 0x003E) { # >
2022 !!!parse-error (type => 'bogus comment');
2023 if ($self->{in_subset}) {
2024 !!!cp (142.1);
2025 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2026 } else {
2027 !!!cp (142);
2028 $self->{state} = DATA_STATE;
2029 $self->{s_kwd} = '';
2030 }
2031 !!!next-input-character;
2032
2033 !!!emit ($self->{ct}); # comment
2034
2035 redo A;
2036 } elsif ($self->{nc} == -1) {
2037 !!!parse-error (type => 'unclosed comment');
2038 if ($self->{in_subset}) {
2039 !!!cp (143.1);
2040 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2041 } else {
2042 !!!cp (143);
2043 $self->{state} = DATA_STATE;
2044 $self->{s_kwd} = '';
2045 }
2046 ## reconsume
2047
2048 !!!emit ($self->{ct}); # comment
2049
2050 redo A;
2051 } else {
2052 !!!cp (144);
2053 $self->{ct}->{data} # comment
2054 .= '-' . chr ($self->{nc});
2055 $self->{state} = COMMENT_STATE;
2056 !!!next-input-character;
2057 redo A;
2058 }
2059 } elsif ($self->{state} == COMMENT_STATE) {
2060 ## XML5: "Comment state" and "DOCTYPE comment state".
2061
2062 if ($self->{nc} == 0x002D) { # -
2063 !!!cp (145);
2064 $self->{state} = COMMENT_END_DASH_STATE;
2065 !!!next-input-character;
2066 redo A;
2067 } elsif ($self->{nc} == -1) {
2068 !!!parse-error (type => 'unclosed comment');
2069 if ($self->{in_subset}) {
2070 !!!cp (146.1);
2071 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2072 } else {
2073 !!!cp (146);
2074 $self->{state} = DATA_STATE;
2075 $self->{s_kwd} = '';
2076 }
2077 ## reconsume
2078
2079 !!!emit ($self->{ct}); # comment
2080
2081 redo A;
2082 } else {
2083 !!!cp (147);
2084 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2085 $self->{read_until}->($self->{ct}->{data},
2086 q[-],
2087 length $self->{ct}->{data});
2088
2089 ## Stay in the state
2090 !!!next-input-character;
2091 redo A;
2092 }
2093 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2094 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2095
2096 if ($self->{nc} == 0x002D) { # -
2097 !!!cp (148);
2098 $self->{state} = COMMENT_END_STATE;
2099 !!!next-input-character;
2100 redo A;
2101 } elsif ($self->{nc} == -1) {
2102 !!!parse-error (type => 'unclosed comment');
2103 if ($self->{in_subset}) {
2104 !!!cp (149.1);
2105 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2106 } else {
2107 !!!cp (149);
2108 $self->{state} = DATA_STATE;
2109 $self->{s_kwd} = '';
2110 }
2111 ## reconsume
2112
2113 !!!emit ($self->{ct}); # comment
2114
2115 redo A;
2116 } else {
2117 !!!cp (150);
2118 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2119 $self->{state} = COMMENT_STATE;
2120 !!!next-input-character;
2121 redo A;
2122 }
2123 } elsif ($self->{state} == COMMENT_END_STATE or
2124 $self->{state} == COMMENT_END_BANG_STATE) {
2125 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2126 ## (No comment end bang state.)
2127
2128 if ($self->{nc} == 0x003E) { # >
2129 if ($self->{in_subset}) {
2130 !!!cp (151.1);
2131 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2132 } else {
2133 !!!cp (151);
2134 $self->{state} = DATA_STATE;
2135 $self->{s_kwd} = '';
2136 }
2137 !!!next-input-character;
2138
2139 !!!emit ($self->{ct}); # comment
2140
2141 redo A;
2142 } elsif ($self->{nc} == 0x002D) { # -
2143 if ($self->{state} == COMMENT_END_BANG_STATE) {
2144 !!!cp (154.3);
2145 $self->{ct}->{data} .= '--!'; # comment
2146 $self->{state} = COMMENT_END_DASH_STATE;
2147 } else {
2148 !!!cp (152);
2149 ## XML5: Not a parse error.
2150 !!!parse-error (type => 'dash in comment',
2151 line => $self->{line_prev},
2152 column => $self->{column_prev});
2153 $self->{ct}->{data} .= '-'; # comment
2154 ## Stay in the state
2155 }
2156 !!!next-input-character;
2157 redo A;
2158 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2159 $is_space->{$self->{nc}}) {
2160 !!!cp (152.1);
2161 !!!parse-error (type => 'comment end space'); # XXX error type
2162 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2163 $self->{state} = COMMENT_END_SPACE_STATE;
2164 !!!next-input-character;
2165 redo A;
2166 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
2167 $self->{nc} == 0x0021) { # !
2168 !!!cp (152.2);
2169 !!!parse-error (type => 'comment end bang'); # XXX error type
2170 $self->{state} = COMMENT_END_BANG_STATE;
2171 !!!next-input-character;
2172 redo A;
2173 } elsif ($self->{nc} == -1) {
2174 !!!parse-error (type => 'unclosed comment');
2175 if ($self->{in_subset}) {
2176 !!!cp (153.1);
2177 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2178 } else {
2179 !!!cp (153);
2180 $self->{state} = DATA_STATE;
2181 $self->{s_kwd} = '';
2182 }
2183 ## Reconsume.
2184
2185 !!!emit ($self->{ct}); # comment
2186
2187 redo A;
2188 } else {
2189 !!!cp (154);
2190 if ($self->{state} == COMMENT_END_BANG_STATE) {
2191 $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
2192 } else {
2193 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2194 }
2195 $self->{state} = COMMENT_STATE;
2196 !!!next-input-character;
2197 redo A;
2198 }
2199 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
2200 ## XML5: Not exist.
2201
2202 if ($self->{nc} == 0x003E) { # >
2203 if ($self->{in_subset}) {
2204 !!!cp (154.4);
2205 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2206 } else {
2207 !!!cp (154.5);
2208 $self->{state} = DATA_STATE;
2209 $self->{s_kwd} = '';
2210 }
2211 !!!next-input-character;
2212
2213 !!!emit ($self->{ct}); # comment
2214
2215 redo A;
2216 } elsif ($is_space->{$self->{nc}}) {
2217 !!!cp (154.6);
2218 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2219 ## Stay in the state.
2220 !!!next-input-character;
2221 redo A;
2222 } elsif ($self->{nc} == -1) {
2223 !!!parse-error (type => 'unclosed comment');
2224 if ($self->{in_subset}) {
2225 !!!cp (154.7);
2226 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2227 } else {
2228 !!!cp (154.8);
2229 $self->{state} = DATA_STATE;
2230 $self->{s_kwd} = '';
2231 }
2232 ## Reconsume.
2233
2234 !!!emit ($self->{ct}); # comment
2235
2236 redo A;
2237 } else {
2238 !!!cp (154.9);
2239 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2240 $self->{state} = COMMENT_STATE;
2241 !!!next-input-character;
2242 redo A;
2243 }
2244 } elsif ($self->{state} == DOCTYPE_STATE) {
2245 if ($is_space->{$self->{nc}}) {
2246 !!!cp (155);
2247 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2248 !!!next-input-character;
2249 redo A;
2250 } elsif ($self->{nc} == -1) {
2251 !!!cp (155.1);
2252 !!!parse-error (type => 'unclosed DOCTYPE');
2253 $self->{ct}->{quirks} = 1;
2254
2255 $self->{state} = DATA_STATE;
2256 ## Reconsume.
2257 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2258
2259 redo A;
2260 } else {
2261 !!!cp (156);
2262 ## XML5: Swith to the bogus comment state.
2263 !!!parse-error (type => 'no space before DOCTYPE name');
2264 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2265 ## reconsume
2266 redo A;
2267 }
2268 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2269 ## XML5: "DOCTYPE root name before state".
2270
2271 if ($is_space->{$self->{nc}}) {
2272 !!!cp (157);
2273 ## Stay in the state
2274 !!!next-input-character;
2275 redo A;
2276 } elsif ($self->{nc} == 0x003E) { # >
2277 !!!cp (158);
2278 ## XML5: No parse error.
2279 !!!parse-error (type => 'no DOCTYPE name');
2280 $self->{state} = DATA_STATE;
2281 $self->{s_kwd} = '';
2282 !!!next-input-character;
2283
2284 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2285
2286 redo A;
2287 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2288 !!!cp (158.1);
2289 $self->{ct}->{name} # DOCTYPE
2290 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2291 delete $self->{ct}->{quirks};
2292 $self->{state} = DOCTYPE_NAME_STATE;
2293 !!!next-input-character;
2294 redo A;
2295 } elsif ($self->{nc} == -1) {
2296 !!!cp (159);
2297 !!!parse-error (type => 'no DOCTYPE name');
2298 $self->{state} = DATA_STATE;
2299 $self->{s_kwd} = '';
2300 ## reconsume
2301
2302 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2303
2304 redo A;
2305 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2306 !!!cp (159.1);
2307 !!!parse-error (type => 'no DOCTYPE name');
2308 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2309 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2310 $self->{in_subset} = 1;
2311 !!!next-input-character;
2312 !!!emit ($self->{ct}); # DOCTYPE
2313 redo A;
2314 } else {
2315 !!!cp (160);
2316 $self->{ct}->{name} = chr $self->{nc};
2317 delete $self->{ct}->{quirks};
2318 $self->{state} = DOCTYPE_NAME_STATE;
2319 !!!next-input-character;
2320 redo A;
2321 }
2322 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2323 ## XML5: "DOCTYPE root name state".
2324
2325 ## ISSUE: Redundant "First," in the spec.
2326
2327 if ($is_space->{$self->{nc}}) {
2328 !!!cp (161);
2329 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2330 !!!next-input-character;
2331 redo A;
2332 } elsif ($self->{nc} == 0x003E) { # >
2333 !!!cp (162);
2334 $self->{state} = DATA_STATE;
2335 $self->{s_kwd} = '';
2336 !!!next-input-character;
2337
2338 !!!emit ($self->{ct}); # DOCTYPE
2339
2340 redo A;
2341 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
2342 !!!cp (162.1);
2343 $self->{ct}->{name} # DOCTYPE
2344 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
2345 delete $self->{ct}->{quirks};
2346 ## Stay in the state.
2347 !!!next-input-character;
2348 redo A;
2349 } elsif ($self->{nc} == -1) {
2350 !!!cp (163);
2351 !!!parse-error (type => 'unclosed DOCTYPE');
2352 $self->{state} = DATA_STATE;
2353 $self->{s_kwd} = '';
2354 ## reconsume
2355
2356 $self->{ct}->{quirks} = 1;
2357 !!!emit ($self->{ct}); # DOCTYPE
2358
2359 redo A;
2360 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2361 !!!cp (163.1);
2362 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2363 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2364 $self->{in_subset} = 1;
2365 !!!next-input-character;
2366 !!!emit ($self->{ct}); # DOCTYPE
2367 redo A;
2368 } else {
2369 !!!cp (164);
2370 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
2371 ## Stay in the state.
2372 !!!next-input-character;
2373 redo A;
2374 }
2375 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2376 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2377 ## state", but implemented differently.
2378
2379 if ($is_space->{$self->{nc}}) {
2380 !!!cp (165);
2381 ## Stay in the state
2382 !!!next-input-character;
2383 redo A;
2384 } elsif ($self->{nc} == 0x003E) { # >
2385 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2386 !!!cp (166);
2387 $self->{state} = DATA_STATE;
2388 $self->{s_kwd} = '';
2389 } else {
2390 !!!cp (166.1);
2391 !!!parse-error (type => 'no md def'); ## TODO: type
2392 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2393 }
2394
2395 !!!next-input-character;
2396 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2397 redo A;
2398 } elsif ($self->{nc} == -1) {
2399 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2400 !!!cp (167);
2401 !!!parse-error (type => 'unclosed DOCTYPE');
2402 $self->{state} = DATA_STATE;
2403 $self->{s_kwd} = '';
2404 $self->{ct}->{quirks} = 1;
2405 } else {
2406 !!!cp (167.12);
2407 !!!parse-error (type => 'unclosed md'); ## TODO: type
2408 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2409 }
2410
2411 ## Reconsume.
2412 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2413 redo A;
2414 } elsif ($self->{nc} == 0x0050 or # P
2415 $self->{nc} == 0x0070) { # p
2416 !!!cp (167.1);
2417 $self->{state} = PUBLIC_STATE;
2418 $self->{kwd} = chr $self->{nc};
2419 !!!next-input-character;
2420 redo A;
2421 } elsif ($self->{nc} == 0x0053 or # S
2422 $self->{nc} == 0x0073) { # s
2423 !!!cp (167.2);
2424 $self->{state} = SYSTEM_STATE;
2425 $self->{kwd} = chr $self->{nc};
2426 !!!next-input-character;
2427 redo A;
2428 } elsif ($self->{nc} == 0x0022 and # "
2429 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2430 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2431 !!!cp (167.21);
2432 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2433 $self->{ct}->{value} = ''; # ENTITY
2434 !!!next-input-character;
2435 redo A;
2436 } elsif ($self->{nc} == 0x0027 and # '
2437 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2438 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2439 !!!cp (167.22);
2440 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2441 $self->{ct}->{value} = ''; # ENTITY
2442 !!!next-input-character;
2443 redo A;
2444 } elsif ($self->{is_xml} and
2445 $self->{ct}->{type} == DOCTYPE_TOKEN and
2446 $self->{nc} == 0x005B) { # [
2447 !!!cp (167.3);
2448 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2449 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2450 $self->{in_subset} = 1;
2451 !!!next-input-character;
2452 !!!emit ($self->{ct}); # DOCTYPE
2453 redo A;
2454 } else {
2455 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2456
2457 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2458 !!!cp (180);
2459 $self->{ct}->{quirks} = 1;
2460 $self->{state} = BOGUS_DOCTYPE_STATE;
2461 } else {
2462 !!!cp (180.1);
2463 $self->{state} = BOGUS_MD_STATE;
2464 }
2465
2466 !!!next-input-character;
2467 redo A;
2468 }
2469 } elsif ($self->{state} == PUBLIC_STATE) {
2470 ## ASCII case-insensitive
2471 if ($self->{nc} == [
2472 undef,
2473 0x0055, # U
2474 0x0042, # B
2475 0x004C, # L
2476 0x0049, # I
2477 ]->[length $self->{kwd}] or
2478 $self->{nc} == [
2479 undef,
2480 0x0075, # u
2481 0x0062, # b
2482 0x006C, # l
2483 0x0069, # i
2484 ]->[length $self->{kwd}]) {
2485 !!!cp (175);
2486 ## Stay in the state.
2487 $self->{kwd} .= chr $self->{nc};
2488 !!!next-input-character;
2489 redo A;
2490 } elsif ((length $self->{kwd}) == 5 and
2491 ($self->{nc} == 0x0043 or # C
2492 $self->{nc} == 0x0063)) { # c
2493 if ($self->{is_xml} and
2494 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2495 !!!cp (168.1);
2496 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2497 text => 'PUBLIC',
2498 line => $self->{line_prev},
2499 column => $self->{column_prev} - 4);
2500 } else {
2501 !!!cp (168);
2502 }
2503 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2504 !!!next-input-character;
2505 redo A;
2506 } else {
2507 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2508 line => $self->{line_prev},
2509 column => $self->{column_prev} + 1 - length $self->{kwd});
2510 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2511 !!!cp (169);
2512 $self->{ct}->{quirks} = 1;
2513 $self->{state} = BOGUS_DOCTYPE_STATE;
2514 } else {
2515 !!!cp (169.1);
2516 $self->{state} = BOGUS_MD_STATE;
2517 }
2518 ## Reconsume.
2519 redo A;
2520 }
2521 } elsif ($self->{state} == SYSTEM_STATE) {
2522 ## ASCII case-insensitive
2523 if ($self->{nc} == [
2524 undef,
2525 0x0059, # Y
2526 0x0053, # S
2527 0x0054, # T
2528 0x0045, # E
2529 ]->[length $self->{kwd}] or
2530 $self->{nc} == [
2531 undef,
2532 0x0079, # y
2533 0x0073, # s
2534 0x0074, # t
2535 0x0065, # e
2536 ]->[length $self->{kwd}]) {
2537 !!!cp (170);
2538 ## Stay in the state.
2539 $self->{kwd} .= chr $self->{nc};
2540 !!!next-input-character;
2541 redo A;
2542 } elsif ((length $self->{kwd}) == 5 and
2543 ($self->{nc} == 0x004D or # M
2544 $self->{nc} == 0x006D)) { # m
2545 if ($self->{is_xml} and
2546 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2547 !!!cp (171.1);
2548 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2549 text => 'SYSTEM',
2550 line => $self->{line_prev},
2551 column => $self->{column_prev} - 4);
2552 } else {
2553 !!!cp (171);
2554 }
2555 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2556 !!!next-input-character;
2557 redo A;
2558 } else {
2559 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2560 line => $self->{line_prev},
2561 column => $self->{column_prev} + 1 - length $self->{kwd});
2562 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2563 !!!cp (172);
2564 $self->{ct}->{quirks} = 1;
2565 $self->{state} = BOGUS_DOCTYPE_STATE;
2566 } else {
2567 !!!cp (172.1);
2568 $self->{state} = BOGUS_MD_STATE;
2569 }
2570 ## Reconsume.
2571 redo A;
2572 }
2573 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2574 if ($is_space->{$self->{nc}}) {
2575 !!!cp (181);
2576 ## Stay in the state
2577 !!!next-input-character;
2578 redo A;
2579 } elsif ($self->{nc} eq 0x0022) { # "
2580 !!!cp (182);
2581 $self->{ct}->{pubid} = ''; # DOCTYPE
2582 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2583 !!!next-input-character;
2584 redo A;
2585 } elsif ($self->{nc} eq 0x0027) { # '
2586 !!!cp (183);
2587 $self->{ct}->{pubid} = ''; # DOCTYPE
2588 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2589 !!!next-input-character;
2590 redo A;
2591 } elsif ($self->{nc} eq 0x003E) { # >
2592 !!!parse-error (type => 'no PUBLIC literal');
2593
2594 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2595 !!!cp (184);
2596 $self->{state} = DATA_STATE;
2597 $self->{s_kwd} = '';
2598 $self->{ct}->{quirks} = 1;
2599 } else {
2600 !!!cp (184.1);
2601 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2602 }
2603
2604 !!!next-input-character;
2605 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2606 redo A;
2607 } elsif ($self->{nc} == -1) {
2608 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2609 !!!cp (185);
2610 !!!parse-error (type => 'unclosed DOCTYPE');
2611 $self->{state} = DATA_STATE;
2612 $self->{s_kwd} = '';
2613 $self->{ct}->{quirks} = 1;
2614 } else {
2615 !!!cp (185.1);
2616 !!!parse-error (type => 'unclosed md'); ## TODO: type
2617 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2618 }
2619
2620 ## reconsume
2621 !!!emit ($self->{ct}); # DOCTYPE
2622 redo A;
2623 } elsif ($self->{is_xml} and
2624 $self->{ct}->{type} == DOCTYPE_TOKEN and
2625 $self->{nc} == 0x005B) { # [
2626 !!!cp (186.1);
2627 !!!parse-error (type => 'no PUBLIC literal');
2628 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2629 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2630 $self->{in_subset} = 1;
2631 !!!next-input-character;
2632 !!!emit ($self->{ct}); # DOCTYPE
2633 redo A;
2634 } else {
2635 !!!parse-error (type => 'string after PUBLIC');
2636
2637 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2638 !!!cp (186);
2639 $self->{ct}->{quirks} = 1;
2640 $self->{state} = BOGUS_DOCTYPE_STATE;
2641 } else {
2642 !!!cp (186.2);
2643 $self->{state} = BOGUS_MD_STATE;
2644 }
2645
2646 !!!next-input-character;
2647 redo A;
2648 }
2649 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2650 if ($self->{nc} == 0x0022) { # "
2651 !!!cp (187);
2652 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2653 !!!next-input-character;
2654 redo A;
2655 } elsif ($self->{nc} == 0x003E) { # >
2656 !!!parse-error (type => 'unclosed PUBLIC literal');
2657
2658 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2659 !!!cp (188);
2660 $self->{state} = DATA_STATE;
2661 $self->{s_kwd} = '';
2662 $self->{ct}->{quirks} = 1;
2663 } else {
2664 !!!cp (188.1);
2665 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2666 }
2667
2668 !!!next-input-character;
2669 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2670 redo A;
2671 } elsif ($self->{nc} == -1) {
2672 !!!parse-error (type => 'unclosed PUBLIC literal');
2673
2674 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2675 !!!cp (189);
2676 $self->{state} = DATA_STATE;
2677 $self->{s_kwd} = '';
2678 $self->{ct}->{quirks} = 1;
2679 } else {
2680 !!!cp (189.1);
2681 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2682 }
2683
2684 ## Reconsume.
2685 !!!emit ($self->{ct}); # DOCTYPE
2686 redo A;
2687 } else {
2688 !!!cp (190);
2689 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2690 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2691 length $self->{ct}->{pubid});
2692
2693 ## Stay in the state
2694 !!!next-input-character;
2695 redo A;
2696 }
2697 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2698 if ($self->{nc} == 0x0027) { # '
2699 !!!cp (191);
2700 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2701 !!!next-input-character;
2702 redo A;
2703 } elsif ($self->{nc} == 0x003E) { # >
2704 !!!parse-error (type => 'unclosed PUBLIC literal');
2705
2706 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2707 !!!cp (192);
2708 $self->{state} = DATA_STATE;
2709 $self->{s_kwd} = '';
2710 $self->{ct}->{quirks} = 1;
2711 } else {
2712 !!!cp (192.1);
2713 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2714 }
2715
2716 !!!next-input-character;
2717 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2718 redo A;
2719 } elsif ($self->{nc} == -1) {
2720 !!!parse-error (type => 'unclosed PUBLIC literal');
2721
2722 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2723 !!!cp (193);
2724 $self->{state} = DATA_STATE;
2725 $self->{s_kwd} = '';
2726 $self->{ct}->{quirks} = 1;
2727 } else {
2728 !!!cp (193.1);
2729 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2730 }
2731
2732 ## reconsume
2733 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2734 redo A;
2735 } else {
2736 !!!cp (194);
2737 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2738 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2739 length $self->{ct}->{pubid});
2740
2741 ## Stay in the state
2742 !!!next-input-character;
2743 redo A;
2744 }
2745 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2746 if ($is_space->{$self->{nc}}) {
2747 !!!cp (195);
2748 ## Stay in the state
2749 !!!next-input-character;
2750 redo A;
2751 } elsif ($self->{nc} == 0x0022) { # "
2752 !!!cp (196);
2753 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2754 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2755 !!!next-input-character;
2756 redo A;
2757 } elsif ($self->{nc} == 0x0027) { # '
2758 !!!cp (197);
2759 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2760 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2761 !!!next-input-character;
2762 redo A;
2763 } elsif ($self->{nc} == 0x003E) { # >
2764 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2765 if ($self->{is_xml}) {
2766 !!!cp (198.1);
2767 !!!parse-error (type => 'no SYSTEM literal');
2768 } else {
2769 !!!cp (198);
2770 }
2771 $self->{state} = DATA_STATE;
2772 $self->{s_kwd} = '';
2773 } else {
2774 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2775 !!!cp (198.2);
2776 } else {
2777 !!!cp (198.3);
2778 !!!parse-error (type => 'no SYSTEM literal');
2779 }
2780 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2781 }
2782
2783 !!!next-input-character;
2784 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2785 redo A;
2786 } elsif ($self->{nc} == -1) {
2787 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2788 !!!cp (199);
2789 !!!parse-error (type => 'unclosed DOCTYPE');
2790
2791 $self->{state} = DATA_STATE;
2792 $self->{s_kwd} = '';
2793 $self->{ct}->{quirks} = 1;
2794 } else {
2795 !!!parse-error (type => 'unclosed md'); ## TODO: type
2796 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2797 }
2798
2799 ## reconsume
2800 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2801 redo A;
2802 } elsif ($self->{is_xml} and
2803 $self->{ct}->{type} == DOCTYPE_TOKEN and
2804 $self->{nc} == 0x005B) { # [
2805 !!!cp (200.1);
2806 !!!parse-error (type => 'no SYSTEM literal');
2807 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2808 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2809 $self->{in_subset} = 1;
2810 !!!next-input-character;
2811 !!!emit ($self->{ct}); # DOCTYPE
2812 redo A;
2813 } else {
2814 !!!parse-error (type => 'string after PUBLIC literal');
2815
2816 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2817 !!!cp (200);
2818 $self->{ct}->{quirks} = 1;
2819 $self->{state} = BOGUS_DOCTYPE_STATE;
2820 } else {
2821 !!!cp (200.2);
2822 $self->{state} = BOGUS_MD_STATE;
2823 }
2824
2825 !!!next-input-character;
2826 redo A;
2827 }
2828 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2829 if ($is_space->{$self->{nc}}) {
2830 !!!cp (201);
2831 ## Stay in the state
2832 !!!next-input-character;
2833 redo A;
2834 } elsif ($self->{nc} == 0x0022) { # "
2835 !!!cp (202);
2836 $self->{ct}->{sysid} = ''; # DOCTYPE
2837 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2838 !!!next-input-character;
2839 redo A;
2840 } elsif ($self->{nc} == 0x0027) { # '
2841 !!!cp (203);
2842 $self->{ct}->{sysid} = ''; # DOCTYPE
2843 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2844 !!!next-input-character;
2845 redo A;
2846 } elsif ($self->{nc} == 0x003E) { # >
2847 !!!parse-error (type => 'no SYSTEM literal');
2848 !!!next-input-character;
2849
2850 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2851 !!!cp (204);
2852 $self->{state} = DATA_STATE;
2853 $self->{s_kwd} = '';
2854 $self->{ct}->{quirks} = 1;
2855 } else {
2856 !!!cp (204.1);
2857 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2858 }
2859
2860 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2861 redo A;
2862 } elsif ($self->{nc} == -1) {
2863 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2864 !!!cp (205);
2865 !!!parse-error (type => 'unclosed DOCTYPE');
2866 $self->{state} = DATA_STATE;
2867 $self->{s_kwd} = '';
2868 $self->{ct}->{quirks} = 1;
2869 } else {
2870 !!!cp (205.1);
2871 !!!parse-error (type => 'unclosed md'); ## TODO: type
2872 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2873 }
2874
2875 ## reconsume
2876 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2877 redo A;
2878 } elsif ($self->{is_xml} and
2879 $self->{ct}->{type} == DOCTYPE_TOKEN and
2880 $self->{nc} == 0x005B) { # [
2881 !!!cp (206.1);
2882 !!!parse-error (type => 'no SYSTEM literal');
2883
2884 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2885 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2886 $self->{in_subset} = 1;
2887 !!!next-input-character;
2888 !!!emit ($self->{ct}); # DOCTYPE
2889 redo A;
2890 } else {
2891 !!!parse-error (type => 'string after SYSTEM');
2892
2893 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2894 !!!cp (206);
2895 $self->{ct}->{quirks} = 1;
2896 $self->{state} = BOGUS_DOCTYPE_STATE;
2897 } else {
2898 !!!cp (206.2);
2899 $self->{state} = BOGUS_MD_STATE;
2900 }
2901
2902 !!!next-input-character;
2903 redo A;
2904 }
2905 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2906 if ($self->{nc} == 0x0022) { # "
2907 !!!cp (207);
2908 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2909 !!!next-input-character;
2910 redo A;
2911 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2912 !!!parse-error (type => 'unclosed SYSTEM literal');
2913
2914 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2915 !!!cp (208);
2916 $self->{state} = DATA_STATE;
2917 $self->{s_kwd} = '';
2918 $self->{ct}->{quirks} = 1;
2919 } else {
2920 !!!cp (208.1);
2921 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2922 }
2923
2924 !!!next-input-character;
2925 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2926 redo A;
2927 } elsif ($self->{nc} == -1) {
2928 !!!parse-error (type => 'unclosed SYSTEM literal');
2929
2930 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2931 !!!cp (209);
2932 $self->{state} = DATA_STATE;
2933 $self->{s_kwd} = '';
2934 $self->{ct}->{quirks} = 1;
2935 } else {
2936 !!!cp (209.1);
2937 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2938 }
2939
2940 ## reconsume
2941 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2942 redo A;
2943 } else {
2944 !!!cp (210);
2945 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2946 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2947 length $self->{ct}->{sysid});
2948
2949 ## Stay in the state
2950 !!!next-input-character;
2951 redo A;
2952 }
2953 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2954 if ($self->{nc} == 0x0027) { # '
2955 !!!cp (211);
2956 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2957 !!!next-input-character;
2958 redo A;
2959 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2960 !!!cp (212);
2961 !!!parse-error (type => 'unclosed SYSTEM literal');
2962
2963 $self->{state} = DATA_STATE;
2964 $self->{s_kwd} = '';
2965 !!!next-input-character;
2966
2967 $self->{ct}->{quirks} = 1;
2968 !!!emit ($self->{ct}); # DOCTYPE
2969
2970 redo A;
2971 } elsif ($self->{nc} == -1) {
2972 !!!parse-error (type => 'unclosed SYSTEM literal');
2973
2974 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2975 !!!cp (213);
2976 $self->{state} = DATA_STATE;
2977 $self->{s_kwd} = '';
2978 $self->{ct}->{quirks} = 1;
2979 } else {
2980 !!!cp (213.1);
2981 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2982 }
2983
2984 ## reconsume
2985 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2986 redo A;
2987 } else {
2988 !!!cp (214);
2989 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2990 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2991 length $self->{ct}->{sysid});
2992
2993 ## Stay in the state
2994 !!!next-input-character;
2995 redo A;
2996 }
2997 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2998 if ($is_space->{$self->{nc}}) {
2999 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
3000 !!!cp (215.1);
3001 $self->{state} = BEFORE_NDATA_STATE;
3002 } else {
3003 !!!cp (215);
3004 ## Stay in the state
3005 }
3006 !!!next-input-character;
3007 redo A;
3008 } elsif ($self->{nc} == 0x003E) { # >
3009 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3010 !!!cp (216);
3011 $self->{state} = DATA_STATE;
3012 $self->{s_kwd} = '';
3013 } else {
3014 !!!cp (216.1);
3015 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3016 }
3017
3018 !!!next-input-character;
3019 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3020 redo A;
3021 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3022 ($self->{nc} == 0x004E or # N
3023 $self->{nc} == 0x006E)) { # n
3024 !!!cp (216.2);
3025 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
3026 $self->{state} = NDATA_STATE;
3027 $self->{kwd} = chr $self->{nc};
3028 !!!next-input-character;
3029 redo A;
3030 } elsif ($self->{nc} == -1) {
3031 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3032 !!!cp (217);
3033 !!!parse-error (type => 'unclosed DOCTYPE');
3034 $self->{state} = DATA_STATE;
3035 $self->{s_kwd} = '';
3036 $self->{ct}->{quirks} = 1;
3037 } else {
3038 !!!cp (217.1);
3039 !!!parse-error (type => 'unclosed md'); ## TODO: type
3040 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3041 }
3042
3043 ## reconsume
3044 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3045 redo A;
3046 } elsif ($self->{is_xml} and
3047 $self->{ct}->{type} == DOCTYPE_TOKEN and
3048 $self->{nc} == 0x005B) { # [
3049 !!!cp (218.1);
3050 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3051 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3052 $self->{in_subset} = 1;
3053 !!!next-input-character;
3054 !!!emit ($self->{ct}); # DOCTYPE
3055 redo A;
3056 } else {
3057 !!!parse-error (type => 'string after SYSTEM literal');
3058
3059 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3060 !!!cp (218);
3061 #$self->{ct}->{quirks} = 1;
3062 $self->{state} = BOGUS_DOCTYPE_STATE;
3063 } else {
3064 !!!cp (218.2);
3065 $self->{state} = BOGUS_MD_STATE;
3066 }
3067
3068 !!!next-input-character;
3069 redo A;
3070 }
3071 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
3072 if ($is_space->{$self->{nc}}) {
3073 !!!cp (218.3);
3074 ## Stay in the state.
3075 !!!next-input-character;
3076 redo A;
3077 } elsif ($self->{nc} == 0x003E) { # >
3078 !!!cp (218.4);
3079 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3080 !!!next-input-character;
3081 !!!emit ($self->{ct}); # ENTITY
3082 redo A;
3083 } elsif ($self->{nc} == 0x004E or # N
3084 $self->{nc} == 0x006E) { # n
3085 !!!cp (218.5);
3086 $self->{state} = NDATA_STATE;
3087 $self->{kwd} = chr $self->{nc};
3088 !!!next-input-character;
3089 redo A;
3090 } elsif ($self->{nc} == -1) {
3091 !!!cp (218.6);
3092 !!!parse-error (type => 'unclosed md'); ## TODO: type
3093 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3094 ## reconsume
3095 !!!emit ($self->{ct}); # ENTITY
3096 redo A;
3097 } else {
3098 !!!cp (218.7);
3099 !!!parse-error (type => 'string after SYSTEM literal');
3100 $self->{state} = BOGUS_MD_STATE;
3101 !!!next-input-character;
3102 redo A;
3103 }
3104 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3105 if ($self->{nc} == 0x003E) { # >
3106 !!!cp (219);
3107 $self->{state} = DATA_STATE;
3108 $self->{s_kwd} = '';
3109 !!!next-input-character;
3110
3111 !!!emit ($self->{ct}); # DOCTYPE
3112
3113 redo A;
3114 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3115 !!!cp (220.1);
3116 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3117 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3118 $self->{in_subset} = 1;
3119 !!!next-input-character;
3120 !!!emit ($self->{ct}); # DOCTYPE
3121 redo A;
3122 } elsif ($self->{nc} == -1) {
3123 !!!cp (220);
3124 $self->{state} = DATA_STATE;
3125 $self->{s_kwd} = '';
3126 ## reconsume
3127
3128 !!!emit ($self->{ct}); # DOCTYPE
3129
3130 redo A;
3131 } else {
3132 !!!cp (221);
3133 my $s = '';
3134 $self->{read_until}->($s, q{>[}, 0);
3135
3136 ## Stay in the state
3137 !!!next-input-character;
3138 redo A;
3139 }
3140 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3141 ## NOTE: "CDATA section state" in the state is jointly implemented
3142 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3143 ## and |CDATA_SECTION_MSE2_STATE|.
3144
3145 ## XML5: "CDATA state".
3146
3147 if ($self->{nc} == 0x005D) { # ]
3148 !!!cp (221.1);
3149 $self->{state} = CDATA_SECTION_MSE1_STATE;
3150 !!!next-input-character;
3151 redo A;
3152 } elsif ($self->{nc} == -1) {
3153 if ($self->{is_xml}) {
3154 !!!cp (221.11);
3155 !!!parse-error (type => 'no mse'); ## TODO: type
3156 } else {
3157 !!!cp (221.12);
3158 }
3159
3160 $self->{state} = DATA_STATE;
3161 $self->{s_kwd} = '';
3162 ## Reconsume.
3163 if (length $self->{ct}->{data}) { # character
3164 !!!cp (221.2);
3165 !!!emit ($self->{ct}); # character
3166 } else {
3167 !!!cp (221.3);
3168 ## No token to emit. $self->{ct} is discarded.
3169 }
3170 redo A;
3171 } else {
3172 !!!cp (221.4);
3173 $self->{ct}->{data} .= chr $self->{nc};
3174 $self->{read_until}->($self->{ct}->{data},
3175 q<]>,
3176 length $self->{ct}->{data});
3177
3178 ## Stay in the state.
3179 !!!next-input-character;
3180 redo A;
3181 }
3182
3183 ## ISSUE: "text tokens" in spec.
3184 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3185 ## XML5: "CDATA bracket state".
3186
3187 if ($self->{nc} == 0x005D) { # ]
3188 !!!cp (221.5);
3189 $self->{state} = CDATA_SECTION_MSE2_STATE;
3190 !!!next-input-character;
3191 redo A;
3192 } else {
3193 !!!cp (221.6);
3194 ## XML5: If EOF, "]" is not appended and changed to the data state.
3195 $self->{ct}->{data} .= ']';
3196 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3197 ## Reconsume.
3198 redo A;
3199 }
3200 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3201 ## XML5: "CDATA end state".
3202
3203 if ($self->{nc} == 0x003E) { # >
3204 $self->{state} = DATA_STATE;
3205 $self->{s_kwd} = '';
3206 !!!next-input-character;
3207 if (length $self->{ct}->{data}) { # character
3208 !!!cp (221.7);
3209 !!!emit ($self->{ct}); # character
3210 } else {
3211 !!!cp (221.8);
3212 ## No token to emit. $self->{ct} is discarded.
3213 }
3214 redo A;
3215 } elsif ($self->{nc} == 0x005D) { # ]
3216 !!!cp (221.9); # character
3217 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3218 ## Stay in the state.
3219 !!!next-input-character;
3220 redo A;
3221 } else {
3222 !!!cp (221.11);
3223 $self->{ct}->{data} .= ']]'; # character
3224 $self->{state} = CDATA_SECTION_STATE;
3225 ## Reconsume. ## XML5: Emit.
3226 redo A;
3227 }
3228 } elsif ($self->{state} == ENTITY_STATE) {
3229 if ($is_space->{$self->{nc}} or
3230 {
3231 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3232 $self->{entity_add} => 1,
3233 }->{$self->{nc}}) {
3234 if ($self->{is_xml}) {
3235 !!!cp (1001.1);
3236 !!!parse-error (type => 'bare ero',
3237 line => $self->{line_prev},
3238 column => $self->{column_prev}
3239 + ($self->{nc} == -1 ? 1 : 0));
3240 } else {
3241 !!!cp (1001);
3242 ## No error
3243 }
3244 ## Don't consume
3245 ## Return nothing.
3246 #
3247 } elsif ($self->{nc} == 0x0023) { # #
3248 !!!cp (999);
3249 $self->{state} = ENTITY_HASH_STATE;
3250 $self->{kwd} = '#';
3251 !!!next-input-character;
3252 redo A;
3253 } elsif ($self->{is_xml} or
3254 (0x0041 <= $self->{nc} and
3255 $self->{nc} <= 0x005A) or # A..Z
3256 (0x0061 <= $self->{nc} and
3257 $self->{nc} <= 0x007A)) { # a..z
3258 !!!cp (998);
3259 require Whatpm::_NamedEntityList;
3260 $self->{state} = ENTITY_NAME_STATE;
3261 $self->{kwd} = chr $self->{nc};
3262 $self->{entity__value} = $self->{kwd};
3263 $self->{entity__match} = 0;
3264 !!!next-input-character;
3265 redo A;
3266 } else {
3267 !!!cp (1027);
3268 !!!parse-error (type => 'bare ero');
3269 ## Return nothing.
3270 #
3271 }
3272
3273 ## NOTE: No character is consumed by the "consume a character
3274 ## reference" algorithm. In other word, there is an "&" character
3275 ## that does not introduce a character reference, which would be
3276 ## appended to the parent element or the attribute value in later
3277 ## process of the tokenizer.
3278
3279 if ($self->{prev_state} == DATA_STATE) {
3280 !!!cp (997);
3281 $self->{state} = $self->{prev_state};
3282 $self->{s_kwd} = '';
3283 ## Reconsume.
3284 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3285 line => $self->{line_prev},
3286 column => $self->{column_prev},
3287 });
3288 redo A;
3289 } else {
3290 !!!cp (996);
3291 $self->{ca}->{value} .= '&';
3292 $self->{state} = $self->{prev_state};
3293 $self->{s_kwd} = '';
3294 ## Reconsume.
3295 redo A;
3296 }
3297 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3298 if ($self->{nc} == 0x0078) { # x
3299 !!!cp (995);
3300 $self->{state} = HEXREF_X_STATE;
3301 $self->{kwd} .= chr $self->{nc};
3302 !!!next-input-character;
3303 redo A;
3304 } elsif ($self->{nc} == 0x0058) { # X
3305 !!!cp (995.1);
3306 if ($self->{is_xml}) {
3307 !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3308 }
3309 $self->{state} = HEXREF_X_STATE;
3310 $self->{kwd} .= chr $self->{nc};
3311 !!!next-input-character;
3312 redo A;
3313 } elsif (0x0030 <= $self->{nc} and
3314 $self->{nc} <= 0x0039) { # 0..9
3315 !!!cp (994);
3316 $self->{state} = NCR_NUM_STATE;
3317 $self->{kwd} = $self->{nc} - 0x0030;
3318 !!!next-input-character;
3319 redo A;
3320 } else {
3321 !!!parse-error (type => 'bare nero',
3322 line => $self->{line_prev},
3323 column => $self->{column_prev} - 1);
3324
3325 ## NOTE: According to the spec algorithm, nothing is returned,
3326 ## and then "&#" is appended to the parent element or the attribute
3327 ## value in the later processing.
3328
3329 if ($self->{prev_state} == DATA_STATE) {
3330 !!!cp (1019);
3331 $self->{state} = $self->{prev_state};
3332 $self->{s_kwd} = '';
3333 ## Reconsume.
3334 !!!emit ({type => CHARACTER_TOKEN,
3335 data => '&#',
3336 line => $self->{line_prev},
3337 column => $self->{column_prev} - 1,
3338 });
3339 redo A;
3340 } else {
3341 !!!cp (993);
3342 $self->{ca}->{value} .= '&#';
3343 $self->{state} = $self->{prev_state};
3344 $self->{s_kwd} = '';
3345 ## Reconsume.
3346 redo A;
3347 }
3348 }
3349 } elsif ($self->{state} == NCR_NUM_STATE) {
3350 if (0x0030 <= $self->{nc} and
3351 $self->{nc} <= 0x0039) { # 0..9
3352 !!!cp (1012);
3353 $self->{kwd} *= 10;
3354 $self->{kwd} += $self->{nc} - 0x0030;
3355
3356 ## Stay in the state.
3357 !!!next-input-character;
3358 redo A;
3359 } elsif ($self->{nc} == 0x003B) { # ;
3360 !!!cp (1013);
3361 !!!next-input-character;
3362 #
3363 } else {
3364 !!!cp (1014);
3365 !!!parse-error (type => 'no refc');
3366 ## Reconsume.
3367 #
3368 }
3369
3370 my $code = $self->{kwd};
3371 my $l = $self->{line_prev};
3372 my $c = $self->{column_prev};
3373 if ((not $self->{is_xml} and $charref_map->{$code}) or
3374 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3375 ($self->{is_xml} and $code == 0x0000)) {
3376 !!!cp (1015);
3377 !!!parse-error (type => 'invalid character reference',
3378 text => (sprintf 'U+%04X', $code),
3379 line => $l, column => $c);
3380 $code = $charref_map->{$code};
3381 } elsif ($code > 0x10FFFF) {
3382 !!!cp (1016);
3383 !!!parse-error (type => 'invalid character reference',
3384 text => (sprintf 'U-%08X', $code),
3385 line => $l, column => $c);
3386 $code = 0xFFFD;
3387 }
3388
3389 if ($self->{prev_state} == DATA_STATE) {
3390 !!!cp (992);
3391 $self->{state} = $self->{prev_state};
3392 $self->{s_kwd} = '';
3393 ## Reconsume.
3394 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3395 has_reference => 1,
3396 line => $l, column => $c,
3397 });
3398 redo A;
3399 } else {
3400 !!!cp (991);
3401 $self->{ca}->{value} .= chr $code;
3402 $self->{ca}->{has_reference} = 1;
3403 $self->{state} = $self->{prev_state};
3404 $self->{s_kwd} = '';
3405 ## Reconsume.
3406 redo A;
3407 }
3408 } elsif ($self->{state} == HEXREF_X_STATE) {
3409 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3410 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3411 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3412 # 0..9, A..F, a..f
3413 !!!cp (990);
3414 $self->{state} = HEXREF_HEX_STATE;
3415 $self->{kwd} = 0;
3416 ## Reconsume.
3417 redo A;
3418 } else {
3419 !!!parse-error (type => 'bare hcro',
3420 line => $self->{line_prev},
3421 column => $self->{column_prev} - 2);
3422
3423 ## NOTE: According to the spec algorithm, nothing is returned,
3424 ## and then "&#" followed by "X" or "x" is appended to the parent
3425 ## element or the attribute value in the later processing.
3426
3427 if ($self->{prev_state} == DATA_STATE) {
3428 !!!cp (1005);
3429 $self->{state} = $self->{prev_state};
3430 $self->{s_kwd} = '';
3431 ## Reconsume.
3432 !!!emit ({type => CHARACTER_TOKEN,
3433 data => '&' . $self->{kwd},
3434 line => $self->{line_prev},
3435 column => $self->{column_prev} - length $self->{kwd},
3436 });
3437 redo A;
3438 } else {
3439 !!!cp (989);
3440 $self->{ca}->{value} .= '&' . $self->{kwd};
3441 $self->{state} = $self->{prev_state};
3442 $self->{s_kwd} = '';
3443 ## Reconsume.
3444 redo A;
3445 }
3446 }
3447 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3448 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3449 # 0..9
3450 !!!cp (1002);
3451 $self->{kwd} *= 0x10;
3452 $self->{kwd} += $self->{nc} - 0x0030;
3453 ## Stay in the state.
3454 !!!next-input-character;
3455 redo A;
3456 } elsif (0x0061 <= $self->{nc} and
3457 $self->{nc} <= 0x0066) { # a..f
3458 !!!cp (1003);
3459 $self->{kwd} *= 0x10;
3460 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3461 ## Stay in the state.
3462 !!!next-input-character;
3463 redo A;
3464 } elsif (0x0041 <= $self->{nc} and
3465 $self->{nc} <= 0x0046) { # A..F
3466 !!!cp (1004);
3467 $self->{kwd} *= 0x10;
3468 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3469 ## Stay in the state.
3470 !!!next-input-character;
3471 redo A;
3472 } elsif ($self->{nc} == 0x003B) { # ;
3473 !!!cp (1006);
3474 !!!next-input-character;
3475 #
3476 } else {
3477 !!!cp (1007);
3478 !!!parse-error (type => 'no refc',
3479 line => $self->{line},
3480 column => $self->{column});
3481 ## Reconsume.
3482 #
3483 }
3484
3485 my $code = $self->{kwd};
3486 my $l = $self->{line_prev};
3487 my $c = $self->{column_prev};
3488 if ((not $self->{is_xml} and $charref_map->{$code}) or
3489 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3490 ($self->{is_xml} and $code == 0x0000)) {
3491 !!!cp (1008);
3492 !!!parse-error (type => 'invalid character reference',
3493 text => (sprintf 'U+%04X', $code),
3494 line => $l, column => $c);
3495 $code = $charref_map->{$code};
3496 } elsif ($code > 0x10FFFF) {
3497 !!!cp (1009);
3498 !!!parse-error (type => 'invalid character reference',
3499 text => (sprintf 'U-%08X', $code),
3500 line => $l, column => $c);
3501 $code = 0xFFFD;
3502 }
3503
3504 if ($self->{prev_state} == DATA_STATE) {
3505 !!!cp (988);
3506 $self->{state} = $self->{prev_state};
3507 $self->{s_kwd} = '';
3508 ## Reconsume.
3509 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3510 has_reference => 1,
3511 line => $l, column => $c,
3512 });
3513 redo A;
3514 } else {
3515 !!!cp (987);
3516 $self->{ca}->{value} .= chr $code;
3517 $self->{ca}->{has_reference} = 1;
3518 $self->{state} = $self->{prev_state};
3519 $self->{s_kwd} = '';
3520 ## Reconsume.
3521 redo A;
3522 }
3523 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3524 if ((0x0041 <= $self->{nc} and # a
3525 $self->{nc} <= 0x005A) or # x
3526 (0x0061 <= $self->{nc} and # a
3527 $self->{nc} <= 0x007A) or # z
3528 (0x0030 <= $self->{nc} and # 0
3529 $self->{nc} <= 0x0039) or # 9
3530 $self->{nc} == 0x003B or # ;
3531 ($self->{is_xml} and
3532 not ($is_space->{$self->{nc}} or
3533 {
3534 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3535 $self->{entity_add} => 1,
3536 }->{$self->{nc}}))) {
3537 our $EntityChar;
3538 $self->{kwd} .= chr $self->{nc};
3539 if (defined $EntityChar->{$self->{kwd}} or
3540 $self->{ge}->{$self->{kwd}}) {
3541 if ($self->{nc} == 0x003B) { # ;
3542 if (defined $self->{ge}->{$self->{kwd}}) {
3543 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3544 !!!cp (1020.1);
3545 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3546 } else {
3547 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3548 !!!cp (1020.2);
3549 !!!parse-error (type => 'unparsed entity', ## TODO: type
3550 value => $self->{kwd});
3551 } else {
3552 !!!cp (1020.3);
3553 }
3554 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3555 }
3556 } else {
3557 if ($self->{is_xml}) {
3558 !!!cp (1020.4);
3559 !!!parse-error (type => 'entity not declared', ## TODO: type
3560 value => $self->{kwd},
3561 level => {
3562 'amp;' => $self->{level}->{warn},
3563 'quot;' => $self->{level}->{warn},
3564 'lt;' => $self->{level}->{warn},
3565 'gt;' => $self->{level}->{warn},
3566 'apos;' => $self->{level}->{warn},
3567 }->{$self->{kwd}} ||
3568 $self->{level}->{must});
3569 } else {
3570 !!!cp (1020);
3571 }
3572 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3573 }
3574 $self->{entity__match} = 1;
3575 !!!next-input-character;
3576 #
3577 } else {
3578 !!!cp (1021);
3579 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3580 $self->{entity__match} = -1;
3581 ## Stay in the state.
3582 !!!next-input-character;
3583 redo A;
3584 }
3585 } else {
3586 !!!cp (1022);
3587 $self->{entity__value} .= chr $self->{nc};
3588 $self->{entity__match} *= 2;
3589 ## Stay in the state.
3590 !!!next-input-character;
3591 redo A;
3592 }
3593 }
3594
3595 my $data;
3596 my $has_ref;
3597 if ($self->{entity__match} > 0) {
3598 !!!cp (1023);
3599 $data = $self->{entity__value};
3600 $has_ref = 1;
3601 #
3602 } elsif ($self->{entity__match} < 0) {
3603 !!!parse-error (type => 'no refc');
3604 if ($self->{prev_state} != DATA_STATE and # in attribute
3605 $self->{entity__match} < -1) {
3606 !!!cp (1024);
3607 $data = '&' . $self->{kwd};
3608 #
3609 } else {
3610 !!!cp (1025);
3611 $data = $self->{entity__value};
3612 $has_ref = 1;
3613 #
3614 }
3615 } else {
3616 !!!cp (1026);
3617 !!!parse-error (type => 'bare ero',
3618 line => $self->{line_prev},
3619 column => $self->{column_prev} - length $self->{kwd});
3620 $data = '&' . $self->{kwd};
3621 #
3622 }
3623
3624 ## NOTE: In these cases, when a character reference is found,
3625 ## it is consumed and a character token is returned, or, otherwise,
3626 ## nothing is consumed and returned, according to the spec algorithm.
3627 ## In this implementation, anything that has been examined by the
3628 ## tokenizer is appended to the parent element or the attribute value
3629 ## as string, either literal string when no character reference or
3630 ## entity-replaced string otherwise, in this stage, since any characters
3631 ## that would not be consumed are appended in the data state or in an
3632 ## appropriate attribute value state anyway.
3633
3634 if ($self->{prev_state} == DATA_STATE) {
3635 !!!cp (986);
3636 $self->{state} = $self->{prev_state};
3637 $self->{s_kwd} = '';
3638 ## Reconsume.
3639 !!!emit ({type => CHARACTER_TOKEN,
3640 data => $data,
3641 has_reference => $has_ref,
3642 line => $self->{line_prev},
3643 column => $self->{column_prev} + 1 - length $self->{kwd},
3644 });
3645 redo A;
3646 } else {
3647 !!!cp (985);
3648 $self->{ca}->{value} .= $data;
3649 $self->{ca}->{has_reference} = 1 if $has_ref;
3650 $self->{state} = $self->{prev_state};
3651 $self->{s_kwd} = '';
3652 ## Reconsume.
3653 redo A;
3654 }
3655
3656 ## XML-only states
3657
3658 } elsif ($self->{state} == PI_STATE) {
3659 ## XML5: "Pi state" and "DOCTYPE pi state".
3660
3661 if ($is_space->{$self->{nc}} or
3662 $self->{nc} == 0x003F or # ?
3663 $self->{nc} == -1) {
3664 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3665 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3666 ## "DOCTYPE pi state": Parse error, switch to the "data
3667 ## state".
3668 !!!parse-error (type => 'bare pio', ## TODO: type
3669 line => $self->{line_prev},
3670 column => $self->{column_prev}
3671 - 1 * ($self->{nc} != -1));
3672 $self->{state} = BOGUS_COMMENT_STATE;
3673 ## Reconsume.
3674 $self->{ct} = {type => COMMENT_TOKEN,
3675 data => '?',
3676 line => $self->{line_prev},
3677 column => $self->{column_prev}
3678 - 1 * ($self->{nc} != -1),
3679 };
3680 redo A;
3681 } else {
3682 ## XML5: "DOCTYPE pi state": Stay in the state.
3683 $self->{ct} = {type => PI_TOKEN,
3684 target => chr $self->{nc},
3685 data => '',
3686 line => $self->{line_prev},
3687 column => $self->{column_prev} - 1,
3688 };
3689 $self->{state} = PI_TARGET_STATE;
3690 !!!next-input-character;
3691 redo A;
3692 }
3693 } elsif ($self->{state} == PI_TARGET_STATE) {
3694 if ($is_space->{$self->{nc}}) {
3695 $self->{state} = PI_TARGET_AFTER_STATE;
3696 !!!next-input-character;
3697 redo A;
3698 } elsif ($self->{nc} == -1) {
3699 !!!parse-error (type => 'no pic'); ## TODO: type
3700 if ($self->{in_subset}) {
3701 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3702 } else {
3703 $self->{state} = DATA_STATE;
3704 $self->{s_kwd} = '';
3705 }
3706 ## Reconsume.
3707 !!!emit ($self->{ct}); # pi
3708 redo A;
3709 } elsif ($self->{nc} == 0x003F) { # ?
3710 $self->{state} = PI_AFTER_STATE;
3711 !!!next-input-character;
3712 redo A;
3713 } else {
3714 ## XML5: typo ("tag name" -> "target")
3715 $self->{ct}->{target} .= chr $self->{nc}; # pi
3716 !!!next-input-character;
3717 redo A;
3718 }
3719 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3720 if ($is_space->{$self->{nc}}) {
3721 ## Stay in the state.
3722 !!!next-input-character;
3723 redo A;
3724 } else {
3725 $self->{state} = PI_DATA_STATE;
3726 ## Reprocess.
3727 redo A;
3728 }
3729 } elsif ($self->{state} == PI_DATA_STATE) {
3730 if ($self->{nc} == 0x003F) { # ?
3731 $self->{state} = PI_DATA_AFTER_STATE;
3732 !!!next-input-character;
3733 redo A;
3734 } elsif ($self->{nc} == -1) {
3735 !!!parse-error (type => 'no pic'); ## TODO: type
3736 if ($self->{in_subset}) {
3737 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3738 } else {
3739 $self->{state} = DATA_STATE;
3740 $self->{s_kwd} = '';
3741 }
3742 ## Reprocess.
3743 !!!emit ($self->{ct}); # pi
3744 redo A;
3745 } else {
3746 $self->{ct}->{data} .= chr $self->{nc}; # pi
3747 $self->{read_until}->($self->{ct}->{data}, q[?],
3748 length $self->{ct}->{data});
3749 ## Stay in the state.
3750 !!!next-input-character;
3751 ## Reprocess.
3752 redo A;
3753 }
3754 } elsif ($self->{state} == PI_AFTER_STATE) {
3755 ## XML5: Part of "Pi after state".
3756
3757 if ($self->{nc} == 0x003E) { # >
3758 if ($self->{in_subset}) {
3759 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3760 } else {
3761 $self->{state} = DATA_STATE;
3762 $self->{s_kwd} = '';
3763 }
3764 !!!next-input-character;
3765 !!!emit ($self->{ct}); # pi
3766 redo A;
3767 } elsif ($self->{nc} == 0x003F) { # ?
3768 !!!parse-error (type => 'no s after target', ## TODO: type
3769 line => $self->{line_prev},
3770 column => $self->{column_prev}); ## XML5: no error
3771 $self->{ct}->{data} .= '?';
3772 $self->{state} = PI_DATA_AFTER_STATE;
3773 !!!next-input-character;
3774 redo A;
3775 } else {
3776 !!!parse-error (type => 'no s after target', ## TODO: type
3777 line => $self->{line_prev},
3778 column => $self->{column_prev}
3779 + 1 * ($self->{nc} == -1)); ## XML5: no error
3780 $self->{ct}->{data} .= '?'; ## XML5: not appended
3781 $self->{state} = PI_DATA_STATE;
3782 ## Reprocess.
3783 redo A;
3784 }
3785 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3786 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3787
3788 if ($self->{nc} == 0x003E) { # >
3789 if ($self->{in_subset}) {
3790 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3791 } else {
3792 $self->{state} = DATA_STATE;
3793 $self->{s_kwd} = '';
3794 }
3795 !!!next-input-character;
3796 !!!emit ($self->{ct}); # pi
3797 redo A;
3798 } elsif ($self->{nc} == 0x003F) { # ?
3799 $self->{ct}->{data} .= '?';
3800 ## Stay in the state.
3801 !!!next-input-character;
3802 redo A;
3803 } else {
3804 $self->{ct}->{data} .= '?'; ## XML5: not appended
3805 $self->{state} = PI_DATA_STATE;
3806 ## Reprocess.
3807 redo A;
3808 }
3809
3810 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3811 if ($self->{nc} == 0x003C) { # <
3812 $self->{state} = DOCTYPE_TAG_STATE;
3813 !!!next-input-character;
3814 redo A;
3815 } elsif ($self->{nc} == 0x0025) { # %
3816 ## XML5: Not defined yet.
3817
3818 ## TODO:
3819
3820 if (not $self->{stop_processing} and
3821 not $self->{document}->xml_standalone) {
3822 !!!parse-error (type => 'stop processing', ## TODO: type
3823 level => $self->{level}->{info});
3824 $self->{stop_processing} = 1;
3825 }
3826
3827 !!!next-input-character;
3828 redo A;
3829 } elsif ($self->{nc} == 0x005D) { # ]
3830 delete $self->{in_subset};
3831 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3832 !!!next-input-character;
3833 redo A;
3834 } elsif ($is_space->{$self->{nc}}) {
3835 ## Stay in the state.
3836 !!!next-input-character;
3837 redo A;
3838 } elsif ($self->{nc} == -1) {
3839 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3840 delete $self->{in_subset};
3841 $self->{state} = DATA_STATE;
3842 $self->{s_kwd} = '';
3843 ## Reconsume.
3844 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3845 redo A;
3846 } else {
3847 unless ($self->{internal_subset_tainted}) {
3848 ## XML5: No parse error.
3849 !!!parse-error (type => 'string in internal subset');
3850 $self->{internal_subset_tainted} = 1;
3851 }
3852 ## Stay in the state.
3853 !!!next-input-character;
3854 redo A;
3855 }
3856 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3857 if ($self->{nc} == 0x003E) { # >
3858 $self->{state} = DATA_STATE;
3859 $self->{s_kwd} = '';
3860 !!!next-input-character;
3861 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3862 redo A;
3863 } elsif ($self->{nc} == -1) {
3864 !!!parse-error (type => 'unclosed DOCTYPE');
3865 $self->{state} = DATA_STATE;
3866 $self->{s_kwd} = '';
3867 ## Reconsume.
3868 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3869 redo A;
3870 } else {
3871 ## XML5: No parse error and stay in the state.
3872 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3873
3874 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3875 !!!next-input-character;
3876 redo A;
3877 }
3878 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3879 if ($self->{nc} == 0x003E) { # >
3880 $self->{state} = DATA_STATE;
3881 $self->{s_kwd} = '';
3882 !!!next-input-character;
3883 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3884 redo A;
3885 } elsif ($self->{nc} == -1) {
3886 $self->{state} = DATA_STATE;
3887 $self->{s_kwd} = '';
3888 ## Reconsume.
3889 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3890 redo A;
3891 } else {
3892 ## Stay in the state.
3893 !!!next-input-character;
3894 redo A;
3895 }
3896 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3897 if ($self->{nc} == 0x0021) { # !
3898 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3899 !!!next-input-character;
3900 redo A;
3901 } elsif ($self->{nc} == 0x003F) { # ?
3902 $self->{state} = PI_STATE;
3903 !!!next-input-character;
3904 redo A;
3905 } elsif ($self->{nc} == -1) {
3906 !!!parse-error (type => 'bare stago');
3907 $self->{state} = DATA_STATE;
3908 $self->{s_kwd} = '';
3909 ## Reconsume.
3910 redo A;
3911 } else {
3912 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3913 line => $self->{line_prev},
3914 column => $self->{column_prev});
3915 $self->{state} = BOGUS_COMMENT_STATE;
3916 $self->{ct} = {type => COMMENT_TOKEN,
3917 data => '',
3918 }; ## NOTE: Will be discarded.
3919 !!!next-input-character;
3920 redo A;
3921 }
3922 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3923 ## XML5: "DOCTYPE markup declaration state".
3924
3925 if ($self->{nc} == 0x002D) { # -
3926 $self->{state} = MD_HYPHEN_STATE;
3927 !!!next-input-character;
3928 redo A;
3929 } elsif ($self->{nc} == 0x0045 or # E
3930 $self->{nc} == 0x0065) { # e
3931 $self->{state} = MD_E_STATE;
3932 $self->{kwd} = chr $self->{nc};
3933 !!!next-input-character;
3934 redo A;
3935 } elsif ($self->{nc} == 0x0041 or # A
3936 $self->{nc} == 0x0061) { # a
3937 $self->{state} = MD_ATTLIST_STATE;
3938 $self->{kwd} = chr $self->{nc};
3939 !!!next-input-character;
3940 redo A;
3941 } elsif ($self->{nc} == 0x004E or # N
3942 $self->{nc} == 0x006E) { # n
3943 $self->{state} = MD_NOTATION_STATE;
3944 $self->{kwd} = chr $self->{nc};
3945 !!!next-input-character;
3946 redo A;
3947 } else {
3948 #
3949 }
3950
3951 ## XML5: No parse error.
3952 !!!parse-error (type => 'bogus comment',
3953 line => $self->{line_prev},
3954 column => $self->{column_prev} - 1);
3955 ## Reconsume.
3956 $self->{state} = BOGUS_COMMENT_STATE;
3957 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3958 redo A;
3959 } elsif ($self->{state} == MD_E_STATE) {
3960 if ($self->{nc} == 0x004E or # N
3961 $self->{nc} == 0x006E) { # n
3962 $self->{state} = MD_ENTITY_STATE;
3963 $self->{kwd} .= chr $self->{nc};
3964 !!!next-input-character;
3965 redo A;
3966 } elsif ($self->{nc} == 0x004C or # L
3967 $self->{nc} == 0x006C) { # l
3968 ## XML5: <!ELEMENT> not supported.
3969 $self->{state} = MD_ELEMENT_STATE;
3970 $self->{kwd} .= chr $self->{nc};
3971 !!!next-input-character;
3972 redo A;
3973 } else {
3974 ## XML5: No parse error.
3975 !!!parse-error (type => 'bogus comment',
3976 line => $self->{line_prev},
3977 column => $self->{column_prev} - 2
3978 + 1 * ($self->{nc} == -1));
3979 ## Reconsume.
3980 $self->{state} = BOGUS_COMMENT_STATE;
3981 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3982 redo A;
3983 }
3984 } elsif ($self->{state} == MD_ENTITY_STATE) {
3985 if ($self->{nc} == [
3986 undef,
3987 undef,
3988 0x0054, # T
3989 0x0049, # I
3990 0x0054, # T
3991 ]->[length $self->{kwd}] or
3992 $self->{nc} == [
3993 undef,
3994 undef,
3995 0x0074, # t
3996 0x0069, # i
3997 0x0074, # t
3998 ]->[length $self->{kwd}]) {
3999 ## Stay in the state.
4000 $self->{kwd} .= chr $self->{nc};
4001 !!!next-input-character;
4002 redo A;
4003 } elsif ((length $self->{kwd}) == 5 and
4004 ($self->{nc} == 0x0059 or # Y
4005 $self->{nc} == 0x0079)) { # y
4006 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
4007 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4008 text => 'ENTITY',
4009 line => $self->{line_prev},
4010 column => $self->{column_prev} - 4);
4011 }
4012 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
4013 line => $self->{line_prev},
4014 column => $self->{column_prev} - 6};
4015 $self->{state} = DOCTYPE_MD_STATE;
4016 !!!next-input-character;
4017 redo A;
4018 } else {
4019 !!!parse-error (type => 'bogus comment',
4020 line => $self->{line_prev},
4021 column => $self->{column_prev} - 1
4022 - (length $self->{kwd})
4023 + 1 * ($self->{nc} == -1));
4024 $self->{state} = BOGUS_COMMENT_STATE;
4025 ## Reconsume.
4026 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4027 redo A;
4028 }
4029 } elsif ($self->{state} == MD_ELEMENT_STATE) {
4030 if ($self->{nc} == [
4031 undef,
4032 undef,
4033 0x0045, # E
4034 0x004D, # M
4035 0x0045, # E
4036 0x004E, # N
4037 ]->[length $self->{kwd}] or
4038 $self->{nc} == [
4039 undef,
4040 undef,
4041 0x0065, # e
4042 0x006D, # m
4043 0x0065, # e
4044 0x006E, # n
4045 ]->[length $self->{kwd}]) {
4046 ## Stay in the state.
4047 $self->{kwd} .= chr $self->{nc};
4048 !!!next-input-character;
4049 redo A;
4050 } elsif ((length $self->{kwd}) == 6 and
4051 ($self->{nc} == 0x0054 or # T
4052 $self->{nc} == 0x0074)) { # t
4053 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
4054 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4055 text => 'ELEMENT',
4056 line => $self->{line_prev},
4057 column => $self->{column_prev} - 5);
4058 }
4059 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
4060 line => $self->{line_prev},
4061 column => $self->{column_prev} - 7};
4062 $self->{state} = DOCTYPE_MD_STATE;
4063 !!!next-input-character;
4064 redo A;
4065 } else {
4066 !!!parse-error (type => 'bogus comment',
4067 line => $self->{line_prev},
4068 column => $self->{column_prev} - 1
4069 - (length $self->{kwd})
4070 + 1 * ($self->{nc} == -1));
4071 $self->{state} = BOGUS_COMMENT_STATE;
4072 ## Reconsume.
4073 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4074 redo A;
4075 }
4076 } elsif ($self->{state} == MD_ATTLIST_STATE) {
4077 if ($self->{nc} == [
4078 undef,
4079 0x0054, # T
4080 0x0054, # T
4081 0x004C, # L
4082 0x0049, # I
4083 0x0053, # S
4084 ]->[length $self->{kwd}] or
4085 $self->{nc} == [
4086 undef,
4087 0x0074, # t
4088 0x0074, # t
4089 0x006C, # l
4090 0x0069, # i
4091 0x0073, # s
4092 ]->[length $self->{kwd}]) {
4093 ## Stay in the state.
4094 $self->{kwd} .= chr $self->{nc};
4095 !!!next-input-character;
4096 redo A;
4097 } elsif ((length $self->{kwd}) == 6 and
4098 ($self->{nc} == 0x0054 or # T
4099 $self->{nc} == 0x0074)) { # t
4100 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
4101 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4102 text => 'ATTLIST',
4103 line => $self->{line_prev},
4104 column => $self->{column_prev} - 5);
4105 }
4106 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
4107 attrdefs => [],
4108 line => $self->{line_prev},
4109 column => $self->{column_prev} - 7};
4110 $self->{state} = DOCTYPE_MD_STATE;
4111 !!!next-input-character;
4112 redo A;
4113 } else {
4114 !!!parse-error (type => 'bogus comment',
4115 line => $self->{line_prev},
4116 column => $self->{column_prev} - 1
4117 - (length $self->{kwd})
4118 + 1 * ($self->{nc} == -1));
4119 $self->{state} = BOGUS_COMMENT_STATE;
4120 ## Reconsume.
4121 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4122 redo A;
4123 }
4124 } elsif ($self->{state} == MD_NOTATION_STATE) {
4125 if ($self->{nc} == [
4126 undef,
4127 0x004F, # O
4128 0x0054, # T
4129 0x0041, # A
4130 0x0054, # T
4131 0x0049, # I
4132 0x004F, # O
4133 ]->[length $self->{kwd}] or
4134 $self->{nc} == [
4135 undef,
4136 0x006F, # o
4137 0x0074, # t
4138 0x0061, # a
4139 0x0074, # t
4140 0x0069, # i
4141 0x006F, # o
4142 ]->[length $self->{kwd}]) {
4143 ## Stay in the state.
4144 $self->{kwd} .= chr $self->{nc};
4145 !!!next-input-character;
4146 redo A;
4147 } elsif ((length $self->{kwd}) == 7 and
4148 ($self->{nc} == 0x004E or # N
4149 $self->{nc} == 0x006E)) { # n
4150 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4151 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4152 text => 'NOTATION',
4153 line => $self->{line_prev},
4154 column => $self->{column_prev} - 6);
4155 }
4156 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4157 line => $self->{line_prev},
4158 column => $self->{column_prev} - 8};
4159 $self->{state} = DOCTYPE_MD_STATE;
4160 !!!next-input-character;
4161 redo A;
4162 } else {
4163 !!!parse-error (type => 'bogus comment',
4164 line => $self->{line_prev},
4165 column => $self->{column_prev} - 1
4166 - (length $self->{kwd})
4167 + 1 * ($self->{nc} == -1));
4168 $self->{state} = BOGUS_COMMENT_STATE;
4169 ## Reconsume.
4170 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4171 redo A;
4172 }
4173 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4174 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4175 ## "DOCTYPE NOTATION state".
4176
4177 if ($is_space->{$self->{nc}}) {
4178 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4179 $self->{state} = BEFORE_MD_NAME_STATE;
4180 !!!next-input-character;
4181 redo A;
4182 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4183 $self->{nc} == 0x0025) { # %
4184 ## XML5: Switch to the "DOCTYPE bogus comment state".
4185 !!!parse-error (type => 'no space before md name'); ## TODO: type
4186 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4187 !!!next-input-character;
4188 redo A;
4189 } elsif ($self->{nc} == -1) {
4190 !!!parse-error (type => 'unclosed md'); ## TODO: type
4191 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4192 ## Reconsume.
4193 redo A;
4194 } elsif ($self->{nc} == 0x003E) { # >
4195 ## XML5: Switch to the "DOCTYPE bogus comment state".
4196 !!!parse-error (type => 'no md name'); ## TODO: type
4197 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4198 !!!next-input-character;
4199 redo A;
4200 } else {
4201 ## XML5: Switch to the "DOCTYPE bogus comment state".
4202 !!!parse-error (type => 'no space before md name'); ## TODO: type
4203 $self->{state} = BEFORE_MD_NAME_STATE;
4204 redo A;
4205 }
4206 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4207 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4208 ## before state", "DOCTYPE ATTLIST name before state".
4209
4210 if ($is_space->{$self->{nc}}) {
4211 ## Stay in the state.
4212 !!!next-input-character;
4213 redo A;
4214 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4215 $self->{nc} == 0x0025) { # %
4216 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4217 !!!next-input-character;
4218 redo A;
4219 } elsif ($self->{nc} == 0x003E) { # >
4220 ## XML5: Same as "Anything else".
4221 !!!parse-error (type => 'no md name'); ## TODO: type
4222 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4223 !!!next-input-character;
4224 redo A;
4225 } elsif ($self->{nc} == -1) {
4226 !!!parse-error (type => 'unclosed md'); ## TODO: type
4227 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4228 ## Reconsume.
4229 redo A;
4230 } else {
4231 ## XML5: [ATTLIST] Not defined yet.
4232 $self->{ct}->{name} .= chr $self->{nc};
4233 $self->{state} = MD_NAME_STATE;
4234 !!!next-input-character;
4235 redo A;
4236 }
4237 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4238 if ($is_space->{$self->{nc}}) {
4239 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4240 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4241 $self->{state} = BEFORE_MD_NAME_STATE;
4242 !!!next-input-character;
4243 redo A;
4244 } elsif ($self->{nc} == 0x003E) { # >
4245 ## XML5: Same as "Anything else".
4246 !!!parse-error (type => 'no md name'); ## TODO: type
4247 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4248 !!!next-input-character;
4249 redo A;
4250 } elsif ($self->{nc} == -1) {
4251 !!!parse-error (type => 'unclosed md');
4252 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4253 ## Reconsume.
4254 redo A;
4255 } else {
4256 ## XML5: No parse error.
4257 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4258 $self->{state} = BOGUS_COMMENT_STATE;
4259 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4260 ## Reconsume.
4261 redo A;
4262 }
4263 } elsif ($self->{state} == MD_NAME_STATE) {
4264 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4265
4266 if ($is_space->{$self->{nc}}) {
4267 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4268 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4269 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4270 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4271 } else { # ENTITY/NOTATION
4272 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4273 }
4274 !!!next-input-character;
4275 redo A;
4276 } elsif ($self->{nc} == 0x003E) { # >
4277 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4278 #
4279 } else {
4280 !!!parse-error (type => 'no md def'); ## TODO: type
4281 }
4282 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4283 !!!next-input-character;
4284 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4285 redo A;
4286 } elsif ($self->{nc} == -1) {
4287 ## XML5: [ATTLIST] No parse error.
4288 !!!parse-error (type => 'unclosed md');
4289 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4290 ## Reconsume.
4291 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4292 redo A;
4293 } else {
4294 ## XML5: [ATTLIST] Not defined yet.
4295 $self->{ct}->{name} .= chr $self->{nc};
4296 ## Stay in the state.
4297 !!!next-input-character;
4298 redo A;
4299 }
4300 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4301 if ($is_space->{$self->{nc}}) {
4302 ## Stay in the state.
4303 !!!next-input-character;
4304 redo A;
4305 } elsif ($self->{nc} == 0x003E) { # >
4306 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4307 !!!next-input-character;
4308 !!!emit ($self->{ct}); # ATTLIST
4309 redo A;
4310 } elsif ($self->{nc} == -1) {
4311 ## XML5: No parse error.
4312 !!!parse-error (type => 'unclosed md'); ## TODO: type
4313 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4314 !!!emit ($self->{ct});
4315 redo A;
4316 } else {
4317 ## XML5: Not defined yet.
4318 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4319 tokens => [],
4320 line => $self->{line}, column => $self->{column}};
4321 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4322 !!!next-input-character;
4323 redo A;
4324 }
4325 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4326 if ($is_space->{$self->{nc}}) {
4327 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4328 !!!next-input-character;
4329 redo A;
4330 } elsif ($self->{nc} == 0x003E) { # >
4331 ## XML5: Same as "anything else".
4332 !!!parse-error (type => 'no attr type'); ## TODO: type
4333 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4334 !!!next-input-character;
4335 !!!emit ($self->{ct}); # ATTLIST
4336 redo A;
4337 } elsif ($self->{nc} == 0x0028) { # (
4338 ## XML5: Same as "anything else".
4339 !!!parse-error (type => 'no space before paren'); ## TODO: type
4340 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4341 !!!next-input-character;
4342 redo A;
4343 } elsif ($self->{nc} == -1) {
4344 ## XML5: No parse error.
4345 !!!parse-error (type => 'unclosed md'); ## TODO: type
4346 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4347 !!!next-input-character;
4348 !!!emit ($self->{ct}); # ATTLIST
4349 redo A;
4350 } else {
4351 ## XML5: Not defined yet.
4352 $self->{ca}->{name} .= chr $self->{nc};
4353 ## Stay in the state.
4354 !!!next-input-character;
4355 redo A;
4356 }
4357 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4358 if ($is_space->{$self->{nc}}) {
4359 ## Stay in the state.
4360 !!!next-input-character;
4361 redo A;
4362 } elsif ($self->{nc} == 0x003E) { # >
4363 ## XML5: Same as "anything else".
4364 !!!parse-error (type => 'no attr type'); ## TODO: type
4365 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4366 !!!next-input-character;
4367 !!!emit ($self->{ct}); # ATTLIST
4368 redo A;
4369 } elsif ($self->{nc} == 0x0028) { # (
4370 ## XML5: Same as "anything else".
4371 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4372 !!!next-input-character;
4373 redo A;
4374 } elsif ($self->{nc} == -1) {
4375 ## XML5: No parse error.
4376 !!!parse-error (type => 'unclosed md'); ## TODO: type
4377 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4378 !!!next-input-character;
4379 !!!emit ($self->{ct});
4380 redo A;
4381 } else {
4382 ## XML5: Not defined yet.
4383 $self->{ca}->{type} = chr $self->{nc};
4384 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4385 !!!next-input-character;
4386 redo A;
4387 }
4388 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4389 if ($is_space->{$self->{nc}}) {
4390 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4391 !!!next-input-character;
4392 redo A;
4393 } elsif ($self->{nc} == 0x0023) { # #
4394 ## XML5: Same as "anything else".
4395 !!!parse-error (type => 'no space before default value'); ## TODO: type
4396 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4397 !!!next-input-character;
4398 redo A;
4399 } elsif ($self->{nc} == 0x0022) { # "
4400 ## XML5: Same as "anything else".
4401 !!!parse-error (type => 'no space before default value'); ## TODO: type
4402 $self->{ca}->{value} = '';
4403 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4404 !!!next-input-character;
4405 redo A;
4406 } elsif ($self->{nc} == 0x0027) { # '
4407 ## XML5: Same as "anything else".
4408 !!!parse-error (type => 'no space before default value'); ## TODO: type
4409 $self->{ca}->{value} = '';
4410 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4411 !!!next-input-character;
4412 redo A;
4413 } elsif ($self->{nc} == 0x003E) { # >
4414 ## XML5: Same as "anything else".
4415 !!!parse-error (type => 'no attr default'); ## TODO: type
4416 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4417 !!!next-input-character;
4418 !!!emit ($self->{ct}); # ATTLIST
4419 redo A;
4420 } elsif ($self->{nc} == 0x0028) { # (
4421 ## XML5: Same as "anything else".
4422 !!!parse-error (type => 'no space before paren'); ## TODO: type
4423 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4424 !!!next-input-character;
4425 redo A;
4426 } elsif ($self->{nc} == -1) {
4427 ## XML5: No parse error.
4428 !!!parse-error (type => 'unclosed md'); ## TODO: type
4429 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4430 !!!next-input-character;
4431 !!!emit ($self->{ct});
4432 redo A;
4433 } else {
4434 ## XML5: Not defined yet.
4435 $self->{ca}->{type} .= chr $self->{nc};
4436 ## Stay in the state.
4437 !!!next-input-character;
4438 redo A;
4439 }
4440 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4441 if ($is_space->{$self->{nc}}) {
4442 ## Stay in the state.
4443 !!!next-input-character;
4444 redo A;
4445 } elsif ($self->{nc} == 0x0028) { # (
4446 ## XML5: Same as "anything else".
4447 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4448 !!!next-input-character;
4449 redo A;
4450 } elsif ($self->{nc} == 0x0023) { # #
4451 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4452 !!!next-input-character;
4453 redo A;
4454 } elsif ($self->{nc} == 0x0022) { # "
4455 ## XML5: Same as "anything else".
4456 $self->{ca}->{value} = '';
4457 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4458 !!!next-input-character;
4459 redo A;
4460 } elsif ($self->{nc} == 0x0027) { # '
4461 ## XML5: Same as "anything else".
4462 $self->{ca}->{value} = '';
4463 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4464 !!!next-input-character;
4465 redo A;
4466 } elsif ($self->{nc} == 0x003E) { # >
4467 ## XML5: Same as "anything else".
4468 !!!parse-error (type => 'no attr default'); ## TODO: type
4469 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4470 !!!next-input-character;
4471 !!!emit ($self->{ct}); # ATTLIST
4472 redo A;
4473 } elsif ($self->{nc} == -1) {
4474 ## XML5: No parse error.
4475 !!!parse-error (type => 'unclosed md'); ## TODO: type
4476 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4477 !!!next-input-character;
4478 !!!emit ($self->{ct});
4479 redo A;
4480 } else {
4481 ## XML5: Switch to the "DOCTYPE bogus comment state".
4482 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4483 $self->{ca}->{value} = '';
4484 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4485 ## Reconsume.
4486 redo A;
4487 }
4488 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4489 if ($is_space->{$self->{nc}}) {
4490 ## Stay in the state.
4491 !!!next-input-character;
4492 redo A;
4493 } elsif ($self->{nc} == 0x007C) { # |
4494 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4495 ## Stay in the state.
4496 !!!next-input-character;
4497 redo A;
4498 } elsif ($self->{nc} == 0x0029) { # )
4499 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4500 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4501 !!!next-input-character;
4502 redo A;
4503 } elsif ($self->{nc} == 0x003E) { # >
4504 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4505 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4506 !!!next-input-character;
4507 !!!emit ($self->{ct}); # ATTLIST
4508 redo A;
4509 } elsif ($self->{nc} == -1) {
4510 ## XML5: No parse error.
4511 !!!parse-error (type => 'unclosed md'); ## TODO: type
4512 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4513 !!!next-input-character;
4514 !!!emit ($self->{ct});
4515 redo A;
4516 } else {
4517 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4518 $self->{state} = ALLOWED_TOKEN_STATE;
4519 !!!next-input-character;
4520 redo A;
4521 }
4522 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4523 if ($is_space->{$self->{nc}}) {
4524 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4525 !!!next-input-character;
4526 redo A;
4527 } elsif ($self->{nc} == 0x007C) { # |
4528 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4529 !!!next-input-character;
4530 redo A;
4531 } elsif ($self->{nc} == 0x0029) { # )
4532 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4533 !!!next-input-character;
4534 redo A;
4535 } elsif ($self->{nc} == 0x003E) { # >
4536 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4537 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4538 !!!next-input-character;
4539 !!!emit ($self->{ct}); # ATTLIST
4540 redo A;
4541 } elsif ($self->{nc} == -1) {
4542 ## XML5: No parse error.
4543 !!!parse-error (type => 'unclosed md'); ## TODO: type
4544 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4545 !!!next-input-character;
4546 !!!emit ($self->{ct});
4547 redo A;
4548 } else {
4549 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4550 ## Stay in the state.
4551 !!!next-input-character;
4552 redo A;
4553 }
4554 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4555 if ($is_space->{$self->{nc}}) {
4556 ## Stay in the state.
4557 !!!next-input-character;
4558 redo A;
4559 } elsif ($self->{nc} == 0x007C) { # |
4560 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4561 !!!next-input-character;
4562 redo A;
4563 } elsif ($self->{nc} == 0x0029) { # )
4564 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4565 !!!next-input-character;
4566 redo A;
4567 } elsif ($self->{nc} == 0x003E) { # >
4568 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4569 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4570 !!!next-input-character;
4571 !!!emit ($self->{ct}); # ATTLIST
4572 redo A;
4573 } elsif ($self->{nc} == -1) {
4574 ## XML5: No parse error.
4575 !!!parse-error (type => 'unclosed md'); ## TODO: type
4576 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4577 !!!next-input-character;
4578 !!!emit ($self->{ct});
4579 redo A;
4580 } else {
4581 !!!parse-error (type => 'space in allowed token', ## TODO: type
4582 line => $self->{line_prev},
4583 column => $self->{column_prev});
4584 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4585 $self->{state} = ALLOWED_TOKEN_STATE;
4586 !!!next-input-character;
4587 redo A;
4588 }
4589 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4590 if ($is_space->{$self->{nc}}) {
4591 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4592 !!!next-input-character;
4593 redo A;
4594 } elsif ($self->{nc} == 0x0023) { # #
4595 !!!parse-error (type => 'no space before default value'); ## TODO: type
4596 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4597 !!!next-input-character;
4598 redo A;
4599 } elsif ($self->{nc} == 0x0022) { # "
4600 !!!parse-error (type => 'no space before default value'); ## TODO: type
4601 $self->{ca}->{value} = '';
4602 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4603 !!!next-input-character;
4604 redo A;
4605 } elsif ($self->{nc} == 0x0027) { # '
4606 !!!parse-error (type => 'no space before default value'); ## TODO: type
4607 $self->{ca}->{value} = '';
4608 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4609 !!!next-input-character;
4610 redo A;
4611 } elsif ($self->{nc} == 0x003E) { # >
4612 !!!parse-error (type => 'no attr default'); ## TODO: type
4613 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4614 !!!next-input-character;
4615 !!!emit ($self->{ct}); # ATTLIST
4616 redo A;
4617 } elsif ($self->{nc} == -1) {
4618 !!!parse-error (type => 'unclosed md'); ## TODO: type
4619 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4620 !!!next-input-character;
4621 !!!emit ($self->{ct});
4622 redo A;
4623 } else {
4624 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4625 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4626 ## Reconsume.
4627 redo A;
4628 }
4629 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4630 if ($is_space->{$self->{nc}}) {
4631 ## Stay in the state.
4632 !!!next-input-character;
4633 redo A;
4634 } elsif ($self->{nc} == 0x0023) { # #
4635 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4636 !!!next-input-character;
4637 redo A;
4638 } elsif ($self->{nc} == 0x0022) { # "
4639 $self->{ca}->{value} = '';
4640 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4641 !!!next-input-character;
4642 redo A;
4643 } elsif ($self->{nc} == 0x0027) { # '
4644 $self->{ca}->{value} = '';
4645 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4646 !!!next-input-character;
4647 redo A;
4648 } elsif ($self->{nc} == 0x003E) { # >
4649 !!!parse-error (type => 'no attr default'); ## TODO: type
4650 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4651 !!!next-input-character;
4652 !!!emit ($self->{ct}); # ATTLIST
4653 redo A;
4654 } elsif ($self->{nc} == -1) {
4655 !!!parse-error (type => 'unclosed md'); ## TODO: type
4656 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4657 !!!next-input-character;
4658 !!!emit ($self->{ct});
4659 redo A;
4660 } else {
4661 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4662 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4663 ## Reconsume.
4664 redo A;
4665 }
4666 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4667 if ($is_space->{$self->{nc}}) {
4668 ## XML5: No parse error.
4669 !!!parse-error (type => 'no default type'); ## TODO: type
4670 $self->{state} = BOGUS_MD_STATE;
4671 ## Reconsume.
4672 redo A;
4673 } elsif ($self->{nc} == 0x0022) { # "
4674 ## XML5: Same as "anything else".
4675 $self->{ca}->{value} = '';
4676 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4677 !!!next-input-character;
4678 redo A;
4679 } elsif ($self->{nc} == 0x0027) { # '
4680 ## XML5: Same as "anything else".
4681 $self->{ca}->{value} = '';
4682 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4683 !!!next-input-character;
4684 redo A;
4685 } elsif ($self->{nc} == 0x003E) { # >
4686 ## XML5: Same as "anything else".
4687 !!!parse-error (type => 'no attr default'); ## TODO: type
4688 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4689 !!!next-input-character;
4690 !!!emit ($self->{ct}); # ATTLIST
4691 redo A;
4692 } elsif ($self->{nc} == -1) {
4693 ## XML5: No parse error.
4694 !!!parse-error (type => 'unclosed md'); ## TODO: type
4695 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4696 !!!next-input-character;
4697 !!!emit ($self->{ct});
4698 redo A;
4699 } else {
4700 $self->{ca}->{default} = chr $self->{nc};
4701 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4702 !!!next-input-character;
4703 redo A;
4704 }
4705 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4706 if ($is_space->{$self->{nc}}) {
4707 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4708 !!!next-input-character;
4709 redo A;
4710 } elsif ($self->{nc} == 0x0022) { # "
4711 ## XML5: Same as "anything else".
4712 !!!parse-error (type => 'no space before default value'); ## TODO: type
4713 $self->{ca}->{value} = '';
4714 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4715 !!!next-input-character;
4716 redo A;
4717 } elsif ($self->{nc} == 0x0027) { # '
4718 ## XML5: Same as "anything else".
4719 !!!parse-error (type => 'no space before default value'); ## TODO: type
4720 $self->{ca}->{value} = '';
4721 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4722 !!!next-input-character;
4723 redo A;
4724 } elsif ($self->{nc} == 0x003E) { # >
4725 ## XML5: Same as "anything else".
4726 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4727 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4728 !!!next-input-character;
4729 !!!emit ($self->{ct}); # ATTLIST
4730 redo A;
4731 } elsif ($self->{nc} == -1) {
4732 ## XML5: No parse error.
4733 !!!parse-error (type => 'unclosed md'); ## TODO: type
4734 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4735 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4736 !!!next-input-character;
4737 !!!emit ($self->{ct});
4738 redo A;
4739 } else {
4740 $self->{ca}->{default} .= chr $self->{nc};
4741 ## Stay in the state.
4742 !!!next-input-character;
4743 redo A;
4744 }
4745 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4746 if ($is_space->{$self->{nc}}) {
4747 ## Stay in the state.
4748 !!!next-input-character;
4749 redo A;
4750 } elsif ($self->{nc} == 0x0022) { # "
4751 $self->{ca}->{value} = '';
4752 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4753 !!!next-input-character;
4754 redo A;
4755 } elsif ($self->{nc} == 0x0027) { # '
4756 $self->{ca}->{value} = '';
4757 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4758 !!!next-input-character;
4759 redo A;
4760 } elsif ($self->{nc} == 0x003E) { # >
4761 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4762 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4763 !!!next-input-character;
4764 !!!emit ($self->{ct}); # ATTLIST
4765 redo A;
4766 } elsif ($self->{nc} == -1) {
4767 ## XML5: No parse error.
4768 !!!parse-error (type => 'unclosed md'); ## TODO: type
4769 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4770 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4771 !!!next-input-character;
4772 !!!emit ($self->{ct});
4773 redo A;
4774 } else {
4775 ## XML5: Not defined yet.
4776 if ($self->{ca}->{default} eq 'FIXED') {
4777 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4778 } else {
4779 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4780 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4781 }
4782 ## Reconsume.
4783 redo A;
4784 }
4785 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4786 if ($is_space->{$self->{nc}} or
4787 $self->{nc} == -1 or
4788 $self->{nc} == 0x003E) { # >
4789 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4790 ## Reconsume.
4791 redo A;
4792 } else {
4793 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4794 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4795 ## Reconsume.
4796 redo A;
4797 }
4798 } elsif ($self->{state} == NDATA_STATE) {
4799 ## ASCII case-insensitive
4800 if ($self->{nc} == [
4801 undef,
4802 0x0044, # D
4803 0x0041, # A
4804 0x0054, # T
4805 ]->[length $self->{kwd}] or
4806 $self->{nc} == [
4807 undef,
4808 0x0064, # d
4809 0x0061, # a
4810 0x0074, # t
4811 ]->[length $self->{kwd}]) {
4812 !!!cp (172.2);
4813 ## Stay in the state.
4814 $self->{kwd} .= chr $self->{nc};
4815 !!!next-input-character;
4816 redo A;
4817 } elsif ((length $self->{kwd}) == 4 and
4818 ($self->{nc} == 0x0041 or # A
4819 $self->{nc} == 0x0061)) { # a
4820 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4821 !!!cp (172.3);
4822 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4823 text => 'NDATA',
4824 line => $self->{line_prev},
4825 column => $self->{column_prev} - 4);
4826 } else {
4827 !!!cp (172.4);
4828 }
4829 $self->{state} = AFTER_NDATA_STATE;
4830 !!!next-input-character;
4831 redo A;
4832 } else {
4833 !!!parse-error (type => 'string after literal', ## TODO: type
4834 line => $self->{line_prev},
4835 column => $self->{column_prev} + 1
4836 - length $self->{kwd});
4837 !!!cp (172.5);
4838 $self->{state} = BOGUS_MD_STATE;
4839 ## Reconsume.
4840 redo A;
4841 }
4842 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4843 if ($is_space->{$self->{nc}}) {
4844 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4845 !!!next-input-character;
4846 redo A;
4847 } elsif ($self->{nc} == 0x003E) { # >
4848 !!!parse-error (type => 'no notation name'); ## TODO: type
4849 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4850 !!!next-input-character;
4851 !!!emit ($self->{ct}); # ENTITY
4852 redo A;
4853 } elsif ($self->{nc} == -1) {
4854 !!!parse-error (type => 'unclosed md'); ## TODO: type
4855 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4856 !!!next-input-character;
4857 !!!emit ($self->{ct}); # ENTITY
4858 redo A;
4859 } else {
4860 !!!parse-error (type => 'string after literal', ## TODO: type
4861 line => $self->{line_prev},
4862 column => $self->{column_prev} + 1
4863 - length $self->{kwd});
4864 $self->{state} = BOGUS_MD_STATE;
4865 ## Reconsume.
4866 redo A;
4867 }
4868 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4869 if ($is_space->{$self->{nc}}) {
4870 ## Stay in the state.
4871 !!!next-input-character;
4872 redo A;
4873 } elsif ($self->{nc} == 0x003E) { # >
4874 !!!parse-error (type => 'no notation name'); ## TODO: type
4875 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4876 !!!next-input-character;
4877 !!!emit ($self->{ct}); # ENTITY
4878 redo A;
4879 } elsif ($self->{nc} == -1) {
4880 !!!parse-error (type => 'unclosed md'); ## TODO: type
4881 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4882 !!!next-input-character;
4883 !!!emit ($self->{ct}); # ENTITY
4884 redo A;
4885 } else {
4886 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4887 $self->{state} = NOTATION_NAME_STATE;
4888 !!!next-input-character;
4889 redo A;
4890 }
4891 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4892 if ($is_space->{$self->{nc}}) {
4893 $self->{state} = AFTER_MD_DEF_STATE;
4894 !!!next-input-character;
4895 redo A;
4896 } elsif ($self->{nc} == 0x003E) { # >
4897 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4898 !!!next-input-character;
4899 !!!emit ($self->{ct}); # ENTITY
4900 redo A;
4901 } elsif ($self->{nc} == -1) {
4902 !!!parse-error (type => 'unclosed md'); ## TODO: type
4903 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4904 !!!next-input-character;
4905 !!!emit ($self->{ct}); # ENTITY
4906 redo A;
4907 } else {
4908 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4909 ## Stay in the state.
4910 !!!next-input-character;
4911 redo A;
4912 }
4913 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4914 if ($self->{nc} == 0x0022) { # "
4915 $self->{state} = AFTER_MD_DEF_STATE;
4916 !!!next-input-character;
4917 redo A;
4918 } elsif ($self->{nc} == 0x0026) { # &
4919 $self->{prev_state} = $self->{state};
4920 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4921 $self->{entity_add} = 0x0022; # "
4922 !!!next-input-character;
4923 redo A;
4924 ## TODO: %
4925 } elsif ($self->{nc} == -1) {
4926 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4927 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4928 ## Reconsume.
4929 !!!emit ($self->{ct}); # ENTITY
4930 redo A;
4931 } else {
4932 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4933 !!!next-input-character;
4934 redo A;
4935 }
4936 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4937 if ($self->{nc} == 0x0027) { # '
4938 $self->{state} = AFTER_MD_DEF_STATE;
4939 !!!next-input-character;
4940 redo A;
4941 } elsif ($self->{nc} == 0x0026) { # &
4942 $self->{prev_state} = $self->{state};
4943 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4944 $self->{entity_add} = 0x0027; # '
4945 !!!next-input-character;
4946 redo A;
4947 ## TODO: %
4948 } elsif ($self->{nc} == -1) {
4949 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4950 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4951 ## Reconsume.
4952 !!!emit ($self->{ct}); # ENTITY
4953 redo A;
4954 } else {
4955 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4956 !!!next-input-character;
4957 redo A;
4958 }
4959 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4960 if ($is_space->{$self->{nc}} or
4961 {
4962 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4963 $self->{entity_add} => 1,
4964 }->{$self->{nc}}) {
4965 !!!parse-error (type => 'bare ero',
4966 line => $self->{line_prev},
4967 column => $self->{column_prev}
4968 + ($self->{nc} == -1 ? 1 : 0));
4969 ## Don't consume
4970 ## Return nothing.
4971 #
4972 } elsif ($self->{nc} == 0x0023) { # #
4973 $self->{ca} = $self->{ct};
4974 $self->{state} = ENTITY_HASH_STATE;
4975 $self->{kwd} = '#';
4976 !!!next-input-character;
4977 redo A;
4978 } else {
4979 #
4980 }
4981
4982 $self->{ct}->{value} .= '&';
4983 $self->{state} = $self->{prev_state};
4984 ## Reconsume.
4985 redo A;
4986 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4987 if ($is_space->{$self->{nc}}) {
4988 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4989 !!!next-input-character;
4990 redo A;
4991 } elsif ($self->{nc} == 0x0028) { # (
4992 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4993 $self->{ct}->{content} = ['('];
4994 $self->{group_depth} = 1;
4995 !!!next-input-character;
4996 redo A;
4997 } elsif ($self->{nc} == 0x003E) { # >
4998 !!!parse-error (type => 'no md def'); ## TODO: type
4999 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5000 !!!next-input-character;
5001 !!!emit ($self->{ct}); # ELEMENT
5002 redo A;
5003 } elsif ($self->{nc} == -1) {
5004 !!!parse-error (type => 'unclosed md'); ## TODO: type
5005 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5006 !!!next-input-character;
5007 !!!emit ($self->{ct}); # ELEMENT
5008 redo A;
5009 } else {
5010 $self->{ct}->{content} = [chr $self->{nc}];
5011 $self->{state} = CONTENT_KEYWORD_STATE;
5012 !!!next-input-character;
5013 redo A;
5014 }
5015 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
5016 if ($is_space->{$self->{nc}}) {
5017 $self->{state} = AFTER_MD_DEF_STATE;
5018 !!!next-input-character;
5019 redo A;
5020 } elsif ($self->{nc} == 0x003E) { # >
5021 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5022 !!!next-input-character;
5023 !!!emit ($self->{ct}); # ELEMENT
5024 redo A;
5025 } elsif ($self->{nc} == -1) {
5026 !!!parse-error (type => 'unclosed md'); ## TODO: type
5027 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5028 !!!next-input-character;
5029 !!!emit ($self->{ct}); # ELEMENT
5030 redo A;
5031 } else {
5032 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
5033 ## Stay in the state.
5034 !!!next-input-character;
5035 redo A;
5036 }
5037 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
5038 if ($is_space->{$self->{nc}}) {
5039 ## Stay in the state.
5040 !!!next-input-character;
5041 redo A;
5042 } elsif ($self->{nc} == 0x0028) { # (
5043 $self->{group_depth}++;
5044 push @{$self->{ct}->{content}}, chr $self->{nc};
5045 ## Stay in the state.
5046 !!!next-input-character;
5047 redo A;
5048 } elsif ($self->{nc} == 0x007C or # |
5049 $self->{nc} == 0x002C) { # ,
5050 !!!parse-error (type => 'empty element name'); ## TODO: type
5051 ## Stay in the state.
5052 !!!next-input-character;
5053 redo A;
5054 } elsif ($self->{nc} == 0x0029) { # )
5055 !!!parse-error (type => 'empty element name'); ## TODO: type
5056 push @{$self->{ct}->{content}}, chr $self->{nc};
5057 $self->{group_depth}--;
5058 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5059 !!!next-input-character;
5060 redo A;
5061 } elsif ($self->{nc} == 0x003E) { # >
5062 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5063 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5064 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5065 !!!next-input-character;
5066 !!!emit ($self->{ct}); # ELEMENT
5067 redo A;
5068 } elsif ($self->{nc} == -1) {
5069 !!!parse-error (type => 'unclosed md'); ## TODO: type
5070 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5071 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5072 !!!next-input-character;
5073 !!!emit ($self->{ct}); # ELEMENT
5074 redo A;
5075 } else {
5076 push @{$self->{ct}->{content}}, chr $self->{nc};
5077 $self->{state} = CM_ELEMENT_NAME_STATE;
5078 !!!next-input-character;
5079 redo A;
5080 }
5081 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
5082 if ($is_space->{$self->{nc}}) {
5083 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5084 !!!next-input-character;
5085 redo A;
5086 } elsif ($self->{nc} == 0x002A or # *
5087 $self->{nc} == 0x002B or # +
5088 $self->{nc} == 0x003F) { # ?
5089 push @{$self->{ct}->{content}}, chr $self->{nc};
5090 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5091 !!!next-input-character;
5092 redo A;
5093 } elsif ($self->{nc} == 0x007C or # |
5094 $self->{nc} == 0x002C) { # ,
5095 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5096 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5097 !!!next-input-character;
5098 redo A;
5099 } elsif ($self->{nc} == 0x0029) { # )
5100 $self->{group_depth}--;
5101 push @{$self->{ct}->{content}}, chr $self->{nc};
5102 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5103 !!!next-input-character;
5104 redo A;
5105 } elsif ($self->{nc} == 0x003E) { # >
5106 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5107 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5108 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5109 !!!next-input-character;
5110 !!!emit ($self->{ct}); # ELEMENT
5111 redo A;
5112 } elsif ($self->{nc} == -1) {
5113 !!!parse-error (type => 'unclosed md'); ## TODO: type
5114 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5115 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5116 !!!next-input-character;
5117 !!!emit ($self->{ct}); # ELEMENT
5118 redo A;
5119 } else {
5120 $self->{ct}->{content}->[-1] .= chr $self->{nc};
5121 ## Stay in the state.
5122 !!!next-input-character;
5123 redo A;
5124 }
5125 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
5126 if ($is_space->{$self->{nc}}) {
5127 ## Stay in the state.
5128 !!!next-input-character;
5129 redo A;
5130 } elsif ($self->{nc} == 0x007C or # |
5131 $self->{nc} == 0x002C) { # ,
5132 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5133 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5134 !!!next-input-character;
5135 redo A;
5136 } elsif ($self->{nc} == 0x0029) { # )
5137 $self->{group_depth}--;
5138 push @{$self->{ct}->{content}}, chr $self->{nc};
5139 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5140 !!!next-input-character;
5141 redo A;
5142 } elsif ($self->{nc} == 0x003E) { # >
5143 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5144 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5145 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5146 !!!next-input-character;
5147 !!!emit ($self->{ct}); # ELEMENT
5148 redo A;
5149 } elsif ($self->{nc} == -1) {
5150 !!!parse-error (type => 'unclosed md'); ## TODO: type
5151 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5152 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5153 !!!next-input-character;
5154 !!!emit ($self->{ct}); # ELEMENT
5155 redo A;
5156 } else {
5157 !!!parse-error (type => 'after element name'); ## TODO: type
5158 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5159 $self->{state} = BOGUS_MD_STATE;
5160 !!!next-input-character;
5161 redo A;
5162 }
5163 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5164 if ($is_space->{$self->{nc}}) {
5165 if ($self->{group_depth}) {
5166 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5167 } else {
5168 $self->{state} = AFTER_MD_DEF_STATE;
5169 }
5170 !!!next-input-character;
5171 redo A;
5172 } elsif ($self->{nc} == 0x002A or # *
5173 $self->{nc} == 0x002B or # +
5174 $self->{nc} == 0x003F) { # ?
5175 push @{$self->{ct}->{content}}, chr $self->{nc};
5176 if ($self->{group_depth}) {
5177 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5178 } else {
5179 $self->{state} = AFTER_MD_DEF_STATE;
5180 }
5181 !!!next-input-character;
5182 redo A;
5183 } elsif ($self->{nc} == 0x0029) { # )
5184 if ($self->{group_depth}) {
5185 $self->{group_depth}--;
5186 push @{$self->{ct}->{content}}, chr $self->{nc};
5187 ## Stay in the state.
5188 !!!next-input-character;
5189 redo A;
5190 } else {
5191 !!!parse-error (type => 'string after md def'); ## TODO: type
5192 $self->{state} = BOGUS_MD_STATE;
5193 ## Reconsume.
5194 redo A;
5195 }
5196 } elsif ($self->{nc} == 0x003E) { # >
5197 if ($self->{group_depth}) {
5198 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5199 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5200 }
5201 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5202 !!!next-input-character;
5203 !!!emit ($self->{ct}); # ELEMENT
5204 redo A;
5205 } elsif ($self->{nc} == -1) {
5206 !!!parse-error (type => 'unclosed md'); ## TODO: type
5207 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5208 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5209 !!!next-input-character;
5210 !!!emit ($self->{ct}); # ELEMENT
5211 redo A;
5212 } else {
5213 if ($self->{group_depth}) {
5214 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5215 } else {
5216 !!!parse-error (type => 'string after md def'); ## TODO: type
5217 $self->{state} = BOGUS_MD_STATE;
5218 }
5219 ## Reconsume.
5220 redo A;
5221 }
5222 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5223 if ($is_space->{$self->{nc}}) {
5224 ## Stay in the state.
5225 !!!next-input-character;
5226 redo A;
5227 } elsif ($self->{nc} == 0x003E) { # >
5228 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5229 !!!next-input-character;
5230 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5231 redo A;
5232 } elsif ($self->{nc} == -1) {
5233 !!!parse-error (type => 'unclosed md'); ## TODO: type
5234 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5235 !!!next-input-character;
5236 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5237 redo A;
5238 } else {
5239 !!!parse-error (type => 'string after md def'); ## TODO: type
5240 $self->{state} = BOGUS_MD_STATE;
5241 ## Reconsume.
5242 redo A;
5243 }
5244 } elsif ($self->{state} == BOGUS_MD_STATE) {
5245 if ($self->{nc} == 0x003E) { # >
5246 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5247 !!!next-input-character;
5248 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5249 redo A;
5250 } elsif ($self->{nc} == -1) {
5251 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5252 ## Reconsume.
5253 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5254 redo A;
5255 } else {
5256 ## Stay in the state.
5257 !!!next-input-character;
5258 redo A;
5259 }
5260 } else {
5261 die "$0: $self->{state}: Unknown state";
5262 }
5263 } # A
5264
5265 die "$0: _get_next_token: unexpected case";
5266 } # _get_next_token
5267
5268 1;
5269 ## $Date: 2009/09/05 10:41:07 $
5270

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24