/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.28 - (show annotations) (download) (as text)
Sun Jul 5 04:38:45 2009 UTC (15 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.27: +13 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	5 Jul 2009 04:38:11 -0000
2009-07-05  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Updated the result (c.f. HTML5 revision
	3121).

++ whatpm/Whatpm/HTML/ChangeLog	5 Jul 2009 04:38:33 -0000
2009-07-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Reduced the number of parse errors on broken
	DOCTYPE (HTML5 revision 3121).

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.27 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276 !!!next-input-character;
277 $self->{token} = [];
278 # $self->{escape}
279 } # _initialize_tokenizer
280
281 ## A token has:
282 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 ## ->{name} (DOCTYPE_TOKEN)
285 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 ## ->{target} (PI_TOKEN)
287 ## ->{pubid} (DOCTYPE_TOKEN)
288 ## ->{sysid} (DOCTYPE_TOKEN)
289 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291 ## ->{name}
292 ## ->{value}
293 ## ->{has_reference} == 1 or 0
294 ## ->{index}: Index of the attribute in a tag.
295 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299
300 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302 ## while the token is pushed back to the stack.
303
304 ## Emitted token MUST immediately be handled by the tree construction state.
305
306 ## Before each step, UA MAY check to see if either one of the scripts in
307 ## "list of scripts that will execute as soon as possible" or the first
308 ## script in the "list of scripts that will execute asynchronously",
309 ## has completed loading. If one has, then it MUST be executed
310 ## and removed from the list.
311
312 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313 ## (This requirement was dropped from HTML5 spec, unfortunately.)
314
315 my $is_space = {
316 0x0009 => 1, # CHARACTER TABULATION (HT)
317 0x000A => 1, # LINE FEED (LF)
318 #0x000B => 0, # LINE TABULATION (VT)
319 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 #0x000D => 1, # CARRIAGE RETURN (CR)
321 0x0020 => 1, # SPACE (SP)
322 };
323
324 sub _get_next_token ($) {
325 my $self = shift;
326
327 if ($self->{self_closing}) {
328 !!!parse-error (type => 'nestc', token => $self->{ct});
329 ## NOTE: The |self_closing| flag is only set by start tag token.
330 ## In addition, when a start tag token is emitted, it is always set to
331 ## |ct|.
332 delete $self->{self_closing};
333 }
334
335 if (@{$self->{token}}) {
336 $self->{self_closing} = $self->{token}->[0]->{self_closing};
337 return shift @{$self->{token}};
338 }
339
340 A: {
341 if ($self->{state} == PCDATA_STATE) {
342 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343
344 if ($self->{nc} == 0x0026) { # &
345 !!!cp (0.1);
346 ## NOTE: In the spec, the tokenizer is switched to the
347 ## "entity data state". In this implementation, the tokenizer
348 ## is switched to the |ENTITY_STATE|, which is an implementation
349 ## of the "consume a character reference" algorithm.
350 $self->{entity_add} = -1;
351 $self->{prev_state} = DATA_STATE;
352 $self->{state} = ENTITY_STATE;
353 !!!next-input-character;
354 redo A;
355 } elsif ($self->{nc} == 0x003C) { # <
356 !!!cp (0.2);
357 $self->{state} = TAG_OPEN_STATE;
358 !!!next-input-character;
359 redo A;
360 } elsif ($self->{nc} == -1) {
361 !!!cp (0.3);
362 !!!emit ({type => END_OF_FILE_TOKEN,
363 line => $self->{line}, column => $self->{column}});
364 last A; ## TODO: ok?
365 } else {
366 !!!cp (0.4);
367 #
368 }
369
370 # Anything else
371 my $token = {type => CHARACTER_TOKEN,
372 data => chr $self->{nc},
373 line => $self->{line}, column => $self->{column},
374 };
375 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
376
377 ## Stay in the state.
378 !!!next-input-character;
379 !!!emit ($token);
380 redo A;
381 } elsif ($self->{state} == DATA_STATE) {
382 $self->{s_kwd} = '' unless defined $self->{s_kwd};
383 if ($self->{nc} == 0x0026) { # &
384 $self->{s_kwd} = '';
385 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
386 not $self->{escape}) {
387 !!!cp (1);
388 ## NOTE: In the spec, the tokenizer is switched to the
389 ## "entity data state". In this implementation, the tokenizer
390 ## is switched to the |ENTITY_STATE|, which is an implementation
391 ## of the "consume a character reference" algorithm.
392 $self->{entity_add} = -1;
393 $self->{prev_state} = DATA_STATE;
394 $self->{state} = ENTITY_STATE;
395 !!!next-input-character;
396 redo A;
397 } else {
398 !!!cp (2);
399 #
400 }
401 } elsif ($self->{nc} == 0x002D) { # -
402 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
403 if ($self->{s_kwd} eq '<!-') {
404 !!!cp (3);
405 $self->{escape} = 1; # unless $self->{escape};
406 $self->{s_kwd} = '--';
407 #
408 } elsif ($self->{s_kwd} eq '-') {
409 !!!cp (4);
410 $self->{s_kwd} = '--';
411 #
412 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
413 !!!cp (4.1);
414 $self->{s_kwd} .= '-';
415 #
416 } else {
417 !!!cp (5);
418 $self->{s_kwd} = '-';
419 #
420 }
421 }
422
423 #
424 } elsif ($self->{nc} == 0x0021) { # !
425 if (length $self->{s_kwd}) {
426 !!!cp (5.1);
427 $self->{s_kwd} .= '!';
428 #
429 } else {
430 !!!cp (5.2);
431 #$self->{s_kwd} = '';
432 #
433 }
434 #
435 } elsif ($self->{nc} == 0x003C) { # <
436 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
437 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
438 not $self->{escape})) {
439 !!!cp (6);
440 $self->{state} = TAG_OPEN_STATE;
441 !!!next-input-character;
442 redo A;
443 } else {
444 !!!cp (7);
445 $self->{s_kwd} = '';
446 #
447 }
448 } elsif ($self->{nc} == 0x003E) { # >
449 if ($self->{escape} and
450 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
451 if ($self->{s_kwd} eq '--') {
452 !!!cp (8);
453 delete $self->{escape};
454 #
455 } else {
456 !!!cp (9);
457 #
458 }
459 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
460 !!!cp (9.1);
461 !!!parse-error (type => 'unmatched mse', ## TODO: type
462 line => $self->{line_prev},
463 column => $self->{column_prev} - 1);
464 #
465 } else {
466 !!!cp (10);
467 #
468 }
469
470 $self->{s_kwd} = '';
471 #
472 } elsif ($self->{nc} == 0x005D) { # ]
473 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
474 !!!cp (10.1);
475 $self->{s_kwd} .= ']';
476 } elsif ($self->{s_kwd} eq ']]') {
477 !!!cp (10.2);
478 #
479 } else {
480 !!!cp (10.3);
481 $self->{s_kwd} = '';
482 }
483 #
484 } elsif ($self->{nc} == -1) {
485 !!!cp (11);
486 $self->{s_kwd} = '';
487 !!!emit ({type => END_OF_FILE_TOKEN,
488 line => $self->{line}, column => $self->{column}});
489 last A; ## TODO: ok?
490 } else {
491 !!!cp (12);
492 $self->{s_kwd} = '';
493 #
494 }
495
496 # Anything else
497 my $token = {type => CHARACTER_TOKEN,
498 data => chr $self->{nc},
499 line => $self->{line}, column => $self->{column},
500 };
501 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
502 length $token->{data})) {
503 $self->{s_kwd} = '';
504 }
505
506 ## Stay in the data state.
507 if (not $self->{is_xml} and
508 $self->{content_model} == PCDATA_CONTENT_MODEL) {
509 !!!cp (13);
510 $self->{state} = PCDATA_STATE;
511 } else {
512 !!!cp (14);
513 ## Stay in the state.
514 }
515 !!!next-input-character;
516 !!!emit ($token);
517 redo A;
518 } elsif ($self->{state} == TAG_OPEN_STATE) {
519 ## XML5: "tag state".
520
521 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
522 if ($self->{nc} == 0x002F) { # /
523 !!!cp (15);
524 !!!next-input-character;
525 $self->{state} = CLOSE_TAG_OPEN_STATE;
526 redo A;
527 } elsif ($self->{nc} == 0x0021) { # !
528 !!!cp (15.1);
529 $self->{s_kwd} = $self->{escaped} ? '' : '<';
530 #
531 } else {
532 !!!cp (16);
533 $self->{s_kwd} = '';
534 #
535 }
536
537 ## reconsume
538 $self->{state} = DATA_STATE;
539 !!!emit ({type => CHARACTER_TOKEN, data => '<',
540 line => $self->{line_prev},
541 column => $self->{column_prev},
542 });
543 redo A;
544 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
545 if ($self->{nc} == 0x0021) { # !
546 !!!cp (17);
547 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
548 !!!next-input-character;
549 redo A;
550 } elsif ($self->{nc} == 0x002F) { # /
551 !!!cp (18);
552 $self->{state} = CLOSE_TAG_OPEN_STATE;
553 !!!next-input-character;
554 redo A;
555 } elsif (0x0041 <= $self->{nc} and
556 $self->{nc} <= 0x005A) { # A..Z
557 !!!cp (19);
558 $self->{ct}
559 = {type => START_TAG_TOKEN,
560 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
561 line => $self->{line_prev},
562 column => $self->{column_prev}};
563 $self->{state} = TAG_NAME_STATE;
564 !!!next-input-character;
565 redo A;
566 } elsif (0x0061 <= $self->{nc} and
567 $self->{nc} <= 0x007A) { # a..z
568 !!!cp (20);
569 $self->{ct} = {type => START_TAG_TOKEN,
570 tag_name => chr ($self->{nc}),
571 line => $self->{line_prev},
572 column => $self->{column_prev}};
573 $self->{state} = TAG_NAME_STATE;
574 !!!next-input-character;
575 redo A;
576 } elsif ($self->{nc} == 0x003E) { # >
577 !!!cp (21);
578 !!!parse-error (type => 'empty start tag',
579 line => $self->{line_prev},
580 column => $self->{column_prev});
581 $self->{state} = DATA_STATE;
582 $self->{s_kwd} = '';
583 !!!next-input-character;
584
585 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
586 line => $self->{line_prev},
587 column => $self->{column_prev},
588 });
589
590 redo A;
591 } elsif ($self->{nc} == 0x003F) { # ?
592 if ($self->{is_xml}) {
593 !!!cp (22.1);
594 $self->{state} = PI_STATE;
595 !!!next-input-character;
596 redo A;
597 } else {
598 !!!cp (22);
599 !!!parse-error (type => 'pio',
600 line => $self->{line_prev},
601 column => $self->{column_prev});
602 $self->{state} = BOGUS_COMMENT_STATE;
603 $self->{ct} = {type => COMMENT_TOKEN, data => '',
604 line => $self->{line_prev},
605 column => $self->{column_prev},
606 };
607 ## $self->{nc} is intentionally left as is
608 redo A;
609 }
610 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
611 !!!cp (23);
612 !!!parse-error (type => 'bare stago',
613 line => $self->{line_prev},
614 column => $self->{column_prev});
615 $self->{state} = DATA_STATE;
616 $self->{s_kwd} = '';
617 ## reconsume
618
619 !!!emit ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623
624 redo A;
625 } else {
626 ## XML5: "<:" is a parse error.
627 !!!cp (23.1);
628 $self->{ct} = {type => START_TAG_TOKEN,
629 tag_name => chr ($self->{nc}),
630 line => $self->{line_prev},
631 column => $self->{column_prev}};
632 $self->{state} = TAG_NAME_STATE;
633 !!!next-input-character;
634 redo A;
635 }
636 } else {
637 die "$0: $self->{content_model} in tag open";
638 }
639 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
640 ## NOTE: The "close tag open state" in the spec is implemented as
641 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
642
643 ## XML5: "end tag state".
644
645 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
646 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
647 if (defined $self->{last_stag_name}) {
648 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
649 $self->{kwd} = '';
650 ## Reconsume.
651 redo A;
652 } else {
653 ## No start tag token has ever been emitted
654 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
655 !!!cp (28);
656 $self->{state} = DATA_STATE;
657 $self->{s_kwd} = '';
658 ## Reconsume.
659 !!!emit ({type => CHARACTER_TOKEN, data => '</',
660 line => $l, column => $c,
661 });
662 redo A;
663 }
664 }
665
666 if (0x0041 <= $self->{nc} and
667 $self->{nc} <= 0x005A) { # A..Z
668 !!!cp (29);
669 $self->{ct}
670 = {type => END_TAG_TOKEN,
671 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
672 line => $l, column => $c};
673 $self->{state} = TAG_NAME_STATE;
674 !!!next-input-character;
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678 !!!cp (30);
679 $self->{ct} = {type => END_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $l, column => $c};
682 $self->{state} = TAG_NAME_STATE;
683 !!!next-input-character;
684 redo A;
685 } elsif ($self->{nc} == 0x003E) { # >
686 !!!parse-error (type => 'empty end tag',
687 line => $self->{line_prev}, ## "<" in "</>"
688 column => $self->{column_prev} - 1);
689 $self->{state} = DATA_STATE;
690 $self->{s_kwd} = '';
691 if ($self->{is_xml}) {
692 !!!cp (31);
693 ## XML5: No parse error.
694
695 ## NOTE: This parser raises a parse error, since it supports
696 ## XML1, not XML5.
697
698 ## NOTE: A short end tag token.
699 my $ct = {type => END_TAG_TOKEN,
700 tag_name => '',
701 line => $self->{line_prev},
702 column => $self->{column_prev} - 1,
703 };
704 !!!next-input-character;
705 !!!emit ($ct);
706 } else {
707 !!!cp (31.1);
708 !!!next-input-character;
709 }
710 redo A;
711 } elsif ($self->{nc} == -1) {
712 !!!cp (32);
713 !!!parse-error (type => 'bare etago');
714 $self->{s_kwd} = '';
715 $self->{state} = DATA_STATE;
716 # reconsume
717
718 !!!emit ({type => CHARACTER_TOKEN, data => '</',
719 line => $l, column => $c,
720 });
721
722 redo A;
723 } elsif (not $self->{is_xml} or
724 $is_space->{$self->{nc}}) {
725 !!!cp (33);
726 !!!parse-error (type => 'bogus end tag',
727 line => $self->{line_prev}, # "<" of "</"
728 column => $self->{column_prev} - 1);
729 $self->{state} = BOGUS_COMMENT_STATE;
730 $self->{ct} = {type => COMMENT_TOKEN, data => '',
731 line => $self->{line_prev}, # "<" of "</"
732 column => $self->{column_prev} - 1,
733 };
734 ## NOTE: $self->{nc} is intentionally left as is.
735 ## Although the "anything else" case of the spec not explicitly
736 ## states that the next input character is to be reconsumed,
737 ## it will be included to the |data| of the comment token
738 ## generated from the bogus end tag, as defined in the
739 ## "bogus comment state" entry.
740 redo A;
741 } else {
742 ## XML5: "</:" is a parse error.
743 !!!cp (30.1);
744 $self->{ct} = {type => END_TAG_TOKEN,
745 tag_name => chr ($self->{nc}),
746 line => $l, column => $c};
747 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
748 !!!next-input-character;
749 redo A;
750 }
751 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
752 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
753 if (length $ch) {
754 my $CH = $ch;
755 $ch =~ tr/a-z/A-Z/;
756 my $nch = chr $self->{nc};
757 if ($nch eq $ch or $nch eq $CH) {
758 !!!cp (24);
759 ## Stay in the state.
760 $self->{kwd} .= $nch;
761 !!!next-input-character;
762 redo A;
763 } else {
764 !!!cp (25);
765 $self->{state} = DATA_STATE;
766 $self->{s_kwd} = '';
767 ## Reconsume.
768 !!!emit ({type => CHARACTER_TOKEN,
769 data => '</' . $self->{kwd},
770 line => $self->{line_prev},
771 column => $self->{column_prev} - 1 - length $self->{kwd},
772 });
773 redo A;
774 }
775 } else { # after "<{tag-name}"
776 unless ($is_space->{$self->{nc}} or
777 {
778 0x003E => 1, # >
779 0x002F => 1, # /
780 -1 => 1, # EOF
781 }->{$self->{nc}}) {
782 !!!cp (26);
783 ## Reconsume.
784 $self->{state} = DATA_STATE;
785 $self->{s_kwd} = '';
786 !!!emit ({type => CHARACTER_TOKEN,
787 data => '</' . $self->{kwd},
788 line => $self->{line_prev},
789 column => $self->{column_prev} - 1 - length $self->{kwd},
790 });
791 redo A;
792 } else {
793 !!!cp (27);
794 $self->{ct}
795 = {type => END_TAG_TOKEN,
796 tag_name => $self->{last_stag_name},
797 line => $self->{line_prev},
798 column => $self->{column_prev} - 1 - length $self->{kwd}};
799 $self->{state} = TAG_NAME_STATE;
800 ## Reconsume.
801 redo A;
802 }
803 }
804 } elsif ($self->{state} == TAG_NAME_STATE) {
805 if ($is_space->{$self->{nc}}) {
806 !!!cp (34);
807 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
808 !!!next-input-character;
809 redo A;
810 } elsif ($self->{nc} == 0x003E) { # >
811 if ($self->{ct}->{type} == START_TAG_TOKEN) {
812 !!!cp (35);
813 $self->{last_stag_name} = $self->{ct}->{tag_name};
814 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
815 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
816 #if ($self->{ct}->{attributes}) {
817 # ## NOTE: This should never be reached.
818 # !!! cp (36);
819 # !!! parse-error (type => 'end tag attribute');
820 #} else {
821 !!!cp (37);
822 #}
823 } else {
824 die "$0: $self->{ct}->{type}: Unknown token type";
825 }
826 $self->{state} = DATA_STATE;
827 $self->{s_kwd} = '';
828 !!!next-input-character;
829
830 !!!emit ($self->{ct}); # start tag or end tag
831
832 redo A;
833 } elsif (0x0041 <= $self->{nc} and
834 $self->{nc} <= 0x005A) { # A..Z
835 !!!cp (38);
836 $self->{ct}->{tag_name}
837 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
838 # start tag or end tag
839 ## Stay in this state
840 !!!next-input-character;
841 redo A;
842 } elsif ($self->{nc} == -1) {
843 !!!parse-error (type => 'unclosed tag');
844 if ($self->{ct}->{type} == START_TAG_TOKEN) {
845 !!!cp (39);
846 $self->{last_stag_name} = $self->{ct}->{tag_name};
847 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
848 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
849 #if ($self->{ct}->{attributes}) {
850 # ## NOTE: This state should never be reached.
851 # !!! cp (40);
852 # !!! parse-error (type => 'end tag attribute');
853 #} else {
854 !!!cp (41);
855 #}
856 } else {
857 die "$0: $self->{ct}->{type}: Unknown token type";
858 }
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 # reconsume
862
863 !!!emit ($self->{ct}); # start tag or end tag
864
865 redo A;
866 } elsif ($self->{nc} == 0x002F) { # /
867 !!!cp (42);
868 $self->{state} = SELF_CLOSING_START_TAG_STATE;
869 !!!next-input-character;
870 redo A;
871 } else {
872 !!!cp (44);
873 $self->{ct}->{tag_name} .= chr $self->{nc};
874 # start tag or end tag
875 ## Stay in the state
876 !!!next-input-character;
877 redo A;
878 }
879 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
880 ## XML5: "Tag attribute name before state".
881
882 if ($is_space->{$self->{nc}}) {
883 !!!cp (45);
884 ## Stay in the state
885 !!!next-input-character;
886 redo A;
887 } elsif ($self->{nc} == 0x003E) { # >
888 if ($self->{ct}->{type} == START_TAG_TOKEN) {
889 !!!cp (46);
890 $self->{last_stag_name} = $self->{ct}->{tag_name};
891 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
892 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
893 if ($self->{ct}->{attributes}) {
894 !!!cp (47);
895 !!!parse-error (type => 'end tag attribute');
896 } else {
897 !!!cp (48);
898 }
899 } else {
900 die "$0: $self->{ct}->{type}: Unknown token type";
901 }
902 $self->{state} = DATA_STATE;
903 $self->{s_kwd} = '';
904 !!!next-input-character;
905
906 !!!emit ($self->{ct}); # start tag or end tag
907
908 redo A;
909 } elsif (0x0041 <= $self->{nc} and
910 $self->{nc} <= 0x005A) { # A..Z
911 !!!cp (49);
912 $self->{ca}
913 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
914 value => '',
915 line => $self->{line}, column => $self->{column}};
916 $self->{state} = ATTRIBUTE_NAME_STATE;
917 !!!next-input-character;
918 redo A;
919 } elsif ($self->{nc} == 0x002F) { # /
920 !!!cp (50);
921 $self->{state} = SELF_CLOSING_START_TAG_STATE;
922 !!!next-input-character;
923 redo A;
924 } elsif ($self->{nc} == -1) {
925 !!!parse-error (type => 'unclosed tag');
926 if ($self->{ct}->{type} == START_TAG_TOKEN) {
927 !!!cp (52);
928 $self->{last_stag_name} = $self->{ct}->{tag_name};
929 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
930 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
931 if ($self->{ct}->{attributes}) {
932 !!!cp (53);
933 !!!parse-error (type => 'end tag attribute');
934 } else {
935 !!!cp (54);
936 }
937 } else {
938 die "$0: $self->{ct}->{type}: Unknown token type";
939 }
940 $self->{state} = DATA_STATE;
941 $self->{s_kwd} = '';
942 # reconsume
943
944 !!!emit ($self->{ct}); # start tag or end tag
945
946 redo A;
947 } else {
948 if ({
949 0x0022 => 1, # "
950 0x0027 => 1, # '
951 0x003D => 1, # =
952 }->{$self->{nc}}) {
953 !!!cp (55);
954 ## XML5: Not a parse error.
955 !!!parse-error (type => 'bad attribute name');
956 } else {
957 !!!cp (56);
958 ## XML5: ":" raises a parse error and is ignored.
959 }
960 $self->{ca}
961 = {name => chr ($self->{nc}),
962 value => '',
963 line => $self->{line}, column => $self->{column}};
964 $self->{state} = ATTRIBUTE_NAME_STATE;
965 !!!next-input-character;
966 redo A;
967 }
968 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
969 ## XML5: "Tag attribute name state".
970
971 my $before_leave = sub {
972 if (exists $self->{ct}->{attributes} # start tag or end tag
973 ->{$self->{ca}->{name}}) { # MUST
974 !!!cp (57);
975 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
976 ## Discard $self->{ca} # MUST
977 } else {
978 !!!cp (58);
979 $self->{ct}->{attributes}->{$self->{ca}->{name}}
980 = $self->{ca};
981 $self->{ca}->{index} = ++$self->{ct}->{last_index};
982 }
983 }; # $before_leave
984
985 if ($is_space->{$self->{nc}}) {
986 !!!cp (59);
987 $before_leave->();
988 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
989 !!!next-input-character;
990 redo A;
991 } elsif ($self->{nc} == 0x003D) { # =
992 !!!cp (60);
993 $before_leave->();
994 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{nc} == 0x003E) { # >
998 if ($self->{is_xml}) {
999 !!!cp (60.1);
1000 ## XML5: Not a parse error.
1001 !!!parse-error (type => 'no attr value'); ## TODO: type
1002 } else {
1003 !!!cp (60.2);
1004 }
1005
1006 $before_leave->();
1007 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1008 !!!cp (61);
1009 $self->{last_stag_name} = $self->{ct}->{tag_name};
1010 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1011 !!!cp (62);
1012 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013 if ($self->{ct}->{attributes}) {
1014 !!!parse-error (type => 'end tag attribute');
1015 }
1016 } else {
1017 die "$0: $self->{ct}->{type}: Unknown token type";
1018 }
1019 $self->{state} = DATA_STATE;
1020 $self->{s_kwd} = '';
1021 !!!next-input-character;
1022
1023 !!!emit ($self->{ct}); # start tag or end tag
1024
1025 redo A;
1026 } elsif (0x0041 <= $self->{nc} and
1027 $self->{nc} <= 0x005A) { # A..Z
1028 !!!cp (63);
1029 $self->{ca}->{name}
1030 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1031 ## Stay in the state
1032 !!!next-input-character;
1033 redo A;
1034 } elsif ($self->{nc} == 0x002F) { # /
1035 if ($self->{is_xml}) {
1036 !!!cp (64);
1037 ## XML5: Not a parse error.
1038 !!!parse-error (type => 'no attr value'); ## TODO: type
1039 } else {
1040 !!!cp (64.1);
1041 }
1042
1043 $before_leave->();
1044 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1045 !!!next-input-character;
1046 redo A;
1047 } elsif ($self->{nc} == -1) {
1048 !!!parse-error (type => 'unclosed tag');
1049 $before_leave->();
1050 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1051 !!!cp (66);
1052 $self->{last_stag_name} = $self->{ct}->{tag_name};
1053 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1054 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1055 if ($self->{ct}->{attributes}) {
1056 !!!cp (67);
1057 !!!parse-error (type => 'end tag attribute');
1058 } else {
1059 ## NOTE: This state should never be reached.
1060 !!!cp (68);
1061 }
1062 } else {
1063 die "$0: $self->{ct}->{type}: Unknown token type";
1064 }
1065 $self->{state} = DATA_STATE;
1066 $self->{s_kwd} = '';
1067 # reconsume
1068
1069 !!!emit ($self->{ct}); # start tag or end tag
1070
1071 redo A;
1072 } else {
1073 if ($self->{nc} == 0x0022 or # "
1074 $self->{nc} == 0x0027) { # '
1075 !!!cp (69);
1076 ## XML5: Not a parse error.
1077 !!!parse-error (type => 'bad attribute name');
1078 } else {
1079 !!!cp (70);
1080 }
1081 $self->{ca}->{name} .= chr ($self->{nc});
1082 ## Stay in the state
1083 !!!next-input-character;
1084 redo A;
1085 }
1086 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1087 ## XML5: "Tag attribute name after state".
1088
1089 if ($is_space->{$self->{nc}}) {
1090 !!!cp (71);
1091 ## Stay in the state
1092 !!!next-input-character;
1093 redo A;
1094 } elsif ($self->{nc} == 0x003D) { # =
1095 !!!cp (72);
1096 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1097 !!!next-input-character;
1098 redo A;
1099 } elsif ($self->{nc} == 0x003E) { # >
1100 if ($self->{is_xml}) {
1101 !!!cp (72.1);
1102 ## XML5: Not a parse error.
1103 !!!parse-error (type => 'no attr value'); ## TODO: type
1104 } else {
1105 !!!cp (72.2);
1106 }
1107
1108 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1109 !!!cp (73);
1110 $self->{last_stag_name} = $self->{ct}->{tag_name};
1111 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1112 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113 if ($self->{ct}->{attributes}) {
1114 !!!cp (74);
1115 !!!parse-error (type => 'end tag attribute');
1116 } else {
1117 ## NOTE: This state should never be reached.
1118 !!!cp (75);
1119 }
1120 } else {
1121 die "$0: $self->{ct}->{type}: Unknown token type";
1122 }
1123 $self->{state} = DATA_STATE;
1124 $self->{s_kwd} = '';
1125 !!!next-input-character;
1126
1127 !!!emit ($self->{ct}); # start tag or end tag
1128
1129 redo A;
1130 } elsif (0x0041 <= $self->{nc} and
1131 $self->{nc} <= 0x005A) { # A..Z
1132 !!!cp (76);
1133 $self->{ca}
1134 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1135 value => '',
1136 line => $self->{line}, column => $self->{column}};
1137 $self->{state} = ATTRIBUTE_NAME_STATE;
1138 !!!next-input-character;
1139 redo A;
1140 } elsif ($self->{nc} == 0x002F) { # /
1141 if ($self->{is_xml}) {
1142 !!!cp (77);
1143 ## XML5: Not a parse error.
1144 !!!parse-error (type => 'no attr value'); ## TODO: type
1145 } else {
1146 !!!cp (77.1);
1147 }
1148
1149 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1150 !!!next-input-character;
1151 redo A;
1152 } elsif ($self->{nc} == -1) {
1153 !!!parse-error (type => 'unclosed tag');
1154 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1155 !!!cp (79);
1156 $self->{last_stag_name} = $self->{ct}->{tag_name};
1157 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1158 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1159 if ($self->{ct}->{attributes}) {
1160 !!!cp (80);
1161 !!!parse-error (type => 'end tag attribute');
1162 } else {
1163 ## NOTE: This state should never be reached.
1164 !!!cp (81);
1165 }
1166 } else {
1167 die "$0: $self->{ct}->{type}: Unknown token type";
1168 }
1169 $self->{s_kwd} = '';
1170 $self->{state} = DATA_STATE;
1171 # reconsume
1172
1173 !!!emit ($self->{ct}); # start tag or end tag
1174
1175 redo A;
1176 } else {
1177 if ($self->{is_xml}) {
1178 !!!cp (78.1);
1179 ## XML5: Not a parse error.
1180 !!!parse-error (type => 'no attr value'); ## TODO: type
1181 } else {
1182 !!!cp (78.2);
1183 }
1184
1185 if ($self->{nc} == 0x0022 or # "
1186 $self->{nc} == 0x0027) { # '
1187 !!!cp (78);
1188 ## XML5: Not a parse error.
1189 !!!parse-error (type => 'bad attribute name');
1190 } else {
1191 !!!cp (82);
1192 }
1193 $self->{ca}
1194 = {name => chr ($self->{nc}),
1195 value => '',
1196 line => $self->{line}, column => $self->{column}};
1197 $self->{state} = ATTRIBUTE_NAME_STATE;
1198 !!!next-input-character;
1199 redo A;
1200 }
1201 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1202 ## XML5: "Tag attribute value before state".
1203
1204 if ($is_space->{$self->{nc}}) {
1205 !!!cp (83);
1206 ## Stay in the state
1207 !!!next-input-character;
1208 redo A;
1209 } elsif ($self->{nc} == 0x0022) { # "
1210 !!!cp (84);
1211 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 } elsif ($self->{nc} == 0x0026) { # &
1215 !!!cp (85);
1216 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1217 ## reconsume
1218 redo A;
1219 } elsif ($self->{nc} == 0x0027) { # '
1220 !!!cp (86);
1221 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1222 !!!next-input-character;
1223 redo A;
1224 } elsif ($self->{nc} == 0x003E) { # >
1225 !!!parse-error (type => 'empty unquoted attribute value');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227 !!!cp (87);
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232 !!!cp (88);
1233 !!!parse-error (type => 'end tag attribute');
1234 } else {
1235 ## NOTE: This state should never be reached.
1236 !!!cp (89);
1237 }
1238 } else {
1239 die "$0: $self->{ct}->{type}: Unknown token type";
1240 }
1241 $self->{state} = DATA_STATE;
1242 $self->{s_kwd} = '';
1243 !!!next-input-character;
1244
1245 !!!emit ($self->{ct}); # start tag or end tag
1246
1247 redo A;
1248 } elsif ($self->{nc} == -1) {
1249 !!!parse-error (type => 'unclosed tag');
1250 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1251 !!!cp (90);
1252 $self->{last_stag_name} = $self->{ct}->{tag_name};
1253 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1254 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1255 if ($self->{ct}->{attributes}) {
1256 !!!cp (91);
1257 !!!parse-error (type => 'end tag attribute');
1258 } else {
1259 ## NOTE: This state should never be reached.
1260 !!!cp (92);
1261 }
1262 } else {
1263 die "$0: $self->{ct}->{type}: Unknown token type";
1264 }
1265 $self->{state} = DATA_STATE;
1266 $self->{s_kwd} = '';
1267 ## reconsume
1268
1269 !!!emit ($self->{ct}); # start tag or end tag
1270
1271 redo A;
1272 } else {
1273 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1274 !!!cp (93);
1275 ## XML5: Not a parse error.
1276 !!!parse-error (type => 'bad attribute value');
1277 } elsif ($self->{is_xml}) {
1278 !!!cp (93.1);
1279 ## XML5: No parse error.
1280 !!!parse-error (type => 'unquoted attr value'); ## TODO
1281 } else {
1282 !!!cp (94);
1283 }
1284 $self->{ca}->{value} .= chr ($self->{nc});
1285 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1286 !!!next-input-character;
1287 redo A;
1288 }
1289 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1290 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1291 ## ATTLIST attribute value double quoted state".
1292
1293 if ($self->{nc} == 0x0022) { # "
1294 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1295 !!!cp (95.1);
1296 ## XML5: "DOCTYPE ATTLIST name after state".
1297 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1298 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1299 } else {
1300 !!!cp (95);
1301 ## XML5: "Tag attribute name before state".
1302 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1303 }
1304 !!!next-input-character;
1305 redo A;
1306 } elsif ($self->{nc} == 0x0026) { # &
1307 !!!cp (96);
1308 ## XML5: Not defined yet.
1309
1310 ## NOTE: In the spec, the tokenizer is switched to the
1311 ## "entity in attribute value state". In this implementation, the
1312 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1313 ## implementation of the "consume a character reference" algorithm.
1314 $self->{prev_state} = $self->{state};
1315 $self->{entity_add} = 0x0022; # "
1316 $self->{state} = ENTITY_STATE;
1317 !!!next-input-character;
1318 redo A;
1319 } elsif ($self->{is_xml} and
1320 $is_space->{$self->{nc}}) {
1321 !!!cp (97.1);
1322 $self->{ca}->{value} .= ' ';
1323 ## Stay in the state.
1324 !!!next-input-character;
1325 redo A;
1326 } elsif ($self->{nc} == -1) {
1327 !!!parse-error (type => 'unclosed attribute value');
1328 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1329 !!!cp (97);
1330 $self->{last_stag_name} = $self->{ct}->{tag_name};
1331
1332 $self->{state} = DATA_STATE;
1333 $self->{s_kwd} = '';
1334 ## reconsume
1335 !!!emit ($self->{ct}); # start tag
1336 redo A;
1337 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339 if ($self->{ct}->{attributes}) {
1340 !!!cp (98);
1341 !!!parse-error (type => 'end tag attribute');
1342 } else {
1343 ## NOTE: This state should never be reached.
1344 !!!cp (99);
1345 }
1346
1347 $self->{state} = DATA_STATE;
1348 $self->{s_kwd} = '';
1349 ## reconsume
1350 !!!emit ($self->{ct}); # end tag
1351 redo A;
1352 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1353 ## XML5: No parse error above; not defined yet.
1354 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1355 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1356 ## Reconsume.
1357 !!!emit ($self->{ct}); # ATTLIST
1358 redo A;
1359 } else {
1360 die "$0: $self->{ct}->{type}: Unknown token type";
1361 }
1362 } else {
1363 ## XML5 [ATTLIST]: Not defined yet.
1364 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1365 !!!cp (100);
1366 ## XML5: Not a parse error.
1367 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1368 } else {
1369 !!!cp (100.1);
1370 }
1371 $self->{ca}->{value} .= chr ($self->{nc});
1372 $self->{read_until}->($self->{ca}->{value},
1373 qq["&<\x09\x0C\x20],
1374 length $self->{ca}->{value});
1375
1376 ## Stay in the state
1377 !!!next-input-character;
1378 redo A;
1379 }
1380 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1381 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1382 ## ATTLIST attribute value single quoted state".
1383
1384 if ($self->{nc} == 0x0027) { # '
1385 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1386 !!!cp (101.1);
1387 ## XML5: "DOCTYPE ATTLIST name after state".
1388 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1389 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1390 } else {
1391 !!!cp (101);
1392 ## XML5: "Before attribute name state" (sic).
1393 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1394 }
1395 !!!next-input-character;
1396 redo A;
1397 } elsif ($self->{nc} == 0x0026) { # &
1398 !!!cp (102);
1399 ## XML5: Not defined yet.
1400
1401 ## NOTE: In the spec, the tokenizer is switched to the
1402 ## "entity in attribute value state". In this implementation, the
1403 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1404 ## implementation of the "consume a character reference" algorithm.
1405 $self->{entity_add} = 0x0027; # '
1406 $self->{prev_state} = $self->{state};
1407 $self->{state} = ENTITY_STATE;
1408 !!!next-input-character;
1409 redo A;
1410 } elsif ($self->{is_xml} and
1411 $is_space->{$self->{nc}}) {
1412 !!!cp (103.1);
1413 $self->{ca}->{value} .= ' ';
1414 ## Stay in the state.
1415 !!!next-input-character;
1416 redo A;
1417 } elsif ($self->{nc} == -1) {
1418 !!!parse-error (type => 'unclosed attribute value');
1419 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1420 !!!cp (103);
1421 $self->{last_stag_name} = $self->{ct}->{tag_name};
1422
1423 $self->{state} = DATA_STATE;
1424 $self->{s_kwd} = '';
1425 ## reconsume
1426 !!!emit ($self->{ct}); # start tag
1427 redo A;
1428 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1429 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1430 if ($self->{ct}->{attributes}) {
1431 !!!cp (104);
1432 !!!parse-error (type => 'end tag attribute');
1433 } else {
1434 ## NOTE: This state should never be reached.
1435 !!!cp (105);
1436 }
1437
1438 $self->{state} = DATA_STATE;
1439 $self->{s_kwd} = '';
1440 ## reconsume
1441 !!!emit ($self->{ct}); # end tag
1442 redo A;
1443 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1444 ## XML5: No parse error above; not defined yet.
1445 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1446 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1447 ## Reconsume.
1448 !!!emit ($self->{ct}); # ATTLIST
1449 redo A;
1450 } else {
1451 die "$0: $self->{ct}->{type}: Unknown token type";
1452 }
1453 } else {
1454 ## XML5 [ATTLIST]: Not defined yet.
1455 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1456 !!!cp (106);
1457 ## XML5: Not a parse error.
1458 !!!parse-error (type => 'lt in attr value'); ## TODO: type
1459 } else {
1460 !!!cp (106.1);
1461 }
1462 $self->{ca}->{value} .= chr ($self->{nc});
1463 $self->{read_until}->($self->{ca}->{value},
1464 qq['&<\x09\x0C\x20],
1465 length $self->{ca}->{value});
1466
1467 ## Stay in the state
1468 !!!next-input-character;
1469 redo A;
1470 }
1471 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1472 ## XML5: "Tag attribute value unquoted state".
1473
1474 if ($is_space->{$self->{nc}}) {
1475 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1476 !!!cp (107.1);
1477 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1478 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1479 } else {
1480 !!!cp (107);
1481 ## XML5: "Tag attribute name before state".
1482 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1483 }
1484 !!!next-input-character;
1485 redo A;
1486 } elsif ($self->{nc} == 0x0026) { # &
1487 !!!cp (108);
1488
1489 ## XML5: Not defined yet.
1490
1491 ## NOTE: In the spec, the tokenizer is switched to the
1492 ## "entity in attribute value state". In this implementation, the
1493 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1494 ## implementation of the "consume a character reference" algorithm.
1495 $self->{entity_add} = -1;
1496 $self->{prev_state} = $self->{state};
1497 $self->{state} = ENTITY_STATE;
1498 !!!next-input-character;
1499 redo A;
1500 } elsif ($self->{nc} == 0x003E) { # >
1501 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1502 !!!cp (109);
1503 $self->{last_stag_name} = $self->{ct}->{tag_name};
1504
1505 $self->{state} = DATA_STATE;
1506 $self->{s_kwd} = '';
1507 !!!next-input-character;
1508 !!!emit ($self->{ct}); # start tag
1509 redo A;
1510 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512 if ($self->{ct}->{attributes}) {
1513 !!!cp (110);
1514 !!!parse-error (type => 'end tag attribute');
1515 } else {
1516 ## NOTE: This state should never be reached.
1517 !!!cp (111);
1518 }
1519
1520 $self->{state} = DATA_STATE;
1521 $self->{s_kwd} = '';
1522 !!!next-input-character;
1523 !!!emit ($self->{ct}); # end tag
1524 redo A;
1525 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1526 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1527 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1528 !!!next-input-character;
1529 !!!emit ($self->{ct}); # ATTLIST
1530 redo A;
1531 } else {
1532 die "$0: $self->{ct}->{type}: Unknown token type";
1533 }
1534 } elsif ($self->{nc} == -1) {
1535 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1536 !!!cp (112);
1537 !!!parse-error (type => 'unclosed tag');
1538 $self->{last_stag_name} = $self->{ct}->{tag_name};
1539
1540 $self->{state} = DATA_STATE;
1541 $self->{s_kwd} = '';
1542 ## reconsume
1543 !!!emit ($self->{ct}); # start tag
1544 redo A;
1545 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1546 !!!parse-error (type => 'unclosed tag');
1547 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1548 if ($self->{ct}->{attributes}) {
1549 !!!cp (113);
1550 !!!parse-error (type => 'end tag attribute');
1551 } else {
1552 ## NOTE: This state should never be reached.
1553 !!!cp (114);
1554 }
1555
1556 $self->{state} = DATA_STATE;
1557 $self->{s_kwd} = '';
1558 ## reconsume
1559 !!!emit ($self->{ct}); # end tag
1560 redo A;
1561 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1562 !!!parse-error (type => 'unclosed md'); ## TODO: type
1563 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1564 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1565 ## Reconsume.
1566 !!!emit ($self->{ct}); # ATTLIST
1567 redo A;
1568 } else {
1569 die "$0: $self->{ct}->{type}: Unknown token type";
1570 }
1571 } else {
1572 if ({
1573 0x0022 => 1, # "
1574 0x0027 => 1, # '
1575 0x003D => 1, # =
1576 0x003C => 1, # <
1577 }->{$self->{nc}}) {
1578 !!!cp (115);
1579 ## XML5: Not a parse error.
1580 !!!parse-error (type => 'bad attribute value');
1581 } else {
1582 !!!cp (116);
1583 }
1584 $self->{ca}->{value} .= chr ($self->{nc});
1585 $self->{read_until}->($self->{ca}->{value},
1586 qq["'=& \x09\x0C>],
1587 length $self->{ca}->{value});
1588
1589 ## Stay in the state
1590 !!!next-input-character;
1591 redo A;
1592 }
1593 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1594 if ($is_space->{$self->{nc}}) {
1595 !!!cp (118);
1596 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1597 !!!next-input-character;
1598 redo A;
1599 } elsif ($self->{nc} == 0x003E) { # >
1600 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1601 !!!cp (119);
1602 $self->{last_stag_name} = $self->{ct}->{tag_name};
1603 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1604 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1605 if ($self->{ct}->{attributes}) {
1606 !!!cp (120);
1607 !!!parse-error (type => 'end tag attribute');
1608 } else {
1609 ## NOTE: This state should never be reached.
1610 !!!cp (121);
1611 }
1612 } else {
1613 die "$0: $self->{ct}->{type}: Unknown token type";
1614 }
1615 $self->{state} = DATA_STATE;
1616 $self->{s_kwd} = '';
1617 !!!next-input-character;
1618
1619 !!!emit ($self->{ct}); # start tag or end tag
1620
1621 redo A;
1622 } elsif ($self->{nc} == 0x002F) { # /
1623 !!!cp (122);
1624 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1625 !!!next-input-character;
1626 redo A;
1627 } elsif ($self->{nc} == -1) {
1628 !!!parse-error (type => 'unclosed tag');
1629 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1630 !!!cp (122.3);
1631 $self->{last_stag_name} = $self->{ct}->{tag_name};
1632 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1633 if ($self->{ct}->{attributes}) {
1634 !!!cp (122.1);
1635 !!!parse-error (type => 'end tag attribute');
1636 } else {
1637 ## NOTE: This state should never be reached.
1638 !!!cp (122.2);
1639 }
1640 } else {
1641 die "$0: $self->{ct}->{type}: Unknown token type";
1642 }
1643 $self->{state} = DATA_STATE;
1644 $self->{s_kwd} = '';
1645 ## Reconsume.
1646 !!!emit ($self->{ct}); # start tag or end tag
1647 redo A;
1648 } else {
1649 !!!cp ('124.1');
1650 !!!parse-error (type => 'no space between attributes');
1651 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1652 ## reconsume
1653 redo A;
1654 }
1655 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1656 ## XML5: "Empty tag state".
1657
1658 if ($self->{nc} == 0x003E) { # >
1659 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1660 !!!cp ('124.2');
1661 !!!parse-error (type => 'nestc', token => $self->{ct});
1662 ## TODO: Different type than slash in start tag
1663 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1664 if ($self->{ct}->{attributes}) {
1665 !!!cp ('124.4');
1666 !!!parse-error (type => 'end tag attribute');
1667 } else {
1668 !!!cp ('124.5');
1669 }
1670 ## TODO: Test |<title></title/>|
1671 } else {
1672 !!!cp ('124.3');
1673 $self->{self_closing} = 1;
1674 }
1675
1676 $self->{state} = DATA_STATE;
1677 $self->{s_kwd} = '';
1678 !!!next-input-character;
1679
1680 !!!emit ($self->{ct}); # start tag or end tag
1681
1682 redo A;
1683 } elsif ($self->{nc} == -1) {
1684 !!!parse-error (type => 'unclosed tag');
1685 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1686 !!!cp (124.7);
1687 $self->{last_stag_name} = $self->{ct}->{tag_name};
1688 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1689 if ($self->{ct}->{attributes}) {
1690 !!!cp (124.5);
1691 !!!parse-error (type => 'end tag attribute');
1692 } else {
1693 ## NOTE: This state should never be reached.
1694 !!!cp (124.6);
1695 }
1696 } else {
1697 die "$0: $self->{ct}->{type}: Unknown token type";
1698 }
1699 ## XML5: "Tag attribute name before state".
1700 $self->{state} = DATA_STATE;
1701 $self->{s_kwd} = '';
1702 ## Reconsume.
1703 !!!emit ($self->{ct}); # start tag or end tag
1704 redo A;
1705 } else {
1706 !!!cp ('124.4');
1707 !!!parse-error (type => 'nestc');
1708 ## TODO: This error type is wrong.
1709 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1710 ## Reconsume.
1711 redo A;
1712 }
1713 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1714 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1715
1716 ## NOTE: Unlike spec's "bogus comment state", this implementation
1717 ## consumes characters one-by-one basis.
1718
1719 if ($self->{nc} == 0x003E) { # >
1720 if ($self->{in_subset}) {
1721 !!!cp (123);
1722 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1723 } else {
1724 !!!cp (124);
1725 $self->{state} = DATA_STATE;
1726 $self->{s_kwd} = '';
1727 }
1728 !!!next-input-character;
1729
1730 !!!emit ($self->{ct}); # comment
1731 redo A;
1732 } elsif ($self->{nc} == -1) {
1733 if ($self->{in_subset}) {
1734 !!!cp (125.1);
1735 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1736 } else {
1737 !!!cp (125);
1738 $self->{state} = DATA_STATE;
1739 $self->{s_kwd} = '';
1740 }
1741 ## reconsume
1742
1743 !!!emit ($self->{ct}); # comment
1744 redo A;
1745 } else {
1746 !!!cp (126);
1747 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1748 $self->{read_until}->($self->{ct}->{data},
1749 q[>],
1750 length $self->{ct}->{data});
1751
1752 ## Stay in the state.
1753 !!!next-input-character;
1754 redo A;
1755 }
1756 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1757 ## XML5: "Markup declaration state".
1758
1759 if ($self->{nc} == 0x002D) { # -
1760 !!!cp (133);
1761 $self->{state} = MD_HYPHEN_STATE;
1762 !!!next-input-character;
1763 redo A;
1764 } elsif ($self->{nc} == 0x0044 or # D
1765 $self->{nc} == 0x0064) { # d
1766 ## ASCII case-insensitive.
1767 !!!cp (130);
1768 $self->{state} = MD_DOCTYPE_STATE;
1769 $self->{kwd} = chr $self->{nc};
1770 !!!next-input-character;
1771 redo A;
1772 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1773 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1774 $self->{is_xml}) and
1775 $self->{nc} == 0x005B) { # [
1776 !!!cp (135.4);
1777 $self->{state} = MD_CDATA_STATE;
1778 $self->{kwd} = '[';
1779 !!!next-input-character;
1780 redo A;
1781 } else {
1782 !!!cp (136);
1783 }
1784
1785 !!!parse-error (type => 'bogus comment',
1786 line => $self->{line_prev},
1787 column => $self->{column_prev} - 1);
1788 ## Reconsume.
1789 $self->{state} = BOGUS_COMMENT_STATE;
1790 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1791 line => $self->{line_prev},
1792 column => $self->{column_prev} - 1,
1793 };
1794 redo A;
1795 } elsif ($self->{state} == MD_HYPHEN_STATE) {
1796 if ($self->{nc} == 0x002D) { # -
1797 !!!cp (127);
1798 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1799 line => $self->{line_prev},
1800 column => $self->{column_prev} - 2,
1801 };
1802 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1803 !!!next-input-character;
1804 redo A;
1805 } else {
1806 !!!cp (128);
1807 !!!parse-error (type => 'bogus comment',
1808 line => $self->{line_prev},
1809 column => $self->{column_prev} - 2);
1810 $self->{state} = BOGUS_COMMENT_STATE;
1811 ## Reconsume.
1812 $self->{ct} = {type => COMMENT_TOKEN,
1813 data => '-',
1814 line => $self->{line_prev},
1815 column => $self->{column_prev} - 2,
1816 };
1817 redo A;
1818 }
1819 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1820 ## ASCII case-insensitive.
1821 if ($self->{nc} == [
1822 undef,
1823 0x004F, # O
1824 0x0043, # C
1825 0x0054, # T
1826 0x0059, # Y
1827 0x0050, # P
1828 ]->[length $self->{kwd}] or
1829 $self->{nc} == [
1830 undef,
1831 0x006F, # o
1832 0x0063, # c
1833 0x0074, # t
1834 0x0079, # y
1835 0x0070, # p
1836 ]->[length $self->{kwd}]) {
1837 !!!cp (131);
1838 ## Stay in the state.
1839 $self->{kwd} .= chr $self->{nc};
1840 !!!next-input-character;
1841 redo A;
1842 } elsif ((length $self->{kwd}) == 6 and
1843 ($self->{nc} == 0x0045 or # E
1844 $self->{nc} == 0x0065)) { # e
1845 if ($self->{is_xml} and
1846 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1847 !!!cp (129);
1848 ## XML5: case-sensitive.
1849 !!!parse-error (type => 'lowercase keyword', ## TODO
1850 text => 'DOCTYPE',
1851 line => $self->{line_prev},
1852 column => $self->{column_prev} - 5);
1853 } else {
1854 !!!cp (129.1);
1855 }
1856 $self->{state} = DOCTYPE_STATE;
1857 $self->{ct} = {type => DOCTYPE_TOKEN,
1858 quirks => 1,
1859 line => $self->{line_prev},
1860 column => $self->{column_prev} - 7,
1861 };
1862 !!!next-input-character;
1863 redo A;
1864 } else {
1865 !!!cp (132);
1866 !!!parse-error (type => 'bogus comment',
1867 line => $self->{line_prev},
1868 column => $self->{column_prev} - 1 - length $self->{kwd});
1869 $self->{state} = BOGUS_COMMENT_STATE;
1870 ## Reconsume.
1871 $self->{ct} = {type => COMMENT_TOKEN,
1872 data => $self->{kwd},
1873 line => $self->{line_prev},
1874 column => $self->{column_prev} - 1 - length $self->{kwd},
1875 };
1876 redo A;
1877 }
1878 } elsif ($self->{state} == MD_CDATA_STATE) {
1879 if ($self->{nc} == {
1880 '[' => 0x0043, # C
1881 '[C' => 0x0044, # D
1882 '[CD' => 0x0041, # A
1883 '[CDA' => 0x0054, # T
1884 '[CDAT' => 0x0041, # A
1885 }->{$self->{kwd}}) {
1886 !!!cp (135.1);
1887 ## Stay in the state.
1888 $self->{kwd} .= chr $self->{nc};
1889 !!!next-input-character;
1890 redo A;
1891 } elsif ($self->{kwd} eq '[CDATA' and
1892 $self->{nc} == 0x005B) { # [
1893 if ($self->{is_xml} and
1894 not $self->{tainted} and
1895 @{$self->{open_elements} or []} == 0) {
1896 !!!cp (135.2);
1897 !!!parse-error (type => 'cdata outside of root element',
1898 line => $self->{line_prev},
1899 column => $self->{column_prev} - 7);
1900 $self->{tainted} = 1;
1901 } else {
1902 !!!cp (135.21);
1903 }
1904
1905 $self->{ct} = {type => CHARACTER_TOKEN,
1906 data => '',
1907 line => $self->{line_prev},
1908 column => $self->{column_prev} - 7};
1909 $self->{state} = CDATA_SECTION_STATE;
1910 !!!next-input-character;
1911 redo A;
1912 } else {
1913 !!!cp (135.3);
1914 !!!parse-error (type => 'bogus comment',
1915 line => $self->{line_prev},
1916 column => $self->{column_prev} - 1 - length $self->{kwd});
1917 $self->{state} = BOGUS_COMMENT_STATE;
1918 ## Reconsume.
1919 $self->{ct} = {type => COMMENT_TOKEN,
1920 data => $self->{kwd},
1921 line => $self->{line_prev},
1922 column => $self->{column_prev} - 1 - length $self->{kwd},
1923 };
1924 redo A;
1925 }
1926 } elsif ($self->{state} == COMMENT_START_STATE) {
1927 if ($self->{nc} == 0x002D) { # -
1928 !!!cp (137);
1929 $self->{state} = COMMENT_START_DASH_STATE;
1930 !!!next-input-character;
1931 redo A;
1932 } elsif ($self->{nc} == 0x003E) { # >
1933 !!!parse-error (type => 'bogus comment');
1934 if ($self->{in_subset}) {
1935 !!!cp (138.1);
1936 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1937 } else {
1938 !!!cp (138);
1939 $self->{state} = DATA_STATE;
1940 $self->{s_kwd} = '';
1941 }
1942 !!!next-input-character;
1943
1944 !!!emit ($self->{ct}); # comment
1945
1946 redo A;
1947 } elsif ($self->{nc} == -1) {
1948 !!!parse-error (type => 'unclosed comment');
1949 if ($self->{in_subset}) {
1950 !!!cp (139.1);
1951 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1952 } else {
1953 !!!cp (139);
1954 $self->{state} = DATA_STATE;
1955 $self->{s_kwd} = '';
1956 }
1957 ## reconsume
1958
1959 !!!emit ($self->{ct}); # comment
1960
1961 redo A;
1962 } else {
1963 !!!cp (140);
1964 $self->{ct}->{data} # comment
1965 .= chr ($self->{nc});
1966 $self->{state} = COMMENT_STATE;
1967 !!!next-input-character;
1968 redo A;
1969 }
1970 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1971 if ($self->{nc} == 0x002D) { # -
1972 !!!cp (141);
1973 $self->{state} = COMMENT_END_STATE;
1974 !!!next-input-character;
1975 redo A;
1976 } elsif ($self->{nc} == 0x003E) { # >
1977 !!!parse-error (type => 'bogus comment');
1978 if ($self->{in_subset}) {
1979 !!!cp (142.1);
1980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1981 } else {
1982 !!!cp (142);
1983 $self->{state} = DATA_STATE;
1984 $self->{s_kwd} = '';
1985 }
1986 !!!next-input-character;
1987
1988 !!!emit ($self->{ct}); # comment
1989
1990 redo A;
1991 } elsif ($self->{nc} == -1) {
1992 !!!parse-error (type => 'unclosed comment');
1993 if ($self->{in_subset}) {
1994 !!!cp (143.1);
1995 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1996 } else {
1997 !!!cp (143);
1998 $self->{state} = DATA_STATE;
1999 $self->{s_kwd} = '';
2000 }
2001 ## reconsume
2002
2003 !!!emit ($self->{ct}); # comment
2004
2005 redo A;
2006 } else {
2007 !!!cp (144);
2008 $self->{ct}->{data} # comment
2009 .= '-' . chr ($self->{nc});
2010 $self->{state} = COMMENT_STATE;
2011 !!!next-input-character;
2012 redo A;
2013 }
2014 } elsif ($self->{state} == COMMENT_STATE) {
2015 ## XML5: "Comment state" and "DOCTYPE comment state".
2016
2017 if ($self->{nc} == 0x002D) { # -
2018 !!!cp (145);
2019 $self->{state} = COMMENT_END_DASH_STATE;
2020 !!!next-input-character;
2021 redo A;
2022 } elsif ($self->{nc} == -1) {
2023 !!!parse-error (type => 'unclosed comment');
2024 if ($self->{in_subset}) {
2025 !!!cp (146.1);
2026 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2027 } else {
2028 !!!cp (146);
2029 $self->{state} = DATA_STATE;
2030 $self->{s_kwd} = '';
2031 }
2032 ## reconsume
2033
2034 !!!emit ($self->{ct}); # comment
2035
2036 redo A;
2037 } else {
2038 !!!cp (147);
2039 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2040 $self->{read_until}->($self->{ct}->{data},
2041 q[-],
2042 length $self->{ct}->{data});
2043
2044 ## Stay in the state
2045 !!!next-input-character;
2046 redo A;
2047 }
2048 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2049 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2050
2051 if ($self->{nc} == 0x002D) { # -
2052 !!!cp (148);
2053 $self->{state} = COMMENT_END_STATE;
2054 !!!next-input-character;
2055 redo A;
2056 } elsif ($self->{nc} == -1) {
2057 !!!parse-error (type => 'unclosed comment');
2058 if ($self->{in_subset}) {
2059 !!!cp (149.1);
2060 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061 } else {
2062 !!!cp (149);
2063 $self->{state} = DATA_STATE;
2064 $self->{s_kwd} = '';
2065 }
2066 ## reconsume
2067
2068 !!!emit ($self->{ct}); # comment
2069
2070 redo A;
2071 } else {
2072 !!!cp (150);
2073 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2074 $self->{state} = COMMENT_STATE;
2075 !!!next-input-character;
2076 redo A;
2077 }
2078 } elsif ($self->{state} == COMMENT_END_STATE) {
2079 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2080
2081 if ($self->{nc} == 0x003E) { # >
2082 if ($self->{in_subset}) {
2083 !!!cp (151.1);
2084 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2085 } else {
2086 !!!cp (151);
2087 $self->{state} = DATA_STATE;
2088 $self->{s_kwd} = '';
2089 }
2090 !!!next-input-character;
2091
2092 !!!emit ($self->{ct}); # comment
2093
2094 redo A;
2095 } elsif ($self->{nc} == 0x002D) { # -
2096 !!!cp (152);
2097 ## XML5: Not a parse error.
2098 !!!parse-error (type => 'dash in comment',
2099 line => $self->{line_prev},
2100 column => $self->{column_prev});
2101 $self->{ct}->{data} .= '-'; # comment
2102 ## Stay in the state
2103 !!!next-input-character;
2104 redo A;
2105 } elsif ($self->{nc} == -1) {
2106 !!!parse-error (type => 'unclosed comment');
2107 if ($self->{in_subset}) {
2108 !!!cp (153.1);
2109 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2110 } else {
2111 !!!cp (153);
2112 $self->{state} = DATA_STATE;
2113 $self->{s_kwd} = '';
2114 }
2115 ## reconsume
2116
2117 !!!emit ($self->{ct}); # comment
2118
2119 redo A;
2120 } else {
2121 !!!cp (154);
2122 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2123 $self->{state} = COMMENT_STATE;
2124 !!!next-input-character;
2125 redo A;
2126 }
2127 } elsif ($self->{state} == DOCTYPE_STATE) {
2128 if ($is_space->{$self->{nc}}) {
2129 !!!cp (155);
2130 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2131 !!!next-input-character;
2132 redo A;
2133 } elsif ($self->{nc} == -1) {
2134 !!!cp (155.1);
2135 !!!parse-error (type => 'unclosed DOCTYPE');
2136 $self->{ct}->{quirks} = 1;
2137
2138 $self->{state} = DATA_STATE;
2139 ## Reconsume.
2140 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2141
2142 redo A;
2143 } else {
2144 !!!cp (156);
2145 ## XML5: Swith to the bogus comment state.
2146 !!!parse-error (type => 'no space before DOCTYPE name');
2147 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2148 ## reconsume
2149 redo A;
2150 }
2151 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2152 ## XML5: "DOCTYPE root name before state".
2153
2154 if ($is_space->{$self->{nc}}) {
2155 !!!cp (157);
2156 ## Stay in the state
2157 !!!next-input-character;
2158 redo A;
2159 } elsif ($self->{nc} == 0x003E) { # >
2160 !!!cp (158);
2161 ## XML5: No parse error.
2162 !!!parse-error (type => 'no DOCTYPE name');
2163 $self->{state} = DATA_STATE;
2164 $self->{s_kwd} = '';
2165 !!!next-input-character;
2166
2167 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2168
2169 redo A;
2170 } elsif ($self->{nc} == -1) {
2171 !!!cp (159);
2172 !!!parse-error (type => 'no DOCTYPE name');
2173 $self->{state} = DATA_STATE;
2174 $self->{s_kwd} = '';
2175 ## reconsume
2176
2177 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2178
2179 redo A;
2180 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2181 !!!cp (159.1);
2182 !!!parse-error (type => 'no DOCTYPE name');
2183 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2184 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2185 $self->{in_subset} = 1;
2186 !!!next-input-character;
2187 !!!emit ($self->{ct}); # DOCTYPE
2188 redo A;
2189 } else {
2190 !!!cp (160);
2191 $self->{ct}->{name} = chr $self->{nc};
2192 delete $self->{ct}->{quirks};
2193 $self->{state} = DOCTYPE_NAME_STATE;
2194 !!!next-input-character;
2195 redo A;
2196 }
2197 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2198 ## XML5: "DOCTYPE root name state".
2199
2200 ## ISSUE: Redundant "First," in the spec.
2201
2202 if ($is_space->{$self->{nc}}) {
2203 !!!cp (161);
2204 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2205 !!!next-input-character;
2206 redo A;
2207 } elsif ($self->{nc} == 0x003E) { # >
2208 !!!cp (162);
2209 $self->{state} = DATA_STATE;
2210 $self->{s_kwd} = '';
2211 !!!next-input-character;
2212
2213 !!!emit ($self->{ct}); # DOCTYPE
2214
2215 redo A;
2216 } elsif ($self->{nc} == -1) {
2217 !!!cp (163);
2218 !!!parse-error (type => 'unclosed DOCTYPE');
2219 $self->{state} = DATA_STATE;
2220 $self->{s_kwd} = '';
2221 ## reconsume
2222
2223 $self->{ct}->{quirks} = 1;
2224 !!!emit ($self->{ct}); # DOCTYPE
2225
2226 redo A;
2227 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2228 !!!cp (163.1);
2229 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2230 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2231 $self->{in_subset} = 1;
2232 !!!next-input-character;
2233 !!!emit ($self->{ct}); # DOCTYPE
2234 redo A;
2235 } else {
2236 !!!cp (164);
2237 $self->{ct}->{name}
2238 .= chr ($self->{nc}); # DOCTYPE
2239 ## Stay in the state
2240 !!!next-input-character;
2241 redo A;
2242 }
2243 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2244 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2245 ## state", but implemented differently.
2246
2247 if ($is_space->{$self->{nc}}) {
2248 !!!cp (165);
2249 ## Stay in the state
2250 !!!next-input-character;
2251 redo A;
2252 } elsif ($self->{nc} == 0x003E) { # >
2253 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2254 !!!cp (166);
2255 $self->{state} = DATA_STATE;
2256 $self->{s_kwd} = '';
2257 } else {
2258 !!!cp (166.1);
2259 !!!parse-error (type => 'no md def'); ## TODO: type
2260 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2261 }
2262
2263 !!!next-input-character;
2264 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2265 redo A;
2266 } elsif ($self->{nc} == -1) {
2267 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2268 !!!cp (167);
2269 !!!parse-error (type => 'unclosed DOCTYPE');
2270 $self->{state} = DATA_STATE;
2271 $self->{s_kwd} = '';
2272 $self->{ct}->{quirks} = 1;
2273 } else {
2274 !!!cp (167.12);
2275 !!!parse-error (type => 'unclosed md'); ## TODO: type
2276 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2277 }
2278
2279 ## Reconsume.
2280 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2281 redo A;
2282 } elsif ($self->{nc} == 0x0050 or # P
2283 $self->{nc} == 0x0070) { # p
2284 !!!cp (167.1);
2285 $self->{state} = PUBLIC_STATE;
2286 $self->{kwd} = chr $self->{nc};
2287 !!!next-input-character;
2288 redo A;
2289 } elsif ($self->{nc} == 0x0053 or # S
2290 $self->{nc} == 0x0073) { # s
2291 !!!cp (167.2);
2292 $self->{state} = SYSTEM_STATE;
2293 $self->{kwd} = chr $self->{nc};
2294 !!!next-input-character;
2295 redo A;
2296 } elsif ($self->{nc} == 0x0022 and # "
2297 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2298 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2299 !!!cp (167.21);
2300 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
2301 $self->{ct}->{value} = ''; # ENTITY
2302 !!!next-input-character;
2303 redo A;
2304 } elsif ($self->{nc} == 0x0027 and # '
2305 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
2306 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
2307 !!!cp (167.22);
2308 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
2309 $self->{ct}->{value} = ''; # ENTITY
2310 !!!next-input-character;
2311 redo A;
2312 } elsif ($self->{is_xml} and
2313 $self->{ct}->{type} == DOCTYPE_TOKEN and
2314 $self->{nc} == 0x005B) { # [
2315 !!!cp (167.3);
2316 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2317 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2318 $self->{in_subset} = 1;
2319 !!!next-input-character;
2320 !!!emit ($self->{ct}); # DOCTYPE
2321 redo A;
2322 } else {
2323 !!!parse-error (type => 'string after DOCTYPE name'); ## TODO: type
2324
2325 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2326 !!!cp (180);
2327 $self->{ct}->{quirks} = 1;
2328 $self->{state} = BOGUS_DOCTYPE_STATE;
2329 } else {
2330 !!!cp (180.1);
2331 $self->{state} = BOGUS_MD_STATE;
2332 }
2333
2334 !!!next-input-character;
2335 redo A;
2336 }
2337 } elsif ($self->{state} == PUBLIC_STATE) {
2338 ## ASCII case-insensitive
2339 if ($self->{nc} == [
2340 undef,
2341 0x0055, # U
2342 0x0042, # B
2343 0x004C, # L
2344 0x0049, # I
2345 ]->[length $self->{kwd}] or
2346 $self->{nc} == [
2347 undef,
2348 0x0075, # u
2349 0x0062, # b
2350 0x006C, # l
2351 0x0069, # i
2352 ]->[length $self->{kwd}]) {
2353 !!!cp (175);
2354 ## Stay in the state.
2355 $self->{kwd} .= chr $self->{nc};
2356 !!!next-input-character;
2357 redo A;
2358 } elsif ((length $self->{kwd}) == 5 and
2359 ($self->{nc} == 0x0043 or # C
2360 $self->{nc} == 0x0063)) { # c
2361 if ($self->{is_xml} and
2362 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2363 !!!cp (168.1);
2364 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2365 text => 'PUBLIC',
2366 line => $self->{line_prev},
2367 column => $self->{column_prev} - 4);
2368 } else {
2369 !!!cp (168);
2370 }
2371 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2372 !!!next-input-character;
2373 redo A;
2374 } else {
2375 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2376 line => $self->{line_prev},
2377 column => $self->{column_prev} + 1 - length $self->{kwd});
2378 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2379 !!!cp (169);
2380 $self->{ct}->{quirks} = 1;
2381 $self->{state} = BOGUS_DOCTYPE_STATE;
2382 } else {
2383 !!!cp (169.1);
2384 $self->{state} = BOGUS_MD_STATE;
2385 }
2386 ## Reconsume.
2387 redo A;
2388 }
2389 } elsif ($self->{state} == SYSTEM_STATE) {
2390 ## ASCII case-insensitive
2391 if ($self->{nc} == [
2392 undef,
2393 0x0059, # Y
2394 0x0053, # S
2395 0x0054, # T
2396 0x0045, # E
2397 ]->[length $self->{kwd}] or
2398 $self->{nc} == [
2399 undef,
2400 0x0079, # y
2401 0x0073, # s
2402 0x0074, # t
2403 0x0065, # e
2404 ]->[length $self->{kwd}]) {
2405 !!!cp (170);
2406 ## Stay in the state.
2407 $self->{kwd} .= chr $self->{nc};
2408 !!!next-input-character;
2409 redo A;
2410 } elsif ((length $self->{kwd}) == 5 and
2411 ($self->{nc} == 0x004D or # M
2412 $self->{nc} == 0x006D)) { # m
2413 if ($self->{is_xml} and
2414 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2415 !!!cp (171.1);
2416 !!!parse-error (type => 'lowercase keyword', ## TODO: type
2417 text => 'SYSTEM',
2418 line => $self->{line_prev},
2419 column => $self->{column_prev} - 4);
2420 } else {
2421 !!!cp (171);
2422 }
2423 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2424 !!!next-input-character;
2425 redo A;
2426 } else {
2427 !!!parse-error (type => 'string after DOCTYPE name', ## TODO: type
2428 line => $self->{line_prev},
2429 column => $self->{column_prev} + 1 - length $self->{kwd});
2430 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2431 !!!cp (172);
2432 $self->{ct}->{quirks} = 1;
2433 $self->{state} = BOGUS_DOCTYPE_STATE;
2434 } else {
2435 !!!cp (172.1);
2436 $self->{state} = BOGUS_MD_STATE;
2437 }
2438 ## Reconsume.
2439 redo A;
2440 }
2441 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2442 if ($is_space->{$self->{nc}}) {
2443 !!!cp (181);
2444 ## Stay in the state
2445 !!!next-input-character;
2446 redo A;
2447 } elsif ($self->{nc} eq 0x0022) { # "
2448 !!!cp (182);
2449 $self->{ct}->{pubid} = ''; # DOCTYPE
2450 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2451 !!!next-input-character;
2452 redo A;
2453 } elsif ($self->{nc} eq 0x0027) { # '
2454 !!!cp (183);
2455 $self->{ct}->{pubid} = ''; # DOCTYPE
2456 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2457 !!!next-input-character;
2458 redo A;
2459 } elsif ($self->{nc} eq 0x003E) { # >
2460 !!!parse-error (type => 'no PUBLIC literal');
2461
2462 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2463 !!!cp (184);
2464 $self->{state} = DATA_STATE;
2465 $self->{s_kwd} = '';
2466 $self->{ct}->{quirks} = 1;
2467 } else {
2468 !!!cp (184.1);
2469 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2470 }
2471
2472 !!!next-input-character;
2473 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2474 redo A;
2475 } elsif ($self->{nc} == -1) {
2476 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2477 !!!cp (185);
2478 !!!parse-error (type => 'unclosed DOCTYPE');
2479 $self->{state} = DATA_STATE;
2480 $self->{s_kwd} = '';
2481 $self->{ct}->{quirks} = 1;
2482 } else {
2483 !!!cp (185.1);
2484 !!!parse-error (type => 'unclosed md'); ## TODO: type
2485 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2486 }
2487
2488 ## reconsume
2489 !!!emit ($self->{ct}); # DOCTYPE
2490 redo A;
2491 } elsif ($self->{is_xml} and
2492 $self->{ct}->{type} == DOCTYPE_TOKEN and
2493 $self->{nc} == 0x005B) { # [
2494 !!!cp (186.1);
2495 !!!parse-error (type => 'no PUBLIC literal');
2496 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2497 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2498 $self->{in_subset} = 1;
2499 !!!next-input-character;
2500 !!!emit ($self->{ct}); # DOCTYPE
2501 redo A;
2502 } else {
2503 !!!parse-error (type => 'string after PUBLIC');
2504
2505 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2506 !!!cp (186);
2507 $self->{ct}->{quirks} = 1;
2508 $self->{state} = BOGUS_DOCTYPE_STATE;
2509 } else {
2510 !!!cp (186.2);
2511 $self->{state} = BOGUS_MD_STATE;
2512 }
2513
2514 !!!next-input-character;
2515 redo A;
2516 }
2517 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2518 if ($self->{nc} == 0x0022) { # "
2519 !!!cp (187);
2520 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2521 !!!next-input-character;
2522 redo A;
2523 } elsif ($self->{nc} == 0x003E) { # >
2524 !!!parse-error (type => 'unclosed PUBLIC literal');
2525
2526 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2527 !!!cp (188);
2528 $self->{state} = DATA_STATE;
2529 $self->{s_kwd} = '';
2530 $self->{ct}->{quirks} = 1;
2531 } else {
2532 !!!cp (188.1);
2533 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2534 }
2535
2536 !!!next-input-character;
2537 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2538 redo A;
2539 } elsif ($self->{nc} == -1) {
2540 !!!parse-error (type => 'unclosed PUBLIC literal');
2541
2542 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2543 !!!cp (189);
2544 $self->{state} = DATA_STATE;
2545 $self->{s_kwd} = '';
2546 $self->{ct}->{quirks} = 1;
2547 } else {
2548 !!!cp (189.1);
2549 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2550 }
2551
2552 ## Reconsume.
2553 !!!emit ($self->{ct}); # DOCTYPE
2554 redo A;
2555 } else {
2556 !!!cp (190);
2557 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2558 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2559 length $self->{ct}->{pubid});
2560
2561 ## Stay in the state
2562 !!!next-input-character;
2563 redo A;
2564 }
2565 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2566 if ($self->{nc} == 0x0027) { # '
2567 !!!cp (191);
2568 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2569 !!!next-input-character;
2570 redo A;
2571 } elsif ($self->{nc} == 0x003E) { # >
2572 !!!parse-error (type => 'unclosed PUBLIC literal');
2573
2574 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2575 !!!cp (192);
2576 $self->{state} = DATA_STATE;
2577 $self->{s_kwd} = '';
2578 $self->{ct}->{quirks} = 1;
2579 } else {
2580 !!!cp (192.1);
2581 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2582 }
2583
2584 !!!next-input-character;
2585 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2586 redo A;
2587 } elsif ($self->{nc} == -1) {
2588 !!!parse-error (type => 'unclosed PUBLIC literal');
2589
2590 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2591 !!!cp (193);
2592 $self->{state} = DATA_STATE;
2593 $self->{s_kwd} = '';
2594 $self->{ct}->{quirks} = 1;
2595 } else {
2596 !!!cp (193.1);
2597 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2598 }
2599
2600 ## reconsume
2601 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2602 redo A;
2603 } else {
2604 !!!cp (194);
2605 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2606 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2607 length $self->{ct}->{pubid});
2608
2609 ## Stay in the state
2610 !!!next-input-character;
2611 redo A;
2612 }
2613 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2614 if ($is_space->{$self->{nc}}) {
2615 !!!cp (195);
2616 ## Stay in the state
2617 !!!next-input-character;
2618 redo A;
2619 } elsif ($self->{nc} == 0x0022) { # "
2620 !!!cp (196);
2621 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2622 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2623 !!!next-input-character;
2624 redo A;
2625 } elsif ($self->{nc} == 0x0027) { # '
2626 !!!cp (197);
2627 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
2628 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2629 !!!next-input-character;
2630 redo A;
2631 } elsif ($self->{nc} == 0x003E) { # >
2632 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2633 if ($self->{is_xml}) {
2634 !!!cp (198.1);
2635 !!!parse-error (type => 'no SYSTEM literal');
2636 } else {
2637 !!!cp (198);
2638 }
2639 $self->{state} = DATA_STATE;
2640 $self->{s_kwd} = '';
2641 } else {
2642 if ($self->{ct}->{type} == NOTATION_TOKEN) {
2643 !!!cp (198.2);
2644 } else {
2645 !!!cp (198.3);
2646 !!!parse-error (type => 'no SYSTEM literal');
2647 }
2648 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2649 }
2650
2651 !!!next-input-character;
2652 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2653 redo A;
2654 } elsif ($self->{nc} == -1) {
2655 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2656 !!!cp (199);
2657 !!!parse-error (type => 'unclosed DOCTYPE');
2658
2659 $self->{state} = DATA_STATE;
2660 $self->{s_kwd} = '';
2661 $self->{ct}->{quirks} = 1;
2662 } else {
2663 !!!parse-error (type => 'unclosed md'); ## TODO: type
2664 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2665 }
2666
2667 ## reconsume
2668 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2669 redo A;
2670 } elsif ($self->{is_xml} and
2671 $self->{ct}->{type} == DOCTYPE_TOKEN and
2672 $self->{nc} == 0x005B) { # [
2673 !!!cp (200.1);
2674 !!!parse-error (type => 'no SYSTEM literal');
2675 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2676 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2677 $self->{in_subset} = 1;
2678 !!!next-input-character;
2679 !!!emit ($self->{ct}); # DOCTYPE
2680 redo A;
2681 } else {
2682 !!!parse-error (type => 'string after PUBLIC literal');
2683
2684 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2685 !!!cp (200);
2686 $self->{ct}->{quirks} = 1;
2687 $self->{state} = BOGUS_DOCTYPE_STATE;
2688 } else {
2689 !!!cp (200.2);
2690 $self->{state} = BOGUS_MD_STATE;
2691 }
2692
2693 !!!next-input-character;
2694 redo A;
2695 }
2696 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2697 if ($is_space->{$self->{nc}}) {
2698 !!!cp (201);
2699 ## Stay in the state
2700 !!!next-input-character;
2701 redo A;
2702 } elsif ($self->{nc} == 0x0022) { # "
2703 !!!cp (202);
2704 $self->{ct}->{sysid} = ''; # DOCTYPE
2705 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2706 !!!next-input-character;
2707 redo A;
2708 } elsif ($self->{nc} == 0x0027) { # '
2709 !!!cp (203);
2710 $self->{ct}->{sysid} = ''; # DOCTYPE
2711 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2712 !!!next-input-character;
2713 redo A;
2714 } elsif ($self->{nc} == 0x003E) { # >
2715 !!!parse-error (type => 'no SYSTEM literal');
2716 !!!next-input-character;
2717
2718 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2719 !!!cp (204);
2720 $self->{state} = DATA_STATE;
2721 $self->{s_kwd} = '';
2722 $self->{ct}->{quirks} = 1;
2723 } else {
2724 !!!cp (204.1);
2725 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2726 }
2727
2728 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2729 redo A;
2730 } elsif ($self->{nc} == -1) {
2731 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2732 !!!cp (205);
2733 !!!parse-error (type => 'unclosed DOCTYPE');
2734 $self->{state} = DATA_STATE;
2735 $self->{s_kwd} = '';
2736 $self->{ct}->{quirks} = 1;
2737 } else {
2738 !!!cp (205.1);
2739 !!!parse-error (type => 'unclosed md'); ## TODO: type
2740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2741 }
2742
2743 ## reconsume
2744 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2745 redo A;
2746 } elsif ($self->{is_xml} and
2747 $self->{ct}->{type} == DOCTYPE_TOKEN and
2748 $self->{nc} == 0x005B) { # [
2749 !!!cp (206.1);
2750 !!!parse-error (type => 'no SYSTEM literal');
2751
2752 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2753 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2754 $self->{in_subset} = 1;
2755 !!!next-input-character;
2756 !!!emit ($self->{ct}); # DOCTYPE
2757 redo A;
2758 } else {
2759 !!!parse-error (type => 'string after SYSTEM');
2760
2761 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2762 !!!cp (206);
2763 $self->{ct}->{quirks} = 1;
2764 $self->{state} = BOGUS_DOCTYPE_STATE;
2765 } else {
2766 !!!cp (206.2);
2767 $self->{state} = BOGUS_MD_STATE;
2768 }
2769
2770 !!!next-input-character;
2771 redo A;
2772 }
2773 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2774 if ($self->{nc} == 0x0022) { # "
2775 !!!cp (207);
2776 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2777 !!!next-input-character;
2778 redo A;
2779 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2780 !!!parse-error (type => 'unclosed SYSTEM literal');
2781
2782 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2783 !!!cp (208);
2784 $self->{state} = DATA_STATE;
2785 $self->{s_kwd} = '';
2786 $self->{ct}->{quirks} = 1;
2787 } else {
2788 !!!cp (208.1);
2789 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2790 }
2791
2792 !!!next-input-character;
2793 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2794 redo A;
2795 } elsif ($self->{nc} == -1) {
2796 !!!parse-error (type => 'unclosed SYSTEM literal');
2797
2798 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2799 !!!cp (209);
2800 $self->{state} = DATA_STATE;
2801 $self->{s_kwd} = '';
2802 $self->{ct}->{quirks} = 1;
2803 } else {
2804 !!!cp (209.1);
2805 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806 }
2807
2808 ## reconsume
2809 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2810 redo A;
2811 } else {
2812 !!!cp (210);
2813 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2814 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2815 length $self->{ct}->{sysid});
2816
2817 ## Stay in the state
2818 !!!next-input-character;
2819 redo A;
2820 }
2821 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2822 if ($self->{nc} == 0x0027) { # '
2823 !!!cp (211);
2824 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2825 !!!next-input-character;
2826 redo A;
2827 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2828 !!!cp (212);
2829 !!!parse-error (type => 'unclosed SYSTEM literal');
2830
2831 $self->{state} = DATA_STATE;
2832 $self->{s_kwd} = '';
2833 !!!next-input-character;
2834
2835 $self->{ct}->{quirks} = 1;
2836 !!!emit ($self->{ct}); # DOCTYPE
2837
2838 redo A;
2839 } elsif ($self->{nc} == -1) {
2840 !!!parse-error (type => 'unclosed SYSTEM literal');
2841
2842 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2843 !!!cp (213);
2844 $self->{state} = DATA_STATE;
2845 $self->{s_kwd} = '';
2846 $self->{ct}->{quirks} = 1;
2847 } else {
2848 !!!cp (213.1);
2849 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2850 }
2851
2852 ## reconsume
2853 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2854 redo A;
2855 } else {
2856 !!!cp (214);
2857 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
2858 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2859 length $self->{ct}->{sysid});
2860
2861 ## Stay in the state
2862 !!!next-input-character;
2863 redo A;
2864 }
2865 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2866 if ($is_space->{$self->{nc}}) {
2867 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
2868 !!!cp (215.1);
2869 $self->{state} = BEFORE_NDATA_STATE;
2870 } else {
2871 !!!cp (215);
2872 ## Stay in the state
2873 }
2874 !!!next-input-character;
2875 redo A;
2876 } elsif ($self->{nc} == 0x003E) { # >
2877 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2878 !!!cp (216);
2879 $self->{state} = DATA_STATE;
2880 $self->{s_kwd} = '';
2881 } else {
2882 !!!cp (216.1);
2883 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2884 }
2885
2886 !!!next-input-character;
2887 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2888 redo A;
2889 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
2890 ($self->{nc} == 0x004E or # N
2891 $self->{nc} == 0x006E)) { # n
2892 !!!cp (216.2);
2893 !!!parse-error (type => 'no space before NDATA'); ## TODO: type
2894 $self->{state} = NDATA_STATE;
2895 $self->{kwd} = chr $self->{nc};
2896 !!!next-input-character;
2897 redo A;
2898 } elsif ($self->{nc} == -1) {
2899 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2900 !!!cp (217);
2901 !!!parse-error (type => 'unclosed DOCTYPE');
2902 $self->{state} = DATA_STATE;
2903 $self->{s_kwd} = '';
2904 $self->{ct}->{quirks} = 1;
2905 } else {
2906 !!!cp (217.1);
2907 !!!parse-error (type => 'unclosed md'); ## TODO: type
2908 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2909 }
2910
2911 ## reconsume
2912 !!!emit ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
2913 redo A;
2914 } elsif ($self->{is_xml} and
2915 $self->{ct}->{type} == DOCTYPE_TOKEN and
2916 $self->{nc} == 0x005B) { # [
2917 !!!cp (218.1);
2918 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2919 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2920 $self->{in_subset} = 1;
2921 !!!next-input-character;
2922 !!!emit ($self->{ct}); # DOCTYPE
2923 redo A;
2924 } else {
2925 !!!parse-error (type => 'string after SYSTEM literal');
2926
2927 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
2928 !!!cp (218);
2929 #$self->{ct}->{quirks} = 1;
2930 $self->{state} = BOGUS_DOCTYPE_STATE;
2931 } else {
2932 !!!cp (218.2);
2933 $self->{state} = BOGUS_MD_STATE;
2934 }
2935
2936 !!!next-input-character;
2937 redo A;
2938 }
2939 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
2940 if ($is_space->{$self->{nc}}) {
2941 !!!cp (218.3);
2942 ## Stay in the state.
2943 !!!next-input-character;
2944 redo A;
2945 } elsif ($self->{nc} == 0x003E) { # >
2946 !!!cp (218.4);
2947 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2948 !!!next-input-character;
2949 !!!emit ($self->{ct}); # ENTITY
2950 redo A;
2951 } elsif ($self->{nc} == 0x004E or # N
2952 $self->{nc} == 0x006E) { # n
2953 !!!cp (218.5);
2954 $self->{state} = NDATA_STATE;
2955 $self->{kwd} = chr $self->{nc};
2956 !!!next-input-character;
2957 redo A;
2958 } elsif ($self->{nc} == -1) {
2959 !!!cp (218.6);
2960 !!!parse-error (type => 'unclosed md'); ## TODO: type
2961 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2962 ## reconsume
2963 !!!emit ($self->{ct}); # ENTITY
2964 redo A;
2965 } else {
2966 !!!cp (218.7);
2967 !!!parse-error (type => 'string after SYSTEM literal');
2968 $self->{state} = BOGUS_MD_STATE;
2969 !!!next-input-character;
2970 redo A;
2971 }
2972 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2973 if ($self->{nc} == 0x003E) { # >
2974 !!!cp (219);
2975 $self->{state} = DATA_STATE;
2976 $self->{s_kwd} = '';
2977 !!!next-input-character;
2978
2979 !!!emit ($self->{ct}); # DOCTYPE
2980
2981 redo A;
2982 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2983 !!!cp (220.1);
2984 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2985 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2986 $self->{in_subset} = 1;
2987 !!!next-input-character;
2988 !!!emit ($self->{ct}); # DOCTYPE
2989 redo A;
2990 } elsif ($self->{nc} == -1) {
2991 !!!cp (220);
2992 $self->{state} = DATA_STATE;
2993 $self->{s_kwd} = '';
2994 ## reconsume
2995
2996 !!!emit ($self->{ct}); # DOCTYPE
2997
2998 redo A;
2999 } else {
3000 !!!cp (221);
3001 my $s = '';
3002 $self->{read_until}->($s, q{>[}, 0);
3003
3004 ## Stay in the state
3005 !!!next-input-character;
3006 redo A;
3007 }
3008 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3009 ## NOTE: "CDATA section state" in the state is jointly implemented
3010 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3011 ## and |CDATA_SECTION_MSE2_STATE|.
3012
3013 ## XML5: "CDATA state".
3014
3015 if ($self->{nc} == 0x005D) { # ]
3016 !!!cp (221.1);
3017 $self->{state} = CDATA_SECTION_MSE1_STATE;
3018 !!!next-input-character;
3019 redo A;
3020 } elsif ($self->{nc} == -1) {
3021 if ($self->{is_xml}) {
3022 !!!cp (221.11);
3023 !!!parse-error (type => 'no mse'); ## TODO: type
3024 } else {
3025 !!!cp (221.12);
3026 }
3027
3028 $self->{state} = DATA_STATE;
3029 $self->{s_kwd} = '';
3030 ## Reconsume.
3031 if (length $self->{ct}->{data}) { # character
3032 !!!cp (221.2);
3033 !!!emit ($self->{ct}); # character
3034 } else {
3035 !!!cp (221.3);
3036 ## No token to emit. $self->{ct} is discarded.
3037 }
3038 redo A;
3039 } else {
3040 !!!cp (221.4);
3041 $self->{ct}->{data} .= chr $self->{nc};
3042 $self->{read_until}->($self->{ct}->{data},
3043 q<]>,
3044 length $self->{ct}->{data});
3045
3046 ## Stay in the state.
3047 !!!next-input-character;
3048 redo A;
3049 }
3050
3051 ## ISSUE: "text tokens" in spec.
3052 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3053 ## XML5: "CDATA bracket state".
3054
3055 if ($self->{nc} == 0x005D) { # ]
3056 !!!cp (221.5);
3057 $self->{state} = CDATA_SECTION_MSE2_STATE;
3058 !!!next-input-character;
3059 redo A;
3060 } else {
3061 !!!cp (221.6);
3062 ## XML5: If EOF, "]" is not appended and changed to the data state.
3063 $self->{ct}->{data} .= ']';
3064 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3065 ## Reconsume.
3066 redo A;
3067 }
3068 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3069 ## XML5: "CDATA end state".
3070
3071 if ($self->{nc} == 0x003E) { # >
3072 $self->{state} = DATA_STATE;
3073 $self->{s_kwd} = '';
3074 !!!next-input-character;
3075 if (length $self->{ct}->{data}) { # character
3076 !!!cp (221.7);
3077 !!!emit ($self->{ct}); # character
3078 } else {
3079 !!!cp (221.8);
3080 ## No token to emit. $self->{ct} is discarded.
3081 }
3082 redo A;
3083 } elsif ($self->{nc} == 0x005D) { # ]
3084 !!!cp (221.9); # character
3085 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3086 ## Stay in the state.
3087 !!!next-input-character;
3088 redo A;
3089 } else {
3090 !!!cp (221.11);
3091 $self->{ct}->{data} .= ']]'; # character
3092 $self->{state} = CDATA_SECTION_STATE;
3093 ## Reconsume. ## XML5: Emit.
3094 redo A;
3095 }
3096 } elsif ($self->{state} == ENTITY_STATE) {
3097 if ($is_space->{$self->{nc}} or
3098 {
3099 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3100 $self->{entity_add} => 1,
3101 }->{$self->{nc}}) {
3102 if ($self->{is_xml}) {
3103 !!!cp (1001.1);
3104 !!!parse-error (type => 'bare ero',
3105 line => $self->{line_prev},
3106 column => $self->{column_prev}
3107 + ($self->{nc} == -1 ? 1 : 0));
3108 } else {
3109 !!!cp (1001);
3110 ## No error
3111 }
3112 ## Don't consume
3113 ## Return nothing.
3114 #
3115 } elsif ($self->{nc} == 0x0023) { # #
3116 !!!cp (999);
3117 $self->{state} = ENTITY_HASH_STATE;
3118 $self->{kwd} = '#';
3119 !!!next-input-character;
3120 redo A;
3121 } elsif ($self->{is_xml} or
3122 (0x0041 <= $self->{nc} and
3123 $self->{nc} <= 0x005A) or # A..Z
3124 (0x0061 <= $self->{nc} and
3125 $self->{nc} <= 0x007A)) { # a..z
3126 !!!cp (998);
3127 require Whatpm::_NamedEntityList;
3128 $self->{state} = ENTITY_NAME_STATE;
3129 $self->{kwd} = chr $self->{nc};
3130 $self->{entity__value} = $self->{kwd};
3131 $self->{entity__match} = 0;
3132 !!!next-input-character;
3133 redo A;
3134 } else {
3135 !!!cp (1027);
3136 !!!parse-error (type => 'bare ero');
3137 ## Return nothing.
3138 #
3139 }
3140
3141 ## NOTE: No character is consumed by the "consume a character
3142 ## reference" algorithm. In other word, there is an "&" character
3143 ## that does not introduce a character reference, which would be
3144 ## appended to the parent element or the attribute value in later
3145 ## process of the tokenizer.
3146
3147 if ($self->{prev_state} == DATA_STATE) {
3148 !!!cp (997);
3149 $self->{state} = $self->{prev_state};
3150 $self->{s_kwd} = '';
3151 ## Reconsume.
3152 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3153 line => $self->{line_prev},
3154 column => $self->{column_prev},
3155 });
3156 redo A;
3157 } else {
3158 !!!cp (996);
3159 $self->{ca}->{value} .= '&';
3160 $self->{state} = $self->{prev_state};
3161 $self->{s_kwd} = '';
3162 ## Reconsume.
3163 redo A;
3164 }
3165 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3166 if ($self->{nc} == 0x0078) { # x
3167 !!!cp (995);
3168 $self->{state} = HEXREF_X_STATE;
3169 $self->{kwd} .= chr $self->{nc};
3170 !!!next-input-character;
3171 redo A;
3172 } elsif ($self->{nc} == 0x0058) { # X
3173 !!!cp (995.1);
3174 if ($self->{is_xml}) {
3175 !!!parse-error (type => 'uppercase hcro'); ## TODO: type
3176 }
3177 $self->{state} = HEXREF_X_STATE;
3178 $self->{kwd} .= chr $self->{nc};
3179 !!!next-input-character;
3180 redo A;
3181 } elsif (0x0030 <= $self->{nc} and
3182 $self->{nc} <= 0x0039) { # 0..9
3183 !!!cp (994);
3184 $self->{state} = NCR_NUM_STATE;
3185 $self->{kwd} = $self->{nc} - 0x0030;
3186 !!!next-input-character;
3187 redo A;
3188 } else {
3189 !!!parse-error (type => 'bare nero',
3190 line => $self->{line_prev},
3191 column => $self->{column_prev} - 1);
3192
3193 ## NOTE: According to the spec algorithm, nothing is returned,
3194 ## and then "&#" is appended to the parent element or the attribute
3195 ## value in the later processing.
3196
3197 if ($self->{prev_state} == DATA_STATE) {
3198 !!!cp (1019);
3199 $self->{state} = $self->{prev_state};
3200 $self->{s_kwd} = '';
3201 ## Reconsume.
3202 !!!emit ({type => CHARACTER_TOKEN,
3203 data => '&#',
3204 line => $self->{line_prev},
3205 column => $self->{column_prev} - 1,
3206 });
3207 redo A;
3208 } else {
3209 !!!cp (993);
3210 $self->{ca}->{value} .= '&#';
3211 $self->{state} = $self->{prev_state};
3212 $self->{s_kwd} = '';
3213 ## Reconsume.
3214 redo A;
3215 }
3216 }
3217 } elsif ($self->{state} == NCR_NUM_STATE) {
3218 if (0x0030 <= $self->{nc} and
3219 $self->{nc} <= 0x0039) { # 0..9
3220 !!!cp (1012);
3221 $self->{kwd} *= 10;
3222 $self->{kwd} += $self->{nc} - 0x0030;
3223
3224 ## Stay in the state.
3225 !!!next-input-character;
3226 redo A;
3227 } elsif ($self->{nc} == 0x003B) { # ;
3228 !!!cp (1013);
3229 !!!next-input-character;
3230 #
3231 } else {
3232 !!!cp (1014);
3233 !!!parse-error (type => 'no refc');
3234 ## Reconsume.
3235 #
3236 }
3237
3238 my $code = $self->{kwd};
3239 my $l = $self->{line_prev};
3240 my $c = $self->{column_prev};
3241 if ((not $self->{is_xml} and $charref_map->{$code}) or
3242 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3243 ($self->{is_xml} and $code == 0x0000)) {
3244 !!!cp (1015);
3245 !!!parse-error (type => 'invalid character reference',
3246 text => (sprintf 'U+%04X', $code),
3247 line => $l, column => $c);
3248 $code = $charref_map->{$code};
3249 } elsif ($code > 0x10FFFF) {
3250 !!!cp (1016);
3251 !!!parse-error (type => 'invalid character reference',
3252 text => (sprintf 'U-%08X', $code),
3253 line => $l, column => $c);
3254 $code = 0xFFFD;
3255 }
3256
3257 if ($self->{prev_state} == DATA_STATE) {
3258 !!!cp (992);
3259 $self->{state} = $self->{prev_state};
3260 $self->{s_kwd} = '';
3261 ## Reconsume.
3262 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3263 has_reference => 1,
3264 line => $l, column => $c,
3265 });
3266 redo A;
3267 } else {
3268 !!!cp (991);
3269 $self->{ca}->{value} .= chr $code;
3270 $self->{ca}->{has_reference} = 1;
3271 $self->{state} = $self->{prev_state};
3272 $self->{s_kwd} = '';
3273 ## Reconsume.
3274 redo A;
3275 }
3276 } elsif ($self->{state} == HEXREF_X_STATE) {
3277 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3278 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3279 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3280 # 0..9, A..F, a..f
3281 !!!cp (990);
3282 $self->{state} = HEXREF_HEX_STATE;
3283 $self->{kwd} = 0;
3284 ## Reconsume.
3285 redo A;
3286 } else {
3287 !!!parse-error (type => 'bare hcro',
3288 line => $self->{line_prev},
3289 column => $self->{column_prev} - 2);
3290
3291 ## NOTE: According to the spec algorithm, nothing is returned,
3292 ## and then "&#" followed by "X" or "x" is appended to the parent
3293 ## element or the attribute value in the later processing.
3294
3295 if ($self->{prev_state} == DATA_STATE) {
3296 !!!cp (1005);
3297 $self->{state} = $self->{prev_state};
3298 $self->{s_kwd} = '';
3299 ## Reconsume.
3300 !!!emit ({type => CHARACTER_TOKEN,
3301 data => '&' . $self->{kwd},
3302 line => $self->{line_prev},
3303 column => $self->{column_prev} - length $self->{kwd},
3304 });
3305 redo A;
3306 } else {
3307 !!!cp (989);
3308 $self->{ca}->{value} .= '&' . $self->{kwd};
3309 $self->{state} = $self->{prev_state};
3310 $self->{s_kwd} = '';
3311 ## Reconsume.
3312 redo A;
3313 }
3314 }
3315 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3316 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3317 # 0..9
3318 !!!cp (1002);
3319 $self->{kwd} *= 0x10;
3320 $self->{kwd} += $self->{nc} - 0x0030;
3321 ## Stay in the state.
3322 !!!next-input-character;
3323 redo A;
3324 } elsif (0x0061 <= $self->{nc} and
3325 $self->{nc} <= 0x0066) { # a..f
3326 !!!cp (1003);
3327 $self->{kwd} *= 0x10;
3328 $self->{kwd} += $self->{nc} - 0x0060 + 9;
3329 ## Stay in the state.
3330 !!!next-input-character;
3331 redo A;
3332 } elsif (0x0041 <= $self->{nc} and
3333 $self->{nc} <= 0x0046) { # A..F
3334 !!!cp (1004);
3335 $self->{kwd} *= 0x10;
3336 $self->{kwd} += $self->{nc} - 0x0040 + 9;
3337 ## Stay in the state.
3338 !!!next-input-character;
3339 redo A;
3340 } elsif ($self->{nc} == 0x003B) { # ;
3341 !!!cp (1006);
3342 !!!next-input-character;
3343 #
3344 } else {
3345 !!!cp (1007);
3346 !!!parse-error (type => 'no refc',
3347 line => $self->{line},
3348 column => $self->{column});
3349 ## Reconsume.
3350 #
3351 }
3352
3353 my $code = $self->{kwd};
3354 my $l = $self->{line_prev};
3355 my $c = $self->{column_prev};
3356 if ((not $self->{is_xml} and $charref_map->{$code}) or
3357 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
3358 ($self->{is_xml} and $code == 0x0000)) {
3359 !!!cp (1008);
3360 !!!parse-error (type => 'invalid character reference',
3361 text => (sprintf 'U+%04X', $code),
3362 line => $l, column => $c);
3363 $code = $charref_map->{$code};
3364 } elsif ($code > 0x10FFFF) {
3365 !!!cp (1009);
3366 !!!parse-error (type => 'invalid character reference',
3367 text => (sprintf 'U-%08X', $code),
3368 line => $l, column => $c);
3369 $code = 0xFFFD;
3370 }
3371
3372 if ($self->{prev_state} == DATA_STATE) {
3373 !!!cp (988);
3374 $self->{state} = $self->{prev_state};
3375 $self->{s_kwd} = '';
3376 ## Reconsume.
3377 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3378 has_reference => 1,
3379 line => $l, column => $c,
3380 });
3381 redo A;
3382 } else {
3383 !!!cp (987);
3384 $self->{ca}->{value} .= chr $code;
3385 $self->{ca}->{has_reference} = 1;
3386 $self->{state} = $self->{prev_state};
3387 $self->{s_kwd} = '';
3388 ## Reconsume.
3389 redo A;
3390 }
3391 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3392 if ((0x0041 <= $self->{nc} and # a
3393 $self->{nc} <= 0x005A) or # x
3394 (0x0061 <= $self->{nc} and # a
3395 $self->{nc} <= 0x007A) or # z
3396 (0x0030 <= $self->{nc} and # 0
3397 $self->{nc} <= 0x0039) or # 9
3398 $self->{nc} == 0x003B or # ;
3399 ($self->{is_xml} and
3400 not ($is_space->{$self->{nc}} or
3401 {
3402 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3403 $self->{entity_add} => 1,
3404 }->{$self->{nc}}))) {
3405 our $EntityChar;
3406 $self->{kwd} .= chr $self->{nc};
3407 if (defined $EntityChar->{$self->{kwd}} or
3408 $self->{ge}->{$self->{kwd}}) {
3409 if ($self->{nc} == 0x003B) { # ;
3410 if (defined $self->{ge}->{$self->{kwd}}) {
3411 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
3412 !!!cp (1020.1);
3413 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
3414 } else {
3415 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
3416 !!!cp (1020.2);
3417 !!!parse-error (type => 'unparsed entity', ## TODO: type
3418 value => $self->{kwd});
3419 } else {
3420 !!!cp (1020.3);
3421 }
3422 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
3423 }
3424 } else {
3425 if ($self->{is_xml}) {
3426 !!!cp (1020.4);
3427 !!!parse-error (type => 'entity not declared', ## TODO: type
3428 value => $self->{kwd},
3429 level => {
3430 'amp;' => $self->{level}->{warn},
3431 'quot;' => $self->{level}->{warn},
3432 'lt;' => $self->{level}->{warn},
3433 'gt;' => $self->{level}->{warn},
3434 'apos;' => $self->{level}->{warn},
3435 }->{$self->{kwd}} ||
3436 $self->{level}->{must});
3437 } else {
3438 !!!cp (1020);
3439 }
3440 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3441 }
3442 $self->{entity__match} = 1;
3443 !!!next-input-character;
3444 #
3445 } else {
3446 !!!cp (1021);
3447 $self->{entity__value} = $EntityChar->{$self->{kwd}};
3448 $self->{entity__match} = -1;
3449 ## Stay in the state.
3450 !!!next-input-character;
3451 redo A;
3452 }
3453 } else {
3454 !!!cp (1022);
3455 $self->{entity__value} .= chr $self->{nc};
3456 $self->{entity__match} *= 2;
3457 ## Stay in the state.
3458 !!!next-input-character;
3459 redo A;
3460 }
3461 }
3462
3463 my $data;
3464 my $has_ref;
3465 if ($self->{entity__match} > 0) {
3466 !!!cp (1023);
3467 $data = $self->{entity__value};
3468 $has_ref = 1;
3469 #
3470 } elsif ($self->{entity__match} < 0) {
3471 !!!parse-error (type => 'no refc');
3472 if ($self->{prev_state} != DATA_STATE and # in attribute
3473 $self->{entity__match} < -1) {
3474 !!!cp (1024);
3475 $data = '&' . $self->{kwd};
3476 #
3477 } else {
3478 !!!cp (1025);
3479 $data = $self->{entity__value};
3480 $has_ref = 1;
3481 #
3482 }
3483 } else {
3484 !!!cp (1026);
3485 !!!parse-error (type => 'bare ero',
3486 line => $self->{line_prev},
3487 column => $self->{column_prev} - length $self->{kwd});
3488 $data = '&' . $self->{kwd};
3489 #
3490 }
3491
3492 ## NOTE: In these cases, when a character reference is found,
3493 ## it is consumed and a character token is returned, or, otherwise,
3494 ## nothing is consumed and returned, according to the spec algorithm.
3495 ## In this implementation, anything that has been examined by the
3496 ## tokenizer is appended to the parent element or the attribute value
3497 ## as string, either literal string when no character reference or
3498 ## entity-replaced string otherwise, in this stage, since any characters
3499 ## that would not be consumed are appended in the data state or in an
3500 ## appropriate attribute value state anyway.
3501
3502 if ($self->{prev_state} == DATA_STATE) {
3503 !!!cp (986);
3504 $self->{state} = $self->{prev_state};
3505 $self->{s_kwd} = '';
3506 ## Reconsume.
3507 !!!emit ({type => CHARACTER_TOKEN,
3508 data => $data,
3509 has_reference => $has_ref,
3510 line => $self->{line_prev},
3511 column => $self->{column_prev} + 1 - length $self->{kwd},
3512 });
3513 redo A;
3514 } else {
3515 !!!cp (985);
3516 $self->{ca}->{value} .= $data;
3517 $self->{ca}->{has_reference} = 1 if $has_ref;
3518 $self->{state} = $self->{prev_state};
3519 $self->{s_kwd} = '';
3520 ## Reconsume.
3521 redo A;
3522 }
3523
3524 ## XML-only states
3525
3526 } elsif ($self->{state} == PI_STATE) {
3527 ## XML5: "Pi state" and "DOCTYPE pi state".
3528
3529 if ($is_space->{$self->{nc}} or
3530 $self->{nc} == 0x003F or # ?
3531 $self->{nc} == -1) {
3532 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3533 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
3534 ## "DOCTYPE pi state": Parse error, switch to the "data
3535 ## state".
3536 !!!parse-error (type => 'bare pio', ## TODO: type
3537 line => $self->{line_prev},
3538 column => $self->{column_prev}
3539 - 1 * ($self->{nc} != -1));
3540 $self->{state} = BOGUS_COMMENT_STATE;
3541 ## Reconsume.
3542 $self->{ct} = {type => COMMENT_TOKEN,
3543 data => '?',
3544 line => $self->{line_prev},
3545 column => $self->{column_prev}
3546 - 1 * ($self->{nc} != -1),
3547 };
3548 redo A;
3549 } else {
3550 ## XML5: "DOCTYPE pi state": Stay in the state.
3551 $self->{ct} = {type => PI_TOKEN,
3552 target => chr $self->{nc},
3553 data => '',
3554 line => $self->{line_prev},
3555 column => $self->{column_prev} - 1,
3556 };
3557 $self->{state} = PI_TARGET_STATE;
3558 !!!next-input-character;
3559 redo A;
3560 }
3561 } elsif ($self->{state} == PI_TARGET_STATE) {
3562 if ($is_space->{$self->{nc}}) {
3563 $self->{state} = PI_TARGET_AFTER_STATE;
3564 !!!next-input-character;
3565 redo A;
3566 } elsif ($self->{nc} == -1) {
3567 !!!parse-error (type => 'no pic'); ## TODO: type
3568 if ($self->{in_subset}) {
3569 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3570 } else {
3571 $self->{state} = DATA_STATE;
3572 $self->{s_kwd} = '';
3573 }
3574 ## Reconsume.
3575 !!!emit ($self->{ct}); # pi
3576 redo A;
3577 } elsif ($self->{nc} == 0x003F) { # ?
3578 $self->{state} = PI_AFTER_STATE;
3579 !!!next-input-character;
3580 redo A;
3581 } else {
3582 ## XML5: typo ("tag name" -> "target")
3583 $self->{ct}->{target} .= chr $self->{nc}; # pi
3584 !!!next-input-character;
3585 redo A;
3586 }
3587 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3588 if ($is_space->{$self->{nc}}) {
3589 ## Stay in the state.
3590 !!!next-input-character;
3591 redo A;
3592 } else {
3593 $self->{state} = PI_DATA_STATE;
3594 ## Reprocess.
3595 redo A;
3596 }
3597 } elsif ($self->{state} == PI_DATA_STATE) {
3598 if ($self->{nc} == 0x003F) { # ?
3599 $self->{state} = PI_DATA_AFTER_STATE;
3600 !!!next-input-character;
3601 redo A;
3602 } elsif ($self->{nc} == -1) {
3603 !!!parse-error (type => 'no pic'); ## TODO: type
3604 if ($self->{in_subset}) {
3605 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3606 } else {
3607 $self->{state} = DATA_STATE;
3608 $self->{s_kwd} = '';
3609 }
3610 ## Reprocess.
3611 !!!emit ($self->{ct}); # pi
3612 redo A;
3613 } else {
3614 $self->{ct}->{data} .= chr $self->{nc}; # pi
3615 $self->{read_until}->($self->{ct}->{data}, q[?],
3616 length $self->{ct}->{data});
3617 ## Stay in the state.
3618 !!!next-input-character;
3619 ## Reprocess.
3620 redo A;
3621 }
3622 } elsif ($self->{state} == PI_AFTER_STATE) {
3623 ## XML5: Part of "Pi after state".
3624
3625 if ($self->{nc} == 0x003E) { # >
3626 if ($self->{in_subset}) {
3627 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3628 } else {
3629 $self->{state} = DATA_STATE;
3630 $self->{s_kwd} = '';
3631 }
3632 !!!next-input-character;
3633 !!!emit ($self->{ct}); # pi
3634 redo A;
3635 } elsif ($self->{nc} == 0x003F) { # ?
3636 !!!parse-error (type => 'no s after target', ## TODO: type
3637 line => $self->{line_prev},
3638 column => $self->{column_prev}); ## XML5: no error
3639 $self->{ct}->{data} .= '?';
3640 $self->{state} = PI_DATA_AFTER_STATE;
3641 !!!next-input-character;
3642 redo A;
3643 } else {
3644 !!!parse-error (type => 'no s after target', ## TODO: type
3645 line => $self->{line_prev},
3646 column => $self->{column_prev}
3647 + 1 * ($self->{nc} == -1)); ## XML5: no error
3648 $self->{ct}->{data} .= '?'; ## XML5: not appended
3649 $self->{state} = PI_DATA_STATE;
3650 ## Reprocess.
3651 redo A;
3652 }
3653 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3654 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3655
3656 if ($self->{nc} == 0x003E) { # >
3657 if ($self->{in_subset}) {
3658 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3659 } else {
3660 $self->{state} = DATA_STATE;
3661 $self->{s_kwd} = '';
3662 }
3663 !!!next-input-character;
3664 !!!emit ($self->{ct}); # pi
3665 redo A;
3666 } elsif ($self->{nc} == 0x003F) { # ?
3667 $self->{ct}->{data} .= '?';
3668 ## Stay in the state.
3669 !!!next-input-character;
3670 redo A;
3671 } else {
3672 $self->{ct}->{data} .= '?'; ## XML5: not appended
3673 $self->{state} = PI_DATA_STATE;
3674 ## Reprocess.
3675 redo A;
3676 }
3677
3678 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3679 if ($self->{nc} == 0x003C) { # <
3680 $self->{state} = DOCTYPE_TAG_STATE;
3681 !!!next-input-character;
3682 redo A;
3683 } elsif ($self->{nc} == 0x0025) { # %
3684 ## XML5: Not defined yet.
3685
3686 ## TODO:
3687
3688 if (not $self->{stop_processing} and
3689 not $self->{document}->xml_standalone) {
3690 !!!parse-error (type => 'stop processing', ## TODO: type
3691 level => $self->{level}->{info});
3692 $self->{stop_processing} = 1;
3693 }
3694
3695 !!!next-input-character;
3696 redo A;
3697 } elsif ($self->{nc} == 0x005D) { # ]
3698 delete $self->{in_subset};
3699 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3700 !!!next-input-character;
3701 redo A;
3702 } elsif ($is_space->{$self->{nc}}) {
3703 ## Stay in the state.
3704 !!!next-input-character;
3705 redo A;
3706 } elsif ($self->{nc} == -1) {
3707 !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3708 delete $self->{in_subset};
3709 $self->{state} = DATA_STATE;
3710 $self->{s_kwd} = '';
3711 ## Reconsume.
3712 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3713 redo A;
3714 } else {
3715 unless ($self->{internal_subset_tainted}) {
3716 ## XML5: No parse error.
3717 !!!parse-error (type => 'string in internal subset');
3718 $self->{internal_subset_tainted} = 1;
3719 }
3720 ## Stay in the state.
3721 !!!next-input-character;
3722 redo A;
3723 }
3724 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3725 if ($self->{nc} == 0x003E) { # >
3726 $self->{state} = DATA_STATE;
3727 $self->{s_kwd} = '';
3728 !!!next-input-character;
3729 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3730 redo A;
3731 } elsif ($self->{nc} == -1) {
3732 !!!parse-error (type => 'unclosed DOCTYPE');
3733 $self->{state} = DATA_STATE;
3734 $self->{s_kwd} = '';
3735 ## Reconsume.
3736 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3737 redo A;
3738 } else {
3739 ## XML5: No parse error and stay in the state.
3740 !!!parse-error (type => 'string after internal subset'); ## TODO: type
3741
3742 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3743 !!!next-input-character;
3744 redo A;
3745 }
3746 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3747 if ($self->{nc} == 0x003E) { # >
3748 $self->{state} = DATA_STATE;
3749 $self->{s_kwd} = '';
3750 !!!next-input-character;
3751 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3752 redo A;
3753 } elsif ($self->{nc} == -1) {
3754 $self->{state} = DATA_STATE;
3755 $self->{s_kwd} = '';
3756 ## Reconsume.
3757 !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3758 redo A;
3759 } else {
3760 ## Stay in the state.
3761 !!!next-input-character;
3762 redo A;
3763 }
3764 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3765 if ($self->{nc} == 0x0021) { # !
3766 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3767 !!!next-input-character;
3768 redo A;
3769 } elsif ($self->{nc} == 0x003F) { # ?
3770 $self->{state} = PI_STATE;
3771 !!!next-input-character;
3772 redo A;
3773 } elsif ($self->{nc} == -1) {
3774 !!!parse-error (type => 'bare stago');
3775 $self->{state} = DATA_STATE;
3776 $self->{s_kwd} = '';
3777 ## Reconsume.
3778 redo A;
3779 } else {
3780 !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3781 line => $self->{line_prev},
3782 column => $self->{column_prev});
3783 $self->{state} = BOGUS_COMMENT_STATE;
3784 $self->{ct} = {type => COMMENT_TOKEN,
3785 data => '',
3786 }; ## NOTE: Will be discarded.
3787 !!!next-input-character;
3788 redo A;
3789 }
3790 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3791 ## XML5: "DOCTYPE markup declaration state".
3792
3793 if ($self->{nc} == 0x002D) { # -
3794 $self->{state} = MD_HYPHEN_STATE;
3795 !!!next-input-character;
3796 redo A;
3797 } elsif ($self->{nc} == 0x0045 or # E
3798 $self->{nc} == 0x0065) { # e
3799 $self->{state} = MD_E_STATE;
3800 $self->{kwd} = chr $self->{nc};
3801 !!!next-input-character;
3802 redo A;
3803 } elsif ($self->{nc} == 0x0041 or # A
3804 $self->{nc} == 0x0061) { # a
3805 $self->{state} = MD_ATTLIST_STATE;
3806 $self->{kwd} = chr $self->{nc};
3807 !!!next-input-character;
3808 redo A;
3809 } elsif ($self->{nc} == 0x004E or # N
3810 $self->{nc} == 0x006E) { # n
3811 $self->{state} = MD_NOTATION_STATE;
3812 $self->{kwd} = chr $self->{nc};
3813 !!!next-input-character;
3814 redo A;
3815 } else {
3816 #
3817 }
3818
3819 ## XML5: No parse error.
3820 !!!parse-error (type => 'bogus comment',
3821 line => $self->{line_prev},
3822 column => $self->{column_prev} - 1);
3823 ## Reconsume.
3824 $self->{state} = BOGUS_COMMENT_STATE;
3825 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3826 redo A;
3827 } elsif ($self->{state} == MD_E_STATE) {
3828 if ($self->{nc} == 0x004E or # N
3829 $self->{nc} == 0x006E) { # n
3830 $self->{state} = MD_ENTITY_STATE;
3831 $self->{kwd} .= chr $self->{nc};
3832 !!!next-input-character;
3833 redo A;
3834 } elsif ($self->{nc} == 0x004C or # L
3835 $self->{nc} == 0x006C) { # l
3836 ## XML5: <!ELEMENT> not supported.
3837 $self->{state} = MD_ELEMENT_STATE;
3838 $self->{kwd} .= chr $self->{nc};
3839 !!!next-input-character;
3840 redo A;
3841 } else {
3842 ## XML5: No parse error.
3843 !!!parse-error (type => 'bogus comment',
3844 line => $self->{line_prev},
3845 column => $self->{column_prev} - 2
3846 + 1 * ($self->{nc} == -1));
3847 ## Reconsume.
3848 $self->{state} = BOGUS_COMMENT_STATE;
3849 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3850 redo A;
3851 }
3852 } elsif ($self->{state} == MD_ENTITY_STATE) {
3853 if ($self->{nc} == [
3854 undef,
3855 undef,
3856 0x0054, # T
3857 0x0049, # I
3858 0x0054, # T
3859 ]->[length $self->{kwd}] or
3860 $self->{nc} == [
3861 undef,
3862 undef,
3863 0x0074, # t
3864 0x0069, # i
3865 0x0074, # t
3866 ]->[length $self->{kwd}]) {
3867 ## Stay in the state.
3868 $self->{kwd} .= chr $self->{nc};
3869 !!!next-input-character;
3870 redo A;
3871 } elsif ((length $self->{kwd}) == 5 and
3872 ($self->{nc} == 0x0059 or # Y
3873 $self->{nc} == 0x0079)) { # y
3874 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
3875 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3876 text => 'ENTITY',
3877 line => $self->{line_prev},
3878 column => $self->{column_prev} - 4);
3879 }
3880 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
3881 line => $self->{line_prev},
3882 column => $self->{column_prev} - 6};
3883 $self->{state} = DOCTYPE_MD_STATE;
3884 !!!next-input-character;
3885 redo A;
3886 } else {
3887 !!!parse-error (type => 'bogus comment',
3888 line => $self->{line_prev},
3889 column => $self->{column_prev} - 1
3890 - (length $self->{kwd})
3891 + 1 * ($self->{nc} == -1));
3892 $self->{state} = BOGUS_COMMENT_STATE;
3893 ## Reconsume.
3894 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3895 redo A;
3896 }
3897 } elsif ($self->{state} == MD_ELEMENT_STATE) {
3898 if ($self->{nc} == [
3899 undef,
3900 undef,
3901 0x0045, # E
3902 0x004D, # M
3903 0x0045, # E
3904 0x004E, # N
3905 ]->[length $self->{kwd}] or
3906 $self->{nc} == [
3907 undef,
3908 undef,
3909 0x0065, # e
3910 0x006D, # m
3911 0x0065, # e
3912 0x006E, # n
3913 ]->[length $self->{kwd}]) {
3914 ## Stay in the state.
3915 $self->{kwd} .= chr $self->{nc};
3916 !!!next-input-character;
3917 redo A;
3918 } elsif ((length $self->{kwd}) == 6 and
3919 ($self->{nc} == 0x0054 or # T
3920 $self->{nc} == 0x0074)) { # t
3921 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
3922 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3923 text => 'ELEMENT',
3924 line => $self->{line_prev},
3925 column => $self->{column_prev} - 5);
3926 }
3927 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3928 line => $self->{line_prev},
3929 column => $self->{column_prev} - 7};
3930 $self->{state} = DOCTYPE_MD_STATE;
3931 !!!next-input-character;
3932 redo A;
3933 } else {
3934 !!!parse-error (type => 'bogus comment',
3935 line => $self->{line_prev},
3936 column => $self->{column_prev} - 1
3937 - (length $self->{kwd})
3938 + 1 * ($self->{nc} == -1));
3939 $self->{state} = BOGUS_COMMENT_STATE;
3940 ## Reconsume.
3941 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3942 redo A;
3943 }
3944 } elsif ($self->{state} == MD_ATTLIST_STATE) {
3945 if ($self->{nc} == [
3946 undef,
3947 0x0054, # T
3948 0x0054, # T
3949 0x004C, # L
3950 0x0049, # I
3951 0x0053, # S
3952 ]->[length $self->{kwd}] or
3953 $self->{nc} == [
3954 undef,
3955 0x0074, # t
3956 0x0074, # t
3957 0x006C, # l
3958 0x0069, # i
3959 0x0073, # s
3960 ]->[length $self->{kwd}]) {
3961 ## Stay in the state.
3962 $self->{kwd} .= chr $self->{nc};
3963 !!!next-input-character;
3964 redo A;
3965 } elsif ((length $self->{kwd}) == 6 and
3966 ($self->{nc} == 0x0054 or # T
3967 $self->{nc} == 0x0074)) { # t
3968 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
3969 !!!parse-error (type => 'lowercase keyword', ## TODO: type
3970 text => 'ATTLIST',
3971 line => $self->{line_prev},
3972 column => $self->{column_prev} - 5);
3973 }
3974 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3975 attrdefs => [],
3976 line => $self->{line_prev},
3977 column => $self->{column_prev} - 7};
3978 $self->{state} = DOCTYPE_MD_STATE;
3979 !!!next-input-character;
3980 redo A;
3981 } else {
3982 !!!parse-error (type => 'bogus comment',
3983 line => $self->{line_prev},
3984 column => $self->{column_prev} - 1
3985 - (length $self->{kwd})
3986 + 1 * ($self->{nc} == -1));
3987 $self->{state} = BOGUS_COMMENT_STATE;
3988 ## Reconsume.
3989 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3990 redo A;
3991 }
3992 } elsif ($self->{state} == MD_NOTATION_STATE) {
3993 if ($self->{nc} == [
3994 undef,
3995 0x004F, # O
3996 0x0054, # T
3997 0x0041, # A
3998 0x0054, # T
3999 0x0049, # I
4000 0x004F, # O
4001 ]->[length $self->{kwd}] or
4002 $self->{nc} == [
4003 undef,
4004 0x006F, # o
4005 0x0074, # t
4006 0x0061, # a
4007 0x0074, # t
4008 0x0069, # i
4009 0x006F, # o
4010 ]->[length $self->{kwd}]) {
4011 ## Stay in the state.
4012 $self->{kwd} .= chr $self->{nc};
4013 !!!next-input-character;
4014 redo A;
4015 } elsif ((length $self->{kwd}) == 7 and
4016 ($self->{nc} == 0x004E or # N
4017 $self->{nc} == 0x006E)) { # n
4018 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
4019 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4020 text => 'NOTATION',
4021 line => $self->{line_prev},
4022 column => $self->{column_prev} - 6);
4023 }
4024 $self->{ct} = {type => NOTATION_TOKEN, name => '',
4025 line => $self->{line_prev},
4026 column => $self->{column_prev} - 8};
4027 $self->{state} = DOCTYPE_MD_STATE;
4028 !!!next-input-character;
4029 redo A;
4030 } else {
4031 !!!parse-error (type => 'bogus comment',
4032 line => $self->{line_prev},
4033 column => $self->{column_prev} - 1
4034 - (length $self->{kwd})
4035 + 1 * ($self->{nc} == -1));
4036 $self->{state} = BOGUS_COMMENT_STATE;
4037 ## Reconsume.
4038 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4039 redo A;
4040 }
4041 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
4042 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
4043 ## "DOCTYPE NOTATION state".
4044
4045 if ($is_space->{$self->{nc}}) {
4046 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
4047 $self->{state} = BEFORE_MD_NAME_STATE;
4048 !!!next-input-character;
4049 redo A;
4050 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4051 $self->{nc} == 0x0025) { # %
4052 ## XML5: Switch to the "DOCTYPE bogus comment state".
4053 !!!parse-error (type => 'no space before md name'); ## TODO: type
4054 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4055 !!!next-input-character;
4056 redo A;
4057 } elsif ($self->{nc} == -1) {
4058 !!!parse-error (type => 'unclosed md'); ## TODO: type
4059 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4060 ## Reconsume.
4061 redo A;
4062 } elsif ($self->{nc} == 0x003E) { # >
4063 ## XML5: Switch to the "DOCTYPE bogus comment state".
4064 !!!parse-error (type => 'no md name'); ## TODO: type
4065 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066 !!!next-input-character;
4067 redo A;
4068 } else {
4069 ## XML5: Switch to the "DOCTYPE bogus comment state".
4070 !!!parse-error (type => 'no space before md name'); ## TODO: type
4071 $self->{state} = BEFORE_MD_NAME_STATE;
4072 redo A;
4073 }
4074 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
4075 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
4076 ## before state", "DOCTYPE ATTLIST name before state".
4077
4078 if ($is_space->{$self->{nc}}) {
4079 ## Stay in the state.
4080 !!!next-input-character;
4081 redo A;
4082 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4083 $self->{nc} == 0x0025) { # %
4084 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
4085 !!!next-input-character;
4086 redo A;
4087 } elsif ($self->{nc} == 0x003E) { # >
4088 ## XML5: Same as "Anything else".
4089 !!!parse-error (type => 'no md name'); ## TODO: type
4090 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4091 !!!next-input-character;
4092 redo A;
4093 } elsif ($self->{nc} == -1) {
4094 !!!parse-error (type => 'unclosed md'); ## TODO: type
4095 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4096 ## Reconsume.
4097 redo A;
4098 } else {
4099 ## XML5: [ATTLIST] Not defined yet.
4100 $self->{ct}->{name} .= chr $self->{nc};
4101 $self->{state} = MD_NAME_STATE;
4102 !!!next-input-character;
4103 redo A;
4104 }
4105 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
4106 if ($is_space->{$self->{nc}}) {
4107 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
4108 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
4109 $self->{state} = BEFORE_MD_NAME_STATE;
4110 !!!next-input-character;
4111 redo A;
4112 } elsif ($self->{nc} == 0x003E) { # >
4113 ## XML5: Same as "Anything else".
4114 !!!parse-error (type => 'no md name'); ## TODO: type
4115 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4116 !!!next-input-character;
4117 redo A;
4118 } elsif ($self->{nc} == -1) {
4119 !!!parse-error (type => 'unclosed md');
4120 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4121 ## Reconsume.
4122 redo A;
4123 } else {
4124 ## XML5: No parse error.
4125 !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
4126 $self->{state} = BOGUS_COMMENT_STATE;
4127 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4128 ## Reconsume.
4129 redo A;
4130 }
4131 } elsif ($self->{state} == MD_NAME_STATE) {
4132 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
4133
4134 if ($is_space->{$self->{nc}}) {
4135 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4136 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4137 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
4138 $self->{state} = AFTER_ELEMENT_NAME_STATE;
4139 } else { # ENTITY/NOTATION
4140 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
4141 }
4142 !!!next-input-character;
4143 redo A;
4144 } elsif ($self->{nc} == 0x003E) { # >
4145 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
4146 #
4147 } else {
4148 !!!parse-error (type => 'no md def'); ## TODO: type
4149 }
4150 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4151 !!!next-input-character;
4152 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4153 redo A;
4154 } elsif ($self->{nc} == -1) {
4155 ## XML5: [ATTLIST] No parse error.
4156 !!!parse-error (type => 'unclosed md');
4157 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4158 ## Reconsume.
4159 !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
4160 redo A;
4161 } else {
4162 ## XML5: [ATTLIST] Not defined yet.
4163 $self->{ct}->{name} .= chr $self->{nc};
4164 ## Stay in the state.
4165 !!!next-input-character;
4166 redo A;
4167 }
4168 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
4169 if ($is_space->{$self->{nc}}) {
4170 ## Stay in the state.
4171 !!!next-input-character;
4172 redo A;
4173 } elsif ($self->{nc} == 0x003E) { # >
4174 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4175 !!!next-input-character;
4176 !!!emit ($self->{ct}); # ATTLIST
4177 redo A;
4178 } elsif ($self->{nc} == -1) {
4179 ## XML5: No parse error.
4180 !!!parse-error (type => 'unclosed md'); ## TODO: type
4181 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4182 !!!emit ($self->{ct});
4183 redo A;
4184 } else {
4185 ## XML5: Not defined yet.
4186 $self->{ca} = {name => chr ($self->{nc}), # attrdef
4187 tokens => [],
4188 line => $self->{line}, column => $self->{column}};
4189 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
4190 !!!next-input-character;
4191 redo A;
4192 }
4193 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
4194 if ($is_space->{$self->{nc}}) {
4195 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
4196 !!!next-input-character;
4197 redo A;
4198 } elsif ($self->{nc} == 0x003E) { # >
4199 ## XML5: Same as "anything else".
4200 !!!parse-error (type => 'no attr type'); ## TODO: type
4201 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4202 !!!next-input-character;
4203 !!!emit ($self->{ct}); # ATTLIST
4204 redo A;
4205 } elsif ($self->{nc} == 0x0028) { # (
4206 ## XML5: Same as "anything else".
4207 !!!parse-error (type => 'no space before paren'); ## TODO: type
4208 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4209 !!!next-input-character;
4210 redo A;
4211 } elsif ($self->{nc} == -1) {
4212 ## XML5: No parse error.
4213 !!!parse-error (type => 'unclosed md'); ## TODO: type
4214 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4215 !!!next-input-character;
4216 !!!emit ($self->{ct}); # ATTLIST
4217 redo A;
4218 } else {
4219 ## XML5: Not defined yet.
4220 $self->{ca}->{name} .= chr $self->{nc};
4221 ## Stay in the state.
4222 !!!next-input-character;
4223 redo A;
4224 }
4225 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
4226 if ($is_space->{$self->{nc}}) {
4227 ## Stay in the state.
4228 !!!next-input-character;
4229 redo A;
4230 } elsif ($self->{nc} == 0x003E) { # >
4231 ## XML5: Same as "anything else".
4232 !!!parse-error (type => 'no attr type'); ## TODO: type
4233 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234 !!!next-input-character;
4235 !!!emit ($self->{ct}); # ATTLIST
4236 redo A;
4237 } elsif ($self->{nc} == 0x0028) { # (
4238 ## XML5: Same as "anything else".
4239 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4240 !!!next-input-character;
4241 redo A;
4242 } elsif ($self->{nc} == -1) {
4243 ## XML5: No parse error.
4244 !!!parse-error (type => 'unclosed md'); ## TODO: type
4245 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4246 !!!next-input-character;
4247 !!!emit ($self->{ct});
4248 redo A;
4249 } else {
4250 ## XML5: Not defined yet.
4251 $self->{ca}->{type} = chr $self->{nc};
4252 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
4253 !!!next-input-character;
4254 redo A;
4255 }
4256 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
4257 if ($is_space->{$self->{nc}}) {
4258 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
4259 !!!next-input-character;
4260 redo A;
4261 } elsif ($self->{nc} == 0x0023) { # #
4262 ## XML5: Same as "anything else".
4263 !!!parse-error (type => 'no space before default value'); ## TODO: type
4264 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4265 !!!next-input-character;
4266 redo A;
4267 } elsif ($self->{nc} == 0x0022) { # "
4268 ## XML5: Same as "anything else".
4269 !!!parse-error (type => 'no space before default value'); ## TODO: type
4270 $self->{ca}->{value} = '';
4271 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4272 !!!next-input-character;
4273 redo A;
4274 } elsif ($self->{nc} == 0x0027) { # '
4275 ## XML5: Same as "anything else".
4276 !!!parse-error (type => 'no space before default value'); ## TODO: type
4277 $self->{ca}->{value} = '';
4278 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4279 !!!next-input-character;
4280 redo A;
4281 } elsif ($self->{nc} == 0x003E) { # >
4282 ## XML5: Same as "anything else".
4283 !!!parse-error (type => 'no attr default'); ## TODO: type
4284 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4285 !!!next-input-character;
4286 !!!emit ($self->{ct}); # ATTLIST
4287 redo A;
4288 } elsif ($self->{nc} == 0x0028) { # (
4289 ## XML5: Same as "anything else".
4290 !!!parse-error (type => 'no space before paren'); ## TODO: type
4291 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4292 !!!next-input-character;
4293 redo A;
4294 } elsif ($self->{nc} == -1) {
4295 ## XML5: No parse error.
4296 !!!parse-error (type => 'unclosed md'); ## TODO: type
4297 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4298 !!!next-input-character;
4299 !!!emit ($self->{ct});
4300 redo A;
4301 } else {
4302 ## XML5: Not defined yet.
4303 $self->{ca}->{type} .= chr $self->{nc};
4304 ## Stay in the state.
4305 !!!next-input-character;
4306 redo A;
4307 }
4308 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
4309 if ($is_space->{$self->{nc}}) {
4310 ## Stay in the state.
4311 !!!next-input-character;
4312 redo A;
4313 } elsif ($self->{nc} == 0x0028) { # (
4314 ## XML5: Same as "anything else".
4315 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4316 !!!next-input-character;
4317 redo A;
4318 } elsif ($self->{nc} == 0x0023) { # #
4319 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4320 !!!next-input-character;
4321 redo A;
4322 } elsif ($self->{nc} == 0x0022) { # "
4323 ## XML5: Same as "anything else".
4324 $self->{ca}->{value} = '';
4325 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4326 !!!next-input-character;
4327 redo A;
4328 } elsif ($self->{nc} == 0x0027) { # '
4329 ## XML5: Same as "anything else".
4330 $self->{ca}->{value} = '';
4331 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4332 !!!next-input-character;
4333 redo A;
4334 } elsif ($self->{nc} == 0x003E) { # >
4335 ## XML5: Same as "anything else".
4336 !!!parse-error (type => 'no attr default'); ## TODO: type
4337 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4338 !!!next-input-character;
4339 !!!emit ($self->{ct}); # ATTLIST
4340 redo A;
4341 } elsif ($self->{nc} == -1) {
4342 ## XML5: No parse error.
4343 !!!parse-error (type => 'unclosed md'); ## TODO: type
4344 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4345 !!!next-input-character;
4346 !!!emit ($self->{ct});
4347 redo A;
4348 } else {
4349 ## XML5: Switch to the "DOCTYPE bogus comment state".
4350 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4351 $self->{ca}->{value} = '';
4352 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4353 ## Reconsume.
4354 redo A;
4355 }
4356 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
4357 if ($is_space->{$self->{nc}}) {
4358 ## Stay in the state.
4359 !!!next-input-character;
4360 redo A;
4361 } elsif ($self->{nc} == 0x007C) { # |
4362 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4363 ## Stay in the state.
4364 !!!next-input-character;
4365 redo A;
4366 } elsif ($self->{nc} == 0x0029) { # )
4367 !!!parse-error (type => 'empty allowed token'); ## TODO: type
4368 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4369 !!!next-input-character;
4370 redo A;
4371 } elsif ($self->{nc} == 0x003E) { # >
4372 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4373 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4374 !!!next-input-character;
4375 !!!emit ($self->{ct}); # ATTLIST
4376 redo A;
4377 } elsif ($self->{nc} == -1) {
4378 ## XML5: No parse error.
4379 !!!parse-error (type => 'unclosed md'); ## TODO: type
4380 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4381 !!!next-input-character;
4382 !!!emit ($self->{ct});
4383 redo A;
4384 } else {
4385 push @{$self->{ca}->{tokens}}, chr $self->{nc};
4386 $self->{state} = ALLOWED_TOKEN_STATE;
4387 !!!next-input-character;
4388 redo A;
4389 }
4390 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4391 if ($is_space->{$self->{nc}}) {
4392 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4393 !!!next-input-character;
4394 redo A;
4395 } elsif ($self->{nc} == 0x007C) { # |
4396 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4397 !!!next-input-character;
4398 redo A;
4399 } elsif ($self->{nc} == 0x0029) { # )
4400 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4401 !!!next-input-character;
4402 redo A;
4403 } elsif ($self->{nc} == 0x003E) { # >
4404 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4405 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4406 !!!next-input-character;
4407 !!!emit ($self->{ct}); # ATTLIST
4408 redo A;
4409 } elsif ($self->{nc} == -1) {
4410 ## XML5: No parse error.
4411 !!!parse-error (type => 'unclosed md'); ## TODO: type
4412 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4413 !!!next-input-character;
4414 !!!emit ($self->{ct});
4415 redo A;
4416 } else {
4417 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4418 ## Stay in the state.
4419 !!!next-input-character;
4420 redo A;
4421 }
4422 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4423 if ($is_space->{$self->{nc}}) {
4424 ## Stay in the state.
4425 !!!next-input-character;
4426 redo A;
4427 } elsif ($self->{nc} == 0x007C) { # |
4428 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4429 !!!next-input-character;
4430 redo A;
4431 } elsif ($self->{nc} == 0x0029) { # )
4432 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4433 !!!next-input-character;
4434 redo A;
4435 } elsif ($self->{nc} == 0x003E) { # >
4436 !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4437 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4438 !!!next-input-character;
4439 !!!emit ($self->{ct}); # ATTLIST
4440 redo A;
4441 } elsif ($self->{nc} == -1) {
4442 ## XML5: No parse error.
4443 !!!parse-error (type => 'unclosed md'); ## TODO: type
4444 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4445 !!!next-input-character;
4446 !!!emit ($self->{ct});
4447 redo A;
4448 } else {
4449 !!!parse-error (type => 'space in allowed token', ## TODO: type
4450 line => $self->{line_prev},
4451 column => $self->{column_prev});
4452 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4453 $self->{state} = ALLOWED_TOKEN_STATE;
4454 !!!next-input-character;
4455 redo A;
4456 }
4457 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4458 if ($is_space->{$self->{nc}}) {
4459 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4460 !!!next-input-character;
4461 redo A;
4462 } elsif ($self->{nc} == 0x0023) { # #
4463 !!!parse-error (type => 'no space before default value'); ## TODO: type
4464 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4465 !!!next-input-character;
4466 redo A;
4467 } elsif ($self->{nc} == 0x0022) { # "
4468 !!!parse-error (type => 'no space before default value'); ## TODO: type
4469 $self->{ca}->{value} = '';
4470 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4471 !!!next-input-character;
4472 redo A;
4473 } elsif ($self->{nc} == 0x0027) { # '
4474 !!!parse-error (type => 'no space before default value'); ## TODO: type
4475 $self->{ca}->{value} = '';
4476 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4477 !!!next-input-character;
4478 redo A;
4479 } elsif ($self->{nc} == 0x003E) { # >
4480 !!!parse-error (type => 'no attr default'); ## TODO: type
4481 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4482 !!!next-input-character;
4483 !!!emit ($self->{ct}); # ATTLIST
4484 redo A;
4485 } elsif ($self->{nc} == -1) {
4486 !!!parse-error (type => 'unclosed md'); ## TODO: type
4487 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4488 !!!next-input-character;
4489 !!!emit ($self->{ct});
4490 redo A;
4491 } else {
4492 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4493 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4494 ## Reconsume.
4495 redo A;
4496 }
4497 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4498 if ($is_space->{$self->{nc}}) {
4499 ## Stay in the state.
4500 !!!next-input-character;
4501 redo A;
4502 } elsif ($self->{nc} == 0x0023) { # #
4503 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4504 !!!next-input-character;
4505 redo A;
4506 } elsif ($self->{nc} == 0x0022) { # "
4507 $self->{ca}->{value} = '';
4508 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4509 !!!next-input-character;
4510 redo A;
4511 } elsif ($self->{nc} == 0x0027) { # '
4512 $self->{ca}->{value} = '';
4513 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4514 !!!next-input-character;
4515 redo A;
4516 } elsif ($self->{nc} == 0x003E) { # >
4517 !!!parse-error (type => 'no attr default'); ## TODO: type
4518 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4519 !!!next-input-character;
4520 !!!emit ($self->{ct}); # ATTLIST
4521 redo A;
4522 } elsif ($self->{nc} == -1) {
4523 !!!parse-error (type => 'unclosed md'); ## TODO: type
4524 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4525 !!!next-input-character;
4526 !!!emit ($self->{ct});
4527 redo A;
4528 } else {
4529 !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4530 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4531 ## Reconsume.
4532 redo A;
4533 }
4534 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4535 if ($is_space->{$self->{nc}}) {
4536 ## XML5: No parse error.
4537 !!!parse-error (type => 'no default type'); ## TODO: type
4538 $self->{state} = BOGUS_MD_STATE;
4539 ## Reconsume.
4540 redo A;
4541 } elsif ($self->{nc} == 0x0022) { # "
4542 ## XML5: Same as "anything else".
4543 $self->{ca}->{value} = '';
4544 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4545 !!!next-input-character;
4546 redo A;
4547 } elsif ($self->{nc} == 0x0027) { # '
4548 ## XML5: Same as "anything else".
4549 $self->{ca}->{value} = '';
4550 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4551 !!!next-input-character;
4552 redo A;
4553 } elsif ($self->{nc} == 0x003E) { # >
4554 ## XML5: Same as "anything else".
4555 !!!parse-error (type => 'no attr default'); ## TODO: type
4556 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4557 !!!next-input-character;
4558 !!!emit ($self->{ct}); # ATTLIST
4559 redo A;
4560 } elsif ($self->{nc} == -1) {
4561 ## XML5: No parse error.
4562 !!!parse-error (type => 'unclosed md'); ## TODO: type
4563 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4564 !!!next-input-character;
4565 !!!emit ($self->{ct});
4566 redo A;
4567 } else {
4568 $self->{ca}->{default} = chr $self->{nc};
4569 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4570 !!!next-input-character;
4571 redo A;
4572 }
4573 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4574 if ($is_space->{$self->{nc}}) {
4575 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4576 !!!next-input-character;
4577 redo A;
4578 } elsif ($self->{nc} == 0x0022) { # "
4579 ## XML5: Same as "anything else".
4580 !!!parse-error (type => 'no space before default value'); ## TODO: type
4581 $self->{ca}->{value} = '';
4582 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4583 !!!next-input-character;
4584 redo A;
4585 } elsif ($self->{nc} == 0x0027) { # '
4586 ## XML5: Same as "anything else".
4587 !!!parse-error (type => 'no space before default value'); ## TODO: type
4588 $self->{ca}->{value} = '';
4589 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4590 !!!next-input-character;
4591 redo A;
4592 } elsif ($self->{nc} == 0x003E) { # >
4593 ## XML5: Same as "anything else".
4594 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4595 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4596 !!!next-input-character;
4597 !!!emit ($self->{ct}); # ATTLIST
4598 redo A;
4599 } elsif ($self->{nc} == -1) {
4600 ## XML5: No parse error.
4601 !!!parse-error (type => 'unclosed md'); ## TODO: type
4602 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4603 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4604 !!!next-input-character;
4605 !!!emit ($self->{ct});
4606 redo A;
4607 } else {
4608 $self->{ca}->{default} .= chr $self->{nc};
4609 ## Stay in the state.
4610 !!!next-input-character;
4611 redo A;
4612 }
4613 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4614 if ($is_space->{$self->{nc}}) {
4615 ## Stay in the state.
4616 !!!next-input-character;
4617 redo A;
4618 } elsif ($self->{nc} == 0x0022) { # "
4619 $self->{ca}->{value} = '';
4620 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4621 !!!next-input-character;
4622 redo A;
4623 } elsif ($self->{nc} == 0x0027) { # '
4624 $self->{ca}->{value} = '';
4625 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4626 !!!next-input-character;
4627 redo A;
4628 } elsif ($self->{nc} == 0x003E) { # >
4629 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4630 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4631 !!!next-input-character;
4632 !!!emit ($self->{ct}); # ATTLIST
4633 redo A;
4634 } elsif ($self->{nc} == -1) {
4635 ## XML5: No parse error.
4636 !!!parse-error (type => 'unclosed md'); ## TODO: type
4637 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4638 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4639 !!!next-input-character;
4640 !!!emit ($self->{ct});
4641 redo A;
4642 } else {
4643 ## XML5: Not defined yet.
4644 if ($self->{ca}->{default} eq 'FIXED') {
4645 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4646 } else {
4647 push @{$self->{ct}->{attrdefs}}, $self->{ca};
4648 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4649 }
4650 ## Reconsume.
4651 redo A;
4652 }
4653 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4654 if ($is_space->{$self->{nc}} or
4655 $self->{nc} == -1 or
4656 $self->{nc} == 0x003E) { # >
4657 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4658 ## Reconsume.
4659 redo A;
4660 } else {
4661 !!!parse-error (type => 'no space before attr name'); ## TODO: type
4662 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4663 ## Reconsume.
4664 redo A;
4665 }
4666 } elsif ($self->{state} == NDATA_STATE) {
4667 ## ASCII case-insensitive
4668 if ($self->{nc} == [
4669 undef,
4670 0x0044, # D
4671 0x0041, # A
4672 0x0054, # T
4673 ]->[length $self->{kwd}] or
4674 $self->{nc} == [
4675 undef,
4676 0x0064, # d
4677 0x0061, # a
4678 0x0074, # t
4679 ]->[length $self->{kwd}]) {
4680 !!!cp (172.2);
4681 ## Stay in the state.
4682 $self->{kwd} .= chr $self->{nc};
4683 !!!next-input-character;
4684 redo A;
4685 } elsif ((length $self->{kwd}) == 4 and
4686 ($self->{nc} == 0x0041 or # A
4687 $self->{nc} == 0x0061)) { # a
4688 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
4689 !!!cp (172.3);
4690 !!!parse-error (type => 'lowercase keyword', ## TODO: type
4691 text => 'NDATA',
4692 line => $self->{line_prev},
4693 column => $self->{column_prev} - 4);
4694 } else {
4695 !!!cp (172.4);
4696 }
4697 $self->{state} = AFTER_NDATA_STATE;
4698 !!!next-input-character;
4699 redo A;
4700 } else {
4701 !!!parse-error (type => 'string after literal', ## TODO: type
4702 line => $self->{line_prev},
4703 column => $self->{column_prev} + 1
4704 - length $self->{kwd});
4705 !!!cp (172.5);
4706 $self->{state} = BOGUS_MD_STATE;
4707 ## Reconsume.
4708 redo A;
4709 }
4710 } elsif ($self->{state} == AFTER_NDATA_STATE) {
4711 if ($is_space->{$self->{nc}}) {
4712 $self->{state} = BEFORE_NOTATION_NAME_STATE;
4713 !!!next-input-character;
4714 redo A;
4715 } elsif ($self->{nc} == 0x003E) { # >
4716 !!!parse-error (type => 'no notation name'); ## TODO: type
4717 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4718 !!!next-input-character;
4719 !!!emit ($self->{ct}); # ENTITY
4720 redo A;
4721 } elsif ($self->{nc} == -1) {
4722 !!!parse-error (type => 'unclosed md'); ## TODO: type
4723 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4724 !!!next-input-character;
4725 !!!emit ($self->{ct}); # ENTITY
4726 redo A;
4727 } else {
4728 !!!parse-error (type => 'string after literal', ## TODO: type
4729 line => $self->{line_prev},
4730 column => $self->{column_prev} + 1
4731 - length $self->{kwd});
4732 $self->{state} = BOGUS_MD_STATE;
4733 ## Reconsume.
4734 redo A;
4735 }
4736 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
4737 if ($is_space->{$self->{nc}}) {
4738 ## Stay in the state.
4739 !!!next-input-character;
4740 redo A;
4741 } elsif ($self->{nc} == 0x003E) { # >
4742 !!!parse-error (type => 'no notation name'); ## TODO: type
4743 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4744 !!!next-input-character;
4745 !!!emit ($self->{ct}); # ENTITY
4746 redo A;
4747 } elsif ($self->{nc} == -1) {
4748 !!!parse-error (type => 'unclosed md'); ## TODO: type
4749 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4750 !!!next-input-character;
4751 !!!emit ($self->{ct}); # ENTITY
4752 redo A;
4753 } else {
4754 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
4755 $self->{state} = NOTATION_NAME_STATE;
4756 !!!next-input-character;
4757 redo A;
4758 }
4759 } elsif ($self->{state} == NOTATION_NAME_STATE) {
4760 if ($is_space->{$self->{nc}}) {
4761 $self->{state} = AFTER_MD_DEF_STATE;
4762 !!!next-input-character;
4763 redo A;
4764 } elsif ($self->{nc} == 0x003E) { # >
4765 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4766 !!!next-input-character;
4767 !!!emit ($self->{ct}); # ENTITY
4768 redo A;
4769 } elsif ($self->{nc} == -1) {
4770 !!!parse-error (type => 'unclosed md'); ## TODO: type
4771 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4772 !!!next-input-character;
4773 !!!emit ($self->{ct}); # ENTITY
4774 redo A;
4775 } else {
4776 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
4777 ## Stay in the state.
4778 !!!next-input-character;
4779 redo A;
4780 }
4781 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
4782 if ($self->{nc} == 0x0022) { # "
4783 $self->{state} = AFTER_MD_DEF_STATE;
4784 !!!next-input-character;
4785 redo A;
4786 } elsif ($self->{nc} == 0x0026) { # &
4787 $self->{prev_state} = $self->{state};
4788 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4789 $self->{entity_add} = 0x0022; # "
4790 !!!next-input-character;
4791 redo A;
4792 ## TODO: %
4793 } elsif ($self->{nc} == -1) {
4794 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4795 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4796 ## Reconsume.
4797 !!!emit ($self->{ct}); # ENTITY
4798 redo A;
4799 } else {
4800 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4801 !!!next-input-character;
4802 redo A;
4803 }
4804 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
4805 if ($self->{nc} == 0x0027) { # '
4806 $self->{state} = AFTER_MD_DEF_STATE;
4807 !!!next-input-character;
4808 redo A;
4809 } elsif ($self->{nc} == 0x0026) { # &
4810 $self->{prev_state} = $self->{state};
4811 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
4812 $self->{entity_add} = 0x0027; # '
4813 !!!next-input-character;
4814 redo A;
4815 ## TODO: %
4816 } elsif ($self->{nc} == -1) {
4817 !!!parse-error (type => 'unclosed entity value'); ## TODO: type
4818 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4819 ## Reconsume.
4820 !!!emit ($self->{ct}); # ENTITY
4821 redo A;
4822 } else {
4823 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
4824 !!!next-input-character;
4825 redo A;
4826 }
4827 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
4828 if ($is_space->{$self->{nc}} or
4829 {
4830 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4831 $self->{entity_add} => 1,
4832 }->{$self->{nc}}) {
4833 !!!parse-error (type => 'bare ero',
4834 line => $self->{line_prev},
4835 column => $self->{column_prev}
4836 + ($self->{nc} == -1 ? 1 : 0));
4837 ## Don't consume
4838 ## Return nothing.
4839 #
4840 } elsif ($self->{nc} == 0x0023) { # #
4841 $self->{ca} = $self->{ct};
4842 $self->{state} = ENTITY_HASH_STATE;
4843 $self->{kwd} = '#';
4844 !!!next-input-character;
4845 redo A;
4846 } else {
4847 #
4848 }
4849
4850 $self->{ct}->{value} .= '&';
4851 $self->{state} = $self->{prev_state};
4852 ## Reconsume.
4853 redo A;
4854 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
4855 if ($is_space->{$self->{nc}}) {
4856 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
4857 !!!next-input-character;
4858 redo A;
4859 } elsif ($self->{nc} == 0x0028) { # (
4860 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4861 $self->{ct}->{content} = ['('];
4862 $self->{group_depth} = 1;
4863 !!!next-input-character;
4864 redo A;
4865 } elsif ($self->{nc} == 0x003E) { # >
4866 !!!parse-error (type => 'no md def'); ## TODO: type
4867 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4868 !!!next-input-character;
4869 !!!emit ($self->{ct}); # ELEMENT
4870 redo A;
4871 } elsif ($self->{nc} == -1) {
4872 !!!parse-error (type => 'unclosed md'); ## TODO: type
4873 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4874 !!!next-input-character;
4875 !!!emit ($self->{ct}); # ELEMENT
4876 redo A;
4877 } else {
4878 $self->{ct}->{content} = [chr $self->{nc}];
4879 $self->{state} = CONTENT_KEYWORD_STATE;
4880 !!!next-input-character;
4881 redo A;
4882 }
4883 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
4884 if ($is_space->{$self->{nc}}) {
4885 $self->{state} = AFTER_MD_DEF_STATE;
4886 !!!next-input-character;
4887 redo A;
4888 } elsif ($self->{nc} == 0x003E) { # >
4889 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4890 !!!next-input-character;
4891 !!!emit ($self->{ct}); # ELEMENT
4892 redo A;
4893 } elsif ($self->{nc} == -1) {
4894 !!!parse-error (type => 'unclosed md'); ## TODO: type
4895 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4896 !!!next-input-character;
4897 !!!emit ($self->{ct}); # ELEMENT
4898 redo A;
4899 } else {
4900 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
4901 ## Stay in the state.
4902 !!!next-input-character;
4903 redo A;
4904 }
4905 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
4906 if ($is_space->{$self->{nc}}) {
4907 ## Stay in the state.
4908 !!!next-input-character;
4909 redo A;
4910 } elsif ($self->{nc} == 0x0028) { # (
4911 $self->{group_depth}++;
4912 push @{$self->{ct}->{content}}, chr $self->{nc};
4913 ## Stay in the state.
4914 !!!next-input-character;
4915 redo A;
4916 } elsif ($self->{nc} == 0x007C or # |
4917 $self->{nc} == 0x002C) { # ,
4918 !!!parse-error (type => 'empty element name'); ## TODO: type
4919 ## Stay in the state.
4920 !!!next-input-character;
4921 redo A;
4922 } elsif ($self->{nc} == 0x0029) { # )
4923 !!!parse-error (type => 'empty element name'); ## TODO: type
4924 push @{$self->{ct}->{content}}, chr $self->{nc};
4925 $self->{group_depth}--;
4926 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4927 !!!next-input-character;
4928 redo A;
4929 } elsif ($self->{nc} == 0x003E) { # >
4930 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4931 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4932 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4933 !!!next-input-character;
4934 !!!emit ($self->{ct}); # ELEMENT
4935 redo A;
4936 } elsif ($self->{nc} == -1) {
4937 !!!parse-error (type => 'unclosed md'); ## TODO: type
4938 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4939 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4940 !!!next-input-character;
4941 !!!emit ($self->{ct}); # ELEMENT
4942 redo A;
4943 } else {
4944 push @{$self->{ct}->{content}}, chr $self->{nc};
4945 $self->{state} = CM_ELEMENT_NAME_STATE;
4946 !!!next-input-character;
4947 redo A;
4948 }
4949 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
4950 if ($is_space->{$self->{nc}}) {
4951 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4952 !!!next-input-character;
4953 redo A;
4954 } elsif ($self->{nc} == 0x002A or # *
4955 $self->{nc} == 0x002B or # +
4956 $self->{nc} == 0x003F) { # ?
4957 push @{$self->{ct}->{content}}, chr $self->{nc};
4958 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
4959 !!!next-input-character;
4960 redo A;
4961 } elsif ($self->{nc} == 0x007C or # |
4962 $self->{nc} == 0x002C) { # ,
4963 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
4964 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
4965 !!!next-input-character;
4966 redo A;
4967 } elsif ($self->{nc} == 0x0029) { # )
4968 $self->{group_depth}--;
4969 push @{$self->{ct}->{content}}, chr $self->{nc};
4970 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
4971 !!!next-input-character;
4972 redo A;
4973 } elsif ($self->{nc} == 0x003E) { # >
4974 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
4975 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4976 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4977 !!!next-input-character;
4978 !!!emit ($self->{ct}); # ELEMENT
4979 redo A;
4980 } elsif ($self->{nc} == -1) {
4981 !!!parse-error (type => 'unclosed md'); ## TODO: type
4982 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
4983 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4984 !!!next-input-character;
4985 !!!emit ($self->{ct}); # ELEMENT
4986 redo A;
4987 } else {
4988 $self->{ct}->{content}->[-1] .= chr $self->{nc};
4989 ## Stay in the state.
4990 !!!next-input-character;
4991 redo A;
4992 }
4993 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
4994 if ($is_space->{$self->{nc}}) {
4995 ## Stay in the state.
4996 !!!next-input-character;
4997 redo A;
4998 } elsif ($self->{nc} == 0x007C or # |
4999 $self->{nc} == 0x002C) { # ,
5000 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
5001 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
5002 !!!next-input-character;
5003 redo A;
5004 } elsif ($self->{nc} == 0x0029) { # )
5005 $self->{group_depth}--;
5006 push @{$self->{ct}->{content}}, chr $self->{nc};
5007 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
5008 !!!next-input-character;
5009 redo A;
5010 } elsif ($self->{nc} == 0x003E) { # >
5011 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5012 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5013 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5014 !!!next-input-character;
5015 !!!emit ($self->{ct}); # ELEMENT
5016 redo A;
5017 } elsif ($self->{nc} == -1) {
5018 !!!parse-error (type => 'unclosed md'); ## TODO: type
5019 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5020 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5021 !!!next-input-character;
5022 !!!emit ($self->{ct}); # ELEMENT
5023 redo A;
5024 } else {
5025 !!!parse-error (type => 'after element name'); ## TODO: type
5026 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5027 $self->{state} = BOGUS_MD_STATE;
5028 !!!next-input-character;
5029 redo A;
5030 }
5031 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
5032 if ($is_space->{$self->{nc}}) {
5033 if ($self->{group_depth}) {
5034 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5035 } else {
5036 $self->{state} = AFTER_MD_DEF_STATE;
5037 }
5038 !!!next-input-character;
5039 redo A;
5040 } elsif ($self->{nc} == 0x002A or # *
5041 $self->{nc} == 0x002B or # +
5042 $self->{nc} == 0x003F) { # ?
5043 push @{$self->{ct}->{content}}, chr $self->{nc};
5044 if ($self->{group_depth}) {
5045 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5046 } else {
5047 $self->{state} = AFTER_MD_DEF_STATE;
5048 }
5049 !!!next-input-character;
5050 redo A;
5051 } elsif ($self->{nc} == 0x0029) { # )
5052 if ($self->{group_depth}) {
5053 $self->{group_depth}--;
5054 push @{$self->{ct}->{content}}, chr $self->{nc};
5055 ## Stay in the state.
5056 !!!next-input-character;
5057 redo A;
5058 } else {
5059 !!!parse-error (type => 'string after md def'); ## TODO: type
5060 $self->{state} = BOGUS_MD_STATE;
5061 ## Reconsume.
5062 redo A;
5063 }
5064 } elsif ($self->{nc} == 0x003E) { # >
5065 if ($self->{group_depth}) {
5066 !!!parse-error (type => 'unclosed cm group'); ## TODO: type
5067 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5068 }
5069 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5070 !!!next-input-character;
5071 !!!emit ($self->{ct}); # ELEMENT
5072 redo A;
5073 } elsif ($self->{nc} == -1) {
5074 !!!parse-error (type => 'unclosed md'); ## TODO: type
5075 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
5076 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5077 !!!next-input-character;
5078 !!!emit ($self->{ct}); # ELEMENT
5079 redo A;
5080 } else {
5081 if ($self->{group_depth}) {
5082 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
5083 } else {
5084 !!!parse-error (type => 'string after md def'); ## TODO: type
5085 $self->{state} = BOGUS_MD_STATE;
5086 }
5087 ## Reconsume.
5088 redo A;
5089 }
5090 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
5091 if ($is_space->{$self->{nc}}) {
5092 ## Stay in the state.
5093 !!!next-input-character;
5094 redo A;
5095 } elsif ($self->{nc} == 0x003E) { # >
5096 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5097 !!!next-input-character;
5098 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5099 redo A;
5100 } elsif ($self->{nc} == -1) {
5101 !!!parse-error (type => 'unclosed md'); ## TODO: type
5102 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5103 !!!next-input-character;
5104 !!!emit ($self->{ct}); # ENTITY/ELEMENT
5105 redo A;
5106 } else {
5107 !!!parse-error (type => 'string after md def'); ## TODO: type
5108 $self->{state} = BOGUS_MD_STATE;
5109 ## Reconsume.
5110 redo A;
5111 }
5112 } elsif ($self->{state} == BOGUS_MD_STATE) {
5113 if ($self->{nc} == 0x003E) { # >
5114 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5115 !!!next-input-character;
5116 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5117 redo A;
5118 } elsif ($self->{nc} == -1) {
5119 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5120 ## Reconsume.
5121 !!!emit ($self->{ct}); # ATTLIST/ENTITY/NOTATION
5122 redo A;
5123 } else {
5124 ## Stay in the state.
5125 !!!next-input-character;
5126 redo A;
5127 }
5128 } else {
5129 die "$0: $self->{state}: Unknown state";
5130 }
5131 } # A
5132
5133 die "$0: _get_next_token: unexpected case";
5134 } # _get_next_token
5135
5136 1;
5137 ## $Date: 2009/07/02 22:24:28 $
5138

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24