/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.31 - (show annotations) (download)
Sat Sep 5 09:26:55 2009 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.30: +40 -11 lines
++ whatpm/t/ChangeLog	5 Sep 2009 09:26:39 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* tokenizer-test-1.test: Added test cases for "comment end bang
	state" (HTML5 revision 3191).

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:26:12 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end
	bang state" (HTML5 revision 3191).

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.30 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_BANG_STATE () { 102 } ## LAST
109 sub COMMENT_END_DASH_STATE () { 18 }
110 sub BOGUS_COMMENT_STATE () { 19 }
111 sub DOCTYPE_STATE () { 20 }
112 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
113 sub DOCTYPE_NAME_STATE () { 22 }
114 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
115 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
117 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
118 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
119 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
121 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
122 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
123 sub BOGUS_DOCTYPE_STATE () { 32 }
124 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
125 sub SELF_CLOSING_START_TAG_STATE () { 34 }
126 sub CDATA_SECTION_STATE () { 35 }
127 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
128 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
129 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
130 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
131 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
132 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
133 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
134 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
135 ## NOTE: "Entity data state", "entity in attribute value state", and
136 ## "consume a character reference" algorithm are jointly implemented
137 ## using the following six states:
138 sub ENTITY_STATE () { 44 }
139 sub ENTITY_HASH_STATE () { 45 }
140 sub NCR_NUM_STATE () { 46 }
141 sub HEXREF_X_STATE () { 47 }
142 sub HEXREF_HEX_STATE () { 48 }
143 sub ENTITY_NAME_STATE () { 49 }
144 sub PCDATA_STATE () { 50 } # "data state" in the spec
145
146 ## XML-only states
147 sub PI_STATE () { 51 }
148 sub PI_TARGET_STATE () { 52 }
149 sub PI_TARGET_AFTER_STATE () { 53 }
150 sub PI_DATA_STATE () { 54 }
151 sub PI_AFTER_STATE () { 55 }
152 sub PI_DATA_AFTER_STATE () { 56 }
153 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
154 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
155 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
156 sub DOCTYPE_TAG_STATE () { 60 }
157 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
158 sub MD_ATTLIST_STATE () { 62 }
159 sub MD_E_STATE () { 63 }
160 sub MD_ELEMENT_STATE () { 64 }
161 sub MD_ENTITY_STATE () { 65 }
162 sub MD_NOTATION_STATE () { 66 }
163 sub DOCTYPE_MD_STATE () { 67 }
164 sub BEFORE_MD_NAME_STATE () { 68 }
165 sub MD_NAME_STATE () { 69 }
166 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
167 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
171 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
172 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
173 sub ALLOWED_TOKEN_STATE () { 77 }
174 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
175 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
176 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
179 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
180 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
181 sub BEFORE_NDATA_STATE () { 85 }
182 sub NDATA_STATE () { 86 }
183 sub AFTER_NDATA_STATE () { 87 }
184 sub BEFORE_NOTATION_NAME_STATE () { 88 }
185 sub NOTATION_NAME_STATE () { 89 }
186 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
187 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
188 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
189 sub AFTER_ELEMENT_NAME_STATE () { 93 }
190 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
191 sub CONTENT_KEYWORD_STATE () { 95 }
192 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
193 sub CM_ELEMENT_NAME_STATE () { 97 }
194 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
195 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
196 sub AFTER_MD_DEF_STATE () { 100 }
197 sub BOGUS_MD_STATE () { 101 }
198
199 ## Tree constructor state constants (see Whatpm::HTML for the full
200 ## list and descriptions)
201
202 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
203 sub FOREIGN_EL () { 0b1_00000000000 }
204
205 ## Character reference mappings
206
207 my $charref_map = {
208 0x0D => 0x000A,
209 0x80 => 0x20AC,
210 0x81 => 0xFFFD,
211 0x82 => 0x201A,
212 0x83 => 0x0192,
213 0x84 => 0x201E,
214 0x85 => 0x2026,
215 0x86 => 0x2020,
216 0x87 => 0x2021,
217 0x88 => 0x02C6,
218 0x89 => 0x2030,
219 0x8A => 0x0160,
220 0x8B => 0x2039,
221 0x8C => 0x0152,
222 0x8D => 0xFFFD,
223 0x8E => 0x017D,
224 0x8F => 0xFFFD,
225 0x90 => 0xFFFD,
226 0x91 => 0x2018,
227 0x92 => 0x2019,
228 0x93 => 0x201C,
229 0x94 => 0x201D,
230 0x95 => 0x2022,
231 0x96 => 0x2013,
232 0x97 => 0x2014,
233 0x98 => 0x02DC,
234 0x99 => 0x2122,
235 0x9A => 0x0161,
236 0x9B => 0x203A,
237 0x9C => 0x0153,
238 0x9D => 0xFFFD,
239 0x9E => 0x017E,
240 0x9F => 0x0178,
241 }; # $charref_map
242 $charref_map->{$_} = 0xFFFD
243 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
244 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
245 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
246 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
247 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
248 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
249 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
250
251 ## Implementations MUST act as if state machine in the spec
252
253 sub _initialize_tokenizer ($) {
254 my $self = shift;
255
256 ## NOTE: Fields set by |new| constructor:
257 #$self->{level}
258 #$self->{set_nc}
259 #$self->{parse_error}
260 #$self->{is_xml} (if XML)
261
262 $self->{state} = DATA_STATE; # MUST
263 $self->{s_kwd} = ''; # Data state keyword
264 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
265 #$self->{entity__value}; # initialized when used
266 #$self->{entity__match}; # initialized when used
267 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
268 undef $self->{ct}; # current token
269 undef $self->{ca}; # current attribute
270 undef $self->{last_stag_name}; # last emitted start tag name
271 #$self->{prev_state}; # initialized when used
272 delete $self->{self_closing};
273 $self->{char_buffer} = '';
274 $self->{char_buffer_pos} = 0;
275 $self->{nc} = -1; # next input character
276 #$self->{next_nc}
277
278 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
279 $self->{line_prev} = $self->{line};
280 $self->{column_prev} = $self->{column};
281 $self->{column}++;
282 $self->{nc}
283 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
284 } else {
285 $self->{set_nc}->($self);
286 }
287
288 $self->{token} = [];
289 # $self->{escape}
290 } # _initialize_tokenizer
291
292 ## A token has:
293 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
294 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
295 ## ->{name} (DOCTYPE_TOKEN)
296 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
297 ## ->{target} (PI_TOKEN)
298 ## ->{pubid} (DOCTYPE_TOKEN)
299 ## ->{sysid} (DOCTYPE_TOKEN)
300 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
301 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
302 ## ->{name}
303 ## ->{value}
304 ## ->{has_reference} == 1 or 0
305 ## ->{index}: Index of the attribute in a tag.
306 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
307 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
308 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
309 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
310
311 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
312 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
313 ## while the token is pushed back to the stack.
314
315 ## Emitted token MUST immediately be handled by the tree construction state.
316
317 ## Before each step, UA MAY check to see if either one of the scripts in
318 ## "list of scripts that will execute as soon as possible" or the first
319 ## script in the "list of scripts that will execute asynchronously",
320 ## has completed loading. If one has, then it MUST be executed
321 ## and removed from the list.
322
323 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
324 ## (This requirement was dropped from HTML5 spec, unfortunately.)
325
326 my $is_space = {
327 0x0009 => 1, # CHARACTER TABULATION (HT)
328 0x000A => 1, # LINE FEED (LF)
329 #0x000B => 0, # LINE TABULATION (VT)
330 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
331 #0x000D => 1, # CARRIAGE RETURN (CR)
332 0x0020 => 1, # SPACE (SP)
333 };
334
335 sub _get_next_token ($) {
336 my $self = shift;
337
338 if ($self->{self_closing}) {
339 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
340 ## NOTE: The |self_closing| flag is only set by start tag token.
341 ## In addition, when a start tag token is emitted, it is always set to
342 ## |ct|.
343 delete $self->{self_closing};
344 }
345
346 if (@{$self->{token}}) {
347 $self->{self_closing} = $self->{token}->[0]->{self_closing};
348 return shift @{$self->{token}};
349 }
350
351 A: {
352 if ($self->{state} == PCDATA_STATE) {
353 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
354
355 if ($self->{nc} == 0x0026) { # &
356
357 ## NOTE: In the spec, the tokenizer is switched to the
358 ## "entity data state". In this implementation, the tokenizer
359 ## is switched to the |ENTITY_STATE|, which is an implementation
360 ## of the "consume a character reference" algorithm.
361 $self->{entity_add} = -1;
362 $self->{prev_state} = DATA_STATE;
363 $self->{state} = ENTITY_STATE;
364
365 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
366 $self->{line_prev} = $self->{line};
367 $self->{column_prev} = $self->{column};
368 $self->{column}++;
369 $self->{nc}
370 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
371 } else {
372 $self->{set_nc}->($self);
373 }
374
375 redo A;
376 } elsif ($self->{nc} == 0x003C) { # <
377
378 $self->{state} = TAG_OPEN_STATE;
379
380 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
381 $self->{line_prev} = $self->{line};
382 $self->{column_prev} = $self->{column};
383 $self->{column}++;
384 $self->{nc}
385 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
386 } else {
387 $self->{set_nc}->($self);
388 }
389
390 redo A;
391 } elsif ($self->{nc} == -1) {
392
393 return ({type => END_OF_FILE_TOKEN,
394 line => $self->{line}, column => $self->{column}});
395 last A; ## TODO: ok?
396 } else {
397
398 #
399 }
400
401 # Anything else
402 my $token = {type => CHARACTER_TOKEN,
403 data => chr $self->{nc},
404 line => $self->{line}, column => $self->{column},
405 };
406 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
407
408 ## Stay in the state.
409
410 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
411 $self->{line_prev} = $self->{line};
412 $self->{column_prev} = $self->{column};
413 $self->{column}++;
414 $self->{nc}
415 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
416 } else {
417 $self->{set_nc}->($self);
418 }
419
420 return ($token);
421 redo A;
422 } elsif ($self->{state} == DATA_STATE) {
423 $self->{s_kwd} = '' unless defined $self->{s_kwd};
424 if ($self->{nc} == 0x0026) { # &
425 $self->{s_kwd} = '';
426 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
427 not $self->{escape}) {
428
429 ## NOTE: In the spec, the tokenizer is switched to the
430 ## "entity data state". In this implementation, the tokenizer
431 ## is switched to the |ENTITY_STATE|, which is an implementation
432 ## of the "consume a character reference" algorithm.
433 $self->{entity_add} = -1;
434 $self->{prev_state} = DATA_STATE;
435 $self->{state} = ENTITY_STATE;
436
437 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
438 $self->{line_prev} = $self->{line};
439 $self->{column_prev} = $self->{column};
440 $self->{column}++;
441 $self->{nc}
442 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
443 } else {
444 $self->{set_nc}->($self);
445 }
446
447 redo A;
448 } else {
449
450 #
451 }
452 } elsif ($self->{nc} == 0x002D) { # -
453 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
454 if ($self->{s_kwd} eq '<!-') {
455
456 $self->{escape} = 1; # unless $self->{escape};
457 $self->{s_kwd} = '--';
458 #
459 } elsif ($self->{s_kwd} eq '-') {
460
461 $self->{s_kwd} = '--';
462 #
463 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
464
465 $self->{s_kwd} .= '-';
466 #
467 } else {
468
469 $self->{s_kwd} = '-';
470 #
471 }
472 }
473
474 #
475 } elsif ($self->{nc} == 0x0021) { # !
476 if (length $self->{s_kwd}) {
477
478 $self->{s_kwd} .= '!';
479 #
480 } else {
481
482 #$self->{s_kwd} = '';
483 #
484 }
485 #
486 } elsif ($self->{nc} == 0x003C) { # <
487 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
488 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
489 not $self->{escape})) {
490
491 $self->{state} = TAG_OPEN_STATE;
492
493 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
494 $self->{line_prev} = $self->{line};
495 $self->{column_prev} = $self->{column};
496 $self->{column}++;
497 $self->{nc}
498 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
499 } else {
500 $self->{set_nc}->($self);
501 }
502
503 redo A;
504 } else {
505
506 $self->{s_kwd} = '';
507 #
508 }
509 } elsif ($self->{nc} == 0x003E) { # >
510 if ($self->{escape} and
511 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
512 if ($self->{s_kwd} eq '--') {
513
514 delete $self->{escape};
515 #
516 } else {
517
518 #
519 }
520 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
521
522 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
523 line => $self->{line_prev},
524 column => $self->{column_prev} - 1);
525 #
526 } else {
527
528 #
529 }
530
531 $self->{s_kwd} = '';
532 #
533 } elsif ($self->{nc} == 0x005D) { # ]
534 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
535
536 $self->{s_kwd} .= ']';
537 } elsif ($self->{s_kwd} eq ']]') {
538
539 #
540 } else {
541
542 $self->{s_kwd} = '';
543 }
544 #
545 } elsif ($self->{nc} == -1) {
546
547 $self->{s_kwd} = '';
548 return ({type => END_OF_FILE_TOKEN,
549 line => $self->{line}, column => $self->{column}});
550 last A; ## TODO: ok?
551 } else {
552
553 $self->{s_kwd} = '';
554 #
555 }
556
557 # Anything else
558 my $token = {type => CHARACTER_TOKEN,
559 data => chr $self->{nc},
560 line => $self->{line}, column => $self->{column},
561 };
562 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
563 length $token->{data})) {
564 $self->{s_kwd} = '';
565 }
566
567 ## Stay in the data state.
568 if (not $self->{is_xml} and
569 $self->{content_model} == PCDATA_CONTENT_MODEL) {
570
571 $self->{state} = PCDATA_STATE;
572 } else {
573
574 ## Stay in the state.
575 }
576
577 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
578 $self->{line_prev} = $self->{line};
579 $self->{column_prev} = $self->{column};
580 $self->{column}++;
581 $self->{nc}
582 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
583 } else {
584 $self->{set_nc}->($self);
585 }
586
587 return ($token);
588 redo A;
589 } elsif ($self->{state} == TAG_OPEN_STATE) {
590 ## XML5: "tag state".
591
592 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
593 if ($self->{nc} == 0x002F) { # /
594
595
596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
597 $self->{line_prev} = $self->{line};
598 $self->{column_prev} = $self->{column};
599 $self->{column}++;
600 $self->{nc}
601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
602 } else {
603 $self->{set_nc}->($self);
604 }
605
606 $self->{state} = CLOSE_TAG_OPEN_STATE;
607 redo A;
608 } elsif ($self->{nc} == 0x0021) { # !
609
610 $self->{s_kwd} = $self->{escaped} ? '' : '<';
611 #
612 } else {
613
614 $self->{s_kwd} = '';
615 #
616 }
617
618 ## reconsume
619 $self->{state} = DATA_STATE;
620 return ({type => CHARACTER_TOKEN, data => '<',
621 line => $self->{line_prev},
622 column => $self->{column_prev},
623 });
624 redo A;
625 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
626 if ($self->{nc} == 0x0021) { # !
627
628 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
629
630 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
631 $self->{line_prev} = $self->{line};
632 $self->{column_prev} = $self->{column};
633 $self->{column}++;
634 $self->{nc}
635 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
636 } else {
637 $self->{set_nc}->($self);
638 }
639
640 redo A;
641 } elsif ($self->{nc} == 0x002F) { # /
642
643 $self->{state} = CLOSE_TAG_OPEN_STATE;
644
645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
646 $self->{line_prev} = $self->{line};
647 $self->{column_prev} = $self->{column};
648 $self->{column}++;
649 $self->{nc}
650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
651 } else {
652 $self->{set_nc}->($self);
653 }
654
655 redo A;
656 } elsif (0x0041 <= $self->{nc} and
657 $self->{nc} <= 0x005A) { # A..Z
658
659 $self->{ct}
660 = {type => START_TAG_TOKEN,
661 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
662 line => $self->{line_prev},
663 column => $self->{column_prev}};
664 $self->{state} = TAG_NAME_STATE;
665
666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
667 $self->{line_prev} = $self->{line};
668 $self->{column_prev} = $self->{column};
669 $self->{column}++;
670 $self->{nc}
671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
672 } else {
673 $self->{set_nc}->($self);
674 }
675
676 redo A;
677 } elsif (0x0061 <= $self->{nc} and
678 $self->{nc} <= 0x007A) { # a..z
679
680 $self->{ct} = {type => START_TAG_TOKEN,
681 tag_name => chr ($self->{nc}),
682 line => $self->{line_prev},
683 column => $self->{column_prev}};
684 $self->{state} = TAG_NAME_STATE;
685
686 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
687 $self->{line_prev} = $self->{line};
688 $self->{column_prev} = $self->{column};
689 $self->{column}++;
690 $self->{nc}
691 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
692 } else {
693 $self->{set_nc}->($self);
694 }
695
696 redo A;
697 } elsif ($self->{nc} == 0x003E) { # >
698
699 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
700 line => $self->{line_prev},
701 column => $self->{column_prev});
702 $self->{state} = DATA_STATE;
703 $self->{s_kwd} = '';
704
705 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
706 $self->{line_prev} = $self->{line};
707 $self->{column_prev} = $self->{column};
708 $self->{column}++;
709 $self->{nc}
710 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
711 } else {
712 $self->{set_nc}->($self);
713 }
714
715
716 return ({type => CHARACTER_TOKEN, data => '<>',
717 line => $self->{line_prev},
718 column => $self->{column_prev},
719 });
720
721 redo A;
722 } elsif ($self->{nc} == 0x003F) { # ?
723 if ($self->{is_xml}) {
724
725 $self->{state} = PI_STATE;
726
727 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
728 $self->{line_prev} = $self->{line};
729 $self->{column_prev} = $self->{column};
730 $self->{column}++;
731 $self->{nc}
732 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
733 } else {
734 $self->{set_nc}->($self);
735 }
736
737 redo A;
738 } else {
739
740 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
741 line => $self->{line_prev},
742 column => $self->{column_prev});
743 $self->{state} = BOGUS_COMMENT_STATE;
744 $self->{ct} = {type => COMMENT_TOKEN, data => '',
745 line => $self->{line_prev},
746 column => $self->{column_prev},
747 };
748 ## $self->{nc} is intentionally left as is
749 redo A;
750 }
751 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
752
753 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
754 line => $self->{line_prev},
755 column => $self->{column_prev});
756 $self->{state} = DATA_STATE;
757 $self->{s_kwd} = '';
758 ## reconsume
759
760 return ({type => CHARACTER_TOKEN, data => '<',
761 line => $self->{line_prev},
762 column => $self->{column_prev},
763 });
764
765 redo A;
766 } else {
767 ## XML5: "<:" is a parse error.
768
769 $self->{ct} = {type => START_TAG_TOKEN,
770 tag_name => chr ($self->{nc}),
771 line => $self->{line_prev},
772 column => $self->{column_prev}};
773 $self->{state} = TAG_NAME_STATE;
774
775 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
776 $self->{line_prev} = $self->{line};
777 $self->{column_prev} = $self->{column};
778 $self->{column}++;
779 $self->{nc}
780 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
781 } else {
782 $self->{set_nc}->($self);
783 }
784
785 redo A;
786 }
787 } else {
788 die "$0: $self->{content_model} in tag open";
789 }
790 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
791 ## NOTE: The "close tag open state" in the spec is implemented as
792 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
793
794 ## XML5: "end tag state".
795
796 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
797 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
798 if (defined $self->{last_stag_name}) {
799 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
800 $self->{kwd} = '';
801 ## Reconsume.
802 redo A;
803 } else {
804 ## No start tag token has ever been emitted
805 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
806
807 $self->{state} = DATA_STATE;
808 $self->{s_kwd} = '';
809 ## Reconsume.
810 return ({type => CHARACTER_TOKEN, data => '</',
811 line => $l, column => $c,
812 });
813 redo A;
814 }
815 }
816
817 if (0x0041 <= $self->{nc} and
818 $self->{nc} <= 0x005A) { # A..Z
819
820 $self->{ct}
821 = {type => END_TAG_TOKEN,
822 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
823 line => $l, column => $c};
824 $self->{state} = TAG_NAME_STATE;
825
826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
827 $self->{line_prev} = $self->{line};
828 $self->{column_prev} = $self->{column};
829 $self->{column}++;
830 $self->{nc}
831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
832 } else {
833 $self->{set_nc}->($self);
834 }
835
836 redo A;
837 } elsif (0x0061 <= $self->{nc} and
838 $self->{nc} <= 0x007A) { # a..z
839
840 $self->{ct} = {type => END_TAG_TOKEN,
841 tag_name => chr ($self->{nc}),
842 line => $l, column => $c};
843 $self->{state} = TAG_NAME_STATE;
844
845 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
846 $self->{line_prev} = $self->{line};
847 $self->{column_prev} = $self->{column};
848 $self->{column}++;
849 $self->{nc}
850 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
851 } else {
852 $self->{set_nc}->($self);
853 }
854
855 redo A;
856 } elsif ($self->{nc} == 0x003E) { # >
857 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
858 line => $self->{line_prev}, ## "<" in "</>"
859 column => $self->{column_prev} - 1);
860 $self->{state} = DATA_STATE;
861 $self->{s_kwd} = '';
862 if ($self->{is_xml}) {
863
864 ## XML5: No parse error.
865
866 ## NOTE: This parser raises a parse error, since it supports
867 ## XML1, not XML5.
868
869 ## NOTE: A short end tag token.
870 my $ct = {type => END_TAG_TOKEN,
871 tag_name => '',
872 line => $self->{line_prev},
873 column => $self->{column_prev} - 1,
874 };
875
876 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
877 $self->{line_prev} = $self->{line};
878 $self->{column_prev} = $self->{column};
879 $self->{column}++;
880 $self->{nc}
881 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
882 } else {
883 $self->{set_nc}->($self);
884 }
885
886 return ($ct);
887 } else {
888
889
890 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
891 $self->{line_prev} = $self->{line};
892 $self->{column_prev} = $self->{column};
893 $self->{column}++;
894 $self->{nc}
895 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
896 } else {
897 $self->{set_nc}->($self);
898 }
899
900 }
901 redo A;
902 } elsif ($self->{nc} == -1) {
903
904 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
905 $self->{s_kwd} = '';
906 $self->{state} = DATA_STATE;
907 # reconsume
908
909 return ({type => CHARACTER_TOKEN, data => '</',
910 line => $l, column => $c,
911 });
912
913 redo A;
914 } elsif (not $self->{is_xml} or
915 $is_space->{$self->{nc}}) {
916
917 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
918 line => $self->{line_prev}, # "<" of "</"
919 column => $self->{column_prev} - 1);
920 $self->{state} = BOGUS_COMMENT_STATE;
921 $self->{ct} = {type => COMMENT_TOKEN, data => '',
922 line => $self->{line_prev}, # "<" of "</"
923 column => $self->{column_prev} - 1,
924 };
925 ## NOTE: $self->{nc} is intentionally left as is.
926 ## Although the "anything else" case of the spec not explicitly
927 ## states that the next input character is to be reconsumed,
928 ## it will be included to the |data| of the comment token
929 ## generated from the bogus end tag, as defined in the
930 ## "bogus comment state" entry.
931 redo A;
932 } else {
933 ## XML5: "</:" is a parse error.
934
935 $self->{ct} = {type => END_TAG_TOKEN,
936 tag_name => chr ($self->{nc}),
937 line => $l, column => $c};
938 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
939
940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
941 $self->{line_prev} = $self->{line};
942 $self->{column_prev} = $self->{column};
943 $self->{column}++;
944 $self->{nc}
945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
946 } else {
947 $self->{set_nc}->($self);
948 }
949
950 redo A;
951 }
952 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
953 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
954 if (length $ch) {
955 my $CH = $ch;
956 $ch =~ tr/a-z/A-Z/;
957 my $nch = chr $self->{nc};
958 if ($nch eq $ch or $nch eq $CH) {
959
960 ## Stay in the state.
961 $self->{kwd} .= $nch;
962
963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
964 $self->{line_prev} = $self->{line};
965 $self->{column_prev} = $self->{column};
966 $self->{column}++;
967 $self->{nc}
968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
969 } else {
970 $self->{set_nc}->($self);
971 }
972
973 redo A;
974 } else {
975
976 $self->{state} = DATA_STATE;
977 $self->{s_kwd} = '';
978 ## Reconsume.
979 return ({type => CHARACTER_TOKEN,
980 data => '</' . $self->{kwd},
981 line => $self->{line_prev},
982 column => $self->{column_prev} - 1 - length $self->{kwd},
983 });
984 redo A;
985 }
986 } else { # after "<{tag-name}"
987 unless ($is_space->{$self->{nc}} or
988 {
989 0x003E => 1, # >
990 0x002F => 1, # /
991 -1 => 1, # EOF
992 }->{$self->{nc}}) {
993
994 ## Reconsume.
995 $self->{state} = DATA_STATE;
996 $self->{s_kwd} = '';
997 return ({type => CHARACTER_TOKEN,
998 data => '</' . $self->{kwd},
999 line => $self->{line_prev},
1000 column => $self->{column_prev} - 1 - length $self->{kwd},
1001 });
1002 redo A;
1003 } else {
1004
1005 $self->{ct}
1006 = {type => END_TAG_TOKEN,
1007 tag_name => $self->{last_stag_name},
1008 line => $self->{line_prev},
1009 column => $self->{column_prev} - 1 - length $self->{kwd}};
1010 $self->{state} = TAG_NAME_STATE;
1011 ## Reconsume.
1012 redo A;
1013 }
1014 }
1015 } elsif ($self->{state} == TAG_NAME_STATE) {
1016 if ($is_space->{$self->{nc}}) {
1017
1018 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1019
1020 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1021 $self->{line_prev} = $self->{line};
1022 $self->{column_prev} = $self->{column};
1023 $self->{column}++;
1024 $self->{nc}
1025 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1026 } else {
1027 $self->{set_nc}->($self);
1028 }
1029
1030 redo A;
1031 } elsif ($self->{nc} == 0x003E) { # >
1032 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033
1034 $self->{last_stag_name} = $self->{ct}->{tag_name};
1035 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1036 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1037 #if ($self->{ct}->{attributes}) {
1038 # ## NOTE: This should never be reached.
1039 # !!! cp (36);
1040 # !!! parse-error (type => 'end tag attribute');
1041 #} else {
1042
1043 #}
1044 } else {
1045 die "$0: $self->{ct}->{type}: Unknown token type";
1046 }
1047 $self->{state} = DATA_STATE;
1048 $self->{s_kwd} = '';
1049
1050 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1051 $self->{line_prev} = $self->{line};
1052 $self->{column_prev} = $self->{column};
1053 $self->{column}++;
1054 $self->{nc}
1055 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1056 } else {
1057 $self->{set_nc}->($self);
1058 }
1059
1060
1061 return ($self->{ct}); # start tag or end tag
1062
1063 redo A;
1064 } elsif (0x0041 <= $self->{nc} and
1065 $self->{nc} <= 0x005A) { # A..Z
1066
1067 $self->{ct}->{tag_name}
1068 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1069 # start tag or end tag
1070 ## Stay in this state
1071
1072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1073 $self->{line_prev} = $self->{line};
1074 $self->{column_prev} = $self->{column};
1075 $self->{column}++;
1076 $self->{nc}
1077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1078 } else {
1079 $self->{set_nc}->($self);
1080 }
1081
1082 redo A;
1083 } elsif ($self->{nc} == -1) {
1084 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1085 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1086
1087 $self->{last_stag_name} = $self->{ct}->{tag_name};
1088 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1089 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1090 #if ($self->{ct}->{attributes}) {
1091 # ## NOTE: This state should never be reached.
1092 # !!! cp (40);
1093 # !!! parse-error (type => 'end tag attribute');
1094 #} else {
1095
1096 #}
1097 } else {
1098 die "$0: $self->{ct}->{type}: Unknown token type";
1099 }
1100 $self->{state} = DATA_STATE;
1101 $self->{s_kwd} = '';
1102 # reconsume
1103
1104 return ($self->{ct}); # start tag or end tag
1105
1106 redo A;
1107 } elsif ($self->{nc} == 0x002F) { # /
1108
1109 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1110
1111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1112 $self->{line_prev} = $self->{line};
1113 $self->{column_prev} = $self->{column};
1114 $self->{column}++;
1115 $self->{nc}
1116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1117 } else {
1118 $self->{set_nc}->($self);
1119 }
1120
1121 redo A;
1122 } else {
1123
1124 $self->{ct}->{tag_name} .= chr $self->{nc};
1125 # start tag or end tag
1126 ## Stay in the state
1127
1128 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1129 $self->{line_prev} = $self->{line};
1130 $self->{column_prev} = $self->{column};
1131 $self->{column}++;
1132 $self->{nc}
1133 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1134 } else {
1135 $self->{set_nc}->($self);
1136 }
1137
1138 redo A;
1139 }
1140 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1141 ## XML5: "Tag attribute name before state".
1142
1143 if ($is_space->{$self->{nc}}) {
1144
1145 ## Stay in the state
1146
1147 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1148 $self->{line_prev} = $self->{line};
1149 $self->{column_prev} = $self->{column};
1150 $self->{column}++;
1151 $self->{nc}
1152 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1153 } else {
1154 $self->{set_nc}->($self);
1155 }
1156
1157 redo A;
1158 } elsif ($self->{nc} == 0x003E) { # >
1159 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1160
1161 $self->{last_stag_name} = $self->{ct}->{tag_name};
1162 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1163 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1164 if ($self->{ct}->{attributes}) {
1165
1166 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1167 } else {
1168
1169 }
1170 } else {
1171 die "$0: $self->{ct}->{type}: Unknown token type";
1172 }
1173 $self->{state} = DATA_STATE;
1174 $self->{s_kwd} = '';
1175
1176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1177 $self->{line_prev} = $self->{line};
1178 $self->{column_prev} = $self->{column};
1179 $self->{column}++;
1180 $self->{nc}
1181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1182 } else {
1183 $self->{set_nc}->($self);
1184 }
1185
1186
1187 return ($self->{ct}); # start tag or end tag
1188
1189 redo A;
1190 } elsif (0x0041 <= $self->{nc} and
1191 $self->{nc} <= 0x005A) { # A..Z
1192
1193 $self->{ca}
1194 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1195 value => '',
1196 line => $self->{line}, column => $self->{column}};
1197 $self->{state} = ATTRIBUTE_NAME_STATE;
1198
1199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1200 $self->{line_prev} = $self->{line};
1201 $self->{column_prev} = $self->{column};
1202 $self->{column}++;
1203 $self->{nc}
1204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1205 } else {
1206 $self->{set_nc}->($self);
1207 }
1208
1209 redo A;
1210 } elsif ($self->{nc} == 0x002F) { # /
1211
1212 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1213
1214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1215 $self->{line_prev} = $self->{line};
1216 $self->{column_prev} = $self->{column};
1217 $self->{column}++;
1218 $self->{nc}
1219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1220 } else {
1221 $self->{set_nc}->($self);
1222 }
1223
1224 redo A;
1225 } elsif ($self->{nc} == -1) {
1226 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1227 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1228
1229 $self->{last_stag_name} = $self->{ct}->{tag_name};
1230 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1231 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1232 if ($self->{ct}->{attributes}) {
1233
1234 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1235 } else {
1236
1237 }
1238 } else {
1239 die "$0: $self->{ct}->{type}: Unknown token type";
1240 }
1241 $self->{state} = DATA_STATE;
1242 $self->{s_kwd} = '';
1243 # reconsume
1244
1245 return ($self->{ct}); # start tag or end tag
1246
1247 redo A;
1248 } else {
1249 if ({
1250 0x0022 => 1, # "
1251 0x0027 => 1, # '
1252 0x003C => 1, # <
1253 0x003D => 1, # =
1254 }->{$self->{nc}}) {
1255
1256 ## XML5: Not a parse error.
1257 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1258 } else {
1259
1260 ## XML5: ":" raises a parse error and is ignored.
1261 }
1262 $self->{ca}
1263 = {name => chr ($self->{nc}),
1264 value => '',
1265 line => $self->{line}, column => $self->{column}};
1266 $self->{state} = ATTRIBUTE_NAME_STATE;
1267
1268 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1269 $self->{line_prev} = $self->{line};
1270 $self->{column_prev} = $self->{column};
1271 $self->{column}++;
1272 $self->{nc}
1273 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1274 } else {
1275 $self->{set_nc}->($self);
1276 }
1277
1278 redo A;
1279 }
1280 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1281 ## XML5: "Tag attribute name state".
1282
1283 my $before_leave = sub {
1284 if (exists $self->{ct}->{attributes} # start tag or end tag
1285 ->{$self->{ca}->{name}}) { # MUST
1286
1287 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1288 ## Discard $self->{ca} # MUST
1289 } else {
1290
1291 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1292 = $self->{ca};
1293 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1294 }
1295 }; # $before_leave
1296
1297 if ($is_space->{$self->{nc}}) {
1298
1299 $before_leave->();
1300 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1301
1302 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1303 $self->{line_prev} = $self->{line};
1304 $self->{column_prev} = $self->{column};
1305 $self->{column}++;
1306 $self->{nc}
1307 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1308 } else {
1309 $self->{set_nc}->($self);
1310 }
1311
1312 redo A;
1313 } elsif ($self->{nc} == 0x003D) { # =
1314
1315 $before_leave->();
1316 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1317
1318 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1319 $self->{line_prev} = $self->{line};
1320 $self->{column_prev} = $self->{column};
1321 $self->{column}++;
1322 $self->{nc}
1323 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1324 } else {
1325 $self->{set_nc}->($self);
1326 }
1327
1328 redo A;
1329 } elsif ($self->{nc} == 0x003E) { # >
1330 if ($self->{is_xml}) {
1331
1332 ## XML5: Not a parse error.
1333 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1334 } else {
1335
1336 }
1337
1338 $before_leave->();
1339 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1340
1341 $self->{last_stag_name} = $self->{ct}->{tag_name};
1342 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1343
1344 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1345 if ($self->{ct}->{attributes}) {
1346 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1347 }
1348 } else {
1349 die "$0: $self->{ct}->{type}: Unknown token type";
1350 }
1351 $self->{state} = DATA_STATE;
1352 $self->{s_kwd} = '';
1353
1354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1355 $self->{line_prev} = $self->{line};
1356 $self->{column_prev} = $self->{column};
1357 $self->{column}++;
1358 $self->{nc}
1359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1360 } else {
1361 $self->{set_nc}->($self);
1362 }
1363
1364
1365 return ($self->{ct}); # start tag or end tag
1366
1367 redo A;
1368 } elsif (0x0041 <= $self->{nc} and
1369 $self->{nc} <= 0x005A) { # A..Z
1370
1371 $self->{ca}->{name}
1372 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1373 ## Stay in the state
1374
1375 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1376 $self->{line_prev} = $self->{line};
1377 $self->{column_prev} = $self->{column};
1378 $self->{column}++;
1379 $self->{nc}
1380 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1381 } else {
1382 $self->{set_nc}->($self);
1383 }
1384
1385 redo A;
1386 } elsif ($self->{nc} == 0x002F) { # /
1387 if ($self->{is_xml}) {
1388
1389 ## XML5: Not a parse error.
1390 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1391 } else {
1392
1393 }
1394
1395 $before_leave->();
1396 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1397
1398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1399 $self->{line_prev} = $self->{line};
1400 $self->{column_prev} = $self->{column};
1401 $self->{column}++;
1402 $self->{nc}
1403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1404 } else {
1405 $self->{set_nc}->($self);
1406 }
1407
1408 redo A;
1409 } elsif ($self->{nc} == -1) {
1410 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1411 $before_leave->();
1412 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1413
1414 $self->{last_stag_name} = $self->{ct}->{tag_name};
1415 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1416 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1417 if ($self->{ct}->{attributes}) {
1418
1419 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1420 } else {
1421 ## NOTE: This state should never be reached.
1422
1423 }
1424 } else {
1425 die "$0: $self->{ct}->{type}: Unknown token type";
1426 }
1427 $self->{state} = DATA_STATE;
1428 $self->{s_kwd} = '';
1429 # reconsume
1430
1431 return ($self->{ct}); # start tag or end tag
1432
1433 redo A;
1434 } else {
1435 if ({
1436 0x0022 => 1, # "
1437 0x0027 => 1, # '
1438 0x003C => 1, # <
1439 }->{$self->{nc}}) {
1440
1441 ## XML5: Not a parse error.
1442 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1443 } else {
1444
1445 }
1446 $self->{ca}->{name} .= chr ($self->{nc});
1447 ## Stay in the state
1448
1449 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1450 $self->{line_prev} = $self->{line};
1451 $self->{column_prev} = $self->{column};
1452 $self->{column}++;
1453 $self->{nc}
1454 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1455 } else {
1456 $self->{set_nc}->($self);
1457 }
1458
1459 redo A;
1460 }
1461 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1462 ## XML5: "Tag attribute name after state".
1463
1464 if ($is_space->{$self->{nc}}) {
1465
1466 ## Stay in the state
1467
1468 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1469 $self->{line_prev} = $self->{line};
1470 $self->{column_prev} = $self->{column};
1471 $self->{column}++;
1472 $self->{nc}
1473 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1474 } else {
1475 $self->{set_nc}->($self);
1476 }
1477
1478 redo A;
1479 } elsif ($self->{nc} == 0x003D) { # =
1480
1481 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1482
1483 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1484 $self->{line_prev} = $self->{line};
1485 $self->{column_prev} = $self->{column};
1486 $self->{column}++;
1487 $self->{nc}
1488 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1489 } else {
1490 $self->{set_nc}->($self);
1491 }
1492
1493 redo A;
1494 } elsif ($self->{nc} == 0x003E) { # >
1495 if ($self->{is_xml}) {
1496
1497 ## XML5: Not a parse error.
1498 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1499 } else {
1500
1501 }
1502
1503 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1504
1505 $self->{last_stag_name} = $self->{ct}->{tag_name};
1506 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1507 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1508 if ($self->{ct}->{attributes}) {
1509
1510 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1511 } else {
1512 ## NOTE: This state should never be reached.
1513
1514 }
1515 } else {
1516 die "$0: $self->{ct}->{type}: Unknown token type";
1517 }
1518 $self->{state} = DATA_STATE;
1519 $self->{s_kwd} = '';
1520
1521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1522 $self->{line_prev} = $self->{line};
1523 $self->{column_prev} = $self->{column};
1524 $self->{column}++;
1525 $self->{nc}
1526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1527 } else {
1528 $self->{set_nc}->($self);
1529 }
1530
1531
1532 return ($self->{ct}); # start tag or end tag
1533
1534 redo A;
1535 } elsif (0x0041 <= $self->{nc} and
1536 $self->{nc} <= 0x005A) { # A..Z
1537
1538 $self->{ca}
1539 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1540 value => '',
1541 line => $self->{line}, column => $self->{column}};
1542 $self->{state} = ATTRIBUTE_NAME_STATE;
1543
1544 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1545 $self->{line_prev} = $self->{line};
1546 $self->{column_prev} = $self->{column};
1547 $self->{column}++;
1548 $self->{nc}
1549 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1550 } else {
1551 $self->{set_nc}->($self);
1552 }
1553
1554 redo A;
1555 } elsif ($self->{nc} == 0x002F) { # /
1556 if ($self->{is_xml}) {
1557
1558 ## XML5: Not a parse error.
1559 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1560 } else {
1561
1562 }
1563
1564 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1565
1566 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1567 $self->{line_prev} = $self->{line};
1568 $self->{column_prev} = $self->{column};
1569 $self->{column}++;
1570 $self->{nc}
1571 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1572 } else {
1573 $self->{set_nc}->($self);
1574 }
1575
1576 redo A;
1577 } elsif ($self->{nc} == -1) {
1578 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1579 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1580
1581 $self->{last_stag_name} = $self->{ct}->{tag_name};
1582 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1583 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1584 if ($self->{ct}->{attributes}) {
1585
1586 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1587 } else {
1588 ## NOTE: This state should never be reached.
1589
1590 }
1591 } else {
1592 die "$0: $self->{ct}->{type}: Unknown token type";
1593 }
1594 $self->{s_kwd} = '';
1595 $self->{state} = DATA_STATE;
1596 # reconsume
1597
1598 return ($self->{ct}); # start tag or end tag
1599
1600 redo A;
1601 } else {
1602 if ($self->{is_xml}) {
1603
1604 ## XML5: Not a parse error.
1605 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1606 } else {
1607
1608 }
1609
1610 if ({
1611 0x0022 => 1, # "
1612 0x0027 => 1, # '
1613 0x003C => 1, # <
1614 }->{$self->{nc}}) {
1615
1616 ## XML5: Not a parse error.
1617 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1618 } else {
1619
1620 }
1621 $self->{ca}
1622 = {name => chr ($self->{nc}),
1623 value => '',
1624 line => $self->{line}, column => $self->{column}};
1625 $self->{state} = ATTRIBUTE_NAME_STATE;
1626
1627 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1628 $self->{line_prev} = $self->{line};
1629 $self->{column_prev} = $self->{column};
1630 $self->{column}++;
1631 $self->{nc}
1632 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1633 } else {
1634 $self->{set_nc}->($self);
1635 }
1636
1637 redo A;
1638 }
1639 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1640 ## XML5: "Tag attribute value before state".
1641
1642 if ($is_space->{$self->{nc}}) {
1643
1644 ## Stay in the state
1645
1646 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1647 $self->{line_prev} = $self->{line};
1648 $self->{column_prev} = $self->{column};
1649 $self->{column}++;
1650 $self->{nc}
1651 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1652 } else {
1653 $self->{set_nc}->($self);
1654 }
1655
1656 redo A;
1657 } elsif ($self->{nc} == 0x0022) { # "
1658
1659 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1660
1661 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1662 $self->{line_prev} = $self->{line};
1663 $self->{column_prev} = $self->{column};
1664 $self->{column}++;
1665 $self->{nc}
1666 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1667 } else {
1668 $self->{set_nc}->($self);
1669 }
1670
1671 redo A;
1672 } elsif ($self->{nc} == 0x0026) { # &
1673
1674 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1675 ## reconsume
1676 redo A;
1677 } elsif ($self->{nc} == 0x0027) { # '
1678
1679 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1680
1681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1682 $self->{line_prev} = $self->{line};
1683 $self->{column_prev} = $self->{column};
1684 $self->{column}++;
1685 $self->{nc}
1686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1687 } else {
1688 $self->{set_nc}->($self);
1689 }
1690
1691 redo A;
1692 } elsif ($self->{nc} == 0x003E) { # >
1693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1694 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1695
1696 $self->{last_stag_name} = $self->{ct}->{tag_name};
1697 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1698 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1699 if ($self->{ct}->{attributes}) {
1700
1701 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1702 } else {
1703 ## NOTE: This state should never be reached.
1704
1705 }
1706 } else {
1707 die "$0: $self->{ct}->{type}: Unknown token type";
1708 }
1709 $self->{state} = DATA_STATE;
1710 $self->{s_kwd} = '';
1711
1712 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1713 $self->{line_prev} = $self->{line};
1714 $self->{column_prev} = $self->{column};
1715 $self->{column}++;
1716 $self->{nc}
1717 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1718 } else {
1719 $self->{set_nc}->($self);
1720 }
1721
1722
1723 return ($self->{ct}); # start tag or end tag
1724
1725 redo A;
1726 } elsif ($self->{nc} == -1) {
1727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1728 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1729
1730 $self->{last_stag_name} = $self->{ct}->{tag_name};
1731 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1732 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1733 if ($self->{ct}->{attributes}) {
1734
1735 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1736 } else {
1737 ## NOTE: This state should never be reached.
1738
1739 }
1740 } else {
1741 die "$0: $self->{ct}->{type}: Unknown token type";
1742 }
1743 $self->{state} = DATA_STATE;
1744 $self->{s_kwd} = '';
1745 ## reconsume
1746
1747 return ($self->{ct}); # start tag or end tag
1748
1749 redo A;
1750 } else {
1751 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1752
1753 ## XML5: Not a parse error.
1754 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1755 } elsif ($self->{is_xml}) {
1756
1757 ## XML5: No parse error.
1758 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1759 } else {
1760
1761 }
1762 $self->{ca}->{value} .= chr ($self->{nc});
1763 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1764
1765 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1766 $self->{line_prev} = $self->{line};
1767 $self->{column_prev} = $self->{column};
1768 $self->{column}++;
1769 $self->{nc}
1770 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1771 } else {
1772 $self->{set_nc}->($self);
1773 }
1774
1775 redo A;
1776 }
1777 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1778 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1779 ## ATTLIST attribute value double quoted state".
1780
1781 if ($self->{nc} == 0x0022) { # "
1782 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1783
1784 ## XML5: "DOCTYPE ATTLIST name after state".
1785 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1786 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1787 } else {
1788
1789 ## XML5: "Tag attribute name before state".
1790 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1791 }
1792
1793 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1794 $self->{line_prev} = $self->{line};
1795 $self->{column_prev} = $self->{column};
1796 $self->{column}++;
1797 $self->{nc}
1798 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1799 } else {
1800 $self->{set_nc}->($self);
1801 }
1802
1803 redo A;
1804 } elsif ($self->{nc} == 0x0026) { # &
1805
1806 ## XML5: Not defined yet.
1807
1808 ## NOTE: In the spec, the tokenizer is switched to the
1809 ## "entity in attribute value state". In this implementation, the
1810 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1811 ## implementation of the "consume a character reference" algorithm.
1812 $self->{prev_state} = $self->{state};
1813 $self->{entity_add} = 0x0022; # "
1814 $self->{state} = ENTITY_STATE;
1815
1816 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1817 $self->{line_prev} = $self->{line};
1818 $self->{column_prev} = $self->{column};
1819 $self->{column}++;
1820 $self->{nc}
1821 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1822 } else {
1823 $self->{set_nc}->($self);
1824 }
1825
1826 redo A;
1827 } elsif ($self->{is_xml} and
1828 $is_space->{$self->{nc}}) {
1829
1830 $self->{ca}->{value} .= ' ';
1831 ## Stay in the state.
1832
1833 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1834 $self->{line_prev} = $self->{line};
1835 $self->{column_prev} = $self->{column};
1836 $self->{column}++;
1837 $self->{nc}
1838 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1839 } else {
1840 $self->{set_nc}->($self);
1841 }
1842
1843 redo A;
1844 } elsif ($self->{nc} == -1) {
1845 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1846 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1847
1848 $self->{last_stag_name} = $self->{ct}->{tag_name};
1849
1850 $self->{state} = DATA_STATE;
1851 $self->{s_kwd} = '';
1852 ## reconsume
1853 return ($self->{ct}); # start tag
1854 redo A;
1855 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1856 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1857 if ($self->{ct}->{attributes}) {
1858
1859 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1860 } else {
1861 ## NOTE: This state should never be reached.
1862
1863 }
1864
1865 $self->{state} = DATA_STATE;
1866 $self->{s_kwd} = '';
1867 ## reconsume
1868 return ($self->{ct}); # end tag
1869 redo A;
1870 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1871 ## XML5: No parse error above; not defined yet.
1872 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1873 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1874 ## Reconsume.
1875 return ($self->{ct}); # ATTLIST
1876 redo A;
1877 } else {
1878 die "$0: $self->{ct}->{type}: Unknown token type";
1879 }
1880 } else {
1881 ## XML5 [ATTLIST]: Not defined yet.
1882 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1883
1884 ## XML5: Not a parse error.
1885 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1886 } else {
1887
1888 }
1889 $self->{ca}->{value} .= chr ($self->{nc});
1890 $self->{read_until}->($self->{ca}->{value},
1891 qq["&<\x09\x0C\x20],
1892 length $self->{ca}->{value});
1893
1894 ## Stay in the state
1895
1896 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1897 $self->{line_prev} = $self->{line};
1898 $self->{column_prev} = $self->{column};
1899 $self->{column}++;
1900 $self->{nc}
1901 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1902 } else {
1903 $self->{set_nc}->($self);
1904 }
1905
1906 redo A;
1907 }
1908 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1909 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1910 ## ATTLIST attribute value single quoted state".
1911
1912 if ($self->{nc} == 0x0027) { # '
1913 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1914
1915 ## XML5: "DOCTYPE ATTLIST name after state".
1916 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1917 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1918 } else {
1919
1920 ## XML5: "Before attribute name state" (sic).
1921 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1922 }
1923
1924 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1925 $self->{line_prev} = $self->{line};
1926 $self->{column_prev} = $self->{column};
1927 $self->{column}++;
1928 $self->{nc}
1929 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1930 } else {
1931 $self->{set_nc}->($self);
1932 }
1933
1934 redo A;
1935 } elsif ($self->{nc} == 0x0026) { # &
1936
1937 ## XML5: Not defined yet.
1938
1939 ## NOTE: In the spec, the tokenizer is switched to the
1940 ## "entity in attribute value state". In this implementation, the
1941 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1942 ## implementation of the "consume a character reference" algorithm.
1943 $self->{entity_add} = 0x0027; # '
1944 $self->{prev_state} = $self->{state};
1945 $self->{state} = ENTITY_STATE;
1946
1947 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1948 $self->{line_prev} = $self->{line};
1949 $self->{column_prev} = $self->{column};
1950 $self->{column}++;
1951 $self->{nc}
1952 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1953 } else {
1954 $self->{set_nc}->($self);
1955 }
1956
1957 redo A;
1958 } elsif ($self->{is_xml} and
1959 $is_space->{$self->{nc}}) {
1960
1961 $self->{ca}->{value} .= ' ';
1962 ## Stay in the state.
1963
1964 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1965 $self->{line_prev} = $self->{line};
1966 $self->{column_prev} = $self->{column};
1967 $self->{column}++;
1968 $self->{nc}
1969 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1970 } else {
1971 $self->{set_nc}->($self);
1972 }
1973
1974 redo A;
1975 } elsif ($self->{nc} == -1) {
1976 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1977 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1978
1979 $self->{last_stag_name} = $self->{ct}->{tag_name};
1980
1981 $self->{state} = DATA_STATE;
1982 $self->{s_kwd} = '';
1983 ## reconsume
1984 return ($self->{ct}); # start tag
1985 redo A;
1986 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1987 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1988 if ($self->{ct}->{attributes}) {
1989
1990 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1991 } else {
1992 ## NOTE: This state should never be reached.
1993
1994 }
1995
1996 $self->{state} = DATA_STATE;
1997 $self->{s_kwd} = '';
1998 ## reconsume
1999 return ($self->{ct}); # end tag
2000 redo A;
2001 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002 ## XML5: No parse error above; not defined yet.
2003 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2005 ## Reconsume.
2006 return ($self->{ct}); # ATTLIST
2007 redo A;
2008 } else {
2009 die "$0: $self->{ct}->{type}: Unknown token type";
2010 }
2011 } else {
2012 ## XML5 [ATTLIST]: Not defined yet.
2013 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2014
2015 ## XML5: Not a parse error.
2016 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2017 } else {
2018
2019 }
2020 $self->{ca}->{value} .= chr ($self->{nc});
2021 $self->{read_until}->($self->{ca}->{value},
2022 qq['&<\x09\x0C\x20],
2023 length $self->{ca}->{value});
2024
2025 ## Stay in the state
2026
2027 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2028 $self->{line_prev} = $self->{line};
2029 $self->{column_prev} = $self->{column};
2030 $self->{column}++;
2031 $self->{nc}
2032 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2033 } else {
2034 $self->{set_nc}->($self);
2035 }
2036
2037 redo A;
2038 }
2039 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2040 ## XML5: "Tag attribute value unquoted state".
2041
2042 if ($is_space->{$self->{nc}}) {
2043 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2044
2045 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2046 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2047 } else {
2048
2049 ## XML5: "Tag attribute name before state".
2050 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2051 }
2052
2053 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2054 $self->{line_prev} = $self->{line};
2055 $self->{column_prev} = $self->{column};
2056 $self->{column}++;
2057 $self->{nc}
2058 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2059 } else {
2060 $self->{set_nc}->($self);
2061 }
2062
2063 redo A;
2064 } elsif ($self->{nc} == 0x0026) { # &
2065
2066
2067 ## XML5: Not defined yet.
2068
2069 ## NOTE: In the spec, the tokenizer is switched to the
2070 ## "entity in attribute value state". In this implementation, the
2071 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2072 ## implementation of the "consume a character reference" algorithm.
2073 $self->{entity_add} = -1;
2074 $self->{prev_state} = $self->{state};
2075 $self->{state} = ENTITY_STATE;
2076
2077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2078 $self->{line_prev} = $self->{line};
2079 $self->{column_prev} = $self->{column};
2080 $self->{column}++;
2081 $self->{nc}
2082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2083 } else {
2084 $self->{set_nc}->($self);
2085 }
2086
2087 redo A;
2088 } elsif ($self->{nc} == 0x003E) { # >
2089 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2090
2091 $self->{last_stag_name} = $self->{ct}->{tag_name};
2092
2093 $self->{state} = DATA_STATE;
2094 $self->{s_kwd} = '';
2095
2096 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2097 $self->{line_prev} = $self->{line};
2098 $self->{column_prev} = $self->{column};
2099 $self->{column}++;
2100 $self->{nc}
2101 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2102 } else {
2103 $self->{set_nc}->($self);
2104 }
2105
2106 return ($self->{ct}); # start tag
2107 redo A;
2108 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2109 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2110 if ($self->{ct}->{attributes}) {
2111
2112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2113 } else {
2114 ## NOTE: This state should never be reached.
2115
2116 }
2117
2118 $self->{state} = DATA_STATE;
2119 $self->{s_kwd} = '';
2120
2121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2122 $self->{line_prev} = $self->{line};
2123 $self->{column_prev} = $self->{column};
2124 $self->{column}++;
2125 $self->{nc}
2126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2127 } else {
2128 $self->{set_nc}->($self);
2129 }
2130
2131 return ($self->{ct}); # end tag
2132 redo A;
2133 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2134 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2135 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2136
2137 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2138 $self->{line_prev} = $self->{line};
2139 $self->{column_prev} = $self->{column};
2140 $self->{column}++;
2141 $self->{nc}
2142 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2143 } else {
2144 $self->{set_nc}->($self);
2145 }
2146
2147 return ($self->{ct}); # ATTLIST
2148 redo A;
2149 } else {
2150 die "$0: $self->{ct}->{type}: Unknown token type";
2151 }
2152 } elsif ($self->{nc} == -1) {
2153 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2154
2155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2156 $self->{last_stag_name} = $self->{ct}->{tag_name};
2157
2158 $self->{state} = DATA_STATE;
2159 $self->{s_kwd} = '';
2160 ## reconsume
2161 return ($self->{ct}); # start tag
2162 redo A;
2163 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2164 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2165 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2166 if ($self->{ct}->{attributes}) {
2167
2168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2169 } else {
2170 ## NOTE: This state should never be reached.
2171
2172 }
2173
2174 $self->{state} = DATA_STATE;
2175 $self->{s_kwd} = '';
2176 ## reconsume
2177 return ($self->{ct}); # end tag
2178 redo A;
2179 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2180 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2181 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2182 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2183 ## Reconsume.
2184 return ($self->{ct}); # ATTLIST
2185 redo A;
2186 } else {
2187 die "$0: $self->{ct}->{type}: Unknown token type";
2188 }
2189 } else {
2190 if ({
2191 0x0022 => 1, # "
2192 0x0027 => 1, # '
2193 0x003D => 1, # =
2194 0x003C => 1, # <
2195 }->{$self->{nc}}) {
2196
2197 ## XML5: Not a parse error.
2198 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2199 } else {
2200
2201 }
2202 $self->{ca}->{value} .= chr ($self->{nc});
2203 $self->{read_until}->($self->{ca}->{value},
2204 qq["'=& \x09\x0C>],
2205 length $self->{ca}->{value});
2206
2207 ## Stay in the state
2208
2209 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2210 $self->{line_prev} = $self->{line};
2211 $self->{column_prev} = $self->{column};
2212 $self->{column}++;
2213 $self->{nc}
2214 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2215 } else {
2216 $self->{set_nc}->($self);
2217 }
2218
2219 redo A;
2220 }
2221 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2222 if ($is_space->{$self->{nc}}) {
2223
2224 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2225
2226 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2227 $self->{line_prev} = $self->{line};
2228 $self->{column_prev} = $self->{column};
2229 $self->{column}++;
2230 $self->{nc}
2231 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2232 } else {
2233 $self->{set_nc}->($self);
2234 }
2235
2236 redo A;
2237 } elsif ($self->{nc} == 0x003E) { # >
2238 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2239
2240 $self->{last_stag_name} = $self->{ct}->{tag_name};
2241 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2242 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2243 if ($self->{ct}->{attributes}) {
2244
2245 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2246 } else {
2247 ## NOTE: This state should never be reached.
2248
2249 }
2250 } else {
2251 die "$0: $self->{ct}->{type}: Unknown token type";
2252 }
2253 $self->{state} = DATA_STATE;
2254 $self->{s_kwd} = '';
2255
2256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2257 $self->{line_prev} = $self->{line};
2258 $self->{column_prev} = $self->{column};
2259 $self->{column}++;
2260 $self->{nc}
2261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2262 } else {
2263 $self->{set_nc}->($self);
2264 }
2265
2266
2267 return ($self->{ct}); # start tag or end tag
2268
2269 redo A;
2270 } elsif ($self->{nc} == 0x002F) { # /
2271
2272 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2273
2274 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2275 $self->{line_prev} = $self->{line};
2276 $self->{column_prev} = $self->{column};
2277 $self->{column}++;
2278 $self->{nc}
2279 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2280 } else {
2281 $self->{set_nc}->($self);
2282 }
2283
2284 redo A;
2285 } elsif ($self->{nc} == -1) {
2286 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2287 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2288
2289 $self->{last_stag_name} = $self->{ct}->{tag_name};
2290 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2291 if ($self->{ct}->{attributes}) {
2292
2293 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2294 } else {
2295 ## NOTE: This state should never be reached.
2296
2297 }
2298 } else {
2299 die "$0: $self->{ct}->{type}: Unknown token type";
2300 }
2301 $self->{state} = DATA_STATE;
2302 $self->{s_kwd} = '';
2303 ## Reconsume.
2304 return ($self->{ct}); # start tag or end tag
2305 redo A;
2306 } else {
2307
2308 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2309 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2310 ## reconsume
2311 redo A;
2312 }
2313 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2314 ## XML5: "Empty tag state".
2315
2316 if ($self->{nc} == 0x003E) { # >
2317 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2318
2319 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2320 ## TODO: Different type than slash in start tag
2321 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2322 if ($self->{ct}->{attributes}) {
2323
2324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2325 } else {
2326
2327 }
2328 ## TODO: Test |<title></title/>|
2329 } else {
2330
2331 $self->{self_closing} = 1;
2332 }
2333
2334 $self->{state} = DATA_STATE;
2335 $self->{s_kwd} = '';
2336
2337 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2338 $self->{line_prev} = $self->{line};
2339 $self->{column_prev} = $self->{column};
2340 $self->{column}++;
2341 $self->{nc}
2342 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2343 } else {
2344 $self->{set_nc}->($self);
2345 }
2346
2347
2348 return ($self->{ct}); # start tag or end tag
2349
2350 redo A;
2351 } elsif ($self->{nc} == -1) {
2352 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2353 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2354
2355 $self->{last_stag_name} = $self->{ct}->{tag_name};
2356 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2357 if ($self->{ct}->{attributes}) {
2358
2359 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2360 } else {
2361 ## NOTE: This state should never be reached.
2362
2363 }
2364 } else {
2365 die "$0: $self->{ct}->{type}: Unknown token type";
2366 }
2367 ## XML5: "Tag attribute name before state".
2368 $self->{state} = DATA_STATE;
2369 $self->{s_kwd} = '';
2370 ## Reconsume.
2371 return ($self->{ct}); # start tag or end tag
2372 redo A;
2373 } else {
2374
2375 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2376 ## TODO: This error type is wrong.
2377 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2378 ## Reconsume.
2379 redo A;
2380 }
2381 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2382 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2383
2384 ## NOTE: Unlike spec's "bogus comment state", this implementation
2385 ## consumes characters one-by-one basis.
2386
2387 if ($self->{nc} == 0x003E) { # >
2388 if ($self->{in_subset}) {
2389
2390 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2391 } else {
2392
2393 $self->{state} = DATA_STATE;
2394 $self->{s_kwd} = '';
2395 }
2396
2397 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2398 $self->{line_prev} = $self->{line};
2399 $self->{column_prev} = $self->{column};
2400 $self->{column}++;
2401 $self->{nc}
2402 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2403 } else {
2404 $self->{set_nc}->($self);
2405 }
2406
2407
2408 return ($self->{ct}); # comment
2409 redo A;
2410 } elsif ($self->{nc} == -1) {
2411 if ($self->{in_subset}) {
2412
2413 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2414 } else {
2415
2416 $self->{state} = DATA_STATE;
2417 $self->{s_kwd} = '';
2418 }
2419 ## reconsume
2420
2421 return ($self->{ct}); # comment
2422 redo A;
2423 } else {
2424
2425 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2426 $self->{read_until}->($self->{ct}->{data},
2427 q[>],
2428 length $self->{ct}->{data});
2429
2430 ## Stay in the state.
2431
2432 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2433 $self->{line_prev} = $self->{line};
2434 $self->{column_prev} = $self->{column};
2435 $self->{column}++;
2436 $self->{nc}
2437 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2438 } else {
2439 $self->{set_nc}->($self);
2440 }
2441
2442 redo A;
2443 }
2444 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2445 ## XML5: "Markup declaration state".
2446
2447 if ($self->{nc} == 0x002D) { # -
2448
2449 $self->{state} = MD_HYPHEN_STATE;
2450
2451 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2452 $self->{line_prev} = $self->{line};
2453 $self->{column_prev} = $self->{column};
2454 $self->{column}++;
2455 $self->{nc}
2456 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2457 } else {
2458 $self->{set_nc}->($self);
2459 }
2460
2461 redo A;
2462 } elsif ($self->{nc} == 0x0044 or # D
2463 $self->{nc} == 0x0064) { # d
2464 ## ASCII case-insensitive.
2465
2466 $self->{state} = MD_DOCTYPE_STATE;
2467 $self->{kwd} = chr $self->{nc};
2468
2469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470 $self->{line_prev} = $self->{line};
2471 $self->{column_prev} = $self->{column};
2472 $self->{column}++;
2473 $self->{nc}
2474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475 } else {
2476 $self->{set_nc}->($self);
2477 }
2478
2479 redo A;
2480 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2481 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2482 $self->{is_xml}) and
2483 $self->{nc} == 0x005B) { # [
2484
2485 $self->{state} = MD_CDATA_STATE;
2486 $self->{kwd} = '[';
2487
2488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2489 $self->{line_prev} = $self->{line};
2490 $self->{column_prev} = $self->{column};
2491 $self->{column}++;
2492 $self->{nc}
2493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2494 } else {
2495 $self->{set_nc}->($self);
2496 }
2497
2498 redo A;
2499 } else {
2500
2501 }
2502
2503 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2504 line => $self->{line_prev},
2505 column => $self->{column_prev} - 1);
2506 ## Reconsume.
2507 $self->{state} = BOGUS_COMMENT_STATE;
2508 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2509 line => $self->{line_prev},
2510 column => $self->{column_prev} - 1,
2511 };
2512 redo A;
2513 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2514 if ($self->{nc} == 0x002D) { # -
2515
2516 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2517 line => $self->{line_prev},
2518 column => $self->{column_prev} - 2,
2519 };
2520 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2521
2522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2523 $self->{line_prev} = $self->{line};
2524 $self->{column_prev} = $self->{column};
2525 $self->{column}++;
2526 $self->{nc}
2527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2528 } else {
2529 $self->{set_nc}->($self);
2530 }
2531
2532 redo A;
2533 } else {
2534
2535 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2536 line => $self->{line_prev},
2537 column => $self->{column_prev} - 2);
2538 $self->{state} = BOGUS_COMMENT_STATE;
2539 ## Reconsume.
2540 $self->{ct} = {type => COMMENT_TOKEN,
2541 data => '-',
2542 line => $self->{line_prev},
2543 column => $self->{column_prev} - 2,
2544 };
2545 redo A;
2546 }
2547 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2548 ## ASCII case-insensitive.
2549 if ($self->{nc} == [
2550 undef,
2551 0x004F, # O
2552 0x0043, # C
2553 0x0054, # T
2554 0x0059, # Y
2555 0x0050, # P
2556 ]->[length $self->{kwd}] or
2557 $self->{nc} == [
2558 undef,
2559 0x006F, # o
2560 0x0063, # c
2561 0x0074, # t
2562 0x0079, # y
2563 0x0070, # p
2564 ]->[length $self->{kwd}]) {
2565
2566 ## Stay in the state.
2567 $self->{kwd} .= chr $self->{nc};
2568
2569 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2570 $self->{line_prev} = $self->{line};
2571 $self->{column_prev} = $self->{column};
2572 $self->{column}++;
2573 $self->{nc}
2574 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2575 } else {
2576 $self->{set_nc}->($self);
2577 }
2578
2579 redo A;
2580 } elsif ((length $self->{kwd}) == 6 and
2581 ($self->{nc} == 0x0045 or # E
2582 $self->{nc} == 0x0065)) { # e
2583 if ($self->{is_xml} and
2584 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2585
2586 ## XML5: case-sensitive.
2587 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2588 text => 'DOCTYPE',
2589 line => $self->{line_prev},
2590 column => $self->{column_prev} - 5);
2591 } else {
2592
2593 }
2594 $self->{state} = DOCTYPE_STATE;
2595 $self->{ct} = {type => DOCTYPE_TOKEN,
2596 quirks => 1,
2597 line => $self->{line_prev},
2598 column => $self->{column_prev} - 7,
2599 };
2600
2601 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2602 $self->{line_prev} = $self->{line};
2603 $self->{column_prev} = $self->{column};
2604 $self->{column}++;
2605 $self->{nc}
2606 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2607 } else {
2608 $self->{set_nc}->($self);
2609 }
2610
2611 redo A;
2612 } else {
2613
2614 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2615 line => $self->{line_prev},
2616 column => $self->{column_prev} - 1 - length $self->{kwd});
2617 $self->{state} = BOGUS_COMMENT_STATE;
2618 ## Reconsume.
2619 $self->{ct} = {type => COMMENT_TOKEN,
2620 data => $self->{kwd},
2621 line => $self->{line_prev},
2622 column => $self->{column_prev} - 1 - length $self->{kwd},
2623 };
2624 redo A;
2625 }
2626 } elsif ($self->{state} == MD_CDATA_STATE) {
2627 if ($self->{nc} == {
2628 '[' => 0x0043, # C
2629 '[C' => 0x0044, # D
2630 '[CD' => 0x0041, # A
2631 '[CDA' => 0x0054, # T
2632 '[CDAT' => 0x0041, # A
2633 }->{$self->{kwd}}) {
2634
2635 ## Stay in the state.
2636 $self->{kwd} .= chr $self->{nc};
2637
2638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2639 $self->{line_prev} = $self->{line};
2640 $self->{column_prev} = $self->{column};
2641 $self->{column}++;
2642 $self->{nc}
2643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2644 } else {
2645 $self->{set_nc}->($self);
2646 }
2647
2648 redo A;
2649 } elsif ($self->{kwd} eq '[CDATA' and
2650 $self->{nc} == 0x005B) { # [
2651 if ($self->{is_xml} and
2652 not $self->{tainted} and
2653 @{$self->{open_elements} or []} == 0) {
2654
2655 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2656 line => $self->{line_prev},
2657 column => $self->{column_prev} - 7);
2658 $self->{tainted} = 1;
2659 } else {
2660
2661 }
2662
2663 $self->{ct} = {type => CHARACTER_TOKEN,
2664 data => '',
2665 line => $self->{line_prev},
2666 column => $self->{column_prev} - 7};
2667 $self->{state} = CDATA_SECTION_STATE;
2668
2669 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2670 $self->{line_prev} = $self->{line};
2671 $self->{column_prev} = $self->{column};
2672 $self->{column}++;
2673 $self->{nc}
2674 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2675 } else {
2676 $self->{set_nc}->($self);
2677 }
2678
2679 redo A;
2680 } else {
2681
2682 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2683 line => $self->{line_prev},
2684 column => $self->{column_prev} - 1 - length $self->{kwd});
2685 $self->{state} = BOGUS_COMMENT_STATE;
2686 ## Reconsume.
2687 $self->{ct} = {type => COMMENT_TOKEN,
2688 data => $self->{kwd},
2689 line => $self->{line_prev},
2690 column => $self->{column_prev} - 1 - length $self->{kwd},
2691 };
2692 redo A;
2693 }
2694 } elsif ($self->{state} == COMMENT_START_STATE) {
2695 if ($self->{nc} == 0x002D) { # -
2696
2697 $self->{state} = COMMENT_START_DASH_STATE;
2698
2699 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2700 $self->{line_prev} = $self->{line};
2701 $self->{column_prev} = $self->{column};
2702 $self->{column}++;
2703 $self->{nc}
2704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2705 } else {
2706 $self->{set_nc}->($self);
2707 }
2708
2709 redo A;
2710 } elsif ($self->{nc} == 0x003E) { # >
2711 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2712 if ($self->{in_subset}) {
2713
2714 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2715 } else {
2716
2717 $self->{state} = DATA_STATE;
2718 $self->{s_kwd} = '';
2719 }
2720
2721 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2722 $self->{line_prev} = $self->{line};
2723 $self->{column_prev} = $self->{column};
2724 $self->{column}++;
2725 $self->{nc}
2726 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2727 } else {
2728 $self->{set_nc}->($self);
2729 }
2730
2731
2732 return ($self->{ct}); # comment
2733
2734 redo A;
2735 } elsif ($self->{nc} == -1) {
2736 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2737 if ($self->{in_subset}) {
2738
2739 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2740 } else {
2741
2742 $self->{state} = DATA_STATE;
2743 $self->{s_kwd} = '';
2744 }
2745 ## reconsume
2746
2747 return ($self->{ct}); # comment
2748
2749 redo A;
2750 } else {
2751
2752 $self->{ct}->{data} # comment
2753 .= chr ($self->{nc});
2754 $self->{state} = COMMENT_STATE;
2755
2756 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2757 $self->{line_prev} = $self->{line};
2758 $self->{column_prev} = $self->{column};
2759 $self->{column}++;
2760 $self->{nc}
2761 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2762 } else {
2763 $self->{set_nc}->($self);
2764 }
2765
2766 redo A;
2767 }
2768 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2769 if ($self->{nc} == 0x002D) { # -
2770
2771 $self->{state} = COMMENT_END_STATE;
2772
2773 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2774 $self->{line_prev} = $self->{line};
2775 $self->{column_prev} = $self->{column};
2776 $self->{column}++;
2777 $self->{nc}
2778 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2779 } else {
2780 $self->{set_nc}->($self);
2781 }
2782
2783 redo A;
2784 } elsif ($self->{nc} == 0x003E) { # >
2785 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2786 if ($self->{in_subset}) {
2787
2788 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2789 } else {
2790
2791 $self->{state} = DATA_STATE;
2792 $self->{s_kwd} = '';
2793 }
2794
2795 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2796 $self->{line_prev} = $self->{line};
2797 $self->{column_prev} = $self->{column};
2798 $self->{column}++;
2799 $self->{nc}
2800 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2801 } else {
2802 $self->{set_nc}->($self);
2803 }
2804
2805
2806 return ($self->{ct}); # comment
2807
2808 redo A;
2809 } elsif ($self->{nc} == -1) {
2810 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2811 if ($self->{in_subset}) {
2812
2813 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2814 } else {
2815
2816 $self->{state} = DATA_STATE;
2817 $self->{s_kwd} = '';
2818 }
2819 ## reconsume
2820
2821 return ($self->{ct}); # comment
2822
2823 redo A;
2824 } else {
2825
2826 $self->{ct}->{data} # comment
2827 .= '-' . chr ($self->{nc});
2828 $self->{state} = COMMENT_STATE;
2829
2830 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2831 $self->{line_prev} = $self->{line};
2832 $self->{column_prev} = $self->{column};
2833 $self->{column}++;
2834 $self->{nc}
2835 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2836 } else {
2837 $self->{set_nc}->($self);
2838 }
2839
2840 redo A;
2841 }
2842 } elsif ($self->{state} == COMMENT_STATE) {
2843 ## XML5: "Comment state" and "DOCTYPE comment state".
2844
2845 if ($self->{nc} == 0x002D) { # -
2846
2847 $self->{state} = COMMENT_END_DASH_STATE;
2848
2849 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2850 $self->{line_prev} = $self->{line};
2851 $self->{column_prev} = $self->{column};
2852 $self->{column}++;
2853 $self->{nc}
2854 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2855 } else {
2856 $self->{set_nc}->($self);
2857 }
2858
2859 redo A;
2860 } elsif ($self->{nc} == -1) {
2861 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2862 if ($self->{in_subset}) {
2863
2864 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2865 } else {
2866
2867 $self->{state} = DATA_STATE;
2868 $self->{s_kwd} = '';
2869 }
2870 ## reconsume
2871
2872 return ($self->{ct}); # comment
2873
2874 redo A;
2875 } else {
2876
2877 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2878 $self->{read_until}->($self->{ct}->{data},
2879 q[-],
2880 length $self->{ct}->{data});
2881
2882 ## Stay in the state
2883
2884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2885 $self->{line_prev} = $self->{line};
2886 $self->{column_prev} = $self->{column};
2887 $self->{column}++;
2888 $self->{nc}
2889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2890 } else {
2891 $self->{set_nc}->($self);
2892 }
2893
2894 redo A;
2895 }
2896 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2897 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2898
2899 if ($self->{nc} == 0x002D) { # -
2900
2901 $self->{state} = COMMENT_END_STATE;
2902
2903 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2904 $self->{line_prev} = $self->{line};
2905 $self->{column_prev} = $self->{column};
2906 $self->{column}++;
2907 $self->{nc}
2908 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2909 } else {
2910 $self->{set_nc}->($self);
2911 }
2912
2913 redo A;
2914 } elsif ($self->{nc} == -1) {
2915 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2916 if ($self->{in_subset}) {
2917
2918 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2919 } else {
2920
2921 $self->{state} = DATA_STATE;
2922 $self->{s_kwd} = '';
2923 }
2924 ## reconsume
2925
2926 return ($self->{ct}); # comment
2927
2928 redo A;
2929 } else {
2930
2931 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2932 $self->{state} = COMMENT_STATE;
2933
2934 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2935 $self->{line_prev} = $self->{line};
2936 $self->{column_prev} = $self->{column};
2937 $self->{column}++;
2938 $self->{nc}
2939 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2940 } else {
2941 $self->{set_nc}->($self);
2942 }
2943
2944 redo A;
2945 }
2946 } elsif ($self->{state} == COMMENT_END_STATE or
2947 $self->{state} == COMMENT_END_BANG_STATE) {
2948 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2949 ## (No comment end bang state.)
2950
2951 if ($self->{nc} == 0x003E) { # >
2952 if ($self->{in_subset}) {
2953
2954 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955 } else {
2956
2957 $self->{state} = DATA_STATE;
2958 $self->{s_kwd} = '';
2959 }
2960
2961 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2962 $self->{line_prev} = $self->{line};
2963 $self->{column_prev} = $self->{column};
2964 $self->{column}++;
2965 $self->{nc}
2966 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2967 } else {
2968 $self->{set_nc}->($self);
2969 }
2970
2971
2972 return ($self->{ct}); # comment
2973
2974 redo A;
2975 } elsif ($self->{nc} == 0x002D) { # -
2976 if ($self->{state} == COMMENT_END_BANG_STATE) {
2977
2978 $self->{ct}->{data} .= '--!'; # comment
2979 $self->{state} = COMMENT_END_DASH_STATE;
2980 } else {
2981
2982 ## XML5: Not a parse error.
2983 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2984 line => $self->{line_prev},
2985 column => $self->{column_prev});
2986 $self->{ct}->{data} .= '-'; # comment
2987 ## Stay in the state
2988 }
2989
2990 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2991 $self->{line_prev} = $self->{line};
2992 $self->{column_prev} = $self->{column};
2993 $self->{column}++;
2994 $self->{nc}
2995 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2996 } else {
2997 $self->{set_nc}->($self);
2998 }
2999
3000 redo A;
3001 } elsif ($self->{nc} == 0x0021 and # !
3002 $self->{state} != COMMENT_END_BANG_STATE) {
3003 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3004 $self->{state} = COMMENT_END_BANG_STATE;
3005
3006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3007 $self->{line_prev} = $self->{line};
3008 $self->{column_prev} = $self->{column};
3009 $self->{column}++;
3010 $self->{nc}
3011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3012 } else {
3013 $self->{set_nc}->($self);
3014 }
3015
3016 redo A;
3017 } elsif ($self->{nc} == -1) {
3018 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3019 if ($self->{in_subset}) {
3020
3021 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3022 } else {
3023
3024 $self->{state} = DATA_STATE;
3025 $self->{s_kwd} = '';
3026 }
3027 ## Reconsume.
3028
3029 return ($self->{ct}); # comment
3030
3031 redo A;
3032 } else {
3033
3034 if ($self->{state} == COMMENT_END_BANG_STATE) {
3035 $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3036 } else {
3037 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3038 }
3039 $self->{state} = COMMENT_STATE;
3040
3041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3042 $self->{line_prev} = $self->{line};
3043 $self->{column_prev} = $self->{column};
3044 $self->{column}++;
3045 $self->{nc}
3046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3047 } else {
3048 $self->{set_nc}->($self);
3049 }
3050
3051 redo A;
3052 }
3053 } elsif ($self->{state} == DOCTYPE_STATE) {
3054 if ($is_space->{$self->{nc}}) {
3055
3056 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3057
3058 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3059 $self->{line_prev} = $self->{line};
3060 $self->{column_prev} = $self->{column};
3061 $self->{column}++;
3062 $self->{nc}
3063 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3064 } else {
3065 $self->{set_nc}->($self);
3066 }
3067
3068 redo A;
3069 } elsif ($self->{nc} == -1) {
3070
3071 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3072 $self->{ct}->{quirks} = 1;
3073
3074 $self->{state} = DATA_STATE;
3075 ## Reconsume.
3076 return ($self->{ct}); # DOCTYPE (quirks)
3077
3078 redo A;
3079 } else {
3080
3081 ## XML5: Swith to the bogus comment state.
3082 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3083 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3084 ## reconsume
3085 redo A;
3086 }
3087 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3088 ## XML5: "DOCTYPE root name before state".
3089
3090 if ($is_space->{$self->{nc}}) {
3091
3092 ## Stay in the state
3093
3094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3095 $self->{line_prev} = $self->{line};
3096 $self->{column_prev} = $self->{column};
3097 $self->{column}++;
3098 $self->{nc}
3099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3100 } else {
3101 $self->{set_nc}->($self);
3102 }
3103
3104 redo A;
3105 } elsif ($self->{nc} == 0x003E) { # >
3106
3107 ## XML5: No parse error.
3108 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3109 $self->{state} = DATA_STATE;
3110 $self->{s_kwd} = '';
3111
3112 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3113 $self->{line_prev} = $self->{line};
3114 $self->{column_prev} = $self->{column};
3115 $self->{column}++;
3116 $self->{nc}
3117 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3118 } else {
3119 $self->{set_nc}->($self);
3120 }
3121
3122
3123 return ($self->{ct}); # DOCTYPE (quirks)
3124
3125 redo A;
3126 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3127
3128 $self->{ct}->{name} # DOCTYPE
3129 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3130 delete $self->{ct}->{quirks};
3131 $self->{state} = DOCTYPE_NAME_STATE;
3132
3133 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3134 $self->{line_prev} = $self->{line};
3135 $self->{column_prev} = $self->{column};
3136 $self->{column}++;
3137 $self->{nc}
3138 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3139 } else {
3140 $self->{set_nc}->($self);
3141 }
3142
3143 redo A;
3144 } elsif ($self->{nc} == -1) {
3145
3146 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3147 $self->{state} = DATA_STATE;
3148 $self->{s_kwd} = '';
3149 ## reconsume
3150
3151 return ($self->{ct}); # DOCTYPE (quirks)
3152
3153 redo A;
3154 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3155
3156 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3157 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3158 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3159 $self->{in_subset} = 1;
3160
3161 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3162 $self->{line_prev} = $self->{line};
3163 $self->{column_prev} = $self->{column};
3164 $self->{column}++;
3165 $self->{nc}
3166 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3167 } else {
3168 $self->{set_nc}->($self);
3169 }
3170
3171 return ($self->{ct}); # DOCTYPE
3172 redo A;
3173 } else {
3174
3175 $self->{ct}->{name} = chr $self->{nc};
3176 delete $self->{ct}->{quirks};
3177 $self->{state} = DOCTYPE_NAME_STATE;
3178
3179 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3180 $self->{line_prev} = $self->{line};
3181 $self->{column_prev} = $self->{column};
3182 $self->{column}++;
3183 $self->{nc}
3184 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3185 } else {
3186 $self->{set_nc}->($self);
3187 }
3188
3189 redo A;
3190 }
3191 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3192 ## XML5: "DOCTYPE root name state".
3193
3194 ## ISSUE: Redundant "First," in the spec.
3195
3196 if ($is_space->{$self->{nc}}) {
3197
3198 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3199
3200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3201 $self->{line_prev} = $self->{line};
3202 $self->{column_prev} = $self->{column};
3203 $self->{column}++;
3204 $self->{nc}
3205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3206 } else {
3207 $self->{set_nc}->($self);
3208 }
3209
3210 redo A;
3211 } elsif ($self->{nc} == 0x003E) { # >
3212
3213 $self->{state} = DATA_STATE;
3214 $self->{s_kwd} = '';
3215
3216 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3217 $self->{line_prev} = $self->{line};
3218 $self->{column_prev} = $self->{column};
3219 $self->{column}++;
3220 $self->{nc}
3221 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3222 } else {
3223 $self->{set_nc}->($self);
3224 }
3225
3226
3227 return ($self->{ct}); # DOCTYPE
3228
3229 redo A;
3230 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3231
3232 $self->{ct}->{name} # DOCTYPE
3233 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3234 delete $self->{ct}->{quirks};
3235 ## Stay in the state.
3236
3237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238 $self->{line_prev} = $self->{line};
3239 $self->{column_prev} = $self->{column};
3240 $self->{column}++;
3241 $self->{nc}
3242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243 } else {
3244 $self->{set_nc}->($self);
3245 }
3246
3247 redo A;
3248 } elsif ($self->{nc} == -1) {
3249
3250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3251 $self->{state} = DATA_STATE;
3252 $self->{s_kwd} = '';
3253 ## reconsume
3254
3255 $self->{ct}->{quirks} = 1;
3256 return ($self->{ct}); # DOCTYPE
3257
3258 redo A;
3259 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3260
3261 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3262 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3263 $self->{in_subset} = 1;
3264
3265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3266 $self->{line_prev} = $self->{line};
3267 $self->{column_prev} = $self->{column};
3268 $self->{column}++;
3269 $self->{nc}
3270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3271 } else {
3272 $self->{set_nc}->($self);
3273 }
3274
3275 return ($self->{ct}); # DOCTYPE
3276 redo A;
3277 } else {
3278
3279 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3280 ## Stay in the state.
3281
3282 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283 $self->{line_prev} = $self->{line};
3284 $self->{column_prev} = $self->{column};
3285 $self->{column}++;
3286 $self->{nc}
3287 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288 } else {
3289 $self->{set_nc}->($self);
3290 }
3291
3292 redo A;
3293 }
3294 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3295 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3296 ## state", but implemented differently.
3297
3298 if ($is_space->{$self->{nc}}) {
3299
3300 ## Stay in the state
3301
3302 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3303 $self->{line_prev} = $self->{line};
3304 $self->{column_prev} = $self->{column};
3305 $self->{column}++;
3306 $self->{nc}
3307 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3308 } else {
3309 $self->{set_nc}->($self);
3310 }
3311
3312 redo A;
3313 } elsif ($self->{nc} == 0x003E) { # >
3314 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3315
3316 $self->{state} = DATA_STATE;
3317 $self->{s_kwd} = '';
3318 } else {
3319
3320 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3321 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3322 }
3323
3324
3325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3326 $self->{line_prev} = $self->{line};
3327 $self->{column_prev} = $self->{column};
3328 $self->{column}++;
3329 $self->{nc}
3330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3331 } else {
3332 $self->{set_nc}->($self);
3333 }
3334
3335 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3336 redo A;
3337 } elsif ($self->{nc} == -1) {
3338 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3339
3340 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3341 $self->{state} = DATA_STATE;
3342 $self->{s_kwd} = '';
3343 $self->{ct}->{quirks} = 1;
3344 } else {
3345
3346 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3347 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3348 }
3349
3350 ## Reconsume.
3351 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3352 redo A;
3353 } elsif ($self->{nc} == 0x0050 or # P
3354 $self->{nc} == 0x0070) { # p
3355
3356 $self->{state} = PUBLIC_STATE;
3357 $self->{kwd} = chr $self->{nc};
3358
3359 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3360 $self->{line_prev} = $self->{line};
3361 $self->{column_prev} = $self->{column};
3362 $self->{column}++;
3363 $self->{nc}
3364 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3365 } else {
3366 $self->{set_nc}->($self);
3367 }
3368
3369 redo A;
3370 } elsif ($self->{nc} == 0x0053 or # S
3371 $self->{nc} == 0x0073) { # s
3372
3373 $self->{state} = SYSTEM_STATE;
3374 $self->{kwd} = chr $self->{nc};
3375
3376 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377 $self->{line_prev} = $self->{line};
3378 $self->{column_prev} = $self->{column};
3379 $self->{column}++;
3380 $self->{nc}
3381 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382 } else {
3383 $self->{set_nc}->($self);
3384 }
3385
3386 redo A;
3387 } elsif ($self->{nc} == 0x0022 and # "
3388 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3389 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3390
3391 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3392 $self->{ct}->{value} = ''; # ENTITY
3393
3394 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3395 $self->{line_prev} = $self->{line};
3396 $self->{column_prev} = $self->{column};
3397 $self->{column}++;
3398 $self->{nc}
3399 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3400 } else {
3401 $self->{set_nc}->($self);
3402 }
3403
3404 redo A;
3405 } elsif ($self->{nc} == 0x0027 and # '
3406 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3407 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3408
3409 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3410 $self->{ct}->{value} = ''; # ENTITY
3411
3412 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3413 $self->{line_prev} = $self->{line};
3414 $self->{column_prev} = $self->{column};
3415 $self->{column}++;
3416 $self->{nc}
3417 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3418 } else {
3419 $self->{set_nc}->($self);
3420 }
3421
3422 redo A;
3423 } elsif ($self->{is_xml} and
3424 $self->{ct}->{type} == DOCTYPE_TOKEN and
3425 $self->{nc} == 0x005B) { # [
3426
3427 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3428 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3429 $self->{in_subset} = 1;
3430
3431 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3432 $self->{line_prev} = $self->{line};
3433 $self->{column_prev} = $self->{column};
3434 $self->{column}++;
3435 $self->{nc}
3436 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3437 } else {
3438 $self->{set_nc}->($self);
3439 }
3440
3441 return ($self->{ct}); # DOCTYPE
3442 redo A;
3443 } else {
3444 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3445
3446 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3447
3448 $self->{ct}->{quirks} = 1;
3449 $self->{state} = BOGUS_DOCTYPE_STATE;
3450 } else {
3451
3452 $self->{state} = BOGUS_MD_STATE;
3453 }
3454
3455
3456 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3457 $self->{line_prev} = $self->{line};
3458 $self->{column_prev} = $self->{column};
3459 $self->{column}++;
3460 $self->{nc}
3461 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3462 } else {
3463 $self->{set_nc}->($self);
3464 }
3465
3466 redo A;
3467 }
3468 } elsif ($self->{state} == PUBLIC_STATE) {
3469 ## ASCII case-insensitive
3470 if ($self->{nc} == [
3471 undef,
3472 0x0055, # U
3473 0x0042, # B
3474 0x004C, # L
3475 0x0049, # I
3476 ]->[length $self->{kwd}] or
3477 $self->{nc} == [
3478 undef,
3479 0x0075, # u
3480 0x0062, # b
3481 0x006C, # l
3482 0x0069, # i
3483 ]->[length $self->{kwd}]) {
3484
3485 ## Stay in the state.
3486 $self->{kwd} .= chr $self->{nc};
3487
3488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3489 $self->{line_prev} = $self->{line};
3490 $self->{column_prev} = $self->{column};
3491 $self->{column}++;
3492 $self->{nc}
3493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3494 } else {
3495 $self->{set_nc}->($self);
3496 }
3497
3498 redo A;
3499 } elsif ((length $self->{kwd}) == 5 and
3500 ($self->{nc} == 0x0043 or # C
3501 $self->{nc} == 0x0063)) { # c
3502 if ($self->{is_xml} and
3503 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3504
3505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3506 text => 'PUBLIC',
3507 line => $self->{line_prev},
3508 column => $self->{column_prev} - 4);
3509 } else {
3510
3511 }
3512 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3513
3514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3515 $self->{line_prev} = $self->{line};
3516 $self->{column_prev} = $self->{column};
3517 $self->{column}++;
3518 $self->{nc}
3519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3520 } else {
3521 $self->{set_nc}->($self);
3522 }
3523
3524 redo A;
3525 } else {
3526 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3527 line => $self->{line_prev},
3528 column => $self->{column_prev} + 1 - length $self->{kwd});
3529 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3530
3531 $self->{ct}->{quirks} = 1;
3532 $self->{state} = BOGUS_DOCTYPE_STATE;
3533 } else {
3534
3535 $self->{state} = BOGUS_MD_STATE;
3536 }
3537 ## Reconsume.
3538 redo A;
3539 }
3540 } elsif ($self->{state} == SYSTEM_STATE) {
3541 ## ASCII case-insensitive
3542 if ($self->{nc} == [
3543 undef,
3544 0x0059, # Y
3545 0x0053, # S
3546 0x0054, # T
3547 0x0045, # E
3548 ]->[length $self->{kwd}] or
3549 $self->{nc} == [
3550 undef,
3551 0x0079, # y
3552 0x0073, # s
3553 0x0074, # t
3554 0x0065, # e
3555 ]->[length $self->{kwd}]) {
3556
3557 ## Stay in the state.
3558 $self->{kwd} .= chr $self->{nc};
3559
3560 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3561 $self->{line_prev} = $self->{line};
3562 $self->{column_prev} = $self->{column};
3563 $self->{column}++;
3564 $self->{nc}
3565 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3566 } else {
3567 $self->{set_nc}->($self);
3568 }
3569
3570 redo A;
3571 } elsif ((length $self->{kwd}) == 5 and
3572 ($self->{nc} == 0x004D or # M
3573 $self->{nc} == 0x006D)) { # m
3574 if ($self->{is_xml} and
3575 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3576
3577 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3578 text => 'SYSTEM',
3579 line => $self->{line_prev},
3580 column => $self->{column_prev} - 4);
3581 } else {
3582
3583 }
3584 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3585
3586 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3587 $self->{line_prev} = $self->{line};
3588 $self->{column_prev} = $self->{column};
3589 $self->{column}++;
3590 $self->{nc}
3591 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3592 } else {
3593 $self->{set_nc}->($self);
3594 }
3595
3596 redo A;
3597 } else {
3598 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3599 line => $self->{line_prev},
3600 column => $self->{column_prev} + 1 - length $self->{kwd});
3601 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3602
3603 $self->{ct}->{quirks} = 1;
3604 $self->{state} = BOGUS_DOCTYPE_STATE;
3605 } else {
3606
3607 $self->{state} = BOGUS_MD_STATE;
3608 }
3609 ## Reconsume.
3610 redo A;
3611 }
3612 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3613 if ($is_space->{$self->{nc}}) {
3614
3615 ## Stay in the state
3616
3617 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3618 $self->{line_prev} = $self->{line};
3619 $self->{column_prev} = $self->{column};
3620 $self->{column}++;
3621 $self->{nc}
3622 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3623 } else {
3624 $self->{set_nc}->($self);
3625 }
3626
3627 redo A;
3628 } elsif ($self->{nc} eq 0x0022) { # "
3629
3630 $self->{ct}->{pubid} = ''; # DOCTYPE
3631 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3632
3633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3634 $self->{line_prev} = $self->{line};
3635 $self->{column_prev} = $self->{column};
3636 $self->{column}++;
3637 $self->{nc}
3638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3639 } else {
3640 $self->{set_nc}->($self);
3641 }
3642
3643 redo A;
3644 } elsif ($self->{nc} eq 0x0027) { # '
3645
3646 $self->{ct}->{pubid} = ''; # DOCTYPE
3647 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3648
3649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3650 $self->{line_prev} = $self->{line};
3651 $self->{column_prev} = $self->{column};
3652 $self->{column}++;
3653 $self->{nc}
3654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3655 } else {
3656 $self->{set_nc}->($self);
3657 }
3658
3659 redo A;
3660 } elsif ($self->{nc} eq 0x003E) { # >
3661 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3662
3663 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3664
3665 $self->{state} = DATA_STATE;
3666 $self->{s_kwd} = '';
3667 $self->{ct}->{quirks} = 1;
3668 } else {
3669
3670 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3671 }
3672
3673
3674 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3675 $self->{line_prev} = $self->{line};
3676 $self->{column_prev} = $self->{column};
3677 $self->{column}++;
3678 $self->{nc}
3679 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3680 } else {
3681 $self->{set_nc}->($self);
3682 }
3683
3684 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3685 redo A;
3686 } elsif ($self->{nc} == -1) {
3687 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3688
3689 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3690 $self->{state} = DATA_STATE;
3691 $self->{s_kwd} = '';
3692 $self->{ct}->{quirks} = 1;
3693 } else {
3694
3695 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3696 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3697 }
3698
3699 ## reconsume
3700 return ($self->{ct}); # DOCTYPE
3701 redo A;
3702 } elsif ($self->{is_xml} and
3703 $self->{ct}->{type} == DOCTYPE_TOKEN and
3704 $self->{nc} == 0x005B) { # [
3705
3706 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3707 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3708 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3709 $self->{in_subset} = 1;
3710
3711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3712 $self->{line_prev} = $self->{line};
3713 $self->{column_prev} = $self->{column};
3714 $self->{column}++;
3715 $self->{nc}
3716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3717 } else {
3718 $self->{set_nc}->($self);
3719 }
3720
3721 return ($self->{ct}); # DOCTYPE
3722 redo A;
3723 } else {
3724 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3725
3726 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3727
3728 $self->{ct}->{quirks} = 1;
3729 $self->{state} = BOGUS_DOCTYPE_STATE;
3730 } else {
3731
3732 $self->{state} = BOGUS_MD_STATE;
3733 }
3734
3735
3736 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3737 $self->{line_prev} = $self->{line};
3738 $self->{column_prev} = $self->{column};
3739 $self->{column}++;
3740 $self->{nc}
3741 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3742 } else {
3743 $self->{set_nc}->($self);
3744 }
3745
3746 redo A;
3747 }
3748 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3749 if ($self->{nc} == 0x0022) { # "
3750
3751 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3752
3753 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3754 $self->{line_prev} = $self->{line};
3755 $self->{column_prev} = $self->{column};
3756 $self->{column}++;
3757 $self->{nc}
3758 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3759 } else {
3760 $self->{set_nc}->($self);
3761 }
3762
3763 redo A;
3764 } elsif ($self->{nc} == 0x003E) { # >
3765 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3766
3767 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3768
3769 $self->{state} = DATA_STATE;
3770 $self->{s_kwd} = '';
3771 $self->{ct}->{quirks} = 1;
3772 } else {
3773
3774 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3775 }
3776
3777
3778 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779 $self->{line_prev} = $self->{line};
3780 $self->{column_prev} = $self->{column};
3781 $self->{column}++;
3782 $self->{nc}
3783 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784 } else {
3785 $self->{set_nc}->($self);
3786 }
3787
3788 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3789 redo A;
3790 } elsif ($self->{nc} == -1) {
3791 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3792
3793 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794
3795 $self->{state} = DATA_STATE;
3796 $self->{s_kwd} = '';
3797 $self->{ct}->{quirks} = 1;
3798 } else {
3799
3800 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801 }
3802
3803 ## Reconsume.
3804 return ($self->{ct}); # DOCTYPE
3805 redo A;
3806 } else {
3807
3808 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3809 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3810 length $self->{ct}->{pubid});
3811
3812 ## Stay in the state
3813
3814 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3815 $self->{line_prev} = $self->{line};
3816 $self->{column_prev} = $self->{column};
3817 $self->{column}++;
3818 $self->{nc}
3819 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3820 } else {
3821 $self->{set_nc}->($self);
3822 }
3823
3824 redo A;
3825 }
3826 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3827 if ($self->{nc} == 0x0027) { # '
3828
3829 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3830
3831 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3832 $self->{line_prev} = $self->{line};
3833 $self->{column_prev} = $self->{column};
3834 $self->{column}++;
3835 $self->{nc}
3836 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3837 } else {
3838 $self->{set_nc}->($self);
3839 }
3840
3841 redo A;
3842 } elsif ($self->{nc} == 0x003E) { # >
3843 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3844
3845 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3846
3847 $self->{state} = DATA_STATE;
3848 $self->{s_kwd} = '';
3849 $self->{ct}->{quirks} = 1;
3850 } else {
3851
3852 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3853 }
3854
3855
3856 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3857 $self->{line_prev} = $self->{line};
3858 $self->{column_prev} = $self->{column};
3859 $self->{column}++;
3860 $self->{nc}
3861 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3862 } else {
3863 $self->{set_nc}->($self);
3864 }
3865
3866 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3867 redo A;
3868 } elsif ($self->{nc} == -1) {
3869 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3870
3871 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3872
3873 $self->{state} = DATA_STATE;
3874 $self->{s_kwd} = '';
3875 $self->{ct}->{quirks} = 1;
3876 } else {
3877
3878 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3879 }
3880
3881 ## reconsume
3882 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3883 redo A;
3884 } else {
3885
3886 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3887 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3888 length $self->{ct}->{pubid});
3889
3890 ## Stay in the state
3891
3892 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3893 $self->{line_prev} = $self->{line};
3894 $self->{column_prev} = $self->{column};
3895 $self->{column}++;
3896 $self->{nc}
3897 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3898 } else {
3899 $self->{set_nc}->($self);
3900 }
3901
3902 redo A;
3903 }
3904 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3905 if ($is_space->{$self->{nc}}) {
3906
3907 ## Stay in the state
3908
3909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3910 $self->{line_prev} = $self->{line};
3911 $self->{column_prev} = $self->{column};
3912 $self->{column}++;
3913 $self->{nc}
3914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3915 } else {
3916 $self->{set_nc}->($self);
3917 }
3918
3919 redo A;
3920 } elsif ($self->{nc} == 0x0022) { # "
3921
3922 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3923 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3924
3925 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3926 $self->{line_prev} = $self->{line};
3927 $self->{column_prev} = $self->{column};
3928 $self->{column}++;
3929 $self->{nc}
3930 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3931 } else {
3932 $self->{set_nc}->($self);
3933 }
3934
3935 redo A;
3936 } elsif ($self->{nc} == 0x0027) { # '
3937
3938 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3939 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3940
3941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3942 $self->{line_prev} = $self->{line};
3943 $self->{column_prev} = $self->{column};
3944 $self->{column}++;
3945 $self->{nc}
3946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3947 } else {
3948 $self->{set_nc}->($self);
3949 }
3950
3951 redo A;
3952 } elsif ($self->{nc} == 0x003E) { # >
3953 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3954 if ($self->{is_xml}) {
3955
3956 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3957 } else {
3958
3959 }
3960 $self->{state} = DATA_STATE;
3961 $self->{s_kwd} = '';
3962 } else {
3963 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3964
3965 } else {
3966
3967 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3968 }
3969 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3970 }
3971
3972
3973 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3974 $self->{line_prev} = $self->{line};
3975 $self->{column_prev} = $self->{column};
3976 $self->{column}++;
3977 $self->{nc}
3978 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3979 } else {
3980 $self->{set_nc}->($self);
3981 }
3982
3983 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3984 redo A;
3985 } elsif ($self->{nc} == -1) {
3986 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3987
3988 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3989
3990 $self->{state} = DATA_STATE;
3991 $self->{s_kwd} = '';
3992 $self->{ct}->{quirks} = 1;
3993 } else {
3994 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3995 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3996 }
3997
3998 ## reconsume
3999 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4000 redo A;
4001 } elsif ($self->{is_xml} and
4002 $self->{ct}->{type} == DOCTYPE_TOKEN and
4003 $self->{nc} == 0x005B) { # [
4004
4005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4006 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4007 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4008 $self->{in_subset} = 1;
4009
4010 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4011 $self->{line_prev} = $self->{line};
4012 $self->{column_prev} = $self->{column};
4013 $self->{column}++;
4014 $self->{nc}
4015 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4016 } else {
4017 $self->{set_nc}->($self);
4018 }
4019
4020 return ($self->{ct}); # DOCTYPE
4021 redo A;
4022 } else {
4023 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4024
4025 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4026
4027 $self->{ct}->{quirks} = 1;
4028 $self->{state} = BOGUS_DOCTYPE_STATE;
4029 } else {
4030
4031 $self->{state} = BOGUS_MD_STATE;
4032 }
4033
4034
4035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036 $self->{line_prev} = $self->{line};
4037 $self->{column_prev} = $self->{column};
4038 $self->{column}++;
4039 $self->{nc}
4040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041 } else {
4042 $self->{set_nc}->($self);
4043 }
4044
4045 redo A;
4046 }
4047 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4048 if ($is_space->{$self->{nc}}) {
4049
4050 ## Stay in the state
4051
4052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4053 $self->{line_prev} = $self->{line};
4054 $self->{column_prev} = $self->{column};
4055 $self->{column}++;
4056 $self->{nc}
4057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4058 } else {
4059 $self->{set_nc}->($self);
4060 }
4061
4062 redo A;
4063 } elsif ($self->{nc} == 0x0022) { # "
4064
4065 $self->{ct}->{sysid} = ''; # DOCTYPE
4066 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4067
4068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4069 $self->{line_prev} = $self->{line};
4070 $self->{column_prev} = $self->{column};
4071 $self->{column}++;
4072 $self->{nc}
4073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4074 } else {
4075 $self->{set_nc}->($self);
4076 }
4077
4078 redo A;
4079 } elsif ($self->{nc} == 0x0027) { # '
4080
4081 $self->{ct}->{sysid} = ''; # DOCTYPE
4082 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4083
4084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4085 $self->{line_prev} = $self->{line};
4086 $self->{column_prev} = $self->{column};
4087 $self->{column}++;
4088 $self->{nc}
4089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4090 } else {
4091 $self->{set_nc}->($self);
4092 }
4093
4094 redo A;
4095 } elsif ($self->{nc} == 0x003E) { # >
4096 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4097
4098 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4099 $self->{line_prev} = $self->{line};
4100 $self->{column_prev} = $self->{column};
4101 $self->{column}++;
4102 $self->{nc}
4103 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4104 } else {
4105 $self->{set_nc}->($self);
4106 }
4107
4108
4109 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4110
4111 $self->{state} = DATA_STATE;
4112 $self->{s_kwd} = '';
4113 $self->{ct}->{quirks} = 1;
4114 } else {
4115
4116 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4117 }
4118
4119 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4120 redo A;
4121 } elsif ($self->{nc} == -1) {
4122 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4123
4124 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4125 $self->{state} = DATA_STATE;
4126 $self->{s_kwd} = '';
4127 $self->{ct}->{quirks} = 1;
4128 } else {
4129
4130 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4131 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4132 }
4133
4134 ## reconsume
4135 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4136 redo A;
4137 } elsif ($self->{is_xml} and
4138 $self->{ct}->{type} == DOCTYPE_TOKEN and
4139 $self->{nc} == 0x005B) { # [
4140
4141 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4142
4143 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4144 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4145 $self->{in_subset} = 1;
4146
4147 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4148 $self->{line_prev} = $self->{line};
4149 $self->{column_prev} = $self->{column};
4150 $self->{column}++;
4151 $self->{nc}
4152 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4153 } else {
4154 $self->{set_nc}->($self);
4155 }
4156
4157 return ($self->{ct}); # DOCTYPE
4158 redo A;
4159 } else {
4160 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4161
4162 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4163
4164 $self->{ct}->{quirks} = 1;
4165 $self->{state} = BOGUS_DOCTYPE_STATE;
4166 } else {
4167
4168 $self->{state} = BOGUS_MD_STATE;
4169 }
4170
4171
4172 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4173 $self->{line_prev} = $self->{line};
4174 $self->{column_prev} = $self->{column};
4175 $self->{column}++;
4176 $self->{nc}
4177 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4178 } else {
4179 $self->{set_nc}->($self);
4180 }
4181
4182 redo A;
4183 }
4184 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4185 if ($self->{nc} == 0x0022) { # "
4186
4187 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4188
4189 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190 $self->{line_prev} = $self->{line};
4191 $self->{column_prev} = $self->{column};
4192 $self->{column}++;
4193 $self->{nc}
4194 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195 } else {
4196 $self->{set_nc}->($self);
4197 }
4198
4199 redo A;
4200 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4201 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4202
4203 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4204
4205 $self->{state} = DATA_STATE;
4206 $self->{s_kwd} = '';
4207 $self->{ct}->{quirks} = 1;
4208 } else {
4209
4210 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4211 }
4212
4213
4214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4215 $self->{line_prev} = $self->{line};
4216 $self->{column_prev} = $self->{column};
4217 $self->{column}++;
4218 $self->{nc}
4219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4220 } else {
4221 $self->{set_nc}->($self);
4222 }
4223
4224 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4225 redo A;
4226 } elsif ($self->{nc} == -1) {
4227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4228
4229 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4230
4231 $self->{state} = DATA_STATE;
4232 $self->{s_kwd} = '';
4233 $self->{ct}->{quirks} = 1;
4234 } else {
4235
4236 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4237 }
4238
4239 ## reconsume
4240 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4241 redo A;
4242 } else {
4243
4244 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4245 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4246 length $self->{ct}->{sysid});
4247
4248 ## Stay in the state
4249
4250 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4251 $self->{line_prev} = $self->{line};
4252 $self->{column_prev} = $self->{column};
4253 $self->{column}++;
4254 $self->{nc}
4255 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4256 } else {
4257 $self->{set_nc}->($self);
4258 }
4259
4260 redo A;
4261 }
4262 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4263 if ($self->{nc} == 0x0027) { # '
4264
4265 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4266
4267 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268 $self->{line_prev} = $self->{line};
4269 $self->{column_prev} = $self->{column};
4270 $self->{column}++;
4271 $self->{nc}
4272 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4273 } else {
4274 $self->{set_nc}->($self);
4275 }
4276
4277 redo A;
4278 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4279
4280 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4281
4282 $self->{state} = DATA_STATE;
4283 $self->{s_kwd} = '';
4284
4285 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4286 $self->{line_prev} = $self->{line};
4287 $self->{column_prev} = $self->{column};
4288 $self->{column}++;
4289 $self->{nc}
4290 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4291 } else {
4292 $self->{set_nc}->($self);
4293 }
4294
4295
4296 $self->{ct}->{quirks} = 1;
4297 return ($self->{ct}); # DOCTYPE
4298
4299 redo A;
4300 } elsif ($self->{nc} == -1) {
4301 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4302
4303 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4304
4305 $self->{state} = DATA_STATE;
4306 $self->{s_kwd} = '';
4307 $self->{ct}->{quirks} = 1;
4308 } else {
4309
4310 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4311 }
4312
4313 ## reconsume
4314 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4315 redo A;
4316 } else {
4317
4318 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4319 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4320 length $self->{ct}->{sysid});
4321
4322 ## Stay in the state
4323
4324 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4325 $self->{line_prev} = $self->{line};
4326 $self->{column_prev} = $self->{column};
4327 $self->{column}++;
4328 $self->{nc}
4329 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4330 } else {
4331 $self->{set_nc}->($self);
4332 }
4333
4334 redo A;
4335 }
4336 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4337 if ($is_space->{$self->{nc}}) {
4338 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4339
4340 $self->{state} = BEFORE_NDATA_STATE;
4341 } else {
4342
4343 ## Stay in the state
4344 }
4345
4346 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4347 $self->{line_prev} = $self->{line};
4348 $self->{column_prev} = $self->{column};
4349 $self->{column}++;
4350 $self->{nc}
4351 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4352 } else {
4353 $self->{set_nc}->($self);
4354 }
4355
4356 redo A;
4357 } elsif ($self->{nc} == 0x003E) { # >
4358 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4359
4360 $self->{state} = DATA_STATE;
4361 $self->{s_kwd} = '';
4362 } else {
4363
4364 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4365 }
4366
4367
4368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369 $self->{line_prev} = $self->{line};
4370 $self->{column_prev} = $self->{column};
4371 $self->{column}++;
4372 $self->{nc}
4373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374 } else {
4375 $self->{set_nc}->($self);
4376 }
4377
4378 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4379 redo A;
4380 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4381 ($self->{nc} == 0x004E or # N
4382 $self->{nc} == 0x006E)) { # n
4383
4384 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4385 $self->{state} = NDATA_STATE;
4386 $self->{kwd} = chr $self->{nc};
4387
4388 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4389 $self->{line_prev} = $self->{line};
4390 $self->{column_prev} = $self->{column};
4391 $self->{column}++;
4392 $self->{nc}
4393 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4394 } else {
4395 $self->{set_nc}->($self);
4396 }
4397
4398 redo A;
4399 } elsif ($self->{nc} == -1) {
4400 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4401
4402 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4403 $self->{state} = DATA_STATE;
4404 $self->{s_kwd} = '';
4405 $self->{ct}->{quirks} = 1;
4406 } else {
4407
4408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4409 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4410 }
4411
4412 ## reconsume
4413 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4414 redo A;
4415 } elsif ($self->{is_xml} and
4416 $self->{ct}->{type} == DOCTYPE_TOKEN and
4417 $self->{nc} == 0x005B) { # [
4418
4419 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4420 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4421 $self->{in_subset} = 1;
4422
4423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4424 $self->{line_prev} = $self->{line};
4425 $self->{column_prev} = $self->{column};
4426 $self->{column}++;
4427 $self->{nc}
4428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4429 } else {
4430 $self->{set_nc}->($self);
4431 }
4432
4433 return ($self->{ct}); # DOCTYPE
4434 redo A;
4435 } else {
4436 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4437
4438 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4439
4440 #$self->{ct}->{quirks} = 1;
4441 $self->{state} = BOGUS_DOCTYPE_STATE;
4442 } else {
4443
4444 $self->{state} = BOGUS_MD_STATE;
4445 }
4446
4447
4448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4449 $self->{line_prev} = $self->{line};
4450 $self->{column_prev} = $self->{column};
4451 $self->{column}++;
4452 $self->{nc}
4453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4454 } else {
4455 $self->{set_nc}->($self);
4456 }
4457
4458 redo A;
4459 }
4460 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4461 if ($is_space->{$self->{nc}}) {
4462
4463 ## Stay in the state.
4464
4465 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4466 $self->{line_prev} = $self->{line};
4467 $self->{column_prev} = $self->{column};
4468 $self->{column}++;
4469 $self->{nc}
4470 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4471 } else {
4472 $self->{set_nc}->($self);
4473 }
4474
4475 redo A;
4476 } elsif ($self->{nc} == 0x003E) { # >
4477
4478 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4479
4480 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4481 $self->{line_prev} = $self->{line};
4482 $self->{column_prev} = $self->{column};
4483 $self->{column}++;
4484 $self->{nc}
4485 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4486 } else {
4487 $self->{set_nc}->($self);
4488 }
4489
4490 return ($self->{ct}); # ENTITY
4491 redo A;
4492 } elsif ($self->{nc} == 0x004E or # N
4493 $self->{nc} == 0x006E) { # n
4494
4495 $self->{state} = NDATA_STATE;
4496 $self->{kwd} = chr $self->{nc};
4497
4498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499 $self->{line_prev} = $self->{line};
4500 $self->{column_prev} = $self->{column};
4501 $self->{column}++;
4502 $self->{nc}
4503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504 } else {
4505 $self->{set_nc}->($self);
4506 }
4507
4508 redo A;
4509 } elsif ($self->{nc} == -1) {
4510
4511 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4512 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4513 ## reconsume
4514 return ($self->{ct}); # ENTITY
4515 redo A;
4516 } else {
4517
4518 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4519 $self->{state} = BOGUS_MD_STATE;
4520
4521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4522 $self->{line_prev} = $self->{line};
4523 $self->{column_prev} = $self->{column};
4524 $self->{column}++;
4525 $self->{nc}
4526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4527 } else {
4528 $self->{set_nc}->($self);
4529 }
4530
4531 redo A;
4532 }
4533 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4534 if ($self->{nc} == 0x003E) { # >
4535
4536 $self->{state} = DATA_STATE;
4537 $self->{s_kwd} = '';
4538
4539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540 $self->{line_prev} = $self->{line};
4541 $self->{column_prev} = $self->{column};
4542 $self->{column}++;
4543 $self->{nc}
4544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4545 } else {
4546 $self->{set_nc}->($self);
4547 }
4548
4549
4550 return ($self->{ct}); # DOCTYPE
4551
4552 redo A;
4553 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4554
4555 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4556 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4557 $self->{in_subset} = 1;
4558
4559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4560 $self->{line_prev} = $self->{line};
4561 $self->{column_prev} = $self->{column};
4562 $self->{column}++;
4563 $self->{nc}
4564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4565 } else {
4566 $self->{set_nc}->($self);
4567 }
4568
4569 return ($self->{ct}); # DOCTYPE
4570 redo A;
4571 } elsif ($self->{nc} == -1) {
4572
4573 $self->{state} = DATA_STATE;
4574 $self->{s_kwd} = '';
4575 ## reconsume
4576
4577 return ($self->{ct}); # DOCTYPE
4578
4579 redo A;
4580 } else {
4581
4582 my $s = '';
4583 $self->{read_until}->($s, q{>[}, 0);
4584
4585 ## Stay in the state
4586
4587 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4588 $self->{line_prev} = $self->{line};
4589 $self->{column_prev} = $self->{column};
4590 $self->{column}++;
4591 $self->{nc}
4592 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4593 } else {
4594 $self->{set_nc}->($self);
4595 }
4596
4597 redo A;
4598 }
4599 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4600 ## NOTE: "CDATA section state" in the state is jointly implemented
4601 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4602 ## and |CDATA_SECTION_MSE2_STATE|.
4603
4604 ## XML5: "CDATA state".
4605
4606 if ($self->{nc} == 0x005D) { # ]
4607
4608 $self->{state} = CDATA_SECTION_MSE1_STATE;
4609
4610 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4611 $self->{line_prev} = $self->{line};
4612 $self->{column_prev} = $self->{column};
4613 $self->{column}++;
4614 $self->{nc}
4615 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4616 } else {
4617 $self->{set_nc}->($self);
4618 }
4619
4620 redo A;
4621 } elsif ($self->{nc} == -1) {
4622 if ($self->{is_xml}) {
4623
4624 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4625 } else {
4626
4627 }
4628
4629 $self->{state} = DATA_STATE;
4630 $self->{s_kwd} = '';
4631 ## Reconsume.
4632 if (length $self->{ct}->{data}) { # character
4633
4634 return ($self->{ct}); # character
4635 } else {
4636
4637 ## No token to emit. $self->{ct} is discarded.
4638 }
4639 redo A;
4640 } else {
4641
4642 $self->{ct}->{data} .= chr $self->{nc};
4643 $self->{read_until}->($self->{ct}->{data},
4644 q<]>,
4645 length $self->{ct}->{data});
4646
4647 ## Stay in the state.
4648
4649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650 $self->{line_prev} = $self->{line};
4651 $self->{column_prev} = $self->{column};
4652 $self->{column}++;
4653 $self->{nc}
4654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655 } else {
4656 $self->{set_nc}->($self);
4657 }
4658
4659 redo A;
4660 }
4661
4662 ## ISSUE: "text tokens" in spec.
4663 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4664 ## XML5: "CDATA bracket state".
4665
4666 if ($self->{nc} == 0x005D) { # ]
4667
4668 $self->{state} = CDATA_SECTION_MSE2_STATE;
4669
4670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4671 $self->{line_prev} = $self->{line};
4672 $self->{column_prev} = $self->{column};
4673 $self->{column}++;
4674 $self->{nc}
4675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4676 } else {
4677 $self->{set_nc}->($self);
4678 }
4679
4680 redo A;
4681 } else {
4682
4683 ## XML5: If EOF, "]" is not appended and changed to the data state.
4684 $self->{ct}->{data} .= ']';
4685 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4686 ## Reconsume.
4687 redo A;
4688 }
4689 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4690 ## XML5: "CDATA end state".
4691
4692 if ($self->{nc} == 0x003E) { # >
4693 $self->{state} = DATA_STATE;
4694 $self->{s_kwd} = '';
4695
4696 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4697 $self->{line_prev} = $self->{line};
4698 $self->{column_prev} = $self->{column};
4699 $self->{column}++;
4700 $self->{nc}
4701 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4702 } else {
4703 $self->{set_nc}->($self);
4704 }
4705
4706 if (length $self->{ct}->{data}) { # character
4707
4708 return ($self->{ct}); # character
4709 } else {
4710
4711 ## No token to emit. $self->{ct} is discarded.
4712 }
4713 redo A;
4714 } elsif ($self->{nc} == 0x005D) { # ]
4715 # character
4716 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4717 ## Stay in the state.
4718
4719 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4720 $self->{line_prev} = $self->{line};
4721 $self->{column_prev} = $self->{column};
4722 $self->{column}++;
4723 $self->{nc}
4724 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4725 } else {
4726 $self->{set_nc}->($self);
4727 }
4728
4729 redo A;
4730 } else {
4731
4732 $self->{ct}->{data} .= ']]'; # character
4733 $self->{state} = CDATA_SECTION_STATE;
4734 ## Reconsume. ## XML5: Emit.
4735 redo A;
4736 }
4737 } elsif ($self->{state} == ENTITY_STATE) {
4738 if ($is_space->{$self->{nc}} or
4739 {
4740 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4741 $self->{entity_add} => 1,
4742 }->{$self->{nc}}) {
4743 if ($self->{is_xml}) {
4744
4745 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4746 line => $self->{line_prev},
4747 column => $self->{column_prev}
4748 + ($self->{nc} == -1 ? 1 : 0));
4749 } else {
4750
4751 ## No error
4752 }
4753 ## Don't consume
4754 ## Return nothing.
4755 #
4756 } elsif ($self->{nc} == 0x0023) { # #
4757
4758 $self->{state} = ENTITY_HASH_STATE;
4759 $self->{kwd} = '#';
4760
4761 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4762 $self->{line_prev} = $self->{line};
4763 $self->{column_prev} = $self->{column};
4764 $self->{column}++;
4765 $self->{nc}
4766 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4767 } else {
4768 $self->{set_nc}->($self);
4769 }
4770
4771 redo A;
4772 } elsif ($self->{is_xml} or
4773 (0x0041 <= $self->{nc} and
4774 $self->{nc} <= 0x005A) or # A..Z
4775 (0x0061 <= $self->{nc} and
4776 $self->{nc} <= 0x007A)) { # a..z
4777
4778 require Whatpm::_NamedEntityList;
4779 $self->{state} = ENTITY_NAME_STATE;
4780 $self->{kwd} = chr $self->{nc};
4781 $self->{entity__value} = $self->{kwd};
4782 $self->{entity__match} = 0;
4783
4784 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4785 $self->{line_prev} = $self->{line};
4786 $self->{column_prev} = $self->{column};
4787 $self->{column}++;
4788 $self->{nc}
4789 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4790 } else {
4791 $self->{set_nc}->($self);
4792 }
4793
4794 redo A;
4795 } else {
4796
4797 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4798 ## Return nothing.
4799 #
4800 }
4801
4802 ## NOTE: No character is consumed by the "consume a character
4803 ## reference" algorithm. In other word, there is an "&" character
4804 ## that does not introduce a character reference, which would be
4805 ## appended to the parent element or the attribute value in later
4806 ## process of the tokenizer.
4807
4808 if ($self->{prev_state} == DATA_STATE) {
4809
4810 $self->{state} = $self->{prev_state};
4811 $self->{s_kwd} = '';
4812 ## Reconsume.
4813 return ({type => CHARACTER_TOKEN, data => '&',
4814 line => $self->{line_prev},
4815 column => $self->{column_prev},
4816 });
4817 redo A;
4818 } else {
4819
4820 $self->{ca}->{value} .= '&';
4821 $self->{state} = $self->{prev_state};
4822 $self->{s_kwd} = '';
4823 ## Reconsume.
4824 redo A;
4825 }
4826 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4827 if ($self->{nc} == 0x0078) { # x
4828
4829 $self->{state} = HEXREF_X_STATE;
4830 $self->{kwd} .= chr $self->{nc};
4831
4832 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4833 $self->{line_prev} = $self->{line};
4834 $self->{column_prev} = $self->{column};
4835 $self->{column}++;
4836 $self->{nc}
4837 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4838 } else {
4839 $self->{set_nc}->($self);
4840 }
4841
4842 redo A;
4843 } elsif ($self->{nc} == 0x0058) { # X
4844
4845 if ($self->{is_xml}) {
4846 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4847 }
4848 $self->{state} = HEXREF_X_STATE;
4849 $self->{kwd} .= chr $self->{nc};
4850
4851 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4852 $self->{line_prev} = $self->{line};
4853 $self->{column_prev} = $self->{column};
4854 $self->{column}++;
4855 $self->{nc}
4856 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4857 } else {
4858 $self->{set_nc}->($self);
4859 }
4860
4861 redo A;
4862 } elsif (0x0030 <= $self->{nc} and
4863 $self->{nc} <= 0x0039) { # 0..9
4864
4865 $self->{state} = NCR_NUM_STATE;
4866 $self->{kwd} = $self->{nc} - 0x0030;
4867
4868 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4869 $self->{line_prev} = $self->{line};
4870 $self->{column_prev} = $self->{column};
4871 $self->{column}++;
4872 $self->{nc}
4873 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4874 } else {
4875 $self->{set_nc}->($self);
4876 }
4877
4878 redo A;
4879 } else {
4880 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4881 line => $self->{line_prev},
4882 column => $self->{column_prev} - 1);
4883
4884 ## NOTE: According to the spec algorithm, nothing is returned,
4885 ## and then "&#" is appended to the parent element or the attribute
4886 ## value in the later processing.
4887
4888 if ($self->{prev_state} == DATA_STATE) {
4889
4890 $self->{state} = $self->{prev_state};
4891 $self->{s_kwd} = '';
4892 ## Reconsume.
4893 return ({type => CHARACTER_TOKEN,
4894 data => '&#',
4895 line => $self->{line_prev},
4896 column => $self->{column_prev} - 1,
4897 });
4898 redo A;
4899 } else {
4900
4901 $self->{ca}->{value} .= '&#';
4902 $self->{state} = $self->{prev_state};
4903 $self->{s_kwd} = '';
4904 ## Reconsume.
4905 redo A;
4906 }
4907 }
4908 } elsif ($self->{state} == NCR_NUM_STATE) {
4909 if (0x0030 <= $self->{nc} and
4910 $self->{nc} <= 0x0039) { # 0..9
4911
4912 $self->{kwd} *= 10;
4913 $self->{kwd} += $self->{nc} - 0x0030;
4914
4915 ## Stay in the state.
4916
4917 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4918 $self->{line_prev} = $self->{line};
4919 $self->{column_prev} = $self->{column};
4920 $self->{column}++;
4921 $self->{nc}
4922 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4923 } else {
4924 $self->{set_nc}->($self);
4925 }
4926
4927 redo A;
4928 } elsif ($self->{nc} == 0x003B) { # ;
4929
4930
4931 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4932 $self->{line_prev} = $self->{line};
4933 $self->{column_prev} = $self->{column};
4934 $self->{column}++;
4935 $self->{nc}
4936 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4937 } else {
4938 $self->{set_nc}->($self);
4939 }
4940
4941 #
4942 } else {
4943
4944 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4945 ## Reconsume.
4946 #
4947 }
4948
4949 my $code = $self->{kwd};
4950 my $l = $self->{line_prev};
4951 my $c = $self->{column_prev};
4952 if ((not $self->{is_xml} and $charref_map->{$code}) or
4953 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4954 ($self->{is_xml} and $code == 0x0000)) {
4955
4956 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4957 text => (sprintf 'U+%04X', $code),
4958 line => $l, column => $c);
4959 $code = $charref_map->{$code};
4960 } elsif ($code > 0x10FFFF) {
4961
4962 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4963 text => (sprintf 'U-%08X', $code),
4964 line => $l, column => $c);
4965 $code = 0xFFFD;
4966 }
4967
4968 if ($self->{prev_state} == DATA_STATE) {
4969
4970 $self->{state} = $self->{prev_state};
4971 $self->{s_kwd} = '';
4972 ## Reconsume.
4973 return ({type => CHARACTER_TOKEN, data => chr $code,
4974 has_reference => 1,
4975 line => $l, column => $c,
4976 });
4977 redo A;
4978 } else {
4979
4980 $self->{ca}->{value} .= chr $code;
4981 $self->{ca}->{has_reference} = 1;
4982 $self->{state} = $self->{prev_state};
4983 $self->{s_kwd} = '';
4984 ## Reconsume.
4985 redo A;
4986 }
4987 } elsif ($self->{state} == HEXREF_X_STATE) {
4988 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4989 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4990 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4991 # 0..9, A..F, a..f
4992
4993 $self->{state} = HEXREF_HEX_STATE;
4994 $self->{kwd} = 0;
4995 ## Reconsume.
4996 redo A;
4997 } else {
4998 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4999 line => $self->{line_prev},
5000 column => $self->{column_prev} - 2);
5001
5002 ## NOTE: According to the spec algorithm, nothing is returned,
5003 ## and then "&#" followed by "X" or "x" is appended to the parent
5004 ## element or the attribute value in the later processing.
5005
5006 if ($self->{prev_state} == DATA_STATE) {
5007
5008 $self->{state} = $self->{prev_state};
5009 $self->{s_kwd} = '';
5010 ## Reconsume.
5011 return ({type => CHARACTER_TOKEN,
5012 data => '&' . $self->{kwd},
5013 line => $self->{line_prev},
5014 column => $self->{column_prev} - length $self->{kwd},
5015 });
5016 redo A;
5017 } else {
5018
5019 $self->{ca}->{value} .= '&' . $self->{kwd};
5020 $self->{state} = $self->{prev_state};
5021 $self->{s_kwd} = '';
5022 ## Reconsume.
5023 redo A;
5024 }
5025 }
5026 } elsif ($self->{state} == HEXREF_HEX_STATE) {
5027 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5028 # 0..9
5029
5030 $self->{kwd} *= 0x10;
5031 $self->{kwd} += $self->{nc} - 0x0030;
5032 ## Stay in the state.
5033
5034 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5035 $self->{line_prev} = $self->{line};
5036 $self->{column_prev} = $self->{column};
5037 $self->{column}++;
5038 $self->{nc}
5039 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5040 } else {
5041 $self->{set_nc}->($self);
5042 }
5043
5044 redo A;
5045 } elsif (0x0061 <= $self->{nc} and
5046 $self->{nc} <= 0x0066) { # a..f
5047
5048 $self->{kwd} *= 0x10;
5049 $self->{kwd} += $self->{nc} - 0x0060 + 9;
5050 ## Stay in the state.
5051
5052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5053 $self->{line_prev} = $self->{line};
5054 $self->{column_prev} = $self->{column};
5055 $self->{column}++;
5056 $self->{nc}
5057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5058 } else {
5059 $self->{set_nc}->($self);
5060 }
5061
5062 redo A;
5063 } elsif (0x0041 <= $self->{nc} and
5064 $self->{nc} <= 0x0046) { # A..F
5065
5066 $self->{kwd} *= 0x10;
5067 $self->{kwd} += $self->{nc} - 0x0040 + 9;
5068 ## Stay in the state.
5069
5070 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5071 $self->{line_prev} = $self->{line};
5072 $self->{column_prev} = $self->{column};
5073 $self->{column}++;
5074 $self->{nc}
5075 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5076 } else {
5077 $self->{set_nc}->($self);
5078 }
5079
5080 redo A;
5081 } elsif ($self->{nc} == 0x003B) { # ;
5082
5083
5084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5085 $self->{line_prev} = $self->{line};
5086 $self->{column_prev} = $self->{column};
5087 $self->{column}++;
5088 $self->{nc}
5089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5090 } else {
5091 $self->{set_nc}->($self);
5092 }
5093
5094 #
5095 } else {
5096
5097 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5098 line => $self->{line},
5099 column => $self->{column});
5100 ## Reconsume.
5101 #
5102 }
5103
5104 my $code = $self->{kwd};
5105 my $l = $self->{line_prev};
5106 my $c = $self->{column_prev};
5107 if ((not $self->{is_xml} and $charref_map->{$code}) or
5108 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5109 ($self->{is_xml} and $code == 0x0000)) {
5110
5111 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5112 text => (sprintf 'U+%04X', $code),
5113 line => $l, column => $c);
5114 $code = $charref_map->{$code};
5115 } elsif ($code > 0x10FFFF) {
5116
5117 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5118 text => (sprintf 'U-%08X', $code),
5119 line => $l, column => $c);
5120 $code = 0xFFFD;
5121 }
5122
5123 if ($self->{prev_state} == DATA_STATE) {
5124
5125 $self->{state} = $self->{prev_state};
5126 $self->{s_kwd} = '';
5127 ## Reconsume.
5128 return ({type => CHARACTER_TOKEN, data => chr $code,
5129 has_reference => 1,
5130 line => $l, column => $c,
5131 });
5132 redo A;
5133 } else {
5134
5135 $self->{ca}->{value} .= chr $code;
5136 $self->{ca}->{has_reference} = 1;
5137 $self->{state} = $self->{prev_state};
5138 $self->{s_kwd} = '';
5139 ## Reconsume.
5140 redo A;
5141 }
5142 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5143 if ((0x0041 <= $self->{nc} and # a
5144 $self->{nc} <= 0x005A) or # x
5145 (0x0061 <= $self->{nc} and # a
5146 $self->{nc} <= 0x007A) or # z
5147 (0x0030 <= $self->{nc} and # 0
5148 $self->{nc} <= 0x0039) or # 9
5149 $self->{nc} == 0x003B or # ;
5150 ($self->{is_xml} and
5151 not ($is_space->{$self->{nc}} or
5152 {
5153 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5154 $self->{entity_add} => 1,
5155 }->{$self->{nc}}))) {
5156 our $EntityChar;
5157 $self->{kwd} .= chr $self->{nc};
5158 if (defined $EntityChar->{$self->{kwd}} or
5159 $self->{ge}->{$self->{kwd}}) {
5160 if ($self->{nc} == 0x003B) { # ;
5161 if (defined $self->{ge}->{$self->{kwd}}) {
5162 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5163
5164 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5165 } else {
5166 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5167
5168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5169 value => $self->{kwd});
5170 } else {
5171
5172 }
5173 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5174 }
5175 } else {
5176 if ($self->{is_xml}) {
5177
5178 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5179 value => $self->{kwd},
5180 level => {
5181 'amp;' => $self->{level}->{warn},
5182 'quot;' => $self->{level}->{warn},
5183 'lt;' => $self->{level}->{warn},
5184 'gt;' => $self->{level}->{warn},
5185 'apos;' => $self->{level}->{warn},
5186 }->{$self->{kwd}} ||
5187 $self->{level}->{must});
5188 } else {
5189
5190 }
5191 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5192 }
5193 $self->{entity__match} = 1;
5194
5195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5196 $self->{line_prev} = $self->{line};
5197 $self->{column_prev} = $self->{column};
5198 $self->{column}++;
5199 $self->{nc}
5200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5201 } else {
5202 $self->{set_nc}->($self);
5203 }
5204
5205 #
5206 } else {
5207
5208 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5209 $self->{entity__match} = -1;
5210 ## Stay in the state.
5211
5212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5213 $self->{line_prev} = $self->{line};
5214 $self->{column_prev} = $self->{column};
5215 $self->{column}++;
5216 $self->{nc}
5217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5218 } else {
5219 $self->{set_nc}->($self);
5220 }
5221
5222 redo A;
5223 }
5224 } else {
5225
5226 $self->{entity__value} .= chr $self->{nc};
5227 $self->{entity__match} *= 2;
5228 ## Stay in the state.
5229
5230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5231 $self->{line_prev} = $self->{line};
5232 $self->{column_prev} = $self->{column};
5233 $self->{column}++;
5234 $self->{nc}
5235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5236 } else {
5237 $self->{set_nc}->($self);
5238 }
5239
5240 redo A;
5241 }
5242 }
5243
5244 my $data;
5245 my $has_ref;
5246 if ($self->{entity__match} > 0) {
5247
5248 $data = $self->{entity__value};
5249 $has_ref = 1;
5250 #
5251 } elsif ($self->{entity__match} < 0) {
5252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5253 if ($self->{prev_state} != DATA_STATE and # in attribute
5254 $self->{entity__match} < -1) {
5255
5256 $data = '&' . $self->{kwd};
5257 #
5258 } else {
5259
5260 $data = $self->{entity__value};
5261 $has_ref = 1;
5262 #
5263 }
5264 } else {
5265
5266 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5267 line => $self->{line_prev},
5268 column => $self->{column_prev} - length $self->{kwd});
5269 $data = '&' . $self->{kwd};
5270 #
5271 }
5272
5273 ## NOTE: In these cases, when a character reference is found,
5274 ## it is consumed and a character token is returned, or, otherwise,
5275 ## nothing is consumed and returned, according to the spec algorithm.
5276 ## In this implementation, anything that has been examined by the
5277 ## tokenizer is appended to the parent element or the attribute value
5278 ## as string, either literal string when no character reference or
5279 ## entity-replaced string otherwise, in this stage, since any characters
5280 ## that would not be consumed are appended in the data state or in an
5281 ## appropriate attribute value state anyway.
5282
5283 if ($self->{prev_state} == DATA_STATE) {
5284
5285 $self->{state} = $self->{prev_state};
5286 $self->{s_kwd} = '';
5287 ## Reconsume.
5288 return ({type => CHARACTER_TOKEN,
5289 data => $data,
5290 has_reference => $has_ref,
5291 line => $self->{line_prev},
5292 column => $self->{column_prev} + 1 - length $self->{kwd},
5293 });
5294 redo A;
5295 } else {
5296
5297 $self->{ca}->{value} .= $data;
5298 $self->{ca}->{has_reference} = 1 if $has_ref;
5299 $self->{state} = $self->{prev_state};
5300 $self->{s_kwd} = '';
5301 ## Reconsume.
5302 redo A;
5303 }
5304
5305 ## XML-only states
5306
5307 } elsif ($self->{state} == PI_STATE) {
5308 ## XML5: "Pi state" and "DOCTYPE pi state".
5309
5310 if ($is_space->{$self->{nc}} or
5311 $self->{nc} == 0x003F or # ?
5312 $self->{nc} == -1) {
5313 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5314 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5315 ## "DOCTYPE pi state": Parse error, switch to the "data
5316 ## state".
5317 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5318 line => $self->{line_prev},
5319 column => $self->{column_prev}
5320 - 1 * ($self->{nc} != -1));
5321 $self->{state} = BOGUS_COMMENT_STATE;
5322 ## Reconsume.
5323 $self->{ct} = {type => COMMENT_TOKEN,
5324 data => '?',
5325 line => $self->{line_prev},
5326 column => $self->{column_prev}
5327 - 1 * ($self->{nc} != -1),
5328 };
5329 redo A;
5330 } else {
5331 ## XML5: "DOCTYPE pi state": Stay in the state.
5332 $self->{ct} = {type => PI_TOKEN,
5333 target => chr $self->{nc},
5334 data => '',
5335 line => $self->{line_prev},
5336 column => $self->{column_prev} - 1,
5337 };
5338 $self->{state} = PI_TARGET_STATE;
5339
5340 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5341 $self->{line_prev} = $self->{line};
5342 $self->{column_prev} = $self->{column};
5343 $self->{column}++;
5344 $self->{nc}
5345 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5346 } else {
5347 $self->{set_nc}->($self);
5348 }
5349
5350 redo A;
5351 }
5352 } elsif ($self->{state} == PI_TARGET_STATE) {
5353 if ($is_space->{$self->{nc}}) {
5354 $self->{state} = PI_TARGET_AFTER_STATE;
5355
5356 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5357 $self->{line_prev} = $self->{line};
5358 $self->{column_prev} = $self->{column};
5359 $self->{column}++;
5360 $self->{nc}
5361 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5362 } else {
5363 $self->{set_nc}->($self);
5364 }
5365
5366 redo A;
5367 } elsif ($self->{nc} == -1) {
5368 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5369 if ($self->{in_subset}) {
5370 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5371 } else {
5372 $self->{state} = DATA_STATE;
5373 $self->{s_kwd} = '';
5374 }
5375 ## Reconsume.
5376 return ($self->{ct}); # pi
5377 redo A;
5378 } elsif ($self->{nc} == 0x003F) { # ?
5379 $self->{state} = PI_AFTER_STATE;
5380
5381 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5382 $self->{line_prev} = $self->{line};
5383 $self->{column_prev} = $self->{column};
5384 $self->{column}++;
5385 $self->{nc}
5386 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5387 } else {
5388 $self->{set_nc}->($self);
5389 }
5390
5391 redo A;
5392 } else {
5393 ## XML5: typo ("tag name" -> "target")
5394 $self->{ct}->{target} .= chr $self->{nc}; # pi
5395
5396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5397 $self->{line_prev} = $self->{line};
5398 $self->{column_prev} = $self->{column};
5399 $self->{column}++;
5400 $self->{nc}
5401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5402 } else {
5403 $self->{set_nc}->($self);
5404 }
5405
5406 redo A;
5407 }
5408 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5409 if ($is_space->{$self->{nc}}) {
5410 ## Stay in the state.
5411
5412 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5413 $self->{line_prev} = $self->{line};
5414 $self->{column_prev} = $self->{column};
5415 $self->{column}++;
5416 $self->{nc}
5417 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5418 } else {
5419 $self->{set_nc}->($self);
5420 }
5421
5422 redo A;
5423 } else {
5424 $self->{state} = PI_DATA_STATE;
5425 ## Reprocess.
5426 redo A;
5427 }
5428 } elsif ($self->{state} == PI_DATA_STATE) {
5429 if ($self->{nc} == 0x003F) { # ?
5430 $self->{state} = PI_DATA_AFTER_STATE;
5431
5432 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5433 $self->{line_prev} = $self->{line};
5434 $self->{column_prev} = $self->{column};
5435 $self->{column}++;
5436 $self->{nc}
5437 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5438 } else {
5439 $self->{set_nc}->($self);
5440 }
5441
5442 redo A;
5443 } elsif ($self->{nc} == -1) {
5444 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5445 if ($self->{in_subset}) {
5446 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5447 } else {
5448 $self->{state} = DATA_STATE;
5449 $self->{s_kwd} = '';
5450 }
5451 ## Reprocess.
5452 return ($self->{ct}); # pi
5453 redo A;
5454 } else {
5455 $self->{ct}->{data} .= chr $self->{nc}; # pi
5456 $self->{read_until}->($self->{ct}->{data}, q[?],
5457 length $self->{ct}->{data});
5458 ## Stay in the state.
5459
5460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5461 $self->{line_prev} = $self->{line};
5462 $self->{column_prev} = $self->{column};
5463 $self->{column}++;
5464 $self->{nc}
5465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5466 } else {
5467 $self->{set_nc}->($self);
5468 }
5469
5470 ## Reprocess.
5471 redo A;
5472 }
5473 } elsif ($self->{state} == PI_AFTER_STATE) {
5474 ## XML5: Part of "Pi after state".
5475
5476 if ($self->{nc} == 0x003E) { # >
5477 if ($self->{in_subset}) {
5478 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5479 } else {
5480 $self->{state} = DATA_STATE;
5481 $self->{s_kwd} = '';
5482 }
5483
5484 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5485 $self->{line_prev} = $self->{line};
5486 $self->{column_prev} = $self->{column};
5487 $self->{column}++;
5488 $self->{nc}
5489 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5490 } else {
5491 $self->{set_nc}->($self);
5492 }
5493
5494 return ($self->{ct}); # pi
5495 redo A;
5496 } elsif ($self->{nc} == 0x003F) { # ?
5497 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5498 line => $self->{line_prev},
5499 column => $self->{column_prev}); ## XML5: no error
5500 $self->{ct}->{data} .= '?';
5501 $self->{state} = PI_DATA_AFTER_STATE;
5502
5503 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5504 $self->{line_prev} = $self->{line};
5505 $self->{column_prev} = $self->{column};
5506 $self->{column}++;
5507 $self->{nc}
5508 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5509 } else {
5510 $self->{set_nc}->($self);
5511 }
5512
5513 redo A;
5514 } else {
5515 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5516 line => $self->{line_prev},
5517 column => $self->{column_prev}
5518 + 1 * ($self->{nc} == -1)); ## XML5: no error
5519 $self->{ct}->{data} .= '?'; ## XML5: not appended
5520 $self->{state} = PI_DATA_STATE;
5521 ## Reprocess.
5522 redo A;
5523 }
5524 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5525 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5526
5527 if ($self->{nc} == 0x003E) { # >
5528 if ($self->{in_subset}) {
5529 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5530 } else {
5531 $self->{state} = DATA_STATE;
5532 $self->{s_kwd} = '';
5533 }
5534
5535 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5536 $self->{line_prev} = $self->{line};
5537 $self->{column_prev} = $self->{column};
5538 $self->{column}++;
5539 $self->{nc}
5540 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5541 } else {
5542 $self->{set_nc}->($self);
5543 }
5544
5545 return ($self->{ct}); # pi
5546 redo A;
5547 } elsif ($self->{nc} == 0x003F) { # ?
5548 $self->{ct}->{data} .= '?';
5549 ## Stay in the state.
5550
5551 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5552 $self->{line_prev} = $self->{line};
5553 $self->{column_prev} = $self->{column};
5554 $self->{column}++;
5555 $self->{nc}
5556 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5557 } else {
5558 $self->{set_nc}->($self);
5559 }
5560
5561 redo A;
5562 } else {
5563 $self->{ct}->{data} .= '?'; ## XML5: not appended
5564 $self->{state} = PI_DATA_STATE;
5565 ## Reprocess.
5566 redo A;
5567 }
5568
5569 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5570 if ($self->{nc} == 0x003C) { # <
5571 $self->{state} = DOCTYPE_TAG_STATE;
5572
5573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5574 $self->{line_prev} = $self->{line};
5575 $self->{column_prev} = $self->{column};
5576 $self->{column}++;
5577 $self->{nc}
5578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5579 } else {
5580 $self->{set_nc}->($self);
5581 }
5582
5583 redo A;
5584 } elsif ($self->{nc} == 0x0025) { # %
5585 ## XML5: Not defined yet.
5586
5587 ## TODO:
5588
5589 if (not $self->{stop_processing} and
5590 not $self->{document}->xml_standalone) {
5591 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5592 level => $self->{level}->{info});
5593 $self->{stop_processing} = 1;
5594 }
5595
5596
5597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5598 $self->{line_prev} = $self->{line};
5599 $self->{column_prev} = $self->{column};
5600 $self->{column}++;
5601 $self->{nc}
5602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5603 } else {
5604 $self->{set_nc}->($self);
5605 }
5606
5607 redo A;
5608 } elsif ($self->{nc} == 0x005D) { # ]
5609 delete $self->{in_subset};
5610 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5611
5612 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5613 $self->{line_prev} = $self->{line};
5614 $self->{column_prev} = $self->{column};
5615 $self->{column}++;
5616 $self->{nc}
5617 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5618 } else {
5619 $self->{set_nc}->($self);
5620 }
5621
5622 redo A;
5623 } elsif ($is_space->{$self->{nc}}) {
5624 ## Stay in the state.
5625
5626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5627 $self->{line_prev} = $self->{line};
5628 $self->{column_prev} = $self->{column};
5629 $self->{column}++;
5630 $self->{nc}
5631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5632 } else {
5633 $self->{set_nc}->($self);
5634 }
5635
5636 redo A;
5637 } elsif ($self->{nc} == -1) {
5638 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5639 delete $self->{in_subset};
5640 $self->{state} = DATA_STATE;
5641 $self->{s_kwd} = '';
5642 ## Reconsume.
5643 return ({type => END_OF_DOCTYPE_TOKEN});
5644 redo A;
5645 } else {
5646 unless ($self->{internal_subset_tainted}) {
5647 ## XML5: No parse error.
5648 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5649 $self->{internal_subset_tainted} = 1;
5650 }
5651 ## Stay in the state.
5652
5653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5654 $self->{line_prev} = $self->{line};
5655 $self->{column_prev} = $self->{column};
5656 $self->{column}++;
5657 $self->{nc}
5658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5659 } else {
5660 $self->{set_nc}->($self);
5661 }
5662
5663 redo A;
5664 }
5665 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5666 if ($self->{nc} == 0x003E) { # >
5667 $self->{state} = DATA_STATE;
5668 $self->{s_kwd} = '';
5669
5670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5671 $self->{line_prev} = $self->{line};
5672 $self->{column_prev} = $self->{column};
5673 $self->{column}++;
5674 $self->{nc}
5675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5676 } else {
5677 $self->{set_nc}->($self);
5678 }
5679
5680 return ({type => END_OF_DOCTYPE_TOKEN});
5681 redo A;
5682 } elsif ($self->{nc} == -1) {
5683 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5684 $self->{state} = DATA_STATE;
5685 $self->{s_kwd} = '';
5686 ## Reconsume.
5687 return ({type => END_OF_DOCTYPE_TOKEN});
5688 redo A;
5689 } else {
5690 ## XML5: No parse error and stay in the state.
5691 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5692
5693 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5694
5695 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5696 $self->{line_prev} = $self->{line};
5697 $self->{column_prev} = $self->{column};
5698 $self->{column}++;
5699 $self->{nc}
5700 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5701 } else {
5702 $self->{set_nc}->($self);
5703 }
5704
5705 redo A;
5706 }
5707 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5708 if ($self->{nc} == 0x003E) { # >
5709 $self->{state} = DATA_STATE;
5710 $self->{s_kwd} = '';
5711
5712 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5713 $self->{line_prev} = $self->{line};
5714 $self->{column_prev} = $self->{column};
5715 $self->{column}++;
5716 $self->{nc}
5717 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5718 } else {
5719 $self->{set_nc}->($self);
5720 }
5721
5722 return ({type => END_OF_DOCTYPE_TOKEN});
5723 redo A;
5724 } elsif ($self->{nc} == -1) {
5725 $self->{state} = DATA_STATE;
5726 $self->{s_kwd} = '';
5727 ## Reconsume.
5728 return ({type => END_OF_DOCTYPE_TOKEN});
5729 redo A;
5730 } else {
5731 ## Stay in the state.
5732
5733 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5734 $self->{line_prev} = $self->{line};
5735 $self->{column_prev} = $self->{column};
5736 $self->{column}++;
5737 $self->{nc}
5738 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5739 } else {
5740 $self->{set_nc}->($self);
5741 }
5742
5743 redo A;
5744 }
5745 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5746 if ($self->{nc} == 0x0021) { # !
5747 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5748
5749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5750 $self->{line_prev} = $self->{line};
5751 $self->{column_prev} = $self->{column};
5752 $self->{column}++;
5753 $self->{nc}
5754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5755 } else {
5756 $self->{set_nc}->($self);
5757 }
5758
5759 redo A;
5760 } elsif ($self->{nc} == 0x003F) { # ?
5761 $self->{state} = PI_STATE;
5762
5763 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5764 $self->{line_prev} = $self->{line};
5765 $self->{column_prev} = $self->{column};
5766 $self->{column}++;
5767 $self->{nc}
5768 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5769 } else {
5770 $self->{set_nc}->($self);
5771 }
5772
5773 redo A;
5774 } elsif ($self->{nc} == -1) {
5775 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5776 $self->{state} = DATA_STATE;
5777 $self->{s_kwd} = '';
5778 ## Reconsume.
5779 redo A;
5780 } else {
5781 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5782 line => $self->{line_prev},
5783 column => $self->{column_prev});
5784 $self->{state} = BOGUS_COMMENT_STATE;
5785 $self->{ct} = {type => COMMENT_TOKEN,
5786 data => '',
5787 }; ## NOTE: Will be discarded.
5788
5789 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5790 $self->{line_prev} = $self->{line};
5791 $self->{column_prev} = $self->{column};
5792 $self->{column}++;
5793 $self->{nc}
5794 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5795 } else {
5796 $self->{set_nc}->($self);
5797 }
5798
5799 redo A;
5800 }
5801 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5802 ## XML5: "DOCTYPE markup declaration state".
5803
5804 if ($self->{nc} == 0x002D) { # -
5805 $self->{state} = MD_HYPHEN_STATE;
5806
5807 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5808 $self->{line_prev} = $self->{line};
5809 $self->{column_prev} = $self->{column};
5810 $self->{column}++;
5811 $self->{nc}
5812 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5813 } else {
5814 $self->{set_nc}->($self);
5815 }
5816
5817 redo A;
5818 } elsif ($self->{nc} == 0x0045 or # E
5819 $self->{nc} == 0x0065) { # e
5820 $self->{state} = MD_E_STATE;
5821 $self->{kwd} = chr $self->{nc};
5822
5823 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5824 $self->{line_prev} = $self->{line};
5825 $self->{column_prev} = $self->{column};
5826 $self->{column}++;
5827 $self->{nc}
5828 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5829 } else {
5830 $self->{set_nc}->($self);
5831 }
5832
5833 redo A;
5834 } elsif ($self->{nc} == 0x0041 or # A
5835 $self->{nc} == 0x0061) { # a
5836 $self->{state} = MD_ATTLIST_STATE;
5837 $self->{kwd} = chr $self->{nc};
5838
5839 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5840 $self->{line_prev} = $self->{line};
5841 $self->{column_prev} = $self->{column};
5842 $self->{column}++;
5843 $self->{nc}
5844 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5845 } else {
5846 $self->{set_nc}->($self);
5847 }
5848
5849 redo A;
5850 } elsif ($self->{nc} == 0x004E or # N
5851 $self->{nc} == 0x006E) { # n
5852 $self->{state} = MD_NOTATION_STATE;
5853 $self->{kwd} = chr $self->{nc};
5854
5855 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5856 $self->{line_prev} = $self->{line};
5857 $self->{column_prev} = $self->{column};
5858 $self->{column}++;
5859 $self->{nc}
5860 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5861 } else {
5862 $self->{set_nc}->($self);
5863 }
5864
5865 redo A;
5866 } else {
5867 #
5868 }
5869
5870 ## XML5: No parse error.
5871 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5872 line => $self->{line_prev},
5873 column => $self->{column_prev} - 1);
5874 ## Reconsume.
5875 $self->{state} = BOGUS_COMMENT_STATE;
5876 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5877 redo A;
5878 } elsif ($self->{state} == MD_E_STATE) {
5879 if ($self->{nc} == 0x004E or # N
5880 $self->{nc} == 0x006E) { # n
5881 $self->{state} = MD_ENTITY_STATE;
5882 $self->{kwd} .= chr $self->{nc};
5883
5884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885 $self->{line_prev} = $self->{line};
5886 $self->{column_prev} = $self->{column};
5887 $self->{column}++;
5888 $self->{nc}
5889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890 } else {
5891 $self->{set_nc}->($self);
5892 }
5893
5894 redo A;
5895 } elsif ($self->{nc} == 0x004C or # L
5896 $self->{nc} == 0x006C) { # l
5897 ## XML5: <!ELEMENT> not supported.
5898 $self->{state} = MD_ELEMENT_STATE;
5899 $self->{kwd} .= chr $self->{nc};
5900
5901 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5902 $self->{line_prev} = $self->{line};
5903 $self->{column_prev} = $self->{column};
5904 $self->{column}++;
5905 $self->{nc}
5906 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5907 } else {
5908 $self->{set_nc}->($self);
5909 }
5910
5911 redo A;
5912 } else {
5913 ## XML5: No parse error.
5914 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5915 line => $self->{line_prev},
5916 column => $self->{column_prev} - 2
5917 + 1 * ($self->{nc} == -1));
5918 ## Reconsume.
5919 $self->{state} = BOGUS_COMMENT_STATE;
5920 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5921 redo A;
5922 }
5923 } elsif ($self->{state} == MD_ENTITY_STATE) {
5924 if ($self->{nc} == [
5925 undef,
5926 undef,
5927 0x0054, # T
5928 0x0049, # I
5929 0x0054, # T
5930 ]->[length $self->{kwd}] or
5931 $self->{nc} == [
5932 undef,
5933 undef,
5934 0x0074, # t
5935 0x0069, # i
5936 0x0074, # t
5937 ]->[length $self->{kwd}]) {
5938 ## Stay in the state.
5939 $self->{kwd} .= chr $self->{nc};
5940
5941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5942 $self->{line_prev} = $self->{line};
5943 $self->{column_prev} = $self->{column};
5944 $self->{column}++;
5945 $self->{nc}
5946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5947 } else {
5948 $self->{set_nc}->($self);
5949 }
5950
5951 redo A;
5952 } elsif ((length $self->{kwd}) == 5 and
5953 ($self->{nc} == 0x0059 or # Y
5954 $self->{nc} == 0x0079)) { # y
5955 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5956 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5957 text => 'ENTITY',
5958 line => $self->{line_prev},
5959 column => $self->{column_prev} - 4);
5960 }
5961 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5962 line => $self->{line_prev},
5963 column => $self->{column_prev} - 6};
5964 $self->{state} = DOCTYPE_MD_STATE;
5965
5966 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5967 $self->{line_prev} = $self->{line};
5968 $self->{column_prev} = $self->{column};
5969 $self->{column}++;
5970 $self->{nc}
5971 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5972 } else {
5973 $self->{set_nc}->($self);
5974 }
5975
5976 redo A;
5977 } else {
5978 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5979 line => $self->{line_prev},
5980 column => $self->{column_prev} - 1
5981 - (length $self->{kwd})
5982 + 1 * ($self->{nc} == -1));
5983 $self->{state} = BOGUS_COMMENT_STATE;
5984 ## Reconsume.
5985 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5986 redo A;
5987 }
5988 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5989 if ($self->{nc} == [
5990 undef,
5991 undef,
5992 0x0045, # E
5993 0x004D, # M
5994 0x0045, # E
5995 0x004E, # N
5996 ]->[length $self->{kwd}] or
5997 $self->{nc} == [
5998 undef,
5999 undef,
6000 0x0065, # e
6001 0x006D, # m
6002 0x0065, # e
6003 0x006E, # n
6004 ]->[length $self->{kwd}]) {
6005 ## Stay in the state.
6006 $self->{kwd} .= chr $self->{nc};
6007
6008 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6009 $self->{line_prev} = $self->{line};
6010 $self->{column_prev} = $self->{column};
6011 $self->{column}++;
6012 $self->{nc}
6013 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6014 } else {
6015 $self->{set_nc}->($self);
6016 }
6017
6018 redo A;
6019 } elsif ((length $self->{kwd}) == 6 and
6020 ($self->{nc} == 0x0054 or # T
6021 $self->{nc} == 0x0074)) { # t
6022 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6023 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6024 text => 'ELEMENT',
6025 line => $self->{line_prev},
6026 column => $self->{column_prev} - 5);
6027 }
6028 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6029 line => $self->{line_prev},
6030 column => $self->{column_prev} - 7};
6031 $self->{state} = DOCTYPE_MD_STATE;
6032
6033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6034 $self->{line_prev} = $self->{line};
6035 $self->{column_prev} = $self->{column};
6036 $self->{column}++;
6037 $self->{nc}
6038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6039 } else {
6040 $self->{set_nc}->($self);
6041 }
6042
6043 redo A;
6044 } else {
6045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6046 line => $self->{line_prev},
6047 column => $self->{column_prev} - 1
6048 - (length $self->{kwd})
6049 + 1 * ($self->{nc} == -1));
6050 $self->{state} = BOGUS_COMMENT_STATE;
6051 ## Reconsume.
6052 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6053 redo A;
6054 }
6055 } elsif ($self->{state} == MD_ATTLIST_STATE) {
6056 if ($self->{nc} == [
6057 undef,
6058 0x0054, # T
6059 0x0054, # T
6060 0x004C, # L
6061 0x0049, # I
6062 0x0053, # S
6063 ]->[length $self->{kwd}] or
6064 $self->{nc} == [
6065 undef,
6066 0x0074, # t
6067 0x0074, # t
6068 0x006C, # l
6069 0x0069, # i
6070 0x0073, # s
6071 ]->[length $self->{kwd}]) {
6072 ## Stay in the state.
6073 $self->{kwd} .= chr $self->{nc};
6074
6075 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6076 $self->{line_prev} = $self->{line};
6077 $self->{column_prev} = $self->{column};
6078 $self->{column}++;
6079 $self->{nc}
6080 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6081 } else {
6082 $self->{set_nc}->($self);
6083 }
6084
6085 redo A;
6086 } elsif ((length $self->{kwd}) == 6 and
6087 ($self->{nc} == 0x0054 or # T
6088 $self->{nc} == 0x0074)) { # t
6089 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6090 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6091 text => 'ATTLIST',
6092 line => $self->{line_prev},
6093 column => $self->{column_prev} - 5);
6094 }
6095 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6096 attrdefs => [],
6097 line => $self->{line_prev},
6098 column => $self->{column_prev} - 7};
6099 $self->{state} = DOCTYPE_MD_STATE;
6100
6101 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6102 $self->{line_prev} = $self->{line};
6103 $self->{column_prev} = $self->{column};
6104 $self->{column}++;
6105 $self->{nc}
6106 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6107 } else {
6108 $self->{set_nc}->($self);
6109 }
6110
6111 redo A;
6112 } else {
6113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6114 line => $self->{line_prev},
6115 column => $self->{column_prev} - 1
6116 - (length $self->{kwd})
6117 + 1 * ($self->{nc} == -1));
6118 $self->{state} = BOGUS_COMMENT_STATE;
6119 ## Reconsume.
6120 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6121 redo A;
6122 }
6123 } elsif ($self->{state} == MD_NOTATION_STATE) {
6124 if ($self->{nc} == [
6125 undef,
6126 0x004F, # O
6127 0x0054, # T
6128 0x0041, # A
6129 0x0054, # T
6130 0x0049, # I
6131 0x004F, # O
6132 ]->[length $self->{kwd}] or
6133 $self->{nc} == [
6134 undef,
6135 0x006F, # o
6136 0x0074, # t
6137 0x0061, # a
6138 0x0074, # t
6139 0x0069, # i
6140 0x006F, # o
6141 ]->[length $self->{kwd}]) {
6142 ## Stay in the state.
6143 $self->{kwd} .= chr $self->{nc};
6144
6145 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6146 $self->{line_prev} = $self->{line};
6147 $self->{column_prev} = $self->{column};
6148 $self->{column}++;
6149 $self->{nc}
6150 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6151 } else {
6152 $self->{set_nc}->($self);
6153 }
6154
6155 redo A;
6156 } elsif ((length $self->{kwd}) == 7 and
6157 ($self->{nc} == 0x004E or # N
6158 $self->{nc} == 0x006E)) { # n
6159 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6160 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6161 text => 'NOTATION',
6162 line => $self->{line_prev},
6163 column => $self->{column_prev} - 6);
6164 }
6165 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6166 line => $self->{line_prev},
6167 column => $self->{column_prev} - 8};
6168 $self->{state} = DOCTYPE_MD_STATE;
6169
6170 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6171 $self->{line_prev} = $self->{line};
6172 $self->{column_prev} = $self->{column};
6173 $self->{column}++;
6174 $self->{nc}
6175 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6176 } else {
6177 $self->{set_nc}->($self);
6178 }
6179
6180 redo A;
6181 } else {
6182 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6183 line => $self->{line_prev},
6184 column => $self->{column_prev} - 1
6185 - (length $self->{kwd})
6186 + 1 * ($self->{nc} == -1));
6187 $self->{state} = BOGUS_COMMENT_STATE;
6188 ## Reconsume.
6189 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6190 redo A;
6191 }
6192 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6193 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6194 ## "DOCTYPE NOTATION state".
6195
6196 if ($is_space->{$self->{nc}}) {
6197 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6198 $self->{state} = BEFORE_MD_NAME_STATE;
6199
6200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6201 $self->{line_prev} = $self->{line};
6202 $self->{column_prev} = $self->{column};
6203 $self->{column}++;
6204 $self->{nc}
6205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6206 } else {
6207 $self->{set_nc}->($self);
6208 }
6209
6210 redo A;
6211 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6212 $self->{nc} == 0x0025) { # %
6213 ## XML5: Switch to the "DOCTYPE bogus comment state".
6214 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6215 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6216
6217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6218 $self->{line_prev} = $self->{line};
6219 $self->{column_prev} = $self->{column};
6220 $self->{column}++;
6221 $self->{nc}
6222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6223 } else {
6224 $self->{set_nc}->($self);
6225 }
6226
6227 redo A;
6228 } elsif ($self->{nc} == -1) {
6229 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6230 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6231 ## Reconsume.
6232 redo A;
6233 } elsif ($self->{nc} == 0x003E) { # >
6234 ## XML5: Switch to the "DOCTYPE bogus comment state".
6235 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6236 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6237
6238 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6239 $self->{line_prev} = $self->{line};
6240 $self->{column_prev} = $self->{column};
6241 $self->{column}++;
6242 $self->{nc}
6243 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6244 } else {
6245 $self->{set_nc}->($self);
6246 }
6247
6248 redo A;
6249 } else {
6250 ## XML5: Switch to the "DOCTYPE bogus comment state".
6251 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6252 $self->{state} = BEFORE_MD_NAME_STATE;
6253 redo A;
6254 }
6255 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6256 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6257 ## before state", "DOCTYPE ATTLIST name before state".
6258
6259 if ($is_space->{$self->{nc}}) {
6260 ## Stay in the state.
6261
6262 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6263 $self->{line_prev} = $self->{line};
6264 $self->{column_prev} = $self->{column};
6265 $self->{column}++;
6266 $self->{nc}
6267 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6268 } else {
6269 $self->{set_nc}->($self);
6270 }
6271
6272 redo A;
6273 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6274 $self->{nc} == 0x0025) { # %
6275 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6276
6277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6278 $self->{line_prev} = $self->{line};
6279 $self->{column_prev} = $self->{column};
6280 $self->{column}++;
6281 $self->{nc}
6282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6283 } else {
6284 $self->{set_nc}->($self);
6285 }
6286
6287 redo A;
6288 } elsif ($self->{nc} == 0x003E) { # >
6289 ## XML5: Same as "Anything else".
6290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6291 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6292
6293 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6294 $self->{line_prev} = $self->{line};
6295 $self->{column_prev} = $self->{column};
6296 $self->{column}++;
6297 $self->{nc}
6298 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6299 } else {
6300 $self->{set_nc}->($self);
6301 }
6302
6303 redo A;
6304 } elsif ($self->{nc} == -1) {
6305 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6306 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6307 ## Reconsume.
6308 redo A;
6309 } else {
6310 ## XML5: [ATTLIST] Not defined yet.
6311 $self->{ct}->{name} .= chr $self->{nc};
6312 $self->{state} = MD_NAME_STATE;
6313
6314 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6315 $self->{line_prev} = $self->{line};
6316 $self->{column_prev} = $self->{column};
6317 $self->{column}++;
6318 $self->{nc}
6319 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6320 } else {
6321 $self->{set_nc}->($self);
6322 }
6323
6324 redo A;
6325 }
6326 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6327 if ($is_space->{$self->{nc}}) {
6328 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6329 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6330 $self->{state} = BEFORE_MD_NAME_STATE;
6331
6332 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6333 $self->{line_prev} = $self->{line};
6334 $self->{column_prev} = $self->{column};
6335 $self->{column}++;
6336 $self->{nc}
6337 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6338 } else {
6339 $self->{set_nc}->($self);
6340 }
6341
6342 redo A;
6343 } elsif ($self->{nc} == 0x003E) { # >
6344 ## XML5: Same as "Anything else".
6345 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6346 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6347
6348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6349 $self->{line_prev} = $self->{line};
6350 $self->{column_prev} = $self->{column};
6351 $self->{column}++;
6352 $self->{nc}
6353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6354 } else {
6355 $self->{set_nc}->($self);
6356 }
6357
6358 redo A;
6359 } elsif ($self->{nc} == -1) {
6360 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6361 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6362 ## Reconsume.
6363 redo A;
6364 } else {
6365 ## XML5: No parse error.
6366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6367 $self->{state} = BOGUS_COMMENT_STATE;
6368 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6369 ## Reconsume.
6370 redo A;
6371 }
6372 } elsif ($self->{state} == MD_NAME_STATE) {
6373 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6374
6375 if ($is_space->{$self->{nc}}) {
6376 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6377 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6378 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6379 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6380 } else { # ENTITY/NOTATION
6381 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6382 }
6383
6384 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6385 $self->{line_prev} = $self->{line};
6386 $self->{column_prev} = $self->{column};
6387 $self->{column}++;
6388 $self->{nc}
6389 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6390 } else {
6391 $self->{set_nc}->($self);
6392 }
6393
6394 redo A;
6395 } elsif ($self->{nc} == 0x003E) { # >
6396 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6397 #
6398 } else {
6399 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6400 }
6401 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6402
6403 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6404 $self->{line_prev} = $self->{line};
6405 $self->{column_prev} = $self->{column};
6406 $self->{column}++;
6407 $self->{nc}
6408 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6409 } else {
6410 $self->{set_nc}->($self);
6411 }
6412
6413 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6414 redo A;
6415 } elsif ($self->{nc} == -1) {
6416 ## XML5: [ATTLIST] No parse error.
6417 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6418 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6419 ## Reconsume.
6420 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6421 redo A;
6422 } else {
6423 ## XML5: [ATTLIST] Not defined yet.
6424 $self->{ct}->{name} .= chr $self->{nc};
6425 ## Stay in the state.
6426
6427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6428 $self->{line_prev} = $self->{line};
6429 $self->{column_prev} = $self->{column};
6430 $self->{column}++;
6431 $self->{nc}
6432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6433 } else {
6434 $self->{set_nc}->($self);
6435 }
6436
6437 redo A;
6438 }
6439 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6440 if ($is_space->{$self->{nc}}) {
6441 ## Stay in the state.
6442
6443 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6444 $self->{line_prev} = $self->{line};
6445 $self->{column_prev} = $self->{column};
6446 $self->{column}++;
6447 $self->{nc}
6448 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6449 } else {
6450 $self->{set_nc}->($self);
6451 }
6452
6453 redo A;
6454 } elsif ($self->{nc} == 0x003E) { # >
6455 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6456
6457 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6458 $self->{line_prev} = $self->{line};
6459 $self->{column_prev} = $self->{column};
6460 $self->{column}++;
6461 $self->{nc}
6462 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6463 } else {
6464 $self->{set_nc}->($self);
6465 }
6466
6467 return ($self->{ct}); # ATTLIST
6468 redo A;
6469 } elsif ($self->{nc} == -1) {
6470 ## XML5: No parse error.
6471 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6472 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6473 return ($self->{ct});
6474 redo A;
6475 } else {
6476 ## XML5: Not defined yet.
6477 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6478 tokens => [],
6479 line => $self->{line}, column => $self->{column}};
6480 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6481
6482 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6483 $self->{line_prev} = $self->{line};
6484 $self->{column_prev} = $self->{column};
6485 $self->{column}++;
6486 $self->{nc}
6487 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6488 } else {
6489 $self->{set_nc}->($self);
6490 }
6491
6492 redo A;
6493 }
6494 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6495 if ($is_space->{$self->{nc}}) {
6496 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6497
6498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6499 $self->{line_prev} = $self->{line};
6500 $self->{column_prev} = $self->{column};
6501 $self->{column}++;
6502 $self->{nc}
6503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6504 } else {
6505 $self->{set_nc}->($self);
6506 }
6507
6508 redo A;
6509 } elsif ($self->{nc} == 0x003E) { # >
6510 ## XML5: Same as "anything else".
6511 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6512 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6513
6514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515 $self->{line_prev} = $self->{line};
6516 $self->{column_prev} = $self->{column};
6517 $self->{column}++;
6518 $self->{nc}
6519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520 } else {
6521 $self->{set_nc}->($self);
6522 }
6523
6524 return ($self->{ct}); # ATTLIST
6525 redo A;
6526 } elsif ($self->{nc} == 0x0028) { # (
6527 ## XML5: Same as "anything else".
6528 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6529 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6530
6531 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6532 $self->{line_prev} = $self->{line};
6533 $self->{column_prev} = $self->{column};
6534 $self->{column}++;
6535 $self->{nc}
6536 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6537 } else {
6538 $self->{set_nc}->($self);
6539 }
6540
6541 redo A;
6542 } elsif ($self->{nc} == -1) {
6543 ## XML5: No parse error.
6544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6545 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6546
6547 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6548 $self->{line_prev} = $self->{line};
6549 $self->{column_prev} = $self->{column};
6550 $self->{column}++;
6551 $self->{nc}
6552 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6553 } else {
6554 $self->{set_nc}->($self);
6555 }
6556
6557 return ($self->{ct}); # ATTLIST
6558 redo A;
6559 } else {
6560 ## XML5: Not defined yet.
6561 $self->{ca}->{name} .= chr $self->{nc};
6562 ## Stay in the state.
6563
6564 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6565 $self->{line_prev} = $self->{line};
6566 $self->{column_prev} = $self->{column};
6567 $self->{column}++;
6568 $self->{nc}
6569 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6570 } else {
6571 $self->{set_nc}->($self);
6572 }
6573
6574 redo A;
6575 }
6576 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6577 if ($is_space->{$self->{nc}}) {
6578 ## Stay in the state.
6579
6580 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6581 $self->{line_prev} = $self->{line};
6582 $self->{column_prev} = $self->{column};
6583 $self->{column}++;
6584 $self->{nc}
6585 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6586 } else {
6587 $self->{set_nc}->($self);
6588 }
6589
6590 redo A;
6591 } elsif ($self->{nc} == 0x003E) { # >
6592 ## XML5: Same as "anything else".
6593 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6594 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6595
6596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6597 $self->{line_prev} = $self->{line};
6598 $self->{column_prev} = $self->{column};
6599 $self->{column}++;
6600 $self->{nc}
6601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6602 } else {
6603 $self->{set_nc}->($self);
6604 }
6605
6606 return ($self->{ct}); # ATTLIST
6607 redo A;
6608 } elsif ($self->{nc} == 0x0028) { # (
6609 ## XML5: Same as "anything else".
6610 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6611
6612 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6613 $self->{line_prev} = $self->{line};
6614 $self->{column_prev} = $self->{column};
6615 $self->{column}++;
6616 $self->{nc}
6617 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6618 } else {
6619 $self->{set_nc}->($self);
6620 }
6621
6622 redo A;
6623 } elsif ($self->{nc} == -1) {
6624 ## XML5: No parse error.
6625 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6626 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6627
6628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6629 $self->{line_prev} = $self->{line};
6630 $self->{column_prev} = $self->{column};
6631 $self->{column}++;
6632 $self->{nc}
6633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6634 } else {
6635 $self->{set_nc}->($self);
6636 }
6637
6638 return ($self->{ct});
6639 redo A;
6640 } else {
6641 ## XML5: Not defined yet.
6642 $self->{ca}->{type} = chr $self->{nc};
6643 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6644
6645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6646 $self->{line_prev} = $self->{line};
6647 $self->{column_prev} = $self->{column};
6648 $self->{column}++;
6649 $self->{nc}
6650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6651 } else {
6652 $self->{set_nc}->($self);
6653 }
6654
6655 redo A;
6656 }
6657 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6658 if ($is_space->{$self->{nc}}) {
6659 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6660
6661 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6662 $self->{line_prev} = $self->{line};
6663 $self->{column_prev} = $self->{column};
6664 $self->{column}++;
6665 $self->{nc}
6666 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6667 } else {
6668 $self->{set_nc}->($self);
6669 }
6670
6671 redo A;
6672 } elsif ($self->{nc} == 0x0023) { # #
6673 ## XML5: Same as "anything else".
6674 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6675 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6676
6677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6678 $self->{line_prev} = $self->{line};
6679 $self->{column_prev} = $self->{column};
6680 $self->{column}++;
6681 $self->{nc}
6682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6683 } else {
6684 $self->{set_nc}->($self);
6685 }
6686
6687 redo A;
6688 } elsif ($self->{nc} == 0x0022) { # "
6689 ## XML5: Same as "anything else".
6690 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6691 $self->{ca}->{value} = '';
6692 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6693
6694 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6695 $self->{line_prev} = $self->{line};
6696 $self->{column_prev} = $self->{column};
6697 $self->{column}++;
6698 $self->{nc}
6699 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6700 } else {
6701 $self->{set_nc}->($self);
6702 }
6703
6704 redo A;
6705 } elsif ($self->{nc} == 0x0027) { # '
6706 ## XML5: Same as "anything else".
6707 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6708 $self->{ca}->{value} = '';
6709 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6710
6711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712 $self->{line_prev} = $self->{line};
6713 $self->{column_prev} = $self->{column};
6714 $self->{column}++;
6715 $self->{nc}
6716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717 } else {
6718 $self->{set_nc}->($self);
6719 }
6720
6721 redo A;
6722 } elsif ($self->{nc} == 0x003E) { # >
6723 ## XML5: Same as "anything else".
6724 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6725 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6726
6727 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6728 $self->{line_prev} = $self->{line};
6729 $self->{column_prev} = $self->{column};
6730 $self->{column}++;
6731 $self->{nc}
6732 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6733 } else {
6734 $self->{set_nc}->($self);
6735 }
6736
6737 return ($self->{ct}); # ATTLIST
6738 redo A;
6739 } elsif ($self->{nc} == 0x0028) { # (
6740 ## XML5: Same as "anything else".
6741 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6742 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6743
6744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6745 $self->{line_prev} = $self->{line};
6746 $self->{column_prev} = $self->{column};
6747 $self->{column}++;
6748 $self->{nc}
6749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6750 } else {
6751 $self->{set_nc}->($self);
6752 }
6753
6754 redo A;
6755 } elsif ($self->{nc} == -1) {
6756 ## XML5: No parse error.
6757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6758 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6759
6760 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6761 $self->{line_prev} = $self->{line};
6762 $self->{column_prev} = $self->{column};
6763 $self->{column}++;
6764 $self->{nc}
6765 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6766 } else {
6767 $self->{set_nc}->($self);
6768 }
6769
6770 return ($self->{ct});
6771 redo A;
6772 } else {
6773 ## XML5: Not defined yet.
6774 $self->{ca}->{type} .= chr $self->{nc};
6775 ## Stay in the state.
6776
6777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6778 $self->{line_prev} = $self->{line};
6779 $self->{column_prev} = $self->{column};
6780 $self->{column}++;
6781 $self->{nc}
6782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6783 } else {
6784 $self->{set_nc}->($self);
6785 }
6786
6787 redo A;
6788 }
6789 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6790 if ($is_space->{$self->{nc}}) {
6791 ## Stay in the state.
6792
6793 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794 $self->{line_prev} = $self->{line};
6795 $self->{column_prev} = $self->{column};
6796 $self->{column}++;
6797 $self->{nc}
6798 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799 } else {
6800 $self->{set_nc}->($self);
6801 }
6802
6803 redo A;
6804 } elsif ($self->{nc} == 0x0028) { # (
6805 ## XML5: Same as "anything else".
6806 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6807
6808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6809 $self->{line_prev} = $self->{line};
6810 $self->{column_prev} = $self->{column};
6811 $self->{column}++;
6812 $self->{nc}
6813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6814 } else {
6815 $self->{set_nc}->($self);
6816 }
6817
6818 redo A;
6819 } elsif ($self->{nc} == 0x0023) { # #
6820 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6821
6822 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6823 $self->{line_prev} = $self->{line};
6824 $self->{column_prev} = $self->{column};
6825 $self->{column}++;
6826 $self->{nc}
6827 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6828 } else {
6829 $self->{set_nc}->($self);
6830 }
6831
6832 redo A;
6833 } elsif ($self->{nc} == 0x0022) { # "
6834 ## XML5: Same as "anything else".
6835 $self->{ca}->{value} = '';
6836 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6837
6838 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6839 $self->{line_prev} = $self->{line};
6840 $self->{column_prev} = $self->{column};
6841 $self->{column}++;
6842 $self->{nc}
6843 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6844 } else {
6845 $self->{set_nc}->($self);
6846 }
6847
6848 redo A;
6849 } elsif ($self->{nc} == 0x0027) { # '
6850 ## XML5: Same as "anything else".
6851 $self->{ca}->{value} = '';
6852 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6853
6854 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6855 $self->{line_prev} = $self->{line};
6856 $self->{column_prev} = $self->{column};
6857 $self->{column}++;
6858 $self->{nc}
6859 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6860 } else {
6861 $self->{set_nc}->($self);
6862 }
6863
6864 redo A;
6865 } elsif ($self->{nc} == 0x003E) { # >
6866 ## XML5: Same as "anything else".
6867 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6868 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6869
6870 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6871 $self->{line_prev} = $self->{line};
6872 $self->{column_prev} = $self->{column};
6873 $self->{column}++;
6874 $self->{nc}
6875 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6876 } else {
6877 $self->{set_nc}->($self);
6878 }
6879
6880 return ($self->{ct}); # ATTLIST
6881 redo A;
6882 } elsif ($self->{nc} == -1) {
6883 ## XML5: No parse error.
6884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6885 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6886
6887 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6888 $self->{line_prev} = $self->{line};
6889 $self->{column_prev} = $self->{column};
6890 $self->{column}++;
6891 $self->{nc}
6892 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6893 } else {
6894 $self->{set_nc}->($self);
6895 }
6896
6897 return ($self->{ct});
6898 redo A;
6899 } else {
6900 ## XML5: Switch to the "DOCTYPE bogus comment state".
6901 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6902 $self->{ca}->{value} = '';
6903 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6904 ## Reconsume.
6905 redo A;
6906 }
6907 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6908 if ($is_space->{$self->{nc}}) {
6909 ## Stay in the state.
6910
6911 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6912 $self->{line_prev} = $self->{line};
6913 $self->{column_prev} = $self->{column};
6914 $self->{column}++;
6915 $self->{nc}
6916 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6917 } else {
6918 $self->{set_nc}->($self);
6919 }
6920
6921 redo A;
6922 } elsif ($self->{nc} == 0x007C) { # |
6923 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6924 ## Stay in the state.
6925
6926 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6927 $self->{line_prev} = $self->{line};
6928 $self->{column_prev} = $self->{column};
6929 $self->{column}++;
6930 $self->{nc}
6931 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6932 } else {
6933 $self->{set_nc}->($self);
6934 }
6935
6936 redo A;
6937 } elsif ($self->{nc} == 0x0029) { # )
6938 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6939 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6940
6941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6942 $self->{line_prev} = $self->{line};
6943 $self->{column_prev} = $self->{column};
6944 $self->{column}++;
6945 $self->{nc}
6946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6947 } else {
6948 $self->{set_nc}->($self);
6949 }
6950
6951 redo A;
6952 } elsif ($self->{nc} == 0x003E) { # >
6953 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6954 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6955
6956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6957 $self->{line_prev} = $self->{line};
6958 $self->{column_prev} = $self->{column};
6959 $self->{column}++;
6960 $self->{nc}
6961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6962 } else {
6963 $self->{set_nc}->($self);
6964 }
6965
6966 return ($self->{ct}); # ATTLIST
6967 redo A;
6968 } elsif ($self->{nc} == -1) {
6969 ## XML5: No parse error.
6970 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6971 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6972
6973 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6974 $self->{line_prev} = $self->{line};
6975 $self->{column_prev} = $self->{column};
6976 $self->{column}++;
6977 $self->{nc}
6978 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6979 } else {
6980 $self->{set_nc}->($self);
6981 }
6982
6983 return ($self->{ct});
6984 redo A;
6985 } else {
6986 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6987 $self->{state} = ALLOWED_TOKEN_STATE;
6988
6989 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6990 $self->{line_prev} = $self->{line};
6991 $self->{column_prev} = $self->{column};
6992 $self->{column}++;
6993 $self->{nc}
6994 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6995 } else {
6996 $self->{set_nc}->($self);
6997 }
6998
6999 redo A;
7000 }
7001 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7002 if ($is_space->{$self->{nc}}) {
7003 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7004
7005 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7006 $self->{line_prev} = $self->{line};
7007 $self->{column_prev} = $self->{column};
7008 $self->{column}++;
7009 $self->{nc}
7010 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7011 } else {
7012 $self->{set_nc}->($self);
7013 }
7014
7015 redo A;
7016 } elsif ($self->{nc} == 0x007C) { # |
7017 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7018
7019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020 $self->{line_prev} = $self->{line};
7021 $self->{column_prev} = $self->{column};
7022 $self->{column}++;
7023 $self->{nc}
7024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025 } else {
7026 $self->{set_nc}->($self);
7027 }
7028
7029 redo A;
7030 } elsif ($self->{nc} == 0x0029) { # )
7031 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7032
7033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7034 $self->{line_prev} = $self->{line};
7035 $self->{column_prev} = $self->{column};
7036 $self->{column}++;
7037 $self->{nc}
7038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7039 } else {
7040 $self->{set_nc}->($self);
7041 }
7042
7043 redo A;
7044 } elsif ($self->{nc} == 0x003E) { # >
7045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7046 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7047
7048 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7049 $self->{line_prev} = $self->{line};
7050 $self->{column_prev} = $self->{column};
7051 $self->{column}++;
7052 $self->{nc}
7053 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7054 } else {
7055 $self->{set_nc}->($self);
7056 }
7057
7058 return ($self->{ct}); # ATTLIST
7059 redo A;
7060 } elsif ($self->{nc} == -1) {
7061 ## XML5: No parse error.
7062 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7063 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7064
7065 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7066 $self->{line_prev} = $self->{line};
7067 $self->{column_prev} = $self->{column};
7068 $self->{column}++;
7069 $self->{nc}
7070 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7071 } else {
7072 $self->{set_nc}->($self);
7073 }
7074
7075 return ($self->{ct});
7076 redo A;
7077 } else {
7078 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7079 ## Stay in the state.
7080
7081 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7082 $self->{line_prev} = $self->{line};
7083 $self->{column_prev} = $self->{column};
7084 $self->{column}++;
7085 $self->{nc}
7086 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7087 } else {
7088 $self->{set_nc}->($self);
7089 }
7090
7091 redo A;
7092 }
7093 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7094 if ($is_space->{$self->{nc}}) {
7095 ## Stay in the state.
7096
7097 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7098 $self->{line_prev} = $self->{line};
7099 $self->{column_prev} = $self->{column};
7100 $self->{column}++;
7101 $self->{nc}
7102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7103 } else {
7104 $self->{set_nc}->($self);
7105 }
7106
7107 redo A;
7108 } elsif ($self->{nc} == 0x007C) { # |
7109 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7110
7111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7112 $self->{line_prev} = $self->{line};
7113 $self->{column_prev} = $self->{column};
7114 $self->{column}++;
7115 $self->{nc}
7116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7117 } else {
7118 $self->{set_nc}->($self);
7119 }
7120
7121 redo A;
7122 } elsif ($self->{nc} == 0x0029) { # )
7123 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7124
7125 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7126 $self->{line_prev} = $self->{line};
7127 $self->{column_prev} = $self->{column};
7128 $self->{column}++;
7129 $self->{nc}
7130 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7131 } else {
7132 $self->{set_nc}->($self);
7133 }
7134
7135 redo A;
7136 } elsif ($self->{nc} == 0x003E) { # >
7137 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7138 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7139
7140 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7141 $self->{line_prev} = $self->{line};
7142 $self->{column_prev} = $self->{column};
7143 $self->{column}++;
7144 $self->{nc}
7145 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7146 } else {
7147 $self->{set_nc}->($self);
7148 }
7149
7150 return ($self->{ct}); # ATTLIST
7151 redo A;
7152 } elsif ($self->{nc} == -1) {
7153 ## XML5: No parse error.
7154 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7155 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7156
7157 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7158 $self->{line_prev} = $self->{line};
7159 $self->{column_prev} = $self->{column};
7160 $self->{column}++;
7161 $self->{nc}
7162 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7163 } else {
7164 $self->{set_nc}->($self);
7165 }
7166
7167 return ($self->{ct});
7168 redo A;
7169 } else {
7170 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7171 line => $self->{line_prev},
7172 column => $self->{column_prev});
7173 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7174 $self->{state} = ALLOWED_TOKEN_STATE;
7175
7176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177 $self->{line_prev} = $self->{line};
7178 $self->{column_prev} = $self->{column};
7179 $self->{column}++;
7180 $self->{nc}
7181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182 } else {
7183 $self->{set_nc}->($self);
7184 }
7185
7186 redo A;
7187 }
7188 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7189 if ($is_space->{$self->{nc}}) {
7190 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7191
7192 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193 $self->{line_prev} = $self->{line};
7194 $self->{column_prev} = $self->{column};
7195 $self->{column}++;
7196 $self->{nc}
7197 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198 } else {
7199 $self->{set_nc}->($self);
7200 }
7201
7202 redo A;
7203 } elsif ($self->{nc} == 0x0023) { # #
7204 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7205 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7206
7207 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7208 $self->{line_prev} = $self->{line};
7209 $self->{column_prev} = $self->{column};
7210 $self->{column}++;
7211 $self->{nc}
7212 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7213 } else {
7214 $self->{set_nc}->($self);
7215 }
7216
7217 redo A;
7218 } elsif ($self->{nc} == 0x0022) { # "
7219 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7220 $self->{ca}->{value} = '';
7221 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7222
7223 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7224 $self->{line_prev} = $self->{line};
7225 $self->{column_prev} = $self->{column};
7226 $self->{column}++;
7227 $self->{nc}
7228 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7229 } else {
7230 $self->{set_nc}->($self);
7231 }
7232
7233 redo A;
7234 } elsif ($self->{nc} == 0x0027) { # '
7235 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7236 $self->{ca}->{value} = '';
7237 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7238
7239 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7240 $self->{line_prev} = $self->{line};
7241 $self->{column_prev} = $self->{column};
7242 $self->{column}++;
7243 $self->{nc}
7244 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7245 } else {
7246 $self->{set_nc}->($self);
7247 }
7248
7249 redo A;
7250 } elsif ($self->{nc} == 0x003E) { # >
7251 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7252 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7253
7254 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7255 $self->{line_prev} = $self->{line};
7256 $self->{column_prev} = $self->{column};
7257 $self->{column}++;
7258 $self->{nc}
7259 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7260 } else {
7261 $self->{set_nc}->($self);
7262 }
7263
7264 return ($self->{ct}); # ATTLIST
7265 redo A;
7266 } elsif ($self->{nc} == -1) {
7267 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7268 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7269
7270 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271 $self->{line_prev} = $self->{line};
7272 $self->{column_prev} = $self->{column};
7273 $self->{column}++;
7274 $self->{nc}
7275 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276 } else {
7277 $self->{set_nc}->($self);
7278 }
7279
7280 return ($self->{ct});
7281 redo A;
7282 } else {
7283 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7284 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7285 ## Reconsume.
7286 redo A;
7287 }
7288 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7289 if ($is_space->{$self->{nc}}) {
7290 ## Stay in the state.
7291
7292 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7293 $self->{line_prev} = $self->{line};
7294 $self->{column_prev} = $self->{column};
7295 $self->{column}++;
7296 $self->{nc}
7297 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7298 } else {
7299 $self->{set_nc}->($self);
7300 }
7301
7302 redo A;
7303 } elsif ($self->{nc} == 0x0023) { # #
7304 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7305
7306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7307 $self->{line_prev} = $self->{line};
7308 $self->{column_prev} = $self->{column};
7309 $self->{column}++;
7310 $self->{nc}
7311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7312 } else {
7313 $self->{set_nc}->($self);
7314 }
7315
7316 redo A;
7317 } elsif ($self->{nc} == 0x0022) { # "
7318 $self->{ca}->{value} = '';
7319 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7320
7321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7322 $self->{line_prev} = $self->{line};
7323 $self->{column_prev} = $self->{column};
7324 $self->{column}++;
7325 $self->{nc}
7326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7327 } else {
7328 $self->{set_nc}->($self);
7329 }
7330
7331 redo A;
7332 } elsif ($self->{nc} == 0x0027) { # '
7333 $self->{ca}->{value} = '';
7334 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7335
7336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7337 $self->{line_prev} = $self->{line};
7338 $self->{column_prev} = $self->{column};
7339 $self->{column}++;
7340 $self->{nc}
7341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7342 } else {
7343 $self->{set_nc}->($self);
7344 }
7345
7346 redo A;
7347 } elsif ($self->{nc} == 0x003E) { # >
7348 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7349 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7350
7351 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7352 $self->{line_prev} = $self->{line};
7353 $self->{column_prev} = $self->{column};
7354 $self->{column}++;
7355 $self->{nc}
7356 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7357 } else {
7358 $self->{set_nc}->($self);
7359 }
7360
7361 return ($self->{ct}); # ATTLIST
7362 redo A;
7363 } elsif ($self->{nc} == -1) {
7364 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7365 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7366
7367 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7368 $self->{line_prev} = $self->{line};
7369 $self->{column_prev} = $self->{column};
7370 $self->{column}++;
7371 $self->{nc}
7372 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7373 } else {
7374 $self->{set_nc}->($self);
7375 }
7376
7377 return ($self->{ct});
7378 redo A;
7379 } else {
7380 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7381 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7382 ## Reconsume.
7383 redo A;
7384 }
7385 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7386 if ($is_space->{$self->{nc}}) {
7387 ## XML5: No parse error.
7388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7389 $self->{state} = BOGUS_MD_STATE;
7390 ## Reconsume.
7391 redo A;
7392 } elsif ($self->{nc} == 0x0022) { # "
7393 ## XML5: Same as "anything else".
7394 $self->{ca}->{value} = '';
7395 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7396
7397 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7398 $self->{line_prev} = $self->{line};
7399 $self->{column_prev} = $self->{column};
7400 $self->{column}++;
7401 $self->{nc}
7402 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7403 } else {
7404 $self->{set_nc}->($self);
7405 }
7406
7407 redo A;
7408 } elsif ($self->{nc} == 0x0027) { # '
7409 ## XML5: Same as "anything else".
7410 $self->{ca}->{value} = '';
7411 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7412
7413 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7414 $self->{line_prev} = $self->{line};
7415 $self->{column_prev} = $self->{column};
7416 $self->{column}++;
7417 $self->{nc}
7418 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7419 } else {
7420 $self->{set_nc}->($self);
7421 }
7422
7423 redo A;
7424 } elsif ($self->{nc} == 0x003E) { # >
7425 ## XML5: Same as "anything else".
7426 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7427 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7428
7429 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7430 $self->{line_prev} = $self->{line};
7431 $self->{column_prev} = $self->{column};
7432 $self->{column}++;
7433 $self->{nc}
7434 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7435 } else {
7436 $self->{set_nc}->($self);
7437 }
7438
7439 return ($self->{ct}); # ATTLIST
7440 redo A;
7441 } elsif ($self->{nc} == -1) {
7442 ## XML5: No parse error.
7443 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7444 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7445
7446 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7447 $self->{line_prev} = $self->{line};
7448 $self->{column_prev} = $self->{column};
7449 $self->{column}++;
7450 $self->{nc}
7451 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7452 } else {
7453 $self->{set_nc}->($self);
7454 }
7455
7456 return ($self->{ct});
7457 redo A;
7458 } else {
7459 $self->{ca}->{default} = chr $self->{nc};
7460 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7461
7462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7463 $self->{line_prev} = $self->{line};
7464 $self->{column_prev} = $self->{column};
7465 $self->{column}++;
7466 $self->{nc}
7467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7468 } else {
7469 $self->{set_nc}->($self);
7470 }
7471
7472 redo A;
7473 }
7474 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7475 if ($is_space->{$self->{nc}}) {
7476 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7477
7478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7479 $self->{line_prev} = $self->{line};
7480 $self->{column_prev} = $self->{column};
7481 $self->{column}++;
7482 $self->{nc}
7483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7484 } else {
7485 $self->{set_nc}->($self);
7486 }
7487
7488 redo A;
7489 } elsif ($self->{nc} == 0x0022) { # "
7490 ## XML5: Same as "anything else".
7491 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7492 $self->{ca}->{value} = '';
7493 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7494
7495 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7496 $self->{line_prev} = $self->{line};
7497 $self->{column_prev} = $self->{column};
7498 $self->{column}++;
7499 $self->{nc}
7500 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7501 } else {
7502 $self->{set_nc}->($self);
7503 }
7504
7505 redo A;
7506 } elsif ($self->{nc} == 0x0027) { # '
7507 ## XML5: Same as "anything else".
7508 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7509 $self->{ca}->{value} = '';
7510 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7511
7512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7513 $self->{line_prev} = $self->{line};
7514 $self->{column_prev} = $self->{column};
7515 $self->{column}++;
7516 $self->{nc}
7517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7518 } else {
7519 $self->{set_nc}->($self);
7520 }
7521
7522 redo A;
7523 } elsif ($self->{nc} == 0x003E) { # >
7524 ## XML5: Same as "anything else".
7525 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7526 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7527
7528 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7529 $self->{line_prev} = $self->{line};
7530 $self->{column_prev} = $self->{column};
7531 $self->{column}++;
7532 $self->{nc}
7533 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7534 } else {
7535 $self->{set_nc}->($self);
7536 }
7537
7538 return ($self->{ct}); # ATTLIST
7539 redo A;
7540 } elsif ($self->{nc} == -1) {
7541 ## XML5: No parse error.
7542 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7543 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7544 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7545
7546 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547 $self->{line_prev} = $self->{line};
7548 $self->{column_prev} = $self->{column};
7549 $self->{column}++;
7550 $self->{nc}
7551 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552 } else {
7553 $self->{set_nc}->($self);
7554 }
7555
7556 return ($self->{ct});
7557 redo A;
7558 } else {
7559 $self->{ca}->{default} .= chr $self->{nc};
7560 ## Stay in the state.
7561
7562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7563 $self->{line_prev} = $self->{line};
7564 $self->{column_prev} = $self->{column};
7565 $self->{column}++;
7566 $self->{nc}
7567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7568 } else {
7569 $self->{set_nc}->($self);
7570 }
7571
7572 redo A;
7573 }
7574 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7575 if ($is_space->{$self->{nc}}) {
7576 ## Stay in the state.
7577
7578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7579 $self->{line_prev} = $self->{line};
7580 $self->{column_prev} = $self->{column};
7581 $self->{column}++;
7582 $self->{nc}
7583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7584 } else {
7585 $self->{set_nc}->($self);
7586 }
7587
7588 redo A;
7589 } elsif ($self->{nc} == 0x0022) { # "
7590 $self->{ca}->{value} = '';
7591 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7592
7593 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7594 $self->{line_prev} = $self->{line};
7595 $self->{column_prev} = $self->{column};
7596 $self->{column}++;
7597 $self->{nc}
7598 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7599 } else {
7600 $self->{set_nc}->($self);
7601 }
7602
7603 redo A;
7604 } elsif ($self->{nc} == 0x0027) { # '
7605 $self->{ca}->{value} = '';
7606 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7607
7608 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7609 $self->{line_prev} = $self->{line};
7610 $self->{column_prev} = $self->{column};
7611 $self->{column}++;
7612 $self->{nc}
7613 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7614 } else {
7615 $self->{set_nc}->($self);
7616 }
7617
7618 redo A;
7619 } elsif ($self->{nc} == 0x003E) { # >
7620 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7621 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7622
7623 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7624 $self->{line_prev} = $self->{line};
7625 $self->{column_prev} = $self->{column};
7626 $self->{column}++;
7627 $self->{nc}
7628 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7629 } else {
7630 $self->{set_nc}->($self);
7631 }
7632
7633 return ($self->{ct}); # ATTLIST
7634 redo A;
7635 } elsif ($self->{nc} == -1) {
7636 ## XML5: No parse error.
7637 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7638 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7639 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7640
7641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7642 $self->{line_prev} = $self->{line};
7643 $self->{column_prev} = $self->{column};
7644 $self->{column}++;
7645 $self->{nc}
7646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7647 } else {
7648 $self->{set_nc}->($self);
7649 }
7650
7651 return ($self->{ct});
7652 redo A;
7653 } else {
7654 ## XML5: Not defined yet.
7655 if ($self->{ca}->{default} eq 'FIXED') {
7656 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7657 } else {
7658 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7659 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7660 }
7661 ## Reconsume.
7662 redo A;
7663 }
7664 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7665 if ($is_space->{$self->{nc}} or
7666 $self->{nc} == -1 or
7667 $self->{nc} == 0x003E) { # >
7668 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7669 ## Reconsume.
7670 redo A;
7671 } else {
7672 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7673 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7674 ## Reconsume.
7675 redo A;
7676 }
7677 } elsif ($self->{state} == NDATA_STATE) {
7678 ## ASCII case-insensitive
7679 if ($self->{nc} == [
7680 undef,
7681 0x0044, # D
7682 0x0041, # A
7683 0x0054, # T
7684 ]->[length $self->{kwd}] or
7685 $self->{nc} == [
7686 undef,
7687 0x0064, # d
7688 0x0061, # a
7689 0x0074, # t
7690 ]->[length $self->{kwd}]) {
7691
7692 ## Stay in the state.
7693 $self->{kwd} .= chr $self->{nc};
7694
7695 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7696 $self->{line_prev} = $self->{line};
7697 $self->{column_prev} = $self->{column};
7698 $self->{column}++;
7699 $self->{nc}
7700 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7701 } else {
7702 $self->{set_nc}->($self);
7703 }
7704
7705 redo A;
7706 } elsif ((length $self->{kwd}) == 4 and
7707 ($self->{nc} == 0x0041 or # A
7708 $self->{nc} == 0x0061)) { # a
7709 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7710
7711 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7712 text => 'NDATA',
7713 line => $self->{line_prev},
7714 column => $self->{column_prev} - 4);
7715 } else {
7716
7717 }
7718 $self->{state} = AFTER_NDATA_STATE;
7719
7720 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7721 $self->{line_prev} = $self->{line};
7722 $self->{column_prev} = $self->{column};
7723 $self->{column}++;
7724 $self->{nc}
7725 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7726 } else {
7727 $self->{set_nc}->($self);
7728 }
7729
7730 redo A;
7731 } else {
7732 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7733 line => $self->{line_prev},
7734 column => $self->{column_prev} + 1
7735 - length $self->{kwd});
7736
7737 $self->{state} = BOGUS_MD_STATE;
7738 ## Reconsume.
7739 redo A;
7740 }
7741 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7742 if ($is_space->{$self->{nc}}) {
7743 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7744
7745 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7746 $self->{line_prev} = $self->{line};
7747 $self->{column_prev} = $self->{column};
7748 $self->{column}++;
7749 $self->{nc}
7750 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7751 } else {
7752 $self->{set_nc}->($self);
7753 }
7754
7755 redo A;
7756 } elsif ($self->{nc} == 0x003E) { # >
7757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7758 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7759
7760 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7761 $self->{line_prev} = $self->{line};
7762 $self->{column_prev} = $self->{column};
7763 $self->{column}++;
7764 $self->{nc}
7765 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7766 } else {
7767 $self->{set_nc}->($self);
7768 }
7769
7770 return ($self->{ct}); # ENTITY
7771 redo A;
7772 } elsif ($self->{nc} == -1) {
7773 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7774 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7775
7776 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7777 $self->{line_prev} = $self->{line};
7778 $self->{column_prev} = $self->{column};
7779 $self->{column}++;
7780 $self->{nc}
7781 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7782 } else {
7783 $self->{set_nc}->($self);
7784 }
7785
7786 return ($self->{ct}); # ENTITY
7787 redo A;
7788 } else {
7789 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7790 line => $self->{line_prev},
7791 column => $self->{column_prev} + 1
7792 - length $self->{kwd});
7793 $self->{state} = BOGUS_MD_STATE;
7794 ## Reconsume.
7795 redo A;
7796 }
7797 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7798 if ($is_space->{$self->{nc}}) {
7799 ## Stay in the state.
7800
7801 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7802 $self->{line_prev} = $self->{line};
7803 $self->{column_prev} = $self->{column};
7804 $self->{column}++;
7805 $self->{nc}
7806 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7807 } else {
7808 $self->{set_nc}->($self);
7809 }
7810
7811 redo A;
7812 } elsif ($self->{nc} == 0x003E) { # >
7813 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7814 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7815
7816 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7817 $self->{line_prev} = $self->{line};
7818 $self->{column_prev} = $self->{column};
7819 $self->{column}++;
7820 $self->{nc}
7821 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7822 } else {
7823 $self->{set_nc}->($self);
7824 }
7825
7826 return ($self->{ct}); # ENTITY
7827 redo A;
7828 } elsif ($self->{nc} == -1) {
7829 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7830 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7831
7832 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7833 $self->{line_prev} = $self->{line};
7834 $self->{column_prev} = $self->{column};
7835 $self->{column}++;
7836 $self->{nc}
7837 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7838 } else {
7839 $self->{set_nc}->($self);
7840 }
7841
7842 return ($self->{ct}); # ENTITY
7843 redo A;
7844 } else {
7845 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7846 $self->{state} = NOTATION_NAME_STATE;
7847
7848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7849 $self->{line_prev} = $self->{line};
7850 $self->{column_prev} = $self->{column};
7851 $self->{column}++;
7852 $self->{nc}
7853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7854 } else {
7855 $self->{set_nc}->($self);
7856 }
7857
7858 redo A;
7859 }
7860 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7861 if ($is_space->{$self->{nc}}) {
7862 $self->{state} = AFTER_MD_DEF_STATE;
7863
7864 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7865 $self->{line_prev} = $self->{line};
7866 $self->{column_prev} = $self->{column};
7867 $self->{column}++;
7868 $self->{nc}
7869 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7870 } else {
7871 $self->{set_nc}->($self);
7872 }
7873
7874 redo A;
7875 } elsif ($self->{nc} == 0x003E) { # >
7876 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7877
7878 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7879 $self->{line_prev} = $self->{line};
7880 $self->{column_prev} = $self->{column};
7881 $self->{column}++;
7882 $self->{nc}
7883 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7884 } else {
7885 $self->{set_nc}->($self);
7886 }
7887
7888 return ($self->{ct}); # ENTITY
7889 redo A;
7890 } elsif ($self->{nc} == -1) {
7891 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7892 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7893
7894 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7895 $self->{line_prev} = $self->{line};
7896 $self->{column_prev} = $self->{column};
7897 $self->{column}++;
7898 $self->{nc}
7899 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7900 } else {
7901 $self->{set_nc}->($self);
7902 }
7903
7904 return ($self->{ct}); # ENTITY
7905 redo A;
7906 } else {
7907 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7908 ## Stay in the state.
7909
7910 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7911 $self->{line_prev} = $self->{line};
7912 $self->{column_prev} = $self->{column};
7913 $self->{column}++;
7914 $self->{nc}
7915 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7916 } else {
7917 $self->{set_nc}->($self);
7918 }
7919
7920 redo A;
7921 }
7922 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7923 if ($self->{nc} == 0x0022) { # "
7924 $self->{state} = AFTER_MD_DEF_STATE;
7925
7926 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7927 $self->{line_prev} = $self->{line};
7928 $self->{column_prev} = $self->{column};
7929 $self->{column}++;
7930 $self->{nc}
7931 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7932 } else {
7933 $self->{set_nc}->($self);
7934 }
7935
7936 redo A;
7937 } elsif ($self->{nc} == 0x0026) { # &
7938 $self->{prev_state} = $self->{state};
7939 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7940 $self->{entity_add} = 0x0022; # "
7941
7942 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7943 $self->{line_prev} = $self->{line};
7944 $self->{column_prev} = $self->{column};
7945 $self->{column}++;
7946 $self->{nc}
7947 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7948 } else {
7949 $self->{set_nc}->($self);
7950 }
7951
7952 redo A;
7953 ## TODO: %
7954 } elsif ($self->{nc} == -1) {
7955 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7956 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7957 ## Reconsume.
7958 return ($self->{ct}); # ENTITY
7959 redo A;
7960 } else {
7961 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7962
7963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7964 $self->{line_prev} = $self->{line};
7965 $self->{column_prev} = $self->{column};
7966 $self->{column}++;
7967 $self->{nc}
7968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7969 } else {
7970 $self->{set_nc}->($self);
7971 }
7972
7973 redo A;
7974 }
7975 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7976 if ($self->{nc} == 0x0027) { # '
7977 $self->{state} = AFTER_MD_DEF_STATE;
7978
7979 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7980 $self->{line_prev} = $self->{line};
7981 $self->{column_prev} = $self->{column};
7982 $self->{column}++;
7983 $self->{nc}
7984 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7985 } else {
7986 $self->{set_nc}->($self);
7987 }
7988
7989 redo A;
7990 } elsif ($self->{nc} == 0x0026) { # &
7991 $self->{prev_state} = $self->{state};
7992 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7993 $self->{entity_add} = 0x0027; # '
7994
7995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7996 $self->{line_prev} = $self->{line};
7997 $self->{column_prev} = $self->{column};
7998 $self->{column}++;
7999 $self->{nc}
8000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8001 } else {
8002 $self->{set_nc}->($self);
8003 }
8004
8005 redo A;
8006 ## TODO: %
8007 } elsif ($self->{nc} == -1) {
8008 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8009 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8010 ## Reconsume.
8011 return ($self->{ct}); # ENTITY
8012 redo A;
8013 } else {
8014 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8015
8016 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8017 $self->{line_prev} = $self->{line};
8018 $self->{column_prev} = $self->{column};
8019 $self->{column}++;
8020 $self->{nc}
8021 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8022 } else {
8023 $self->{set_nc}->($self);
8024 }
8025
8026 redo A;
8027 }
8028 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8029 if ($is_space->{$self->{nc}} or
8030 {
8031 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8032 $self->{entity_add} => 1,
8033 }->{$self->{nc}}) {
8034 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8035 line => $self->{line_prev},
8036 column => $self->{column_prev}
8037 + ($self->{nc} == -1 ? 1 : 0));
8038 ## Don't consume
8039 ## Return nothing.
8040 #
8041 } elsif ($self->{nc} == 0x0023) { # #
8042 $self->{ca} = $self->{ct};
8043 $self->{state} = ENTITY_HASH_STATE;
8044 $self->{kwd} = '#';
8045
8046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8047 $self->{line_prev} = $self->{line};
8048 $self->{column_prev} = $self->{column};
8049 $self->{column}++;
8050 $self->{nc}
8051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8052 } else {
8053 $self->{set_nc}->($self);
8054 }
8055
8056 redo A;
8057 } else {
8058 #
8059 }
8060
8061 $self->{ct}->{value} .= '&';
8062 $self->{state} = $self->{prev_state};
8063 ## Reconsume.
8064 redo A;
8065 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8066 if ($is_space->{$self->{nc}}) {
8067 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8068
8069 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8070 $self->{line_prev} = $self->{line};
8071 $self->{column_prev} = $self->{column};
8072 $self->{column}++;
8073 $self->{nc}
8074 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8075 } else {
8076 $self->{set_nc}->($self);
8077 }
8078
8079 redo A;
8080 } elsif ($self->{nc} == 0x0028) { # (
8081 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8082 $self->{ct}->{content} = ['('];
8083 $self->{group_depth} = 1;
8084
8085 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8086 $self->{line_prev} = $self->{line};
8087 $self->{column_prev} = $self->{column};
8088 $self->{column}++;
8089 $self->{nc}
8090 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8091 } else {
8092 $self->{set_nc}->($self);
8093 }
8094
8095 redo A;
8096 } elsif ($self->{nc} == 0x003E) { # >
8097 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8098 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8099
8100 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8101 $self->{line_prev} = $self->{line};
8102 $self->{column_prev} = $self->{column};
8103 $self->{column}++;
8104 $self->{nc}
8105 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8106 } else {
8107 $self->{set_nc}->($self);
8108 }
8109
8110 return ($self->{ct}); # ELEMENT
8111 redo A;
8112 } elsif ($self->{nc} == -1) {
8113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8114 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8115
8116 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8117 $self->{line_prev} = $self->{line};
8118 $self->{column_prev} = $self->{column};
8119 $self->{column}++;
8120 $self->{nc}
8121 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8122 } else {
8123 $self->{set_nc}->($self);
8124 }
8125
8126 return ($self->{ct}); # ELEMENT
8127 redo A;
8128 } else {
8129 $self->{ct}->{content} = [chr $self->{nc}];
8130 $self->{state} = CONTENT_KEYWORD_STATE;
8131
8132 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8133 $self->{line_prev} = $self->{line};
8134 $self->{column_prev} = $self->{column};
8135 $self->{column}++;
8136 $self->{nc}
8137 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8138 } else {
8139 $self->{set_nc}->($self);
8140 }
8141
8142 redo A;
8143 }
8144 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8145 if ($is_space->{$self->{nc}}) {
8146 $self->{state} = AFTER_MD_DEF_STATE;
8147
8148 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8149 $self->{line_prev} = $self->{line};
8150 $self->{column_prev} = $self->{column};
8151 $self->{column}++;
8152 $self->{nc}
8153 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8154 } else {
8155 $self->{set_nc}->($self);
8156 }
8157
8158 redo A;
8159 } elsif ($self->{nc} == 0x003E) { # >
8160 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8161
8162 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8163 $self->{line_prev} = $self->{line};
8164 $self->{column_prev} = $self->{column};
8165 $self->{column}++;
8166 $self->{nc}
8167 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8168 } else {
8169 $self->{set_nc}->($self);
8170 }
8171
8172 return ($self->{ct}); # ELEMENT
8173 redo A;
8174 } elsif ($self->{nc} == -1) {
8175 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8176 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8177
8178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8179 $self->{line_prev} = $self->{line};
8180 $self->{column_prev} = $self->{column};
8181 $self->{column}++;
8182 $self->{nc}
8183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8184 } else {
8185 $self->{set_nc}->($self);
8186 }
8187
8188 return ($self->{ct}); # ELEMENT
8189 redo A;
8190 } else {
8191 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8192 ## Stay in the state.
8193
8194 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8195 $self->{line_prev} = $self->{line};
8196 $self->{column_prev} = $self->{column};
8197 $self->{column}++;
8198 $self->{nc}
8199 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8200 } else {
8201 $self->{set_nc}->($self);
8202 }
8203
8204 redo A;
8205 }
8206 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8207 if ($is_space->{$self->{nc}}) {
8208 ## Stay in the state.
8209
8210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8211 $self->{line_prev} = $self->{line};
8212 $self->{column_prev} = $self->{column};
8213 $self->{column}++;
8214 $self->{nc}
8215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8216 } else {
8217 $self->{set_nc}->($self);
8218 }
8219
8220 redo A;
8221 } elsif ($self->{nc} == 0x0028) { # (
8222 $self->{group_depth}++;
8223 push @{$self->{ct}->{content}}, chr $self->{nc};
8224 ## Stay in the state.
8225
8226 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8227 $self->{line_prev} = $self->{line};
8228 $self->{column_prev} = $self->{column};
8229 $self->{column}++;
8230 $self->{nc}
8231 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8232 } else {
8233 $self->{set_nc}->($self);
8234 }
8235
8236 redo A;
8237 } elsif ($self->{nc} == 0x007C or # |
8238 $self->{nc} == 0x002C) { # ,
8239 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8240 ## Stay in the state.
8241
8242 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8243 $self->{line_prev} = $self->{line};
8244 $self->{column_prev} = $self->{column};
8245 $self->{column}++;
8246 $self->{nc}
8247 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8248 } else {
8249 $self->{set_nc}->($self);
8250 }
8251
8252 redo A;
8253 } elsif ($self->{nc} == 0x0029) { # )
8254 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8255 push @{$self->{ct}->{content}}, chr $self->{nc};
8256 $self->{group_depth}--;
8257 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8258
8259 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8260 $self->{line_prev} = $self->{line};
8261 $self->{column_prev} = $self->{column};
8262 $self->{column}++;
8263 $self->{nc}
8264 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8265 } else {
8266 $self->{set_nc}->($self);
8267 }
8268
8269 redo A;
8270 } elsif ($self->{nc} == 0x003E) { # >
8271 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8272 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8273 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8274
8275 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8276 $self->{line_prev} = $self->{line};
8277 $self->{column_prev} = $self->{column};
8278 $self->{column}++;
8279 $self->{nc}
8280 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8281 } else {
8282 $self->{set_nc}->($self);
8283 }
8284
8285 return ($self->{ct}); # ELEMENT
8286 redo A;
8287 } elsif ($self->{nc} == -1) {
8288 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8289 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8290 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8291
8292 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8293 $self->{line_prev} = $self->{line};
8294 $self->{column_prev} = $self->{column};
8295 $self->{column}++;
8296 $self->{nc}
8297 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8298 } else {
8299 $self->{set_nc}->($self);
8300 }
8301
8302 return ($self->{ct}); # ELEMENT
8303 redo A;
8304 } else {
8305 push @{$self->{ct}->{content}}, chr $self->{nc};
8306 $self->{state} = CM_ELEMENT_NAME_STATE;
8307
8308 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8309 $self->{line_prev} = $self->{line};
8310 $self->{column_prev} = $self->{column};
8311 $self->{column}++;
8312 $self->{nc}
8313 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8314 } else {
8315 $self->{set_nc}->($self);
8316 }
8317
8318 redo A;
8319 }
8320 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8321 if ($is_space->{$self->{nc}}) {
8322 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8323
8324 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8325 $self->{line_prev} = $self->{line};
8326 $self->{column_prev} = $self->{column};
8327 $self->{column}++;
8328 $self->{nc}
8329 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8330 } else {
8331 $self->{set_nc}->($self);
8332 }
8333
8334 redo A;
8335 } elsif ($self->{nc} == 0x002A or # *
8336 $self->{nc} == 0x002B or # +
8337 $self->{nc} == 0x003F) { # ?
8338 push @{$self->{ct}->{content}}, chr $self->{nc};
8339 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8340
8341 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8342 $self->{line_prev} = $self->{line};
8343 $self->{column_prev} = $self->{column};
8344 $self->{column}++;
8345 $self->{nc}
8346 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8347 } else {
8348 $self->{set_nc}->($self);
8349 }
8350
8351 redo A;
8352 } elsif ($self->{nc} == 0x007C or # |
8353 $self->{nc} == 0x002C) { # ,
8354 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8355 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8356
8357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8358 $self->{line_prev} = $self->{line};
8359 $self->{column_prev} = $self->{column};
8360 $self->{column}++;
8361 $self->{nc}
8362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8363 } else {
8364 $self->{set_nc}->($self);
8365 }
8366
8367 redo A;
8368 } elsif ($self->{nc} == 0x0029) { # )
8369 $self->{group_depth}--;
8370 push @{$self->{ct}->{content}}, chr $self->{nc};
8371 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8372
8373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8374 $self->{line_prev} = $self->{line};
8375 $self->{column_prev} = $self->{column};
8376 $self->{column}++;
8377 $self->{nc}
8378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8379 } else {
8380 $self->{set_nc}->($self);
8381 }
8382
8383 redo A;
8384 } elsif ($self->{nc} == 0x003E) { # >
8385 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8386 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8387 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8388
8389 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8390 $self->{line_prev} = $self->{line};
8391 $self->{column_prev} = $self->{column};
8392 $self->{column}++;
8393 $self->{nc}
8394 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8395 } else {
8396 $self->{set_nc}->($self);
8397 }
8398
8399 return ($self->{ct}); # ELEMENT
8400 redo A;
8401 } elsif ($self->{nc} == -1) {
8402 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8403 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8404 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8405
8406 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8407 $self->{line_prev} = $self->{line};
8408 $self->{column_prev} = $self->{column};
8409 $self->{column}++;
8410 $self->{nc}
8411 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8412 } else {
8413 $self->{set_nc}->($self);
8414 }
8415
8416 return ($self->{ct}); # ELEMENT
8417 redo A;
8418 } else {
8419 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8420 ## Stay in the state.
8421
8422 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423 $self->{line_prev} = $self->{line};
8424 $self->{column_prev} = $self->{column};
8425 $self->{column}++;
8426 $self->{nc}
8427 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428 } else {
8429 $self->{set_nc}->($self);
8430 }
8431
8432 redo A;
8433 }
8434 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8435 if ($is_space->{$self->{nc}}) {
8436 ## Stay in the state.
8437
8438 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8439 $self->{line_prev} = $self->{line};
8440 $self->{column_prev} = $self->{column};
8441 $self->{column}++;
8442 $self->{nc}
8443 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8444 } else {
8445 $self->{set_nc}->($self);
8446 }
8447
8448 redo A;
8449 } elsif ($self->{nc} == 0x007C or # |
8450 $self->{nc} == 0x002C) { # ,
8451 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8452 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8453
8454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8455 $self->{line_prev} = $self->{line};
8456 $self->{column_prev} = $self->{column};
8457 $self->{column}++;
8458 $self->{nc}
8459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8460 } else {
8461 $self->{set_nc}->($self);
8462 }
8463
8464 redo A;
8465 } elsif ($self->{nc} == 0x0029) { # )
8466 $self->{group_depth}--;
8467 push @{$self->{ct}->{content}}, chr $self->{nc};
8468 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8469
8470 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8471 $self->{line_prev} = $self->{line};
8472 $self->{column_prev} = $self->{column};
8473 $self->{column}++;
8474 $self->{nc}
8475 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8476 } else {
8477 $self->{set_nc}->($self);
8478 }
8479
8480 redo A;
8481 } elsif ($self->{nc} == 0x003E) { # >
8482 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8483 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8484 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8485
8486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8487 $self->{line_prev} = $self->{line};
8488 $self->{column_prev} = $self->{column};
8489 $self->{column}++;
8490 $self->{nc}
8491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8492 } else {
8493 $self->{set_nc}->($self);
8494 }
8495
8496 return ($self->{ct}); # ELEMENT
8497 redo A;
8498 } elsif ($self->{nc} == -1) {
8499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8500 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8501 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8502
8503 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8504 $self->{line_prev} = $self->{line};
8505 $self->{column_prev} = $self->{column};
8506 $self->{column}++;
8507 $self->{nc}
8508 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8509 } else {
8510 $self->{set_nc}->($self);
8511 }
8512
8513 return ($self->{ct}); # ELEMENT
8514 redo A;
8515 } else {
8516 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8517 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8518 $self->{state} = BOGUS_MD_STATE;
8519
8520 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8521 $self->{line_prev} = $self->{line};
8522 $self->{column_prev} = $self->{column};
8523 $self->{column}++;
8524 $self->{nc}
8525 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8526 } else {
8527 $self->{set_nc}->($self);
8528 }
8529
8530 redo A;
8531 }
8532 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8533 if ($is_space->{$self->{nc}}) {
8534 if ($self->{group_depth}) {
8535 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8536 } else {
8537 $self->{state} = AFTER_MD_DEF_STATE;
8538 }
8539
8540 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8541 $self->{line_prev} = $self->{line};
8542 $self->{column_prev} = $self->{column};
8543 $self->{column}++;
8544 $self->{nc}
8545 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8546 } else {
8547 $self->{set_nc}->($self);
8548 }
8549
8550 redo A;
8551 } elsif ($self->{nc} == 0x002A or # *
8552 $self->{nc} == 0x002B or # +
8553 $self->{nc} == 0x003F) { # ?
8554 push @{$self->{ct}->{content}}, chr $self->{nc};
8555 if ($self->{group_depth}) {
8556 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8557 } else {
8558 $self->{state} = AFTER_MD_DEF_STATE;
8559 }
8560
8561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8562 $self->{line_prev} = $self->{line};
8563 $self->{column_prev} = $self->{column};
8564 $self->{column}++;
8565 $self->{nc}
8566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8567 } else {
8568 $self->{set_nc}->($self);
8569 }
8570
8571 redo A;
8572 } elsif ($self->{nc} == 0x0029) { # )
8573 if ($self->{group_depth}) {
8574 $self->{group_depth}--;
8575 push @{$self->{ct}->{content}}, chr $self->{nc};
8576 ## Stay in the state.
8577
8578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8579 $self->{line_prev} = $self->{line};
8580 $self->{column_prev} = $self->{column};
8581 $self->{column}++;
8582 $self->{nc}
8583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8584 } else {
8585 $self->{set_nc}->($self);
8586 }
8587
8588 redo A;
8589 } else {
8590 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8591 $self->{state} = BOGUS_MD_STATE;
8592 ## Reconsume.
8593 redo A;
8594 }
8595 } elsif ($self->{nc} == 0x003E) { # >
8596 if ($self->{group_depth}) {
8597 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8598 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8599 }
8600 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8601
8602 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8603 $self->{line_prev} = $self->{line};
8604 $self->{column_prev} = $self->{column};
8605 $self->{column}++;
8606 $self->{nc}
8607 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8608 } else {
8609 $self->{set_nc}->($self);
8610 }
8611
8612 return ($self->{ct}); # ELEMENT
8613 redo A;
8614 } elsif ($self->{nc} == -1) {
8615 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8616 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8617 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8618
8619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8620 $self->{line_prev} = $self->{line};
8621 $self->{column_prev} = $self->{column};
8622 $self->{column}++;
8623 $self->{nc}
8624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8625 } else {
8626 $self->{set_nc}->($self);
8627 }
8628
8629 return ($self->{ct}); # ELEMENT
8630 redo A;
8631 } else {
8632 if ($self->{group_depth}) {
8633 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8634 } else {
8635 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8636 $self->{state} = BOGUS_MD_STATE;
8637 }
8638 ## Reconsume.
8639 redo A;
8640 }
8641 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8642 if ($is_space->{$self->{nc}}) {
8643 ## Stay in the state.
8644
8645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8646 $self->{line_prev} = $self->{line};
8647 $self->{column_prev} = $self->{column};
8648 $self->{column}++;
8649 $self->{nc}
8650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8651 } else {
8652 $self->{set_nc}->($self);
8653 }
8654
8655 redo A;
8656 } elsif ($self->{nc} == 0x003E) { # >
8657 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8658
8659 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8660 $self->{line_prev} = $self->{line};
8661 $self->{column_prev} = $self->{column};
8662 $self->{column}++;
8663 $self->{nc}
8664 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8665 } else {
8666 $self->{set_nc}->($self);
8667 }
8668
8669 return ($self->{ct}); # ENTITY/ELEMENT
8670 redo A;
8671 } elsif ($self->{nc} == -1) {
8672 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8673 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8674
8675 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8676 $self->{line_prev} = $self->{line};
8677 $self->{column_prev} = $self->{column};
8678 $self->{column}++;
8679 $self->{nc}
8680 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8681 } else {
8682 $self->{set_nc}->($self);
8683 }
8684
8685 return ($self->{ct}); # ENTITY/ELEMENT
8686 redo A;
8687 } else {
8688 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8689 $self->{state} = BOGUS_MD_STATE;
8690 ## Reconsume.
8691 redo A;
8692 }
8693 } elsif ($self->{state} == BOGUS_MD_STATE) {
8694 if ($self->{nc} == 0x003E) { # >
8695 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8696
8697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8698 $self->{line_prev} = $self->{line};
8699 $self->{column_prev} = $self->{column};
8700 $self->{column}++;
8701 $self->{nc}
8702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8703 } else {
8704 $self->{set_nc}->($self);
8705 }
8706
8707 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8708 redo A;
8709 } elsif ($self->{nc} == -1) {
8710 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8711 ## Reconsume.
8712 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8713 redo A;
8714 } else {
8715 ## Stay in the state.
8716
8717 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8718 $self->{line_prev} = $self->{line};
8719 $self->{column_prev} = $self->{column};
8720 $self->{column}++;
8721 $self->{nc}
8722 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8723 } else {
8724 $self->{set_nc}->($self);
8725 }
8726
8727 redo A;
8728 }
8729 } else {
8730 die "$0: $self->{state}: Unknown state";
8731 }
8732 } # A
8733
8734 die "$0: _get_next_token: unexpected case";
8735 } # _get_next_token
8736
8737 1;
8738 ## $Date: 2009/08/16 05:24:47 $
8739

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24