/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (show annotations) (download)
Sun Oct 19 04:39:25 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.16: +113 -42 lines
++ whatpm/t/xml/ChangeLog	19 Oct 2008 04:38:53 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* notations-1.dat, notations-1.dat: Tests on lowercase markup
	declaration keywords are added.

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 04:37:30 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Make keywords 'ENTITY',
	'ELEMENT', 'ATTLIST', and 'NOTATION' ASCII case-insensitive.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.16 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BOGUS_MD_STATE () { 85 }
181
182 ## Tree constructor state constants (see Whatpm::HTML for the full
183 ## list and descriptions)
184
185 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
186 sub FOREIGN_EL () { 0b1_00000000000 }
187
188 ## Character reference mappings
189
190 my $charref_map = {
191 0x0D => 0x000A,
192 0x80 => 0x20AC,
193 0x81 => 0xFFFD,
194 0x82 => 0x201A,
195 0x83 => 0x0192,
196 0x84 => 0x201E,
197 0x85 => 0x2026,
198 0x86 => 0x2020,
199 0x87 => 0x2021,
200 0x88 => 0x02C6,
201 0x89 => 0x2030,
202 0x8A => 0x0160,
203 0x8B => 0x2039,
204 0x8C => 0x0152,
205 0x8D => 0xFFFD,
206 0x8E => 0x017D,
207 0x8F => 0xFFFD,
208 0x90 => 0xFFFD,
209 0x91 => 0x2018,
210 0x92 => 0x2019,
211 0x93 => 0x201C,
212 0x94 => 0x201D,
213 0x95 => 0x2022,
214 0x96 => 0x2013,
215 0x97 => 0x2014,
216 0x98 => 0x02DC,
217 0x99 => 0x2122,
218 0x9A => 0x0161,
219 0x9B => 0x203A,
220 0x9C => 0x0153,
221 0x9D => 0xFFFD,
222 0x9E => 0x017E,
223 0x9F => 0x0178,
224 }; # $charref_map
225 $charref_map->{$_} = 0xFFFD
226 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
227 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
228 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
229 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
230 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
231 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
232 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
233
234 ## Implementations MUST act as if state machine in the spec
235
236 sub _initialize_tokenizer ($) {
237 my $self = shift;
238
239 ## NOTE: Fields set by |new| constructor:
240 #$self->{level}
241 #$self->{set_nc}
242 #$self->{parse_error}
243 #$self->{is_xml} (if XML)
244
245 $self->{state} = DATA_STATE; # MUST
246 $self->{s_kwd} = ''; # Data state keyword
247 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
248 #$self->{entity__value}; # initialized when used
249 #$self->{entity__match}; # initialized when used
250 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
251 undef $self->{ct}; # current token
252 undef $self->{ca}; # current attribute
253 undef $self->{last_stag_name}; # last emitted start tag name
254 #$self->{prev_state}; # initialized when used
255 delete $self->{self_closing};
256 $self->{char_buffer} = '';
257 $self->{char_buffer_pos} = 0;
258 $self->{nc} = -1; # next input character
259 #$self->{next_nc}
260
261 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
262 $self->{line_prev} = $self->{line};
263 $self->{column_prev} = $self->{column};
264 $self->{column}++;
265 $self->{nc}
266 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
267 } else {
268 $self->{set_nc}->($self);
269 }
270
271 $self->{token} = [];
272 # $self->{escape}
273 } # _initialize_tokenizer
274
275 ## A token has:
276 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
277 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
278 ## ->{name} (DOCTYPE_TOKEN)
279 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
280 ## ->{target} (PI_TOKEN)
281 ## ->{pubid} (DOCTYPE_TOKEN)
282 ## ->{sysid} (DOCTYPE_TOKEN)
283 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
284 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
285 ## ->{name}
286 ## ->{value}
287 ## ->{has_reference} == 1 or 0
288 ## ->{index}: Index of the attribute in a tag.
289 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
290 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
291 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
292 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
293
294 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
295 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
296 ## while the token is pushed back to the stack.
297
298 ## Emitted token MUST immediately be handled by the tree construction state.
299
300 ## Before each step, UA MAY check to see if either one of the scripts in
301 ## "list of scripts that will execute as soon as possible" or the first
302 ## script in the "list of scripts that will execute asynchronously",
303 ## has completed loading. If one has, then it MUST be executed
304 ## and removed from the list.
305
306 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
307 ## (This requirement was dropped from HTML5 spec, unfortunately.)
308
309 my $is_space = {
310 0x0009 => 1, # CHARACTER TABULATION (HT)
311 0x000A => 1, # LINE FEED (LF)
312 #0x000B => 0, # LINE TABULATION (VT)
313 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
314 #0x000D => 1, # CARRIAGE RETURN (CR)
315 0x0020 => 1, # SPACE (SP)
316 };
317
318 sub _get_next_token ($) {
319 my $self = shift;
320
321 if ($self->{self_closing}) {
322 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
323 ## NOTE: The |self_closing| flag is only set by start tag token.
324 ## In addition, when a start tag token is emitted, it is always set to
325 ## |ct|.
326 delete $self->{self_closing};
327 }
328
329 if (@{$self->{token}}) {
330 $self->{self_closing} = $self->{token}->[0]->{self_closing};
331 return shift @{$self->{token}};
332 }
333
334 A: {
335 if ($self->{state} == PCDATA_STATE) {
336 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
337
338 if ($self->{nc} == 0x0026) { # &
339
340 ## NOTE: In the spec, the tokenizer is switched to the
341 ## "entity data state". In this implementation, the tokenizer
342 ## is switched to the |ENTITY_STATE|, which is an implementation
343 ## of the "consume a character reference" algorithm.
344 $self->{entity_add} = -1;
345 $self->{prev_state} = DATA_STATE;
346 $self->{state} = ENTITY_STATE;
347
348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349 $self->{line_prev} = $self->{line};
350 $self->{column_prev} = $self->{column};
351 $self->{column}++;
352 $self->{nc}
353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354 } else {
355 $self->{set_nc}->($self);
356 }
357
358 redo A;
359 } elsif ($self->{nc} == 0x003C) { # <
360
361 $self->{state} = TAG_OPEN_STATE;
362
363 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
364 $self->{line_prev} = $self->{line};
365 $self->{column_prev} = $self->{column};
366 $self->{column}++;
367 $self->{nc}
368 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
369 } else {
370 $self->{set_nc}->($self);
371 }
372
373 redo A;
374 } elsif ($self->{nc} == -1) {
375
376 return ({type => END_OF_FILE_TOKEN,
377 line => $self->{line}, column => $self->{column}});
378 last A; ## TODO: ok?
379 } else {
380
381 #
382 }
383
384 # Anything else
385 my $token = {type => CHARACTER_TOKEN,
386 data => chr $self->{nc},
387 line => $self->{line}, column => $self->{column},
388 };
389 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
390
391 ## Stay in the state.
392
393 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
394 $self->{line_prev} = $self->{line};
395 $self->{column_prev} = $self->{column};
396 $self->{column}++;
397 $self->{nc}
398 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
399 } else {
400 $self->{set_nc}->($self);
401 }
402
403 return ($token);
404 redo A;
405 } elsif ($self->{state} == DATA_STATE) {
406 $self->{s_kwd} = '' unless defined $self->{s_kwd};
407 if ($self->{nc} == 0x0026) { # &
408 $self->{s_kwd} = '';
409 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
410 not $self->{escape}) {
411
412 ## NOTE: In the spec, the tokenizer is switched to the
413 ## "entity data state". In this implementation, the tokenizer
414 ## is switched to the |ENTITY_STATE|, which is an implementation
415 ## of the "consume a character reference" algorithm.
416 $self->{entity_add} = -1;
417 $self->{prev_state} = DATA_STATE;
418 $self->{state} = ENTITY_STATE;
419
420 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
421 $self->{line_prev} = $self->{line};
422 $self->{column_prev} = $self->{column};
423 $self->{column}++;
424 $self->{nc}
425 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
426 } else {
427 $self->{set_nc}->($self);
428 }
429
430 redo A;
431 } else {
432
433 #
434 }
435 } elsif ($self->{nc} == 0x002D) { # -
436 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
437 if ($self->{s_kwd} eq '<!-') {
438
439 $self->{escape} = 1; # unless $self->{escape};
440 $self->{s_kwd} = '--';
441 #
442 } elsif ($self->{s_kwd} eq '-') {
443
444 $self->{s_kwd} = '--';
445 #
446 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
447
448 $self->{s_kwd} .= '-';
449 #
450 } else {
451
452 $self->{s_kwd} = '-';
453 #
454 }
455 }
456
457 #
458 } elsif ($self->{nc} == 0x0021) { # !
459 if (length $self->{s_kwd}) {
460
461 $self->{s_kwd} .= '!';
462 #
463 } else {
464
465 #$self->{s_kwd} = '';
466 #
467 }
468 #
469 } elsif ($self->{nc} == 0x003C) { # <
470 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
471 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
472 not $self->{escape})) {
473
474 $self->{state} = TAG_OPEN_STATE;
475
476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
477 $self->{line_prev} = $self->{line};
478 $self->{column_prev} = $self->{column};
479 $self->{column}++;
480 $self->{nc}
481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
482 } else {
483 $self->{set_nc}->($self);
484 }
485
486 redo A;
487 } else {
488
489 $self->{s_kwd} = '';
490 #
491 }
492 } elsif ($self->{nc} == 0x003E) { # >
493 if ($self->{escape} and
494 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
495 if ($self->{s_kwd} eq '--') {
496
497 delete $self->{escape};
498 #
499 } else {
500
501 #
502 }
503 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
504
505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
506 line => $self->{line_prev},
507 column => $self->{column_prev} - 1);
508 #
509 } else {
510
511 #
512 }
513
514 $self->{s_kwd} = '';
515 #
516 } elsif ($self->{nc} == 0x005D) { # ]
517 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
518
519 $self->{s_kwd} .= ']';
520 } elsif ($self->{s_kwd} eq ']]') {
521
522 #
523 } else {
524
525 $self->{s_kwd} = '';
526 }
527 #
528 } elsif ($self->{nc} == -1) {
529
530 $self->{s_kwd} = '';
531 return ({type => END_OF_FILE_TOKEN,
532 line => $self->{line}, column => $self->{column}});
533 last A; ## TODO: ok?
534 } else {
535
536 $self->{s_kwd} = '';
537 #
538 }
539
540 # Anything else
541 my $token = {type => CHARACTER_TOKEN,
542 data => chr $self->{nc},
543 line => $self->{line}, column => $self->{column},
544 };
545 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
546 length $token->{data})) {
547 $self->{s_kwd} = '';
548 }
549
550 ## Stay in the data state.
551 if (not $self->{is_xml} and
552 $self->{content_model} == PCDATA_CONTENT_MODEL) {
553
554 $self->{state} = PCDATA_STATE;
555 } else {
556
557 ## Stay in the state.
558 }
559
560 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
561 $self->{line_prev} = $self->{line};
562 $self->{column_prev} = $self->{column};
563 $self->{column}++;
564 $self->{nc}
565 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
566 } else {
567 $self->{set_nc}->($self);
568 }
569
570 return ($token);
571 redo A;
572 } elsif ($self->{state} == TAG_OPEN_STATE) {
573 ## XML5: "tag state".
574
575 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
576 if ($self->{nc} == 0x002F) { # /
577
578
579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
580 $self->{line_prev} = $self->{line};
581 $self->{column_prev} = $self->{column};
582 $self->{column}++;
583 $self->{nc}
584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
585 } else {
586 $self->{set_nc}->($self);
587 }
588
589 $self->{state} = CLOSE_TAG_OPEN_STATE;
590 redo A;
591 } elsif ($self->{nc} == 0x0021) { # !
592
593 $self->{s_kwd} = $self->{escaped} ? '' : '<';
594 #
595 } else {
596
597 $self->{s_kwd} = '';
598 #
599 }
600
601 ## reconsume
602 $self->{state} = DATA_STATE;
603 return ({type => CHARACTER_TOKEN, data => '<',
604 line => $self->{line_prev},
605 column => $self->{column_prev},
606 });
607 redo A;
608 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
609 if ($self->{nc} == 0x0021) { # !
610
611 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
612
613 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
614 $self->{line_prev} = $self->{line};
615 $self->{column_prev} = $self->{column};
616 $self->{column}++;
617 $self->{nc}
618 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
619 } else {
620 $self->{set_nc}->($self);
621 }
622
623 redo A;
624 } elsif ($self->{nc} == 0x002F) { # /
625
626 $self->{state} = CLOSE_TAG_OPEN_STATE;
627
628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
629 $self->{line_prev} = $self->{line};
630 $self->{column_prev} = $self->{column};
631 $self->{column}++;
632 $self->{nc}
633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
634 } else {
635 $self->{set_nc}->($self);
636 }
637
638 redo A;
639 } elsif (0x0041 <= $self->{nc} and
640 $self->{nc} <= 0x005A) { # A..Z
641
642 $self->{ct}
643 = {type => START_TAG_TOKEN,
644 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
645 line => $self->{line_prev},
646 column => $self->{column_prev}};
647 $self->{state} = TAG_NAME_STATE;
648
649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
650 $self->{line_prev} = $self->{line};
651 $self->{column_prev} = $self->{column};
652 $self->{column}++;
653 $self->{nc}
654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
655 } else {
656 $self->{set_nc}->($self);
657 }
658
659 redo A;
660 } elsif (0x0061 <= $self->{nc} and
661 $self->{nc} <= 0x007A) { # a..z
662
663 $self->{ct} = {type => START_TAG_TOKEN,
664 tag_name => chr ($self->{nc}),
665 line => $self->{line_prev},
666 column => $self->{column_prev}};
667 $self->{state} = TAG_NAME_STATE;
668
669 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
670 $self->{line_prev} = $self->{line};
671 $self->{column_prev} = $self->{column};
672 $self->{column}++;
673 $self->{nc}
674 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
675 } else {
676 $self->{set_nc}->($self);
677 }
678
679 redo A;
680 } elsif ($self->{nc} == 0x003E) { # >
681
682 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
683 line => $self->{line_prev},
684 column => $self->{column_prev});
685 $self->{state} = DATA_STATE;
686 $self->{s_kwd} = '';
687
688 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
689 $self->{line_prev} = $self->{line};
690 $self->{column_prev} = $self->{column};
691 $self->{column}++;
692 $self->{nc}
693 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
694 } else {
695 $self->{set_nc}->($self);
696 }
697
698
699 return ({type => CHARACTER_TOKEN, data => '<>',
700 line => $self->{line_prev},
701 column => $self->{column_prev},
702 });
703
704 redo A;
705 } elsif ($self->{nc} == 0x003F) { # ?
706 if ($self->{is_xml}) {
707
708 $self->{state} = PI_STATE;
709
710 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
711 $self->{line_prev} = $self->{line};
712 $self->{column_prev} = $self->{column};
713 $self->{column}++;
714 $self->{nc}
715 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
716 } else {
717 $self->{set_nc}->($self);
718 }
719
720 redo A;
721 } else {
722
723 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
724 line => $self->{line_prev},
725 column => $self->{column_prev});
726 $self->{state} = BOGUS_COMMENT_STATE;
727 $self->{ct} = {type => COMMENT_TOKEN, data => '',
728 line => $self->{line_prev},
729 column => $self->{column_prev},
730 };
731 ## $self->{nc} is intentionally left as is
732 redo A;
733 }
734 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
735
736 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
737 line => $self->{line_prev},
738 column => $self->{column_prev});
739 $self->{state} = DATA_STATE;
740 $self->{s_kwd} = '';
741 ## reconsume
742
743 return ({type => CHARACTER_TOKEN, data => '<',
744 line => $self->{line_prev},
745 column => $self->{column_prev},
746 });
747
748 redo A;
749 } else {
750 ## XML5: "<:" is a parse error.
751
752 $self->{ct} = {type => START_TAG_TOKEN,
753 tag_name => chr ($self->{nc}),
754 line => $self->{line_prev},
755 column => $self->{column_prev}};
756 $self->{state} = TAG_NAME_STATE;
757
758 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
759 $self->{line_prev} = $self->{line};
760 $self->{column_prev} = $self->{column};
761 $self->{column}++;
762 $self->{nc}
763 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
764 } else {
765 $self->{set_nc}->($self);
766 }
767
768 redo A;
769 }
770 } else {
771 die "$0: $self->{content_model} in tag open";
772 }
773 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
774 ## NOTE: The "close tag open state" in the spec is implemented as
775 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
776
777 ## XML5: "end tag state".
778
779 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
780 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
781 if (defined $self->{last_stag_name}) {
782 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
783 $self->{kwd} = '';
784 ## Reconsume.
785 redo A;
786 } else {
787 ## No start tag token has ever been emitted
788 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
789
790 $self->{state} = DATA_STATE;
791 $self->{s_kwd} = '';
792 ## Reconsume.
793 return ({type => CHARACTER_TOKEN, data => '</',
794 line => $l, column => $c,
795 });
796 redo A;
797 }
798 }
799
800 if (0x0041 <= $self->{nc} and
801 $self->{nc} <= 0x005A) { # A..Z
802
803 $self->{ct}
804 = {type => END_TAG_TOKEN,
805 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
806 line => $l, column => $c};
807 $self->{state} = TAG_NAME_STATE;
808
809 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
810 $self->{line_prev} = $self->{line};
811 $self->{column_prev} = $self->{column};
812 $self->{column}++;
813 $self->{nc}
814 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
815 } else {
816 $self->{set_nc}->($self);
817 }
818
819 redo A;
820 } elsif (0x0061 <= $self->{nc} and
821 $self->{nc} <= 0x007A) { # a..z
822
823 $self->{ct} = {type => END_TAG_TOKEN,
824 tag_name => chr ($self->{nc}),
825 line => $l, column => $c};
826 $self->{state} = TAG_NAME_STATE;
827
828 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
829 $self->{line_prev} = $self->{line};
830 $self->{column_prev} = $self->{column};
831 $self->{column}++;
832 $self->{nc}
833 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
834 } else {
835 $self->{set_nc}->($self);
836 }
837
838 redo A;
839 } elsif ($self->{nc} == 0x003E) { # >
840 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
841 line => $self->{line_prev}, ## "<" in "</>"
842 column => $self->{column_prev} - 1);
843 $self->{state} = DATA_STATE;
844 $self->{s_kwd} = '';
845 if ($self->{is_xml}) {
846
847 ## XML5: No parse error.
848
849 ## NOTE: This parser raises a parse error, since it supports
850 ## XML1, not XML5.
851
852 ## NOTE: A short end tag token.
853 my $ct = {type => END_TAG_TOKEN,
854 tag_name => '',
855 line => $self->{line_prev},
856 column => $self->{column_prev} - 1,
857 };
858
859 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
860 $self->{line_prev} = $self->{line};
861 $self->{column_prev} = $self->{column};
862 $self->{column}++;
863 $self->{nc}
864 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
865 } else {
866 $self->{set_nc}->($self);
867 }
868
869 return ($ct);
870 } else {
871
872
873 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
874 $self->{line_prev} = $self->{line};
875 $self->{column_prev} = $self->{column};
876 $self->{column}++;
877 $self->{nc}
878 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
879 } else {
880 $self->{set_nc}->($self);
881 }
882
883 }
884 redo A;
885 } elsif ($self->{nc} == -1) {
886
887 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
888 $self->{s_kwd} = '';
889 $self->{state} = DATA_STATE;
890 # reconsume
891
892 return ({type => CHARACTER_TOKEN, data => '</',
893 line => $l, column => $c,
894 });
895
896 redo A;
897 } elsif (not $self->{is_xml} or
898 $is_space->{$self->{nc}}) {
899
900 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
901 line => $self->{line_prev}, # "<" of "</"
902 column => $self->{column_prev} - 1);
903 $self->{state} = BOGUS_COMMENT_STATE;
904 $self->{ct} = {type => COMMENT_TOKEN, data => '',
905 line => $self->{line_prev}, # "<" of "</"
906 column => $self->{column_prev} - 1,
907 };
908 ## NOTE: $self->{nc} is intentionally left as is.
909 ## Although the "anything else" case of the spec not explicitly
910 ## states that the next input character is to be reconsumed,
911 ## it will be included to the |data| of the comment token
912 ## generated from the bogus end tag, as defined in the
913 ## "bogus comment state" entry.
914 redo A;
915 } else {
916 ## XML5: "</:" is a parse error.
917
918 $self->{ct} = {type => END_TAG_TOKEN,
919 tag_name => chr ($self->{nc}),
920 line => $l, column => $c};
921 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
922
923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
924 $self->{line_prev} = $self->{line};
925 $self->{column_prev} = $self->{column};
926 $self->{column}++;
927 $self->{nc}
928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
929 } else {
930 $self->{set_nc}->($self);
931 }
932
933 redo A;
934 }
935 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
936 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
937 if (length $ch) {
938 my $CH = $ch;
939 $ch =~ tr/a-z/A-Z/;
940 my $nch = chr $self->{nc};
941 if ($nch eq $ch or $nch eq $CH) {
942
943 ## Stay in the state.
944 $self->{kwd} .= $nch;
945
946 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
947 $self->{line_prev} = $self->{line};
948 $self->{column_prev} = $self->{column};
949 $self->{column}++;
950 $self->{nc}
951 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
952 } else {
953 $self->{set_nc}->($self);
954 }
955
956 redo A;
957 } else {
958
959 $self->{state} = DATA_STATE;
960 $self->{s_kwd} = '';
961 ## Reconsume.
962 return ({type => CHARACTER_TOKEN,
963 data => '</' . $self->{kwd},
964 line => $self->{line_prev},
965 column => $self->{column_prev} - 1 - length $self->{kwd},
966 });
967 redo A;
968 }
969 } else { # after "<{tag-name}"
970 unless ($is_space->{$self->{nc}} or
971 {
972 0x003E => 1, # >
973 0x002F => 1, # /
974 -1 => 1, # EOF
975 }->{$self->{nc}}) {
976
977 ## Reconsume.
978 $self->{state} = DATA_STATE;
979 $self->{s_kwd} = '';
980 return ({type => CHARACTER_TOKEN,
981 data => '</' . $self->{kwd},
982 line => $self->{line_prev},
983 column => $self->{column_prev} - 1 - length $self->{kwd},
984 });
985 redo A;
986 } else {
987
988 $self->{ct}
989 = {type => END_TAG_TOKEN,
990 tag_name => $self->{last_stag_name},
991 line => $self->{line_prev},
992 column => $self->{column_prev} - 1 - length $self->{kwd}};
993 $self->{state} = TAG_NAME_STATE;
994 ## Reconsume.
995 redo A;
996 }
997 }
998 } elsif ($self->{state} == TAG_NAME_STATE) {
999 if ($is_space->{$self->{nc}}) {
1000
1001 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1002
1003 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1004 $self->{line_prev} = $self->{line};
1005 $self->{column_prev} = $self->{column};
1006 $self->{column}++;
1007 $self->{nc}
1008 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1009 } else {
1010 $self->{set_nc}->($self);
1011 }
1012
1013 redo A;
1014 } elsif ($self->{nc} == 0x003E) { # >
1015 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1016
1017 $self->{last_stag_name} = $self->{ct}->{tag_name};
1018 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1019 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1020 #if ($self->{ct}->{attributes}) {
1021 # ## NOTE: This should never be reached.
1022 # !!! cp (36);
1023 # !!! parse-error (type => 'end tag attribute');
1024 #} else {
1025
1026 #}
1027 } else {
1028 die "$0: $self->{ct}->{type}: Unknown token type";
1029 }
1030 $self->{state} = DATA_STATE;
1031 $self->{s_kwd} = '';
1032
1033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1034 $self->{line_prev} = $self->{line};
1035 $self->{column_prev} = $self->{column};
1036 $self->{column}++;
1037 $self->{nc}
1038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1039 } else {
1040 $self->{set_nc}->($self);
1041 }
1042
1043
1044 return ($self->{ct}); # start tag or end tag
1045
1046 redo A;
1047 } elsif (0x0041 <= $self->{nc} and
1048 $self->{nc} <= 0x005A) { # A..Z
1049
1050 $self->{ct}->{tag_name}
1051 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1052 # start tag or end tag
1053 ## Stay in this state
1054
1055 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1056 $self->{line_prev} = $self->{line};
1057 $self->{column_prev} = $self->{column};
1058 $self->{column}++;
1059 $self->{nc}
1060 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1061 } else {
1062 $self->{set_nc}->($self);
1063 }
1064
1065 redo A;
1066 } elsif ($self->{nc} == -1) {
1067 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1068 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1069
1070 $self->{last_stag_name} = $self->{ct}->{tag_name};
1071 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1072 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1073 #if ($self->{ct}->{attributes}) {
1074 # ## NOTE: This state should never be reached.
1075 # !!! cp (40);
1076 # !!! parse-error (type => 'end tag attribute');
1077 #} else {
1078
1079 #}
1080 } else {
1081 die "$0: $self->{ct}->{type}: Unknown token type";
1082 }
1083 $self->{state} = DATA_STATE;
1084 $self->{s_kwd} = '';
1085 # reconsume
1086
1087 return ($self->{ct}); # start tag or end tag
1088
1089 redo A;
1090 } elsif ($self->{nc} == 0x002F) { # /
1091
1092 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1093
1094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1095 $self->{line_prev} = $self->{line};
1096 $self->{column_prev} = $self->{column};
1097 $self->{column}++;
1098 $self->{nc}
1099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1100 } else {
1101 $self->{set_nc}->($self);
1102 }
1103
1104 redo A;
1105 } else {
1106
1107 $self->{ct}->{tag_name} .= chr $self->{nc};
1108 # start tag or end tag
1109 ## Stay in the state
1110
1111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1112 $self->{line_prev} = $self->{line};
1113 $self->{column_prev} = $self->{column};
1114 $self->{column}++;
1115 $self->{nc}
1116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1117 } else {
1118 $self->{set_nc}->($self);
1119 }
1120
1121 redo A;
1122 }
1123 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1124 ## XML5: "Tag attribute name before state".
1125
1126 if ($is_space->{$self->{nc}}) {
1127
1128 ## Stay in the state
1129
1130 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1131 $self->{line_prev} = $self->{line};
1132 $self->{column_prev} = $self->{column};
1133 $self->{column}++;
1134 $self->{nc}
1135 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1136 } else {
1137 $self->{set_nc}->($self);
1138 }
1139
1140 redo A;
1141 } elsif ($self->{nc} == 0x003E) { # >
1142 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1143
1144 $self->{last_stag_name} = $self->{ct}->{tag_name};
1145 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1146 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1147 if ($self->{ct}->{attributes}) {
1148
1149 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1150 } else {
1151
1152 }
1153 } else {
1154 die "$0: $self->{ct}->{type}: Unknown token type";
1155 }
1156 $self->{state} = DATA_STATE;
1157 $self->{s_kwd} = '';
1158
1159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1160 $self->{line_prev} = $self->{line};
1161 $self->{column_prev} = $self->{column};
1162 $self->{column}++;
1163 $self->{nc}
1164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1165 } else {
1166 $self->{set_nc}->($self);
1167 }
1168
1169
1170 return ($self->{ct}); # start tag or end tag
1171
1172 redo A;
1173 } elsif (0x0041 <= $self->{nc} and
1174 $self->{nc} <= 0x005A) { # A..Z
1175
1176 $self->{ca}
1177 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1178 value => '',
1179 line => $self->{line}, column => $self->{column}};
1180 $self->{state} = ATTRIBUTE_NAME_STATE;
1181
1182 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1183 $self->{line_prev} = $self->{line};
1184 $self->{column_prev} = $self->{column};
1185 $self->{column}++;
1186 $self->{nc}
1187 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1188 } else {
1189 $self->{set_nc}->($self);
1190 }
1191
1192 redo A;
1193 } elsif ($self->{nc} == 0x002F) { # /
1194
1195 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1196
1197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1198 $self->{line_prev} = $self->{line};
1199 $self->{column_prev} = $self->{column};
1200 $self->{column}++;
1201 $self->{nc}
1202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1203 } else {
1204 $self->{set_nc}->($self);
1205 }
1206
1207 redo A;
1208 } elsif ($self->{nc} == -1) {
1209 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1210 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211
1212 $self->{last_stag_name} = $self->{ct}->{tag_name};
1213 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215 if ($self->{ct}->{attributes}) {
1216
1217 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1218 } else {
1219
1220 }
1221 } else {
1222 die "$0: $self->{ct}->{type}: Unknown token type";
1223 }
1224 $self->{state} = DATA_STATE;
1225 $self->{s_kwd} = '';
1226 # reconsume
1227
1228 return ($self->{ct}); # start tag or end tag
1229
1230 redo A;
1231 } else {
1232 if ({
1233 0x0022 => 1, # "
1234 0x0027 => 1, # '
1235 0x003D => 1, # =
1236 }->{$self->{nc}}) {
1237
1238 ## XML5: Not a parse error.
1239 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1240 } else {
1241
1242 ## XML5: ":" raises a parse error and is ignored.
1243 }
1244 $self->{ca}
1245 = {name => chr ($self->{nc}),
1246 value => '',
1247 line => $self->{line}, column => $self->{column}};
1248 $self->{state} = ATTRIBUTE_NAME_STATE;
1249
1250 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1251 $self->{line_prev} = $self->{line};
1252 $self->{column_prev} = $self->{column};
1253 $self->{column}++;
1254 $self->{nc}
1255 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1256 } else {
1257 $self->{set_nc}->($self);
1258 }
1259
1260 redo A;
1261 }
1262 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1263 ## XML5: "Tag attribute name state".
1264
1265 my $before_leave = sub {
1266 if (exists $self->{ct}->{attributes} # start tag or end tag
1267 ->{$self->{ca}->{name}}) { # MUST
1268
1269 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1270 ## Discard $self->{ca} # MUST
1271 } else {
1272
1273 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1274 = $self->{ca};
1275 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1276 }
1277 }; # $before_leave
1278
1279 if ($is_space->{$self->{nc}}) {
1280
1281 $before_leave->();
1282 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1283
1284 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1285 $self->{line_prev} = $self->{line};
1286 $self->{column_prev} = $self->{column};
1287 $self->{column}++;
1288 $self->{nc}
1289 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1290 } else {
1291 $self->{set_nc}->($self);
1292 }
1293
1294 redo A;
1295 } elsif ($self->{nc} == 0x003D) { # =
1296
1297 $before_leave->();
1298 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1299
1300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301 $self->{line_prev} = $self->{line};
1302 $self->{column_prev} = $self->{column};
1303 $self->{column}++;
1304 $self->{nc}
1305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306 } else {
1307 $self->{set_nc}->($self);
1308 }
1309
1310 redo A;
1311 } elsif ($self->{nc} == 0x003E) { # >
1312 if ($self->{is_xml}) {
1313
1314 ## XML5: Not a parse error.
1315 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1316 } else {
1317
1318 }
1319
1320 $before_leave->();
1321 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1322
1323 $self->{last_stag_name} = $self->{ct}->{tag_name};
1324 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1325
1326 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1327 if ($self->{ct}->{attributes}) {
1328 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1329 }
1330 } else {
1331 die "$0: $self->{ct}->{type}: Unknown token type";
1332 }
1333 $self->{state} = DATA_STATE;
1334 $self->{s_kwd} = '';
1335
1336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1337 $self->{line_prev} = $self->{line};
1338 $self->{column_prev} = $self->{column};
1339 $self->{column}++;
1340 $self->{nc}
1341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1342 } else {
1343 $self->{set_nc}->($self);
1344 }
1345
1346
1347 return ($self->{ct}); # start tag or end tag
1348
1349 redo A;
1350 } elsif (0x0041 <= $self->{nc} and
1351 $self->{nc} <= 0x005A) { # A..Z
1352
1353 $self->{ca}->{name}
1354 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1355 ## Stay in the state
1356
1357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1358 $self->{line_prev} = $self->{line};
1359 $self->{column_prev} = $self->{column};
1360 $self->{column}++;
1361 $self->{nc}
1362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1363 } else {
1364 $self->{set_nc}->($self);
1365 }
1366
1367 redo A;
1368 } elsif ($self->{nc} == 0x002F) { # /
1369 if ($self->{is_xml}) {
1370
1371 ## XML5: Not a parse error.
1372 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1373 } else {
1374
1375 }
1376
1377 $before_leave->();
1378 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1379
1380 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1381 $self->{line_prev} = $self->{line};
1382 $self->{column_prev} = $self->{column};
1383 $self->{column}++;
1384 $self->{nc}
1385 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1386 } else {
1387 $self->{set_nc}->($self);
1388 }
1389
1390 redo A;
1391 } elsif ($self->{nc} == -1) {
1392 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1393 $before_leave->();
1394 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1395
1396 $self->{last_stag_name} = $self->{ct}->{tag_name};
1397 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1398 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1399 if ($self->{ct}->{attributes}) {
1400
1401 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1402 } else {
1403 ## NOTE: This state should never be reached.
1404
1405 }
1406 } else {
1407 die "$0: $self->{ct}->{type}: Unknown token type";
1408 }
1409 $self->{state} = DATA_STATE;
1410 $self->{s_kwd} = '';
1411 # reconsume
1412
1413 return ($self->{ct}); # start tag or end tag
1414
1415 redo A;
1416 } else {
1417 if ($self->{nc} == 0x0022 or # "
1418 $self->{nc} == 0x0027) { # '
1419
1420 ## XML5: Not a parse error.
1421 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1422 } else {
1423
1424 }
1425 $self->{ca}->{name} .= chr ($self->{nc});
1426 ## Stay in the state
1427
1428 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1429 $self->{line_prev} = $self->{line};
1430 $self->{column_prev} = $self->{column};
1431 $self->{column}++;
1432 $self->{nc}
1433 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1434 } else {
1435 $self->{set_nc}->($self);
1436 }
1437
1438 redo A;
1439 }
1440 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1441 ## XML5: "Tag attribute name after state".
1442
1443 if ($is_space->{$self->{nc}}) {
1444
1445 ## Stay in the state
1446
1447 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1448 $self->{line_prev} = $self->{line};
1449 $self->{column_prev} = $self->{column};
1450 $self->{column}++;
1451 $self->{nc}
1452 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1453 } else {
1454 $self->{set_nc}->($self);
1455 }
1456
1457 redo A;
1458 } elsif ($self->{nc} == 0x003D) { # =
1459
1460 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1461
1462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1463 $self->{line_prev} = $self->{line};
1464 $self->{column_prev} = $self->{column};
1465 $self->{column}++;
1466 $self->{nc}
1467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1468 } else {
1469 $self->{set_nc}->($self);
1470 }
1471
1472 redo A;
1473 } elsif ($self->{nc} == 0x003E) { # >
1474 if ($self->{is_xml}) {
1475
1476 ## XML5: Not a parse error.
1477 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1478 } else {
1479
1480 }
1481
1482 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1483
1484 $self->{last_stag_name} = $self->{ct}->{tag_name};
1485 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1486 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1487 if ($self->{ct}->{attributes}) {
1488
1489 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1490 } else {
1491 ## NOTE: This state should never be reached.
1492
1493 }
1494 } else {
1495 die "$0: $self->{ct}->{type}: Unknown token type";
1496 }
1497 $self->{state} = DATA_STATE;
1498 $self->{s_kwd} = '';
1499
1500 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1501 $self->{line_prev} = $self->{line};
1502 $self->{column_prev} = $self->{column};
1503 $self->{column}++;
1504 $self->{nc}
1505 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1506 } else {
1507 $self->{set_nc}->($self);
1508 }
1509
1510
1511 return ($self->{ct}); # start tag or end tag
1512
1513 redo A;
1514 } elsif (0x0041 <= $self->{nc} and
1515 $self->{nc} <= 0x005A) { # A..Z
1516
1517 $self->{ca}
1518 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1519 value => '',
1520 line => $self->{line}, column => $self->{column}};
1521 $self->{state} = ATTRIBUTE_NAME_STATE;
1522
1523 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524 $self->{line_prev} = $self->{line};
1525 $self->{column_prev} = $self->{column};
1526 $self->{column}++;
1527 $self->{nc}
1528 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529 } else {
1530 $self->{set_nc}->($self);
1531 }
1532
1533 redo A;
1534 } elsif ($self->{nc} == 0x002F) { # /
1535 if ($self->{is_xml}) {
1536
1537 ## XML5: Not a parse error.
1538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1539 } else {
1540
1541 }
1542
1543 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1544
1545 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1546 $self->{line_prev} = $self->{line};
1547 $self->{column_prev} = $self->{column};
1548 $self->{column}++;
1549 $self->{nc}
1550 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1551 } else {
1552 $self->{set_nc}->($self);
1553 }
1554
1555 redo A;
1556 } elsif ($self->{nc} == -1) {
1557 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1558 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1559
1560 $self->{last_stag_name} = $self->{ct}->{tag_name};
1561 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1562 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1563 if ($self->{ct}->{attributes}) {
1564
1565 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1566 } else {
1567 ## NOTE: This state should never be reached.
1568
1569 }
1570 } else {
1571 die "$0: $self->{ct}->{type}: Unknown token type";
1572 }
1573 $self->{s_kwd} = '';
1574 $self->{state} = DATA_STATE;
1575 # reconsume
1576
1577 return ($self->{ct}); # start tag or end tag
1578
1579 redo A;
1580 } else {
1581 if ($self->{is_xml}) {
1582
1583 ## XML5: Not a parse error.
1584 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1585 } else {
1586
1587 }
1588
1589 if ($self->{nc} == 0x0022 or # "
1590 $self->{nc} == 0x0027) { # '
1591
1592 ## XML5: Not a parse error.
1593 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1594 } else {
1595
1596 }
1597 $self->{ca}
1598 = {name => chr ($self->{nc}),
1599 value => '',
1600 line => $self->{line}, column => $self->{column}};
1601 $self->{state} = ATTRIBUTE_NAME_STATE;
1602
1603 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1604 $self->{line_prev} = $self->{line};
1605 $self->{column_prev} = $self->{column};
1606 $self->{column}++;
1607 $self->{nc}
1608 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1609 } else {
1610 $self->{set_nc}->($self);
1611 }
1612
1613 redo A;
1614 }
1615 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1616 ## XML5: "Tag attribute value before state".
1617
1618 if ($is_space->{$self->{nc}}) {
1619
1620 ## Stay in the state
1621
1622 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623 $self->{line_prev} = $self->{line};
1624 $self->{column_prev} = $self->{column};
1625 $self->{column}++;
1626 $self->{nc}
1627 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628 } else {
1629 $self->{set_nc}->($self);
1630 }
1631
1632 redo A;
1633 } elsif ($self->{nc} == 0x0022) { # "
1634
1635 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1636
1637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1638 $self->{line_prev} = $self->{line};
1639 $self->{column_prev} = $self->{column};
1640 $self->{column}++;
1641 $self->{nc}
1642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1643 } else {
1644 $self->{set_nc}->($self);
1645 }
1646
1647 redo A;
1648 } elsif ($self->{nc} == 0x0026) { # &
1649
1650 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1651 ## reconsume
1652 redo A;
1653 } elsif ($self->{nc} == 0x0027) { # '
1654
1655 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1656
1657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1658 $self->{line_prev} = $self->{line};
1659 $self->{column_prev} = $self->{column};
1660 $self->{column}++;
1661 $self->{nc}
1662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1663 } else {
1664 $self->{set_nc}->($self);
1665 }
1666
1667 redo A;
1668 } elsif ($self->{nc} == 0x003E) { # >
1669 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1670 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1671
1672 $self->{last_stag_name} = $self->{ct}->{tag_name};
1673 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1674 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1675 if ($self->{ct}->{attributes}) {
1676
1677 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1678 } else {
1679 ## NOTE: This state should never be reached.
1680
1681 }
1682 } else {
1683 die "$0: $self->{ct}->{type}: Unknown token type";
1684 }
1685 $self->{state} = DATA_STATE;
1686 $self->{s_kwd} = '';
1687
1688 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1689 $self->{line_prev} = $self->{line};
1690 $self->{column_prev} = $self->{column};
1691 $self->{column}++;
1692 $self->{nc}
1693 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1694 } else {
1695 $self->{set_nc}->($self);
1696 }
1697
1698
1699 return ($self->{ct}); # start tag or end tag
1700
1701 redo A;
1702 } elsif ($self->{nc} == -1) {
1703 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1704 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1705
1706 $self->{last_stag_name} = $self->{ct}->{tag_name};
1707 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1708 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1709 if ($self->{ct}->{attributes}) {
1710
1711 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1712 } else {
1713 ## NOTE: This state should never be reached.
1714
1715 }
1716 } else {
1717 die "$0: $self->{ct}->{type}: Unknown token type";
1718 }
1719 $self->{state} = DATA_STATE;
1720 $self->{s_kwd} = '';
1721 ## reconsume
1722
1723 return ($self->{ct}); # start tag or end tag
1724
1725 redo A;
1726 } else {
1727 if ($self->{nc} == 0x003D) { # =
1728
1729 ## XML5: Not a parse error.
1730 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1731 } elsif ($self->{is_xml}) {
1732
1733 ## XML5: No parse error.
1734 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1735 } else {
1736
1737 }
1738 $self->{ca}->{value} .= chr ($self->{nc});
1739 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1740
1741 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1742 $self->{line_prev} = $self->{line};
1743 $self->{column_prev} = $self->{column};
1744 $self->{column}++;
1745 $self->{nc}
1746 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1747 } else {
1748 $self->{set_nc}->($self);
1749 }
1750
1751 redo A;
1752 }
1753 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1754 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1755 ## ATTLIST attribute value double quoted state".
1756
1757 if ($self->{nc} == 0x0022) { # "
1758 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1759
1760 ## XML5: "DOCTYPE ATTLIST name after state".
1761 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1762 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1763 } else {
1764
1765 ## XML5: "Tag attribute name before state".
1766 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1767 }
1768
1769 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1770 $self->{line_prev} = $self->{line};
1771 $self->{column_prev} = $self->{column};
1772 $self->{column}++;
1773 $self->{nc}
1774 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1775 } else {
1776 $self->{set_nc}->($self);
1777 }
1778
1779 redo A;
1780 } elsif ($self->{nc} == 0x0026) { # &
1781
1782 ## XML5: Not defined yet.
1783
1784 ## NOTE: In the spec, the tokenizer is switched to the
1785 ## "entity in attribute value state". In this implementation, the
1786 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1787 ## implementation of the "consume a character reference" algorithm.
1788 $self->{prev_state} = $self->{state};
1789 $self->{entity_add} = 0x0022; # "
1790 $self->{state} = ENTITY_STATE;
1791
1792 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1793 $self->{line_prev} = $self->{line};
1794 $self->{column_prev} = $self->{column};
1795 $self->{column}++;
1796 $self->{nc}
1797 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1798 } else {
1799 $self->{set_nc}->($self);
1800 }
1801
1802 redo A;
1803 } elsif ($self->{nc} == -1) {
1804 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1805 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1806
1807 $self->{last_stag_name} = $self->{ct}->{tag_name};
1808
1809 $self->{state} = DATA_STATE;
1810 $self->{s_kwd} = '';
1811 ## reconsume
1812 return ($self->{ct}); # start tag
1813 redo A;
1814 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1815 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1816 if ($self->{ct}->{attributes}) {
1817
1818 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1819 } else {
1820 ## NOTE: This state should never be reached.
1821
1822 }
1823
1824 $self->{state} = DATA_STATE;
1825 $self->{s_kwd} = '';
1826 ## reconsume
1827 return ($self->{ct}); # end tag
1828 redo A;
1829 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1830 ## XML5: No parse error above; not defined yet.
1831 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1832 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1833 ## Reconsume.
1834 return ($self->{ct}); # ATTLIST
1835 redo A;
1836 } else {
1837 die "$0: $self->{ct}->{type}: Unknown token type";
1838 }
1839 } else {
1840 ## XML5 [ATTLIST]: Not defined yet.
1841 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1842
1843 ## XML5: Not a parse error.
1844 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1845 } else {
1846
1847 }
1848 $self->{ca}->{value} .= chr ($self->{nc});
1849 $self->{read_until}->($self->{ca}->{value},
1850 q["&<],
1851 length $self->{ca}->{value});
1852
1853 ## Stay in the state
1854
1855 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1856 $self->{line_prev} = $self->{line};
1857 $self->{column_prev} = $self->{column};
1858 $self->{column}++;
1859 $self->{nc}
1860 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1861 } else {
1862 $self->{set_nc}->($self);
1863 }
1864
1865 redo A;
1866 }
1867 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1868 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1869 ## ATTLIST attribute value single quoted state".
1870
1871 if ($self->{nc} == 0x0027) { # '
1872 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1873
1874 ## XML5: "DOCTYPE ATTLIST name after state".
1875 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1876 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1877 } else {
1878
1879 ## XML5: "Before attribute name state" (sic).
1880 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1881 }
1882
1883 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1884 $self->{line_prev} = $self->{line};
1885 $self->{column_prev} = $self->{column};
1886 $self->{column}++;
1887 $self->{nc}
1888 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1889 } else {
1890 $self->{set_nc}->($self);
1891 }
1892
1893 redo A;
1894 } elsif ($self->{nc} == 0x0026) { # &
1895
1896 ## XML5: Not defined yet.
1897
1898 ## NOTE: In the spec, the tokenizer is switched to the
1899 ## "entity in attribute value state". In this implementation, the
1900 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1901 ## implementation of the "consume a character reference" algorithm.
1902 $self->{entity_add} = 0x0027; # '
1903 $self->{prev_state} = $self->{state};
1904 $self->{state} = ENTITY_STATE;
1905
1906 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1907 $self->{line_prev} = $self->{line};
1908 $self->{column_prev} = $self->{column};
1909 $self->{column}++;
1910 $self->{nc}
1911 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1912 } else {
1913 $self->{set_nc}->($self);
1914 }
1915
1916 redo A;
1917 } elsif ($self->{nc} == -1) {
1918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1919 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1920
1921 $self->{last_stag_name} = $self->{ct}->{tag_name};
1922
1923 $self->{state} = DATA_STATE;
1924 $self->{s_kwd} = '';
1925 ## reconsume
1926 return ($self->{ct}); # start tag
1927 redo A;
1928 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1929 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1930 if ($self->{ct}->{attributes}) {
1931
1932 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1933 } else {
1934 ## NOTE: This state should never be reached.
1935
1936 }
1937
1938 $self->{state} = DATA_STATE;
1939 $self->{s_kwd} = '';
1940 ## reconsume
1941 return ($self->{ct}); # end tag
1942 redo A;
1943 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1944 ## XML5: No parse error above; not defined yet.
1945 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1946 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1947 ## Reconsume.
1948 return ($self->{ct}); # ATTLIST
1949 redo A;
1950 } else {
1951 die "$0: $self->{ct}->{type}: Unknown token type";
1952 }
1953 } else {
1954 ## XML5 [ATTLIST]: Not defined yet.
1955 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1956
1957 ## XML5: Not a parse error.
1958 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1959 } else {
1960
1961 }
1962 $self->{ca}->{value} .= chr ($self->{nc});
1963 $self->{read_until}->($self->{ca}->{value},
1964 q['&<],
1965 length $self->{ca}->{value});
1966
1967 ## Stay in the state
1968
1969 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1970 $self->{line_prev} = $self->{line};
1971 $self->{column_prev} = $self->{column};
1972 $self->{column}++;
1973 $self->{nc}
1974 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1975 } else {
1976 $self->{set_nc}->($self);
1977 }
1978
1979 redo A;
1980 }
1981 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1982 ## XML5: "Tag attribute value unquoted state".
1983
1984 if ($is_space->{$self->{nc}}) {
1985 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1986
1987 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1988 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1989 } else {
1990
1991 ## XML5: "Tag attribute name before state".
1992 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1993 }
1994
1995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1996 $self->{line_prev} = $self->{line};
1997 $self->{column_prev} = $self->{column};
1998 $self->{column}++;
1999 $self->{nc}
2000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2001 } else {
2002 $self->{set_nc}->($self);
2003 }
2004
2005 redo A;
2006 } elsif ($self->{nc} == 0x0026) { # &
2007
2008
2009 ## XML5: Not defined yet.
2010
2011 ## NOTE: In the spec, the tokenizer is switched to the
2012 ## "entity in attribute value state". In this implementation, the
2013 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2014 ## implementation of the "consume a character reference" algorithm.
2015 $self->{entity_add} = -1;
2016 $self->{prev_state} = $self->{state};
2017 $self->{state} = ENTITY_STATE;
2018
2019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020 $self->{line_prev} = $self->{line};
2021 $self->{column_prev} = $self->{column};
2022 $self->{column}++;
2023 $self->{nc}
2024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025 } else {
2026 $self->{set_nc}->($self);
2027 }
2028
2029 redo A;
2030 } elsif ($self->{nc} == 0x003E) { # >
2031 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2032
2033 $self->{last_stag_name} = $self->{ct}->{tag_name};
2034
2035 $self->{state} = DATA_STATE;
2036 $self->{s_kwd} = '';
2037
2038 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2039 $self->{line_prev} = $self->{line};
2040 $self->{column_prev} = $self->{column};
2041 $self->{column}++;
2042 $self->{nc}
2043 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2044 } else {
2045 $self->{set_nc}->($self);
2046 }
2047
2048 return ($self->{ct}); # start tag
2049 redo A;
2050 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2051 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2052 if ($self->{ct}->{attributes}) {
2053
2054 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2055 } else {
2056 ## NOTE: This state should never be reached.
2057
2058 }
2059
2060 $self->{state} = DATA_STATE;
2061 $self->{s_kwd} = '';
2062
2063 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2064 $self->{line_prev} = $self->{line};
2065 $self->{column_prev} = $self->{column};
2066 $self->{column}++;
2067 $self->{nc}
2068 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2069 } else {
2070 $self->{set_nc}->($self);
2071 }
2072
2073 return ($self->{ct}); # end tag
2074 redo A;
2075 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2076 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2077 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2078
2079 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080 $self->{line_prev} = $self->{line};
2081 $self->{column_prev} = $self->{column};
2082 $self->{column}++;
2083 $self->{nc}
2084 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085 } else {
2086 $self->{set_nc}->($self);
2087 }
2088
2089 return ($self->{ct}); # ATTLIST
2090 redo A;
2091 } else {
2092 die "$0: $self->{ct}->{type}: Unknown token type";
2093 }
2094 } elsif ($self->{nc} == -1) {
2095 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2096
2097 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2098 $self->{last_stag_name} = $self->{ct}->{tag_name};
2099
2100 $self->{state} = DATA_STATE;
2101 $self->{s_kwd} = '';
2102 ## reconsume
2103 return ($self->{ct}); # start tag
2104 redo A;
2105 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2106 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2107 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2108 if ($self->{ct}->{attributes}) {
2109
2110 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2111 } else {
2112 ## NOTE: This state should never be reached.
2113
2114 }
2115
2116 $self->{state} = DATA_STATE;
2117 $self->{s_kwd} = '';
2118 ## reconsume
2119 return ($self->{ct}); # end tag
2120 redo A;
2121 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2122 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2123 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2124 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2125 ## Reconsume.
2126 return ($self->{ct}); # ATTLIST
2127 redo A;
2128 } else {
2129 die "$0: $self->{ct}->{type}: Unknown token type";
2130 }
2131 } else {
2132 if ({
2133 0x0022 => 1, # "
2134 0x0027 => 1, # '
2135 0x003D => 1, # =
2136 }->{$self->{nc}}) {
2137
2138 ## XML5: Not a parse error.
2139 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2140 } else {
2141
2142 }
2143 $self->{ca}->{value} .= chr ($self->{nc});
2144 $self->{read_until}->($self->{ca}->{value},
2145 q["'=& >],
2146 length $self->{ca}->{value});
2147
2148 ## Stay in the state
2149
2150 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2151 $self->{line_prev} = $self->{line};
2152 $self->{column_prev} = $self->{column};
2153 $self->{column}++;
2154 $self->{nc}
2155 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2156 } else {
2157 $self->{set_nc}->($self);
2158 }
2159
2160 redo A;
2161 }
2162 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2163 if ($is_space->{$self->{nc}}) {
2164
2165 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2166
2167 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2168 $self->{line_prev} = $self->{line};
2169 $self->{column_prev} = $self->{column};
2170 $self->{column}++;
2171 $self->{nc}
2172 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2173 } else {
2174 $self->{set_nc}->($self);
2175 }
2176
2177 redo A;
2178 } elsif ($self->{nc} == 0x003E) { # >
2179 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2180
2181 $self->{last_stag_name} = $self->{ct}->{tag_name};
2182 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2183 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2184 if ($self->{ct}->{attributes}) {
2185
2186 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2187 } else {
2188 ## NOTE: This state should never be reached.
2189
2190 }
2191 } else {
2192 die "$0: $self->{ct}->{type}: Unknown token type";
2193 }
2194 $self->{state} = DATA_STATE;
2195 $self->{s_kwd} = '';
2196
2197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2198 $self->{line_prev} = $self->{line};
2199 $self->{column_prev} = $self->{column};
2200 $self->{column}++;
2201 $self->{nc}
2202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2203 } else {
2204 $self->{set_nc}->($self);
2205 }
2206
2207
2208 return ($self->{ct}); # start tag or end tag
2209
2210 redo A;
2211 } elsif ($self->{nc} == 0x002F) { # /
2212
2213 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2214
2215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2216 $self->{line_prev} = $self->{line};
2217 $self->{column_prev} = $self->{column};
2218 $self->{column}++;
2219 $self->{nc}
2220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2221 } else {
2222 $self->{set_nc}->($self);
2223 }
2224
2225 redo A;
2226 } elsif ($self->{nc} == -1) {
2227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2228 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2229
2230 $self->{last_stag_name} = $self->{ct}->{tag_name};
2231 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2232 if ($self->{ct}->{attributes}) {
2233
2234 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2235 } else {
2236 ## NOTE: This state should never be reached.
2237
2238 }
2239 } else {
2240 die "$0: $self->{ct}->{type}: Unknown token type";
2241 }
2242 $self->{state} = DATA_STATE;
2243 $self->{s_kwd} = '';
2244 ## Reconsume.
2245 return ($self->{ct}); # start tag or end tag
2246 redo A;
2247 } else {
2248
2249 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2250 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2251 ## reconsume
2252 redo A;
2253 }
2254 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2255 ## XML5: "Empty tag state".
2256
2257 if ($self->{nc} == 0x003E) { # >
2258 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2259
2260 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2261 ## TODO: Different type than slash in start tag
2262 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2263 if ($self->{ct}->{attributes}) {
2264
2265 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2266 } else {
2267
2268 }
2269 ## TODO: Test |<title></title/>|
2270 } else {
2271
2272 $self->{self_closing} = 1;
2273 }
2274
2275 $self->{state} = DATA_STATE;
2276 $self->{s_kwd} = '';
2277
2278 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2279 $self->{line_prev} = $self->{line};
2280 $self->{column_prev} = $self->{column};
2281 $self->{column}++;
2282 $self->{nc}
2283 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2284 } else {
2285 $self->{set_nc}->($self);
2286 }
2287
2288
2289 return ($self->{ct}); # start tag or end tag
2290
2291 redo A;
2292 } elsif ($self->{nc} == -1) {
2293 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2294 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2295
2296 $self->{last_stag_name} = $self->{ct}->{tag_name};
2297 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2298 if ($self->{ct}->{attributes}) {
2299
2300 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2301 } else {
2302 ## NOTE: This state should never be reached.
2303
2304 }
2305 } else {
2306 die "$0: $self->{ct}->{type}: Unknown token type";
2307 }
2308 ## XML5: "Tag attribute name before state".
2309 $self->{state} = DATA_STATE;
2310 $self->{s_kwd} = '';
2311 ## Reconsume.
2312 return ($self->{ct}); # start tag or end tag
2313 redo A;
2314 } else {
2315
2316 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2317 ## TODO: This error type is wrong.
2318 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2319 ## Reconsume.
2320 redo A;
2321 }
2322 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2323 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2324
2325 ## NOTE: Unlike spec's "bogus comment state", this implementation
2326 ## consumes characters one-by-one basis.
2327
2328 if ($self->{nc} == 0x003E) { # >
2329 if ($self->{in_subset}) {
2330
2331 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2332 } else {
2333
2334 $self->{state} = DATA_STATE;
2335 $self->{s_kwd} = '';
2336 }
2337
2338 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339 $self->{line_prev} = $self->{line};
2340 $self->{column_prev} = $self->{column};
2341 $self->{column}++;
2342 $self->{nc}
2343 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2344 } else {
2345 $self->{set_nc}->($self);
2346 }
2347
2348
2349 return ($self->{ct}); # comment
2350 redo A;
2351 } elsif ($self->{nc} == -1) {
2352 if ($self->{in_subset}) {
2353
2354 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2355 } else {
2356
2357 $self->{state} = DATA_STATE;
2358 $self->{s_kwd} = '';
2359 }
2360 ## reconsume
2361
2362 return ($self->{ct}); # comment
2363 redo A;
2364 } else {
2365
2366 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2367 $self->{read_until}->($self->{ct}->{data},
2368 q[>],
2369 length $self->{ct}->{data});
2370
2371 ## Stay in the state.
2372
2373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2374 $self->{line_prev} = $self->{line};
2375 $self->{column_prev} = $self->{column};
2376 $self->{column}++;
2377 $self->{nc}
2378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2379 } else {
2380 $self->{set_nc}->($self);
2381 }
2382
2383 redo A;
2384 }
2385 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2386 ## XML5: "Markup declaration state".
2387
2388 if ($self->{nc} == 0x002D) { # -
2389
2390 $self->{state} = MD_HYPHEN_STATE;
2391
2392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2393 $self->{line_prev} = $self->{line};
2394 $self->{column_prev} = $self->{column};
2395 $self->{column}++;
2396 $self->{nc}
2397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2398 } else {
2399 $self->{set_nc}->($self);
2400 }
2401
2402 redo A;
2403 } elsif ($self->{nc} == 0x0044 or # D
2404 $self->{nc} == 0x0064) { # d
2405 ## ASCII case-insensitive.
2406
2407 $self->{state} = MD_DOCTYPE_STATE;
2408 $self->{kwd} = chr $self->{nc};
2409
2410 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2411 $self->{line_prev} = $self->{line};
2412 $self->{column_prev} = $self->{column};
2413 $self->{column}++;
2414 $self->{nc}
2415 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2416 } else {
2417 $self->{set_nc}->($self);
2418 }
2419
2420 redo A;
2421 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2422 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2423 $self->{is_xml}) and
2424 $self->{nc} == 0x005B) { # [
2425
2426 $self->{state} = MD_CDATA_STATE;
2427 $self->{kwd} = '[';
2428
2429 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2430 $self->{line_prev} = $self->{line};
2431 $self->{column_prev} = $self->{column};
2432 $self->{column}++;
2433 $self->{nc}
2434 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2435 } else {
2436 $self->{set_nc}->($self);
2437 }
2438
2439 redo A;
2440 } else {
2441
2442 }
2443
2444 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2445 line => $self->{line_prev},
2446 column => $self->{column_prev} - 1);
2447 ## Reconsume.
2448 $self->{state} = BOGUS_COMMENT_STATE;
2449 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2450 line => $self->{line_prev},
2451 column => $self->{column_prev} - 1,
2452 };
2453 redo A;
2454 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2455 if ($self->{nc} == 0x002D) { # -
2456
2457 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2458 line => $self->{line_prev},
2459 column => $self->{column_prev} - 2,
2460 };
2461 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2462
2463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2464 $self->{line_prev} = $self->{line};
2465 $self->{column_prev} = $self->{column};
2466 $self->{column}++;
2467 $self->{nc}
2468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2469 } else {
2470 $self->{set_nc}->($self);
2471 }
2472
2473 redo A;
2474 } else {
2475
2476 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2477 line => $self->{line_prev},
2478 column => $self->{column_prev} - 2);
2479 $self->{state} = BOGUS_COMMENT_STATE;
2480 ## Reconsume.
2481 $self->{ct} = {type => COMMENT_TOKEN,
2482 data => '-',
2483 line => $self->{line_prev},
2484 column => $self->{column_prev} - 2,
2485 };
2486 redo A;
2487 }
2488 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2489 ## ASCII case-insensitive.
2490 if ($self->{nc} == [
2491 undef,
2492 0x004F, # O
2493 0x0043, # C
2494 0x0054, # T
2495 0x0059, # Y
2496 0x0050, # P
2497 ]->[length $self->{kwd}] or
2498 $self->{nc} == [
2499 undef,
2500 0x006F, # o
2501 0x0063, # c
2502 0x0074, # t
2503 0x0079, # y
2504 0x0070, # p
2505 ]->[length $self->{kwd}]) {
2506
2507 ## Stay in the state.
2508 $self->{kwd} .= chr $self->{nc};
2509
2510 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2511 $self->{line_prev} = $self->{line};
2512 $self->{column_prev} = $self->{column};
2513 $self->{column}++;
2514 $self->{nc}
2515 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2516 } else {
2517 $self->{set_nc}->($self);
2518 }
2519
2520 redo A;
2521 } elsif ((length $self->{kwd}) == 6 and
2522 ($self->{nc} == 0x0045 or # E
2523 $self->{nc} == 0x0065)) { # e
2524 if ($self->{is_xml} and
2525 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2526
2527 ## XML5: case-sensitive.
2528 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2529 text => 'DOCTYPE',
2530 line => $self->{line_prev},
2531 column => $self->{column_prev} - 5);
2532 } else {
2533
2534 }
2535 $self->{state} = DOCTYPE_STATE;
2536 $self->{ct} = {type => DOCTYPE_TOKEN,
2537 quirks => 1,
2538 line => $self->{line_prev},
2539 column => $self->{column_prev} - 7,
2540 };
2541
2542 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2543 $self->{line_prev} = $self->{line};
2544 $self->{column_prev} = $self->{column};
2545 $self->{column}++;
2546 $self->{nc}
2547 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2548 } else {
2549 $self->{set_nc}->($self);
2550 }
2551
2552 redo A;
2553 } else {
2554
2555 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2556 line => $self->{line_prev},
2557 column => $self->{column_prev} - 1 - length $self->{kwd});
2558 $self->{state} = BOGUS_COMMENT_STATE;
2559 ## Reconsume.
2560 $self->{ct} = {type => COMMENT_TOKEN,
2561 data => $self->{kwd},
2562 line => $self->{line_prev},
2563 column => $self->{column_prev} - 1 - length $self->{kwd},
2564 };
2565 redo A;
2566 }
2567 } elsif ($self->{state} == MD_CDATA_STATE) {
2568 if ($self->{nc} == {
2569 '[' => 0x0043, # C
2570 '[C' => 0x0044, # D
2571 '[CD' => 0x0041, # A
2572 '[CDA' => 0x0054, # T
2573 '[CDAT' => 0x0041, # A
2574 }->{$self->{kwd}}) {
2575
2576 ## Stay in the state.
2577 $self->{kwd} .= chr $self->{nc};
2578
2579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2580 $self->{line_prev} = $self->{line};
2581 $self->{column_prev} = $self->{column};
2582 $self->{column}++;
2583 $self->{nc}
2584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2585 } else {
2586 $self->{set_nc}->($self);
2587 }
2588
2589 redo A;
2590 } elsif ($self->{kwd} eq '[CDATA' and
2591 $self->{nc} == 0x005B) { # [
2592 if ($self->{is_xml} and
2593 not $self->{tainted} and
2594 @{$self->{open_elements} or []} == 0) {
2595
2596 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2597 line => $self->{line_prev},
2598 column => $self->{column_prev} - 7);
2599 $self->{tainted} = 1;
2600 } else {
2601
2602 }
2603
2604 $self->{ct} = {type => CHARACTER_TOKEN,
2605 data => '',
2606 line => $self->{line_prev},
2607 column => $self->{column_prev} - 7};
2608 $self->{state} = CDATA_SECTION_STATE;
2609
2610 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2611 $self->{line_prev} = $self->{line};
2612 $self->{column_prev} = $self->{column};
2613 $self->{column}++;
2614 $self->{nc}
2615 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2616 } else {
2617 $self->{set_nc}->($self);
2618 }
2619
2620 redo A;
2621 } else {
2622
2623 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2624 line => $self->{line_prev},
2625 column => $self->{column_prev} - 1 - length $self->{kwd});
2626 $self->{state} = BOGUS_COMMENT_STATE;
2627 ## Reconsume.
2628 $self->{ct} = {type => COMMENT_TOKEN,
2629 data => $self->{kwd},
2630 line => $self->{line_prev},
2631 column => $self->{column_prev} - 1 - length $self->{kwd},
2632 };
2633 redo A;
2634 }
2635 } elsif ($self->{state} == COMMENT_START_STATE) {
2636 if ($self->{nc} == 0x002D) { # -
2637
2638 $self->{state} = COMMENT_START_DASH_STATE;
2639
2640 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2641 $self->{line_prev} = $self->{line};
2642 $self->{column_prev} = $self->{column};
2643 $self->{column}++;
2644 $self->{nc}
2645 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2646 } else {
2647 $self->{set_nc}->($self);
2648 }
2649
2650 redo A;
2651 } elsif ($self->{nc} == 0x003E) { # >
2652 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2653 if ($self->{in_subset}) {
2654
2655 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2656 } else {
2657
2658 $self->{state} = DATA_STATE;
2659 $self->{s_kwd} = '';
2660 }
2661
2662 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2663 $self->{line_prev} = $self->{line};
2664 $self->{column_prev} = $self->{column};
2665 $self->{column}++;
2666 $self->{nc}
2667 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2668 } else {
2669 $self->{set_nc}->($self);
2670 }
2671
2672
2673 return ($self->{ct}); # comment
2674
2675 redo A;
2676 } elsif ($self->{nc} == -1) {
2677 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2678 if ($self->{in_subset}) {
2679
2680 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2681 } else {
2682
2683 $self->{state} = DATA_STATE;
2684 $self->{s_kwd} = '';
2685 }
2686 ## reconsume
2687
2688 return ($self->{ct}); # comment
2689
2690 redo A;
2691 } else {
2692
2693 $self->{ct}->{data} # comment
2694 .= chr ($self->{nc});
2695 $self->{state} = COMMENT_STATE;
2696
2697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2698 $self->{line_prev} = $self->{line};
2699 $self->{column_prev} = $self->{column};
2700 $self->{column}++;
2701 $self->{nc}
2702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2703 } else {
2704 $self->{set_nc}->($self);
2705 }
2706
2707 redo A;
2708 }
2709 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2710 if ($self->{nc} == 0x002D) { # -
2711
2712 $self->{state} = COMMENT_END_STATE;
2713
2714 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2715 $self->{line_prev} = $self->{line};
2716 $self->{column_prev} = $self->{column};
2717 $self->{column}++;
2718 $self->{nc}
2719 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2720 } else {
2721 $self->{set_nc}->($self);
2722 }
2723
2724 redo A;
2725 } elsif ($self->{nc} == 0x003E) { # >
2726 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2727 if ($self->{in_subset}) {
2728
2729 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2730 } else {
2731
2732 $self->{state} = DATA_STATE;
2733 $self->{s_kwd} = '';
2734 }
2735
2736 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2737 $self->{line_prev} = $self->{line};
2738 $self->{column_prev} = $self->{column};
2739 $self->{column}++;
2740 $self->{nc}
2741 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2742 } else {
2743 $self->{set_nc}->($self);
2744 }
2745
2746
2747 return ($self->{ct}); # comment
2748
2749 redo A;
2750 } elsif ($self->{nc} == -1) {
2751 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2752 if ($self->{in_subset}) {
2753
2754 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2755 } else {
2756
2757 $self->{state} = DATA_STATE;
2758 $self->{s_kwd} = '';
2759 }
2760 ## reconsume
2761
2762 return ($self->{ct}); # comment
2763
2764 redo A;
2765 } else {
2766
2767 $self->{ct}->{data} # comment
2768 .= '-' . chr ($self->{nc});
2769 $self->{state} = COMMENT_STATE;
2770
2771 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2772 $self->{line_prev} = $self->{line};
2773 $self->{column_prev} = $self->{column};
2774 $self->{column}++;
2775 $self->{nc}
2776 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2777 } else {
2778 $self->{set_nc}->($self);
2779 }
2780
2781 redo A;
2782 }
2783 } elsif ($self->{state} == COMMENT_STATE) {
2784 ## XML5: "Comment state" and "DOCTYPE comment state".
2785
2786 if ($self->{nc} == 0x002D) { # -
2787
2788 $self->{state} = COMMENT_END_DASH_STATE;
2789
2790 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2791 $self->{line_prev} = $self->{line};
2792 $self->{column_prev} = $self->{column};
2793 $self->{column}++;
2794 $self->{nc}
2795 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2796 } else {
2797 $self->{set_nc}->($self);
2798 }
2799
2800 redo A;
2801 } elsif ($self->{nc} == -1) {
2802 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803 if ($self->{in_subset}) {
2804
2805 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806 } else {
2807
2808 $self->{state} = DATA_STATE;
2809 $self->{s_kwd} = '';
2810 }
2811 ## reconsume
2812
2813 return ($self->{ct}); # comment
2814
2815 redo A;
2816 } else {
2817
2818 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2819 $self->{read_until}->($self->{ct}->{data},
2820 q[-],
2821 length $self->{ct}->{data});
2822
2823 ## Stay in the state
2824
2825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2826 $self->{line_prev} = $self->{line};
2827 $self->{column_prev} = $self->{column};
2828 $self->{column}++;
2829 $self->{nc}
2830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2831 } else {
2832 $self->{set_nc}->($self);
2833 }
2834
2835 redo A;
2836 }
2837 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2838 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2839
2840 if ($self->{nc} == 0x002D) { # -
2841
2842 $self->{state} = COMMENT_END_STATE;
2843
2844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2845 $self->{line_prev} = $self->{line};
2846 $self->{column_prev} = $self->{column};
2847 $self->{column}++;
2848 $self->{nc}
2849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2850 } else {
2851 $self->{set_nc}->($self);
2852 }
2853
2854 redo A;
2855 } elsif ($self->{nc} == -1) {
2856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2857 if ($self->{in_subset}) {
2858
2859 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2860 } else {
2861
2862 $self->{state} = DATA_STATE;
2863 $self->{s_kwd} = '';
2864 }
2865 ## reconsume
2866
2867 return ($self->{ct}); # comment
2868
2869 redo A;
2870 } else {
2871
2872 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2873 $self->{state} = COMMENT_STATE;
2874
2875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2876 $self->{line_prev} = $self->{line};
2877 $self->{column_prev} = $self->{column};
2878 $self->{column}++;
2879 $self->{nc}
2880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2881 } else {
2882 $self->{set_nc}->($self);
2883 }
2884
2885 redo A;
2886 }
2887 } elsif ($self->{state} == COMMENT_END_STATE) {
2888 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2889
2890 if ($self->{nc} == 0x003E) { # >
2891 if ($self->{in_subset}) {
2892
2893 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2894 } else {
2895
2896 $self->{state} = DATA_STATE;
2897 $self->{s_kwd} = '';
2898 }
2899
2900 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2901 $self->{line_prev} = $self->{line};
2902 $self->{column_prev} = $self->{column};
2903 $self->{column}++;
2904 $self->{nc}
2905 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2906 } else {
2907 $self->{set_nc}->($self);
2908 }
2909
2910
2911 return ($self->{ct}); # comment
2912
2913 redo A;
2914 } elsif ($self->{nc} == 0x002D) { # -
2915
2916 ## XML5: Not a parse error.
2917 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2918 line => $self->{line_prev},
2919 column => $self->{column_prev});
2920 $self->{ct}->{data} .= '-'; # comment
2921 ## Stay in the state
2922
2923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2924 $self->{line_prev} = $self->{line};
2925 $self->{column_prev} = $self->{column};
2926 $self->{column}++;
2927 $self->{nc}
2928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2929 } else {
2930 $self->{set_nc}->($self);
2931 }
2932
2933 redo A;
2934 } elsif ($self->{nc} == -1) {
2935 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2936 if ($self->{in_subset}) {
2937
2938 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939 } else {
2940
2941 $self->{state} = DATA_STATE;
2942 $self->{s_kwd} = '';
2943 }
2944 ## reconsume
2945
2946 return ($self->{ct}); # comment
2947
2948 redo A;
2949 } else {
2950
2951 ## XML5: Not a parse error.
2952 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2953 line => $self->{line_prev},
2954 column => $self->{column_prev});
2955 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2956 $self->{state} = COMMENT_STATE;
2957
2958 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2959 $self->{line_prev} = $self->{line};
2960 $self->{column_prev} = $self->{column};
2961 $self->{column}++;
2962 $self->{nc}
2963 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2964 } else {
2965 $self->{set_nc}->($self);
2966 }
2967
2968 redo A;
2969 }
2970 } elsif ($self->{state} == DOCTYPE_STATE) {
2971 if ($is_space->{$self->{nc}}) {
2972
2973 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2974
2975 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2976 $self->{line_prev} = $self->{line};
2977 $self->{column_prev} = $self->{column};
2978 $self->{column}++;
2979 $self->{nc}
2980 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2981 } else {
2982 $self->{set_nc}->($self);
2983 }
2984
2985 redo A;
2986 } else {
2987
2988 ## XML5: Unless EOF, swith to the bogus comment state.
2989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2990 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2991 ## reconsume
2992 redo A;
2993 }
2994 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2995 ## XML5: "DOCTYPE root name before state".
2996
2997 if ($is_space->{$self->{nc}}) {
2998
2999 ## Stay in the state
3000
3001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002 $self->{line_prev} = $self->{line};
3003 $self->{column_prev} = $self->{column};
3004 $self->{column}++;
3005 $self->{nc}
3006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007 } else {
3008 $self->{set_nc}->($self);
3009 }
3010
3011 redo A;
3012 } elsif ($self->{nc} == 0x003E) { # >
3013
3014 ## XML5: No parse error.
3015 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3016 $self->{state} = DATA_STATE;
3017 $self->{s_kwd} = '';
3018
3019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3020 $self->{line_prev} = $self->{line};
3021 $self->{column_prev} = $self->{column};
3022 $self->{column}++;
3023 $self->{nc}
3024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3025 } else {
3026 $self->{set_nc}->($self);
3027 }
3028
3029
3030 return ($self->{ct}); # DOCTYPE (quirks)
3031
3032 redo A;
3033 } elsif ($self->{nc} == -1) {
3034
3035 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3036 $self->{state} = DATA_STATE;
3037 $self->{s_kwd} = '';
3038 ## reconsume
3039
3040 return ($self->{ct}); # DOCTYPE (quirks)
3041
3042 redo A;
3043 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3044
3045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3046 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3047 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3048 $self->{in_subset} = 1;
3049
3050 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3051 $self->{line_prev} = $self->{line};
3052 $self->{column_prev} = $self->{column};
3053 $self->{column}++;
3054 $self->{nc}
3055 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3056 } else {
3057 $self->{set_nc}->($self);
3058 }
3059
3060 return ($self->{ct}); # DOCTYPE
3061 redo A;
3062 } else {
3063
3064 $self->{ct}->{name} = chr $self->{nc};
3065 delete $self->{ct}->{quirks};
3066 $self->{state} = DOCTYPE_NAME_STATE;
3067
3068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3069 $self->{line_prev} = $self->{line};
3070 $self->{column_prev} = $self->{column};
3071 $self->{column}++;
3072 $self->{nc}
3073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3074 } else {
3075 $self->{set_nc}->($self);
3076 }
3077
3078 redo A;
3079 }
3080 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3081 ## XML5: "DOCTYPE root name state".
3082
3083 ## ISSUE: Redundant "First," in the spec.
3084
3085 if ($is_space->{$self->{nc}}) {
3086
3087 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3088
3089 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3090 $self->{line_prev} = $self->{line};
3091 $self->{column_prev} = $self->{column};
3092 $self->{column}++;
3093 $self->{nc}
3094 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3095 } else {
3096 $self->{set_nc}->($self);
3097 }
3098
3099 redo A;
3100 } elsif ($self->{nc} == 0x003E) { # >
3101
3102 $self->{state} = DATA_STATE;
3103 $self->{s_kwd} = '';
3104
3105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106 $self->{line_prev} = $self->{line};
3107 $self->{column_prev} = $self->{column};
3108 $self->{column}++;
3109 $self->{nc}
3110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111 } else {
3112 $self->{set_nc}->($self);
3113 }
3114
3115
3116 return ($self->{ct}); # DOCTYPE
3117
3118 redo A;
3119 } elsif ($self->{nc} == -1) {
3120
3121 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3122 $self->{state} = DATA_STATE;
3123 $self->{s_kwd} = '';
3124 ## reconsume
3125
3126 $self->{ct}->{quirks} = 1;
3127 return ($self->{ct}); # DOCTYPE
3128
3129 redo A;
3130 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3131
3132 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3133 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3134 $self->{in_subset} = 1;
3135
3136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137 $self->{line_prev} = $self->{line};
3138 $self->{column_prev} = $self->{column};
3139 $self->{column}++;
3140 $self->{nc}
3141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142 } else {
3143 $self->{set_nc}->($self);
3144 }
3145
3146 return ($self->{ct}); # DOCTYPE
3147 redo A;
3148 } else {
3149
3150 $self->{ct}->{name}
3151 .= chr ($self->{nc}); # DOCTYPE
3152 ## Stay in the state
3153
3154 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3155 $self->{line_prev} = $self->{line};
3156 $self->{column_prev} = $self->{column};
3157 $self->{column}++;
3158 $self->{nc}
3159 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3160 } else {
3161 $self->{set_nc}->($self);
3162 }
3163
3164 redo A;
3165 }
3166 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3167 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3168 ## state", but implemented differently.
3169
3170 if ($is_space->{$self->{nc}}) {
3171
3172 ## Stay in the state
3173
3174 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3175 $self->{line_prev} = $self->{line};
3176 $self->{column_prev} = $self->{column};
3177 $self->{column}++;
3178 $self->{nc}
3179 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3180 } else {
3181 $self->{set_nc}->($self);
3182 }
3183
3184 redo A;
3185 } elsif ($self->{nc} == 0x003E) { # >
3186 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3187
3188 $self->{state} = DATA_STATE;
3189 $self->{s_kwd} = '';
3190 } else {
3191
3192 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3193 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3194 }
3195
3196
3197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3198 $self->{line_prev} = $self->{line};
3199 $self->{column_prev} = $self->{column};
3200 $self->{column}++;
3201 $self->{nc}
3202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3203 } else {
3204 $self->{set_nc}->($self);
3205 }
3206
3207 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3208 redo A;
3209 } elsif ($self->{nc} == -1) {
3210 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3211
3212 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3213 $self->{state} = DATA_STATE;
3214 $self->{s_kwd} = '';
3215 $self->{ct}->{quirks} = 1;
3216 } else {
3217
3218 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3219 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3220 }
3221
3222 ## Reconsume.
3223 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 redo A;
3225 } elsif ($self->{nc} == 0x0050 or # P
3226 $self->{nc} == 0x0070) { # p
3227
3228 $self->{state} = PUBLIC_STATE;
3229 $self->{kwd} = chr $self->{nc};
3230
3231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3232 $self->{line_prev} = $self->{line};
3233 $self->{column_prev} = $self->{column};
3234 $self->{column}++;
3235 $self->{nc}
3236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3237 } else {
3238 $self->{set_nc}->($self);
3239 }
3240
3241 redo A;
3242 } elsif ($self->{nc} == 0x0053 or # S
3243 $self->{nc} == 0x0073) { # s
3244
3245 $self->{state} = SYSTEM_STATE;
3246 $self->{kwd} = chr $self->{nc};
3247
3248 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3249 $self->{line_prev} = $self->{line};
3250 $self->{column_prev} = $self->{column};
3251 $self->{column}++;
3252 $self->{nc}
3253 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3254 } else {
3255 $self->{set_nc}->($self);
3256 }
3257
3258 redo A;
3259 ## TODO: " and ' for ENTITY
3260 } elsif ($self->{is_xml} and
3261 $self->{ct}->{type} == DOCTYPE_TOKEN and
3262 $self->{nc} == 0x005B) { # [
3263
3264 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3266 $self->{in_subset} = 1;
3267
3268 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3269 $self->{line_prev} = $self->{line};
3270 $self->{column_prev} = $self->{column};
3271 $self->{column}++;
3272 $self->{nc}
3273 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3274 } else {
3275 $self->{set_nc}->($self);
3276 }
3277
3278 return ($self->{ct}); # DOCTYPE
3279 redo A;
3280 } else {
3281 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3282
3283 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3284
3285 $self->{ct}->{quirks} = 1;
3286 $self->{state} = BOGUS_DOCTYPE_STATE;
3287 } else {
3288
3289 $self->{state} = BOGUS_MD_STATE;
3290 }
3291
3292
3293 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3294 $self->{line_prev} = $self->{line};
3295 $self->{column_prev} = $self->{column};
3296 $self->{column}++;
3297 $self->{nc}
3298 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3299 } else {
3300 $self->{set_nc}->($self);
3301 }
3302
3303 redo A;
3304 }
3305 } elsif ($self->{state} == PUBLIC_STATE) {
3306 ## ASCII case-insensitive
3307 if ($self->{nc} == [
3308 undef,
3309 0x0055, # U
3310 0x0042, # B
3311 0x004C, # L
3312 0x0049, # I
3313 ]->[length $self->{kwd}] or
3314 $self->{nc} == [
3315 undef,
3316 0x0075, # u
3317 0x0062, # b
3318 0x006C, # l
3319 0x0069, # i
3320 ]->[length $self->{kwd}]) {
3321
3322 ## Stay in the state.
3323 $self->{kwd} .= chr $self->{nc};
3324
3325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3326 $self->{line_prev} = $self->{line};
3327 $self->{column_prev} = $self->{column};
3328 $self->{column}++;
3329 $self->{nc}
3330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3331 } else {
3332 $self->{set_nc}->($self);
3333 }
3334
3335 redo A;
3336 } elsif ((length $self->{kwd}) == 5 and
3337 ($self->{nc} == 0x0043 or # C
3338 $self->{nc} == 0x0063)) { # c
3339 if ($self->{is_xml} and
3340 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3341
3342 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3343 text => 'PUBLIC',
3344 line => $self->{line_prev},
3345 column => $self->{column_prev} - 4);
3346 } else {
3347
3348 }
3349 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3350
3351 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3352 $self->{line_prev} = $self->{line};
3353 $self->{column_prev} = $self->{column};
3354 $self->{column}++;
3355 $self->{nc}
3356 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3357 } else {
3358 $self->{set_nc}->($self);
3359 }
3360
3361 redo A;
3362 } else {
3363 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3364 line => $self->{line_prev},
3365 column => $self->{column_prev} + 1 - length $self->{kwd});
3366 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3367
3368 $self->{ct}->{quirks} = 1;
3369 $self->{state} = BOGUS_DOCTYPE_STATE;
3370 } else {
3371
3372 $self->{state} = BOGUS_MD_STATE;
3373 }
3374 ## Reconsume.
3375 redo A;
3376 }
3377 } elsif ($self->{state} == SYSTEM_STATE) {
3378 ## ASCII case-insensitive
3379 if ($self->{nc} == [
3380 undef,
3381 0x0059, # Y
3382 0x0053, # S
3383 0x0054, # T
3384 0x0045, # E
3385 ]->[length $self->{kwd}] or
3386 $self->{nc} == [
3387 undef,
3388 0x0079, # y
3389 0x0073, # s
3390 0x0074, # t
3391 0x0065, # e
3392 ]->[length $self->{kwd}]) {
3393
3394 ## Stay in the state.
3395 $self->{kwd} .= chr $self->{nc};
3396
3397 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3398 $self->{line_prev} = $self->{line};
3399 $self->{column_prev} = $self->{column};
3400 $self->{column}++;
3401 $self->{nc}
3402 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3403 } else {
3404 $self->{set_nc}->($self);
3405 }
3406
3407 redo A;
3408 } elsif ((length $self->{kwd}) == 5 and
3409 ($self->{nc} == 0x004D or # M
3410 $self->{nc} == 0x006D)) { # m
3411 if ($self->{is_xml} and
3412 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3413
3414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3415 text => 'SYSTEM',
3416 line => $self->{line_prev},
3417 column => $self->{column_prev} - 4);
3418 } else {
3419
3420 }
3421 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3422
3423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3424 $self->{line_prev} = $self->{line};
3425 $self->{column_prev} = $self->{column};
3426 $self->{column}++;
3427 $self->{nc}
3428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3429 } else {
3430 $self->{set_nc}->($self);
3431 }
3432
3433 redo A;
3434 } else {
3435 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3436 line => $self->{line_prev},
3437 column => $self->{column_prev} + 1 - length $self->{kwd});
3438 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3439
3440 $self->{ct}->{quirks} = 1;
3441 $self->{state} = BOGUS_DOCTYPE_STATE;
3442 } else {
3443
3444 $self->{state} = BOGUS_MD_STATE;
3445 }
3446 ## Reconsume.
3447 redo A;
3448 }
3449 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3450 if ($is_space->{$self->{nc}}) {
3451
3452 ## Stay in the state
3453
3454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3455 $self->{line_prev} = $self->{line};
3456 $self->{column_prev} = $self->{column};
3457 $self->{column}++;
3458 $self->{nc}
3459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3460 } else {
3461 $self->{set_nc}->($self);
3462 }
3463
3464 redo A;
3465 } elsif ($self->{nc} eq 0x0022) { # "
3466
3467 $self->{ct}->{pubid} = ''; # DOCTYPE
3468 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3469
3470 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3471 $self->{line_prev} = $self->{line};
3472 $self->{column_prev} = $self->{column};
3473 $self->{column}++;
3474 $self->{nc}
3475 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3476 } else {
3477 $self->{set_nc}->($self);
3478 }
3479
3480 redo A;
3481 } elsif ($self->{nc} eq 0x0027) { # '
3482
3483 $self->{ct}->{pubid} = ''; # DOCTYPE
3484 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3485
3486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3487 $self->{line_prev} = $self->{line};
3488 $self->{column_prev} = $self->{column};
3489 $self->{column}++;
3490 $self->{nc}
3491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3492 } else {
3493 $self->{set_nc}->($self);
3494 }
3495
3496 redo A;
3497 } elsif ($self->{nc} eq 0x003E) { # >
3498 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3499
3500 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3501
3502 $self->{state} = DATA_STATE;
3503 $self->{s_kwd} = '';
3504 $self->{ct}->{quirks} = 1;
3505 } else {
3506
3507 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3508 }
3509
3510
3511 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3512 $self->{line_prev} = $self->{line};
3513 $self->{column_prev} = $self->{column};
3514 $self->{column}++;
3515 $self->{nc}
3516 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3517 } else {
3518 $self->{set_nc}->($self);
3519 }
3520
3521 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3522 redo A;
3523 } elsif ($self->{nc} == -1) {
3524 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3525
3526 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3527 $self->{state} = DATA_STATE;
3528 $self->{s_kwd} = '';
3529 $self->{ct}->{quirks} = 1;
3530 } else {
3531
3532 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3533 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3534 }
3535
3536 ## reconsume
3537 return ($self->{ct}); # DOCTYPE
3538 redo A;
3539 } elsif ($self->{is_xml} and
3540 $self->{ct}->{type} == DOCTYPE_TOKEN and
3541 $self->{nc} == 0x005B) { # [
3542
3543 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3544 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3545 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3546 $self->{in_subset} = 1;
3547
3548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3549 $self->{line_prev} = $self->{line};
3550 $self->{column_prev} = $self->{column};
3551 $self->{column}++;
3552 $self->{nc}
3553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3554 } else {
3555 $self->{set_nc}->($self);
3556 }
3557
3558 return ($self->{ct}); # DOCTYPE
3559 redo A;
3560 } else {
3561 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3562
3563 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3564
3565 $self->{ct}->{quirks} = 1;
3566 $self->{state} = BOGUS_DOCTYPE_STATE;
3567 } else {
3568
3569 $self->{state} = BOGUS_MD_STATE;
3570 }
3571
3572
3573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574 $self->{line_prev} = $self->{line};
3575 $self->{column_prev} = $self->{column};
3576 $self->{column}++;
3577 $self->{nc}
3578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3579 } else {
3580 $self->{set_nc}->($self);
3581 }
3582
3583 redo A;
3584 }
3585 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3586 if ($self->{nc} == 0x0022) { # "
3587
3588 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3589
3590 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3591 $self->{line_prev} = $self->{line};
3592 $self->{column_prev} = $self->{column};
3593 $self->{column}++;
3594 $self->{nc}
3595 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3596 } else {
3597 $self->{set_nc}->($self);
3598 }
3599
3600 redo A;
3601 } elsif ($self->{nc} == 0x003E) { # >
3602 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3603
3604 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3605
3606 $self->{state} = DATA_STATE;
3607 $self->{s_kwd} = '';
3608 $self->{ct}->{quirks} = 1;
3609 } else {
3610
3611 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3612 }
3613
3614
3615 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3616 $self->{line_prev} = $self->{line};
3617 $self->{column_prev} = $self->{column};
3618 $self->{column}++;
3619 $self->{nc}
3620 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3621 } else {
3622 $self->{set_nc}->($self);
3623 }
3624
3625 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3626 redo A;
3627 } elsif ($self->{nc} == -1) {
3628 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3629
3630 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3631
3632 $self->{state} = DATA_STATE;
3633 $self->{s_kwd} = '';
3634 $self->{ct}->{quirks} = 1;
3635 } else {
3636
3637 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3638 }
3639
3640 ## Reconsume.
3641 return ($self->{ct}); # DOCTYPE
3642 redo A;
3643 } else {
3644
3645 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3646 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3647 length $self->{ct}->{pubid});
3648
3649 ## Stay in the state
3650
3651 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3652 $self->{line_prev} = $self->{line};
3653 $self->{column_prev} = $self->{column};
3654 $self->{column}++;
3655 $self->{nc}
3656 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3657 } else {
3658 $self->{set_nc}->($self);
3659 }
3660
3661 redo A;
3662 }
3663 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3664 if ($self->{nc} == 0x0027) { # '
3665
3666 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3667
3668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3669 $self->{line_prev} = $self->{line};
3670 $self->{column_prev} = $self->{column};
3671 $self->{column}++;
3672 $self->{nc}
3673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3674 } else {
3675 $self->{set_nc}->($self);
3676 }
3677
3678 redo A;
3679 } elsif ($self->{nc} == 0x003E) { # >
3680 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3681
3682 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3683
3684 $self->{state} = DATA_STATE;
3685 $self->{s_kwd} = '';
3686 $self->{ct}->{quirks} = 1;
3687 } else {
3688
3689 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3690 }
3691
3692
3693 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3694 $self->{line_prev} = $self->{line};
3695 $self->{column_prev} = $self->{column};
3696 $self->{column}++;
3697 $self->{nc}
3698 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3699 } else {
3700 $self->{set_nc}->($self);
3701 }
3702
3703 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3704 redo A;
3705 } elsif ($self->{nc} == -1) {
3706 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3707
3708 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3709
3710 $self->{state} = DATA_STATE;
3711 $self->{s_kwd} = '';
3712 $self->{ct}->{quirks} = 1;
3713 } else {
3714
3715 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3716 }
3717
3718 ## reconsume
3719 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3720 redo A;
3721 } else {
3722
3723 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3724 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3725 length $self->{ct}->{pubid});
3726
3727 ## Stay in the state
3728
3729 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3730 $self->{line_prev} = $self->{line};
3731 $self->{column_prev} = $self->{column};
3732 $self->{column}++;
3733 $self->{nc}
3734 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3735 } else {
3736 $self->{set_nc}->($self);
3737 }
3738
3739 redo A;
3740 }
3741 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3742 if ($is_space->{$self->{nc}}) {
3743
3744 ## Stay in the state
3745
3746 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3747 $self->{line_prev} = $self->{line};
3748 $self->{column_prev} = $self->{column};
3749 $self->{column}++;
3750 $self->{nc}
3751 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3752 } else {
3753 $self->{set_nc}->($self);
3754 }
3755
3756 redo A;
3757 } elsif ($self->{nc} == 0x0022) { # "
3758
3759 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3760 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3761
3762 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3763 $self->{line_prev} = $self->{line};
3764 $self->{column_prev} = $self->{column};
3765 $self->{column}++;
3766 $self->{nc}
3767 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3768 } else {
3769 $self->{set_nc}->($self);
3770 }
3771
3772 redo A;
3773 } elsif ($self->{nc} == 0x0027) { # '
3774
3775 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3776 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3777
3778 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779 $self->{line_prev} = $self->{line};
3780 $self->{column_prev} = $self->{column};
3781 $self->{column}++;
3782 $self->{nc}
3783 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784 } else {
3785 $self->{set_nc}->($self);
3786 }
3787
3788 redo A;
3789 } elsif ($self->{nc} == 0x003E) { # >
3790 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3791 if ($self->{is_xml}) {
3792
3793 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3794 } else {
3795
3796 }
3797 $self->{state} = DATA_STATE;
3798 $self->{s_kwd} = '';
3799 } else {
3800 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3801
3802 } else {
3803
3804 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3805 }
3806 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3807 }
3808
3809
3810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3811 $self->{line_prev} = $self->{line};
3812 $self->{column_prev} = $self->{column};
3813 $self->{column}++;
3814 $self->{nc}
3815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3816 } else {
3817 $self->{set_nc}->($self);
3818 }
3819
3820 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3821 redo A;
3822 } elsif ($self->{nc} == -1) {
3823 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3824
3825 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3826
3827 $self->{state} = DATA_STATE;
3828 $self->{s_kwd} = '';
3829 $self->{ct}->{quirks} = 1;
3830 } else {
3831 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3832 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3833 }
3834
3835 ## reconsume
3836 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3837 redo A;
3838 } elsif ($self->{is_xml} and
3839 $self->{ct}->{type} == DOCTYPE_TOKEN and
3840 $self->{nc} == 0x005B) { # [
3841
3842 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3843 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3844 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3845 $self->{in_subset} = 1;
3846
3847 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3848 $self->{line_prev} = $self->{line};
3849 $self->{column_prev} = $self->{column};
3850 $self->{column}++;
3851 $self->{nc}
3852 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3853 } else {
3854 $self->{set_nc}->($self);
3855 }
3856
3857 return ($self->{ct}); # DOCTYPE
3858 redo A;
3859 } else {
3860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3861
3862 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3863
3864 $self->{ct}->{quirks} = 1;
3865 $self->{state} = BOGUS_DOCTYPE_STATE;
3866 } else {
3867
3868 $self->{state} = BOGUS_MD_STATE;
3869 }
3870
3871
3872 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3873 $self->{line_prev} = $self->{line};
3874 $self->{column_prev} = $self->{column};
3875 $self->{column}++;
3876 $self->{nc}
3877 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3878 } else {
3879 $self->{set_nc}->($self);
3880 }
3881
3882 redo A;
3883 }
3884 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3885 if ($is_space->{$self->{nc}}) {
3886
3887 ## Stay in the state
3888
3889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3890 $self->{line_prev} = $self->{line};
3891 $self->{column_prev} = $self->{column};
3892 $self->{column}++;
3893 $self->{nc}
3894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3895 } else {
3896 $self->{set_nc}->($self);
3897 }
3898
3899 redo A;
3900 } elsif ($self->{nc} == 0x0022) { # "
3901
3902 $self->{ct}->{sysid} = ''; # DOCTYPE
3903 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3904
3905 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3906 $self->{line_prev} = $self->{line};
3907 $self->{column_prev} = $self->{column};
3908 $self->{column}++;
3909 $self->{nc}
3910 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3911 } else {
3912 $self->{set_nc}->($self);
3913 }
3914
3915 redo A;
3916 } elsif ($self->{nc} == 0x0027) { # '
3917
3918 $self->{ct}->{sysid} = ''; # DOCTYPE
3919 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3920
3921 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3922 $self->{line_prev} = $self->{line};
3923 $self->{column_prev} = $self->{column};
3924 $self->{column}++;
3925 $self->{nc}
3926 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3927 } else {
3928 $self->{set_nc}->($self);
3929 }
3930
3931 redo A;
3932 } elsif ($self->{nc} == 0x003E) { # >
3933 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3934
3935 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3936 $self->{line_prev} = $self->{line};
3937 $self->{column_prev} = $self->{column};
3938 $self->{column}++;
3939 $self->{nc}
3940 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3941 } else {
3942 $self->{set_nc}->($self);
3943 }
3944
3945
3946 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3947
3948 $self->{state} = DATA_STATE;
3949 $self->{s_kwd} = '';
3950 $self->{ct}->{quirks} = 1;
3951 } else {
3952
3953 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3954 }
3955
3956 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3957 redo A;
3958 } elsif ($self->{nc} == -1) {
3959 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3960
3961 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3962 $self->{state} = DATA_STATE;
3963 $self->{s_kwd} = '';
3964 $self->{ct}->{quirks} = 1;
3965 } else {
3966
3967 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3968 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3969 }
3970
3971 ## reconsume
3972 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3973 redo A;
3974 } elsif ($self->{is_xml} and
3975 $self->{ct}->{type} == DOCTYPE_TOKEN and
3976 $self->{nc} == 0x005B) { # [
3977
3978 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3979
3980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3981 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3982 $self->{in_subset} = 1;
3983
3984 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3985 $self->{line_prev} = $self->{line};
3986 $self->{column_prev} = $self->{column};
3987 $self->{column}++;
3988 $self->{nc}
3989 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3990 } else {
3991 $self->{set_nc}->($self);
3992 }
3993
3994 return ($self->{ct}); # DOCTYPE
3995 redo A;
3996 } else {
3997 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3998
3999 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4000
4001 $self->{ct}->{quirks} = 1;
4002 $self->{state} = BOGUS_DOCTYPE_STATE;
4003 } else {
4004
4005 $self->{state} = BOGUS_MD_STATE;
4006 }
4007
4008
4009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4010 $self->{line_prev} = $self->{line};
4011 $self->{column_prev} = $self->{column};
4012 $self->{column}++;
4013 $self->{nc}
4014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4015 } else {
4016 $self->{set_nc}->($self);
4017 }
4018
4019 redo A;
4020 }
4021 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4022 if ($self->{nc} == 0x0022) { # "
4023
4024 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4025
4026 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4027 $self->{line_prev} = $self->{line};
4028 $self->{column_prev} = $self->{column};
4029 $self->{column}++;
4030 $self->{nc}
4031 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4032 } else {
4033 $self->{set_nc}->($self);
4034 }
4035
4036 redo A;
4037 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4038 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4039
4040 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4041
4042 $self->{state} = DATA_STATE;
4043 $self->{s_kwd} = '';
4044 $self->{ct}->{quirks} = 1;
4045 } else {
4046
4047 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4048 }
4049
4050
4051 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4052 $self->{line_prev} = $self->{line};
4053 $self->{column_prev} = $self->{column};
4054 $self->{column}++;
4055 $self->{nc}
4056 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4057 } else {
4058 $self->{set_nc}->($self);
4059 }
4060
4061 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4062 redo A;
4063 } elsif ($self->{nc} == -1) {
4064 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4065
4066 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4067
4068 $self->{state} = DATA_STATE;
4069 $self->{s_kwd} = '';
4070 $self->{ct}->{quirks} = 1;
4071 } else {
4072
4073 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4074 }
4075
4076 ## reconsume
4077 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4078 redo A;
4079 } else {
4080
4081 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4082 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4083 length $self->{ct}->{sysid});
4084
4085 ## Stay in the state
4086
4087 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4088 $self->{line_prev} = $self->{line};
4089 $self->{column_prev} = $self->{column};
4090 $self->{column}++;
4091 $self->{nc}
4092 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4093 } else {
4094 $self->{set_nc}->($self);
4095 }
4096
4097 redo A;
4098 }
4099 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4100 if ($self->{nc} == 0x0027) { # '
4101
4102 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4103
4104 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4105 $self->{line_prev} = $self->{line};
4106 $self->{column_prev} = $self->{column};
4107 $self->{column}++;
4108 $self->{nc}
4109 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4110 } else {
4111 $self->{set_nc}->($self);
4112 }
4113
4114 redo A;
4115 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4116
4117 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4118
4119 $self->{state} = DATA_STATE;
4120 $self->{s_kwd} = '';
4121
4122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4123 $self->{line_prev} = $self->{line};
4124 $self->{column_prev} = $self->{column};
4125 $self->{column}++;
4126 $self->{nc}
4127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4128 } else {
4129 $self->{set_nc}->($self);
4130 }
4131
4132
4133 $self->{ct}->{quirks} = 1;
4134 return ($self->{ct}); # DOCTYPE
4135
4136 redo A;
4137 } elsif ($self->{nc} == -1) {
4138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4139
4140 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4141
4142 $self->{state} = DATA_STATE;
4143 $self->{s_kwd} = '';
4144 $self->{ct}->{quirks} = 1;
4145 } else {
4146
4147 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4148 }
4149
4150 ## reconsume
4151 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4152 redo A;
4153 } else {
4154
4155 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4156 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4157 length $self->{ct}->{sysid});
4158
4159 ## Stay in the state
4160
4161 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4162 $self->{line_prev} = $self->{line};
4163 $self->{column_prev} = $self->{column};
4164 $self->{column}++;
4165 $self->{nc}
4166 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4167 } else {
4168 $self->{set_nc}->($self);
4169 }
4170
4171 redo A;
4172 }
4173 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4174 if ($is_space->{$self->{nc}}) {
4175
4176 ## Stay in the state
4177
4178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4179 $self->{line_prev} = $self->{line};
4180 $self->{column_prev} = $self->{column};
4181 $self->{column}++;
4182 $self->{nc}
4183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4184 } else {
4185 $self->{set_nc}->($self);
4186 }
4187
4188 redo A;
4189 } elsif ($self->{nc} == 0x003E) { # >
4190 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4191
4192 $self->{state} = DATA_STATE;
4193 $self->{s_kwd} = '';
4194 } else {
4195
4196 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4197 }
4198
4199
4200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4201 $self->{line_prev} = $self->{line};
4202 $self->{column_prev} = $self->{column};
4203 $self->{column}++;
4204 $self->{nc}
4205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4206 } else {
4207 $self->{set_nc}->($self);
4208 }
4209
4210 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4211 redo A;
4212 ## TODO: "NDATA"
4213 } elsif ($self->{nc} == -1) {
4214 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4215
4216 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4217 $self->{state} = DATA_STATE;
4218 $self->{s_kwd} = '';
4219 $self->{ct}->{quirks} = 1;
4220 } else {
4221
4222 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4223 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4224 }
4225
4226 ## reconsume
4227 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4228 redo A;
4229 } elsif ($self->{is_xml} and
4230 $self->{ct}->{type} == DOCTYPE_TOKEN and
4231 $self->{nc} == 0x005B) { # [
4232
4233 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4235 $self->{in_subset} = 1;
4236
4237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4238 $self->{line_prev} = $self->{line};
4239 $self->{column_prev} = $self->{column};
4240 $self->{column}++;
4241 $self->{nc}
4242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4243 } else {
4244 $self->{set_nc}->($self);
4245 }
4246
4247 return ($self->{ct}); # DOCTYPE
4248 redo A;
4249 } else {
4250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4251
4252 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4253
4254 #$self->{ct}->{quirks} = 1;
4255 $self->{state} = BOGUS_DOCTYPE_STATE;
4256 } else {
4257
4258 $self->{state} = BOGUS_MD_STATE;
4259 }
4260
4261
4262 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4263 $self->{line_prev} = $self->{line};
4264 $self->{column_prev} = $self->{column};
4265 $self->{column}++;
4266 $self->{nc}
4267 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4268 } else {
4269 $self->{set_nc}->($self);
4270 }
4271
4272 redo A;
4273 }
4274 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4275 if ($self->{nc} == 0x003E) { # >
4276
4277 $self->{state} = DATA_STATE;
4278 $self->{s_kwd} = '';
4279
4280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4281 $self->{line_prev} = $self->{line};
4282 $self->{column_prev} = $self->{column};
4283 $self->{column}++;
4284 $self->{nc}
4285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4286 } else {
4287 $self->{set_nc}->($self);
4288 }
4289
4290
4291 return ($self->{ct}); # DOCTYPE
4292
4293 redo A;
4294 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4295
4296 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4297 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4298 $self->{in_subset} = 1;
4299
4300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4301 $self->{line_prev} = $self->{line};
4302 $self->{column_prev} = $self->{column};
4303 $self->{column}++;
4304 $self->{nc}
4305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4306 } else {
4307 $self->{set_nc}->($self);
4308 }
4309
4310 return ($self->{ct}); # DOCTYPE
4311 redo A;
4312 } elsif ($self->{nc} == -1) {
4313
4314 $self->{state} = DATA_STATE;
4315 $self->{s_kwd} = '';
4316 ## reconsume
4317
4318 return ($self->{ct}); # DOCTYPE
4319
4320 redo A;
4321 } else {
4322
4323 my $s = '';
4324 $self->{read_until}->($s, q{>[}, 0);
4325
4326 ## Stay in the state
4327
4328 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4329 $self->{line_prev} = $self->{line};
4330 $self->{column_prev} = $self->{column};
4331 $self->{column}++;
4332 $self->{nc}
4333 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4334 } else {
4335 $self->{set_nc}->($self);
4336 }
4337
4338 redo A;
4339 }
4340 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4341 ## NOTE: "CDATA section state" in the state is jointly implemented
4342 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4343 ## and |CDATA_SECTION_MSE2_STATE|.
4344
4345 ## XML5: "CDATA state".
4346
4347 if ($self->{nc} == 0x005D) { # ]
4348
4349 $self->{state} = CDATA_SECTION_MSE1_STATE;
4350
4351 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4352 $self->{line_prev} = $self->{line};
4353 $self->{column_prev} = $self->{column};
4354 $self->{column}++;
4355 $self->{nc}
4356 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4357 } else {
4358 $self->{set_nc}->($self);
4359 }
4360
4361 redo A;
4362 } elsif ($self->{nc} == -1) {
4363 if ($self->{is_xml}) {
4364
4365 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4366 } else {
4367
4368 }
4369
4370 $self->{state} = DATA_STATE;
4371 $self->{s_kwd} = '';
4372 ## Reconsume.
4373 if (length $self->{ct}->{data}) { # character
4374
4375 return ($self->{ct}); # character
4376 } else {
4377
4378 ## No token to emit. $self->{ct} is discarded.
4379 }
4380 redo A;
4381 } else {
4382
4383 $self->{ct}->{data} .= chr $self->{nc};
4384 $self->{read_until}->($self->{ct}->{data},
4385 q<]>,
4386 length $self->{ct}->{data});
4387
4388 ## Stay in the state.
4389
4390 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4391 $self->{line_prev} = $self->{line};
4392 $self->{column_prev} = $self->{column};
4393 $self->{column}++;
4394 $self->{nc}
4395 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4396 } else {
4397 $self->{set_nc}->($self);
4398 }
4399
4400 redo A;
4401 }
4402
4403 ## ISSUE: "text tokens" in spec.
4404 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4405 ## XML5: "CDATA bracket state".
4406
4407 if ($self->{nc} == 0x005D) { # ]
4408
4409 $self->{state} = CDATA_SECTION_MSE2_STATE;
4410
4411 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4412 $self->{line_prev} = $self->{line};
4413 $self->{column_prev} = $self->{column};
4414 $self->{column}++;
4415 $self->{nc}
4416 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4417 } else {
4418 $self->{set_nc}->($self);
4419 }
4420
4421 redo A;
4422 } else {
4423
4424 ## XML5: If EOF, "]" is not appended and changed to the data state.
4425 $self->{ct}->{data} .= ']';
4426 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4427 ## Reconsume.
4428 redo A;
4429 }
4430 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4431 ## XML5: "CDATA end state".
4432
4433 if ($self->{nc} == 0x003E) { # >
4434 $self->{state} = DATA_STATE;
4435 $self->{s_kwd} = '';
4436
4437 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4438 $self->{line_prev} = $self->{line};
4439 $self->{column_prev} = $self->{column};
4440 $self->{column}++;
4441 $self->{nc}
4442 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4443 } else {
4444 $self->{set_nc}->($self);
4445 }
4446
4447 if (length $self->{ct}->{data}) { # character
4448
4449 return ($self->{ct}); # character
4450 } else {
4451
4452 ## No token to emit. $self->{ct} is discarded.
4453 }
4454 redo A;
4455 } elsif ($self->{nc} == 0x005D) { # ]
4456 # character
4457 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4458 ## Stay in the state.
4459
4460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4461 $self->{line_prev} = $self->{line};
4462 $self->{column_prev} = $self->{column};
4463 $self->{column}++;
4464 $self->{nc}
4465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4466 } else {
4467 $self->{set_nc}->($self);
4468 }
4469
4470 redo A;
4471 } else {
4472
4473 $self->{ct}->{data} .= ']]'; # character
4474 $self->{state} = CDATA_SECTION_STATE;
4475 ## Reconsume. ## XML5: Emit.
4476 redo A;
4477 }
4478 } elsif ($self->{state} == ENTITY_STATE) {
4479 if ($is_space->{$self->{nc}} or
4480 {
4481 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4482 $self->{entity_add} => 1,
4483 }->{$self->{nc}}) {
4484
4485 ## Don't consume
4486 ## No error
4487 ## Return nothing.
4488 #
4489 } elsif ($self->{nc} == 0x0023) { # #
4490
4491 $self->{state} = ENTITY_HASH_STATE;
4492 $self->{kwd} = '#';
4493
4494 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4495 $self->{line_prev} = $self->{line};
4496 $self->{column_prev} = $self->{column};
4497 $self->{column}++;
4498 $self->{nc}
4499 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4500 } else {
4501 $self->{set_nc}->($self);
4502 }
4503
4504 redo A;
4505 } elsif ((0x0041 <= $self->{nc} and
4506 $self->{nc} <= 0x005A) or # A..Z
4507 (0x0061 <= $self->{nc} and
4508 $self->{nc} <= 0x007A)) { # a..z
4509
4510 require Whatpm::_NamedEntityList;
4511 $self->{state} = ENTITY_NAME_STATE;
4512 $self->{kwd} = chr $self->{nc};
4513 $self->{entity__value} = $self->{kwd};
4514 $self->{entity__match} = 0;
4515
4516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4517 $self->{line_prev} = $self->{line};
4518 $self->{column_prev} = $self->{column};
4519 $self->{column}++;
4520 $self->{nc}
4521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4522 } else {
4523 $self->{set_nc}->($self);
4524 }
4525
4526 redo A;
4527 } else {
4528
4529 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4530 ## Return nothing.
4531 #
4532 }
4533
4534 ## NOTE: No character is consumed by the "consume a character
4535 ## reference" algorithm. In other word, there is an "&" character
4536 ## that does not introduce a character reference, which would be
4537 ## appended to the parent element or the attribute value in later
4538 ## process of the tokenizer.
4539
4540 if ($self->{prev_state} == DATA_STATE) {
4541
4542 $self->{state} = $self->{prev_state};
4543 $self->{s_kwd} = '';
4544 ## Reconsume.
4545 return ({type => CHARACTER_TOKEN, data => '&',
4546 line => $self->{line_prev},
4547 column => $self->{column_prev},
4548 });
4549 redo A;
4550 } else {
4551
4552 $self->{ca}->{value} .= '&';
4553 $self->{state} = $self->{prev_state};
4554 $self->{s_kwd} = '';
4555 ## Reconsume.
4556 redo A;
4557 }
4558 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4559 if ($self->{nc} == 0x0078 or # x
4560 $self->{nc} == 0x0058) { # X
4561
4562 $self->{state} = HEXREF_X_STATE;
4563 $self->{kwd} .= chr $self->{nc};
4564
4565 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4566 $self->{line_prev} = $self->{line};
4567 $self->{column_prev} = $self->{column};
4568 $self->{column}++;
4569 $self->{nc}
4570 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4571 } else {
4572 $self->{set_nc}->($self);
4573 }
4574
4575 redo A;
4576 } elsif (0x0030 <= $self->{nc} and
4577 $self->{nc} <= 0x0039) { # 0..9
4578
4579 $self->{state} = NCR_NUM_STATE;
4580 $self->{kwd} = $self->{nc} - 0x0030;
4581
4582 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4583 $self->{line_prev} = $self->{line};
4584 $self->{column_prev} = $self->{column};
4585 $self->{column}++;
4586 $self->{nc}
4587 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4588 } else {
4589 $self->{set_nc}->($self);
4590 }
4591
4592 redo A;
4593 } else {
4594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4595 line => $self->{line_prev},
4596 column => $self->{column_prev} - 1);
4597
4598 ## NOTE: According to the spec algorithm, nothing is returned,
4599 ## and then "&#" is appended to the parent element or the attribute
4600 ## value in the later processing.
4601
4602 if ($self->{prev_state} == DATA_STATE) {
4603
4604 $self->{state} = $self->{prev_state};
4605 $self->{s_kwd} = '';
4606 ## Reconsume.
4607 return ({type => CHARACTER_TOKEN,
4608 data => '&#',
4609 line => $self->{line_prev},
4610 column => $self->{column_prev} - 1,
4611 });
4612 redo A;
4613 } else {
4614
4615 $self->{ca}->{value} .= '&#';
4616 $self->{state} = $self->{prev_state};
4617 $self->{s_kwd} = '';
4618 ## Reconsume.
4619 redo A;
4620 }
4621 }
4622 } elsif ($self->{state} == NCR_NUM_STATE) {
4623 if (0x0030 <= $self->{nc} and
4624 $self->{nc} <= 0x0039) { # 0..9
4625
4626 $self->{kwd} *= 10;
4627 $self->{kwd} += $self->{nc} - 0x0030;
4628
4629 ## Stay in the state.
4630
4631 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4632 $self->{line_prev} = $self->{line};
4633 $self->{column_prev} = $self->{column};
4634 $self->{column}++;
4635 $self->{nc}
4636 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4637 } else {
4638 $self->{set_nc}->($self);
4639 }
4640
4641 redo A;
4642 } elsif ($self->{nc} == 0x003B) { # ;
4643
4644
4645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4646 $self->{line_prev} = $self->{line};
4647 $self->{column_prev} = $self->{column};
4648 $self->{column}++;
4649 $self->{nc}
4650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4651 } else {
4652 $self->{set_nc}->($self);
4653 }
4654
4655 #
4656 } else {
4657
4658 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4659 ## Reconsume.
4660 #
4661 }
4662
4663 my $code = $self->{kwd};
4664 my $l = $self->{line_prev};
4665 my $c = $self->{column_prev};
4666 if ($charref_map->{$code}) {
4667
4668 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4669 text => (sprintf 'U+%04X', $code),
4670 line => $l, column => $c);
4671 $code = $charref_map->{$code};
4672 } elsif ($code > 0x10FFFF) {
4673
4674 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4675 text => (sprintf 'U-%08X', $code),
4676 line => $l, column => $c);
4677 $code = 0xFFFD;
4678 }
4679
4680 if ($self->{prev_state} == DATA_STATE) {
4681
4682 $self->{state} = $self->{prev_state};
4683 $self->{s_kwd} = '';
4684 ## Reconsume.
4685 return ({type => CHARACTER_TOKEN, data => chr $code,
4686 has_reference => 1,
4687 line => $l, column => $c,
4688 });
4689 redo A;
4690 } else {
4691
4692 $self->{ca}->{value} .= chr $code;
4693 $self->{ca}->{has_reference} = 1;
4694 $self->{state} = $self->{prev_state};
4695 $self->{s_kwd} = '';
4696 ## Reconsume.
4697 redo A;
4698 }
4699 } elsif ($self->{state} == HEXREF_X_STATE) {
4700 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4701 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4702 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4703 # 0..9, A..F, a..f
4704
4705 $self->{state} = HEXREF_HEX_STATE;
4706 $self->{kwd} = 0;
4707 ## Reconsume.
4708 redo A;
4709 } else {
4710 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4711 line => $self->{line_prev},
4712 column => $self->{column_prev} - 2);
4713
4714 ## NOTE: According to the spec algorithm, nothing is returned,
4715 ## and then "&#" followed by "X" or "x" is appended to the parent
4716 ## element or the attribute value in the later processing.
4717
4718 if ($self->{prev_state} == DATA_STATE) {
4719
4720 $self->{state} = $self->{prev_state};
4721 $self->{s_kwd} = '';
4722 ## Reconsume.
4723 return ({type => CHARACTER_TOKEN,
4724 data => '&' . $self->{kwd},
4725 line => $self->{line_prev},
4726 column => $self->{column_prev} - length $self->{kwd},
4727 });
4728 redo A;
4729 } else {
4730
4731 $self->{ca}->{value} .= '&' . $self->{kwd};
4732 $self->{state} = $self->{prev_state};
4733 $self->{s_kwd} = '';
4734 ## Reconsume.
4735 redo A;
4736 }
4737 }
4738 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4739 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4740 # 0..9
4741
4742 $self->{kwd} *= 0x10;
4743 $self->{kwd} += $self->{nc} - 0x0030;
4744 ## Stay in the state.
4745
4746 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4747 $self->{line_prev} = $self->{line};
4748 $self->{column_prev} = $self->{column};
4749 $self->{column}++;
4750 $self->{nc}
4751 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4752 } else {
4753 $self->{set_nc}->($self);
4754 }
4755
4756 redo A;
4757 } elsif (0x0061 <= $self->{nc} and
4758 $self->{nc} <= 0x0066) { # a..f
4759
4760 $self->{kwd} *= 0x10;
4761 $self->{kwd} += $self->{nc} - 0x0060 + 9;
4762 ## Stay in the state.
4763
4764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4765 $self->{line_prev} = $self->{line};
4766 $self->{column_prev} = $self->{column};
4767 $self->{column}++;
4768 $self->{nc}
4769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4770 } else {
4771 $self->{set_nc}->($self);
4772 }
4773
4774 redo A;
4775 } elsif (0x0041 <= $self->{nc} and
4776 $self->{nc} <= 0x0046) { # A..F
4777
4778 $self->{kwd} *= 0x10;
4779 $self->{kwd} += $self->{nc} - 0x0040 + 9;
4780 ## Stay in the state.
4781
4782 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4783 $self->{line_prev} = $self->{line};
4784 $self->{column_prev} = $self->{column};
4785 $self->{column}++;
4786 $self->{nc}
4787 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4788 } else {
4789 $self->{set_nc}->($self);
4790 }
4791
4792 redo A;
4793 } elsif ($self->{nc} == 0x003B) { # ;
4794
4795
4796 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4797 $self->{line_prev} = $self->{line};
4798 $self->{column_prev} = $self->{column};
4799 $self->{column}++;
4800 $self->{nc}
4801 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4802 } else {
4803 $self->{set_nc}->($self);
4804 }
4805
4806 #
4807 } else {
4808
4809 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4810 line => $self->{line},
4811 column => $self->{column});
4812 ## Reconsume.
4813 #
4814 }
4815
4816 my $code = $self->{kwd};
4817 my $l = $self->{line_prev};
4818 my $c = $self->{column_prev};
4819 if ($charref_map->{$code}) {
4820
4821 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4822 text => (sprintf 'U+%04X', $code),
4823 line => $l, column => $c);
4824 $code = $charref_map->{$code};
4825 } elsif ($code > 0x10FFFF) {
4826
4827 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4828 text => (sprintf 'U-%08X', $code),
4829 line => $l, column => $c);
4830 $code = 0xFFFD;
4831 }
4832
4833 if ($self->{prev_state} == DATA_STATE) {
4834
4835 $self->{state} = $self->{prev_state};
4836 $self->{s_kwd} = '';
4837 ## Reconsume.
4838 return ({type => CHARACTER_TOKEN, data => chr $code,
4839 has_reference => 1,
4840 line => $l, column => $c,
4841 });
4842 redo A;
4843 } else {
4844
4845 $self->{ca}->{value} .= chr $code;
4846 $self->{ca}->{has_reference} = 1;
4847 $self->{state} = $self->{prev_state};
4848 $self->{s_kwd} = '';
4849 ## Reconsume.
4850 redo A;
4851 }
4852 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4853 if (length $self->{kwd} < 30 and
4854 ## NOTE: Some number greater than the maximum length of entity name
4855 ((0x0041 <= $self->{nc} and # a
4856 $self->{nc} <= 0x005A) or # x
4857 (0x0061 <= $self->{nc} and # a
4858 $self->{nc} <= 0x007A) or # z
4859 (0x0030 <= $self->{nc} and # 0
4860 $self->{nc} <= 0x0039) or # 9
4861 $self->{nc} == 0x003B)) { # ;
4862 our $EntityChar;
4863 $self->{kwd} .= chr $self->{nc};
4864 if (defined $EntityChar->{$self->{kwd}}) {
4865 if ($self->{nc} == 0x003B) { # ;
4866
4867 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4868 $self->{entity__match} = 1;
4869
4870 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4871 $self->{line_prev} = $self->{line};
4872 $self->{column_prev} = $self->{column};
4873 $self->{column}++;
4874 $self->{nc}
4875 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4876 } else {
4877 $self->{set_nc}->($self);
4878 }
4879
4880 #
4881 } else {
4882
4883 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4884 $self->{entity__match} = -1;
4885 ## Stay in the state.
4886
4887 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4888 $self->{line_prev} = $self->{line};
4889 $self->{column_prev} = $self->{column};
4890 $self->{column}++;
4891 $self->{nc}
4892 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4893 } else {
4894 $self->{set_nc}->($self);
4895 }
4896
4897 redo A;
4898 }
4899 } else {
4900
4901 $self->{entity__value} .= chr $self->{nc};
4902 $self->{entity__match} *= 2;
4903 ## Stay in the state.
4904
4905 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4906 $self->{line_prev} = $self->{line};
4907 $self->{column_prev} = $self->{column};
4908 $self->{column}++;
4909 $self->{nc}
4910 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4911 } else {
4912 $self->{set_nc}->($self);
4913 }
4914
4915 redo A;
4916 }
4917 }
4918
4919 my $data;
4920 my $has_ref;
4921 if ($self->{entity__match} > 0) {
4922
4923 $data = $self->{entity__value};
4924 $has_ref = 1;
4925 #
4926 } elsif ($self->{entity__match} < 0) {
4927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4928 if ($self->{prev_state} != DATA_STATE and # in attribute
4929 $self->{entity__match} < -1) {
4930
4931 $data = '&' . $self->{kwd};
4932 #
4933 } else {
4934
4935 $data = $self->{entity__value};
4936 $has_ref = 1;
4937 #
4938 }
4939 } else {
4940
4941 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4942 line => $self->{line_prev},
4943 column => $self->{column_prev} - length $self->{kwd});
4944 $data = '&' . $self->{kwd};
4945 #
4946 }
4947
4948 ## NOTE: In these cases, when a character reference is found,
4949 ## it is consumed and a character token is returned, or, otherwise,
4950 ## nothing is consumed and returned, according to the spec algorithm.
4951 ## In this implementation, anything that has been examined by the
4952 ## tokenizer is appended to the parent element or the attribute value
4953 ## as string, either literal string when no character reference or
4954 ## entity-replaced string otherwise, in this stage, since any characters
4955 ## that would not be consumed are appended in the data state or in an
4956 ## appropriate attribute value state anyway.
4957
4958 if ($self->{prev_state} == DATA_STATE) {
4959
4960 $self->{state} = $self->{prev_state};
4961 $self->{s_kwd} = '';
4962 ## Reconsume.
4963 return ({type => CHARACTER_TOKEN,
4964 data => $data,
4965 has_reference => $has_ref,
4966 line => $self->{line_prev},
4967 column => $self->{column_prev} + 1 - length $self->{kwd},
4968 });
4969 redo A;
4970 } else {
4971
4972 $self->{ca}->{value} .= $data;
4973 $self->{ca}->{has_reference} = 1 if $has_ref;
4974 $self->{state} = $self->{prev_state};
4975 $self->{s_kwd} = '';
4976 ## Reconsume.
4977 redo A;
4978 }
4979
4980 ## XML-only states
4981
4982 } elsif ($self->{state} == PI_STATE) {
4983 ## XML5: "Pi state" and "DOCTYPE pi state".
4984
4985 if ($is_space->{$self->{nc}} or
4986 $self->{nc} == 0x003F or # ?
4987 $self->{nc} == -1) {
4988 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
4989 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
4990 ## "DOCTYPE pi state": Parse error, switch to the "data
4991 ## state".
4992 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4993 line => $self->{line_prev},
4994 column => $self->{column_prev}
4995 - 1 * ($self->{nc} != -1));
4996 $self->{state} = BOGUS_COMMENT_STATE;
4997 ## Reconsume.
4998 $self->{ct} = {type => COMMENT_TOKEN,
4999 data => '?',
5000 line => $self->{line_prev},
5001 column => $self->{column_prev}
5002 - 1 * ($self->{nc} != -1),
5003 };
5004 redo A;
5005 } else {
5006 ## XML5: "DOCTYPE pi state": Stay in the state.
5007 $self->{ct} = {type => PI_TOKEN,
5008 target => chr $self->{nc},
5009 data => '',
5010 line => $self->{line_prev},
5011 column => $self->{column_prev} - 1,
5012 };
5013 $self->{state} = PI_TARGET_STATE;
5014
5015 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5016 $self->{line_prev} = $self->{line};
5017 $self->{column_prev} = $self->{column};
5018 $self->{column}++;
5019 $self->{nc}
5020 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5021 } else {
5022 $self->{set_nc}->($self);
5023 }
5024
5025 redo A;
5026 }
5027 } elsif ($self->{state} == PI_TARGET_STATE) {
5028 if ($is_space->{$self->{nc}}) {
5029 $self->{state} = PI_TARGET_AFTER_STATE;
5030
5031 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5032 $self->{line_prev} = $self->{line};
5033 $self->{column_prev} = $self->{column};
5034 $self->{column}++;
5035 $self->{nc}
5036 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5037 } else {
5038 $self->{set_nc}->($self);
5039 }
5040
5041 redo A;
5042 } elsif ($self->{nc} == -1) {
5043 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5044 if ($self->{in_subset}) {
5045 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5046 } else {
5047 $self->{state} = DATA_STATE;
5048 $self->{s_kwd} = '';
5049 }
5050 ## Reconsume.
5051 return ($self->{ct}); # pi
5052 redo A;
5053 } elsif ($self->{nc} == 0x003F) { # ?
5054 $self->{state} = PI_AFTER_STATE;
5055
5056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5057 $self->{line_prev} = $self->{line};
5058 $self->{column_prev} = $self->{column};
5059 $self->{column}++;
5060 $self->{nc}
5061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5062 } else {
5063 $self->{set_nc}->($self);
5064 }
5065
5066 redo A;
5067 } else {
5068 ## XML5: typo ("tag name" -> "target")
5069 $self->{ct}->{target} .= chr $self->{nc}; # pi
5070
5071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5072 $self->{line_prev} = $self->{line};
5073 $self->{column_prev} = $self->{column};
5074 $self->{column}++;
5075 $self->{nc}
5076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5077 } else {
5078 $self->{set_nc}->($self);
5079 }
5080
5081 redo A;
5082 }
5083 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5084 if ($is_space->{$self->{nc}}) {
5085 ## Stay in the state.
5086
5087 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5088 $self->{line_prev} = $self->{line};
5089 $self->{column_prev} = $self->{column};
5090 $self->{column}++;
5091 $self->{nc}
5092 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5093 } else {
5094 $self->{set_nc}->($self);
5095 }
5096
5097 redo A;
5098 } else {
5099 $self->{state} = PI_DATA_STATE;
5100 ## Reprocess.
5101 redo A;
5102 }
5103 } elsif ($self->{state} == PI_DATA_STATE) {
5104 if ($self->{nc} == 0x003F) { # ?
5105 $self->{state} = PI_DATA_AFTER_STATE;
5106
5107 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5108 $self->{line_prev} = $self->{line};
5109 $self->{column_prev} = $self->{column};
5110 $self->{column}++;
5111 $self->{nc}
5112 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5113 } else {
5114 $self->{set_nc}->($self);
5115 }
5116
5117 redo A;
5118 } elsif ($self->{nc} == -1) {
5119 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5120 if ($self->{in_subset}) {
5121 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5122 } else {
5123 $self->{state} = DATA_STATE;
5124 $self->{s_kwd} = '';
5125 }
5126 ## Reprocess.
5127 return ($self->{ct}); # pi
5128 redo A;
5129 } else {
5130 $self->{ct}->{data} .= chr $self->{nc}; # pi
5131 $self->{read_until}->($self->{ct}->{data}, q[?],
5132 length $self->{ct}->{data});
5133 ## Stay in the state.
5134
5135 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5136 $self->{line_prev} = $self->{line};
5137 $self->{column_prev} = $self->{column};
5138 $self->{column}++;
5139 $self->{nc}
5140 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5141 } else {
5142 $self->{set_nc}->($self);
5143 }
5144
5145 ## Reprocess.
5146 redo A;
5147 }
5148 } elsif ($self->{state} == PI_AFTER_STATE) {
5149 ## XML5: Part of "Pi after state".
5150
5151 if ($self->{nc} == 0x003E) { # >
5152 if ($self->{in_subset}) {
5153 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5154 } else {
5155 $self->{state} = DATA_STATE;
5156 $self->{s_kwd} = '';
5157 }
5158
5159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5160 $self->{line_prev} = $self->{line};
5161 $self->{column_prev} = $self->{column};
5162 $self->{column}++;
5163 $self->{nc}
5164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5165 } else {
5166 $self->{set_nc}->($self);
5167 }
5168
5169 return ($self->{ct}); # pi
5170 redo A;
5171 } elsif ($self->{nc} == 0x003F) { # ?
5172 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5173 line => $self->{line_prev},
5174 column => $self->{column_prev}); ## XML5: no error
5175 $self->{ct}->{data} .= '?';
5176 $self->{state} = PI_DATA_AFTER_STATE;
5177
5178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5179 $self->{line_prev} = $self->{line};
5180 $self->{column_prev} = $self->{column};
5181 $self->{column}++;
5182 $self->{nc}
5183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5184 } else {
5185 $self->{set_nc}->($self);
5186 }
5187
5188 redo A;
5189 } else {
5190 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5191 line => $self->{line_prev},
5192 column => $self->{column_prev}
5193 + 1 * ($self->{nc} == -1)); ## XML5: no error
5194 $self->{ct}->{data} .= '?'; ## XML5: not appended
5195 $self->{state} = PI_DATA_STATE;
5196 ## Reprocess.
5197 redo A;
5198 }
5199 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5200 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5201
5202 if ($self->{nc} == 0x003E) { # >
5203 if ($self->{in_subset}) {
5204 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5205 } else {
5206 $self->{state} = DATA_STATE;
5207 $self->{s_kwd} = '';
5208 }
5209
5210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5211 $self->{line_prev} = $self->{line};
5212 $self->{column_prev} = $self->{column};
5213 $self->{column}++;
5214 $self->{nc}
5215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5216 } else {
5217 $self->{set_nc}->($self);
5218 }
5219
5220 return ($self->{ct}); # pi
5221 redo A;
5222 } elsif ($self->{nc} == 0x003F) { # ?
5223 $self->{ct}->{data} .= '?';
5224 ## Stay in the state.
5225
5226 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5227 $self->{line_prev} = $self->{line};
5228 $self->{column_prev} = $self->{column};
5229 $self->{column}++;
5230 $self->{nc}
5231 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5232 } else {
5233 $self->{set_nc}->($self);
5234 }
5235
5236 redo A;
5237 } else {
5238 $self->{ct}->{data} .= '?'; ## XML5: not appended
5239 $self->{state} = PI_DATA_STATE;
5240 ## Reprocess.
5241 redo A;
5242 }
5243
5244 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5245 if ($self->{nc} == 0x003C) { # <
5246 $self->{state} = DOCTYPE_TAG_STATE;
5247
5248 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5249 $self->{line_prev} = $self->{line};
5250 $self->{column_prev} = $self->{column};
5251 $self->{column}++;
5252 $self->{nc}
5253 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5254 } else {
5255 $self->{set_nc}->($self);
5256 }
5257
5258 redo A;
5259 } elsif ($self->{nc} == 0x0025) { # %
5260 ## XML5: Not defined yet.
5261
5262 ## TODO:
5263
5264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5265 $self->{line_prev} = $self->{line};
5266 $self->{column_prev} = $self->{column};
5267 $self->{column}++;
5268 $self->{nc}
5269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5270 } else {
5271 $self->{set_nc}->($self);
5272 }
5273
5274 redo A;
5275 } elsif ($self->{nc} == 0x005D) { # ]
5276 delete $self->{in_subset};
5277 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5278
5279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5280 $self->{line_prev} = $self->{line};
5281 $self->{column_prev} = $self->{column};
5282 $self->{column}++;
5283 $self->{nc}
5284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5285 } else {
5286 $self->{set_nc}->($self);
5287 }
5288
5289 redo A;
5290 } elsif ($is_space->{$self->{nc}}) {
5291 ## Stay in the state.
5292
5293 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5294 $self->{line_prev} = $self->{line};
5295 $self->{column_prev} = $self->{column};
5296 $self->{column}++;
5297 $self->{nc}
5298 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5299 } else {
5300 $self->{set_nc}->($self);
5301 }
5302
5303 redo A;
5304 } elsif ($self->{nc} == -1) {
5305 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5306 delete $self->{in_subset};
5307 $self->{state} = DATA_STATE;
5308 $self->{s_kwd} = '';
5309 ## Reconsume.
5310 return ({type => END_OF_DOCTYPE_TOKEN});
5311 redo A;
5312 } else {
5313 unless ($self->{internal_subset_tainted}) {
5314 ## XML5: No parse error.
5315 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5316 $self->{internal_subset_tainted} = 1;
5317 }
5318 ## Stay in the state.
5319
5320 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5321 $self->{line_prev} = $self->{line};
5322 $self->{column_prev} = $self->{column};
5323 $self->{column}++;
5324 $self->{nc}
5325 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5326 } else {
5327 $self->{set_nc}->($self);
5328 }
5329
5330 redo A;
5331 }
5332 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5333 if ($self->{nc} == 0x003E) { # >
5334 $self->{state} = DATA_STATE;
5335 $self->{s_kwd} = '';
5336
5337 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5338 $self->{line_prev} = $self->{line};
5339 $self->{column_prev} = $self->{column};
5340 $self->{column}++;
5341 $self->{nc}
5342 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5343 } else {
5344 $self->{set_nc}->($self);
5345 }
5346
5347 return ({type => END_OF_DOCTYPE_TOKEN});
5348 redo A;
5349 } elsif ($self->{nc} == -1) {
5350 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5351 $self->{state} = DATA_STATE;
5352 $self->{s_kwd} = '';
5353 ## Reconsume.
5354 return ({type => END_OF_DOCTYPE_TOKEN});
5355 redo A;
5356 } else {
5357 ## XML5: No parse error and stay in the state.
5358 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5359
5360 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5361
5362 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5363 $self->{line_prev} = $self->{line};
5364 $self->{column_prev} = $self->{column};
5365 $self->{column}++;
5366 $self->{nc}
5367 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5368 } else {
5369 $self->{set_nc}->($self);
5370 }
5371
5372 redo A;
5373 }
5374 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5375 if ($self->{nc} == 0x003E) { # >
5376 $self->{state} = DATA_STATE;
5377 $self->{s_kwd} = '';
5378
5379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5380 $self->{line_prev} = $self->{line};
5381 $self->{column_prev} = $self->{column};
5382 $self->{column}++;
5383 $self->{nc}
5384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5385 } else {
5386 $self->{set_nc}->($self);
5387 }
5388
5389 return ({type => END_OF_DOCTYPE_TOKEN});
5390 redo A;
5391 } elsif ($self->{nc} == -1) {
5392 $self->{state} = DATA_STATE;
5393 $self->{s_kwd} = '';
5394 ## Reconsume.
5395 return ({type => END_OF_DOCTYPE_TOKEN});
5396 redo A;
5397 } else {
5398 ## Stay in the state.
5399
5400 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5401 $self->{line_prev} = $self->{line};
5402 $self->{column_prev} = $self->{column};
5403 $self->{column}++;
5404 $self->{nc}
5405 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5406 } else {
5407 $self->{set_nc}->($self);
5408 }
5409
5410 redo A;
5411 }
5412 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5413 if ($self->{nc} == 0x0021) { # !
5414 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5415
5416 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5417 $self->{line_prev} = $self->{line};
5418 $self->{column_prev} = $self->{column};
5419 $self->{column}++;
5420 $self->{nc}
5421 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5422 } else {
5423 $self->{set_nc}->($self);
5424 }
5425
5426 redo A;
5427 } elsif ($self->{nc} == 0x003F) { # ?
5428 $self->{state} = PI_STATE;
5429
5430 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5431 $self->{line_prev} = $self->{line};
5432 $self->{column_prev} = $self->{column};
5433 $self->{column}++;
5434 $self->{nc}
5435 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5436 } else {
5437 $self->{set_nc}->($self);
5438 }
5439
5440 redo A;
5441 } elsif ($self->{nc} == -1) {
5442 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5443 $self->{state} = DATA_STATE;
5444 $self->{s_kwd} = '';
5445 ## Reconsume.
5446 redo A;
5447 } else {
5448 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5449 line => $self->{line_prev},
5450 column => $self->{column_prev});
5451 $self->{state} = BOGUS_COMMENT_STATE;
5452 $self->{ct} = {type => COMMENT_TOKEN,
5453 data => '',
5454 }; ## NOTE: Will be discarded.
5455
5456 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5457 $self->{line_prev} = $self->{line};
5458 $self->{column_prev} = $self->{column};
5459 $self->{column}++;
5460 $self->{nc}
5461 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5462 } else {
5463 $self->{set_nc}->($self);
5464 }
5465
5466 redo A;
5467 }
5468 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5469 ## XML5: "DOCTYPE markup declaration state".
5470
5471 if ($self->{nc} == 0x002D) { # -
5472 $self->{state} = MD_HYPHEN_STATE;
5473
5474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475 $self->{line_prev} = $self->{line};
5476 $self->{column_prev} = $self->{column};
5477 $self->{column}++;
5478 $self->{nc}
5479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480 } else {
5481 $self->{set_nc}->($self);
5482 }
5483
5484 redo A;
5485 } elsif ($self->{nc} == 0x0045 or # E
5486 $self->{nc} == 0x0065) { # e
5487 $self->{state} = MD_E_STATE;
5488 $self->{kwd} = chr $self->{nc};
5489
5490 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5491 $self->{line_prev} = $self->{line};
5492 $self->{column_prev} = $self->{column};
5493 $self->{column}++;
5494 $self->{nc}
5495 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5496 } else {
5497 $self->{set_nc}->($self);
5498 }
5499
5500 redo A;
5501 } elsif ($self->{nc} == 0x0041 or # A
5502 $self->{nc} == 0x0061) { # a
5503 $self->{state} = MD_ATTLIST_STATE;
5504 $self->{kwd} = chr $self->{nc};
5505
5506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5507 $self->{line_prev} = $self->{line};
5508 $self->{column_prev} = $self->{column};
5509 $self->{column}++;
5510 $self->{nc}
5511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5512 } else {
5513 $self->{set_nc}->($self);
5514 }
5515
5516 redo A;
5517 } elsif ($self->{nc} == 0x004E or # N
5518 $self->{nc} == 0x006E) { # n
5519 $self->{state} = MD_NOTATION_STATE;
5520 $self->{kwd} = chr $self->{nc};
5521
5522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5523 $self->{line_prev} = $self->{line};
5524 $self->{column_prev} = $self->{column};
5525 $self->{column}++;
5526 $self->{nc}
5527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5528 } else {
5529 $self->{set_nc}->($self);
5530 }
5531
5532 redo A;
5533 } else {
5534 #
5535 }
5536
5537 ## XML5: No parse error.
5538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5539 line => $self->{line_prev},
5540 column => $self->{column_prev} - 1);
5541 ## Reconsume.
5542 $self->{state} = BOGUS_COMMENT_STATE;
5543 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5544 redo A;
5545 } elsif ($self->{state} == MD_E_STATE) {
5546 if ($self->{nc} == 0x004E or # N
5547 $self->{nc} == 0x006E) { # n
5548 $self->{state} = MD_ENTITY_STATE;
5549 $self->{kwd} .= chr $self->{nc};
5550
5551 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5552 $self->{line_prev} = $self->{line};
5553 $self->{column_prev} = $self->{column};
5554 $self->{column}++;
5555 $self->{nc}
5556 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5557 } else {
5558 $self->{set_nc}->($self);
5559 }
5560
5561 redo A;
5562 } elsif ($self->{nc} == 0x004C or # L
5563 $self->{nc} == 0x006C) { # l
5564 ## XML5: <!ELEMENT> not supported.
5565 $self->{state} = MD_ELEMENT_STATE;
5566 $self->{kwd} .= chr $self->{nc};
5567
5568 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5569 $self->{line_prev} = $self->{line};
5570 $self->{column_prev} = $self->{column};
5571 $self->{column}++;
5572 $self->{nc}
5573 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5574 } else {
5575 $self->{set_nc}->($self);
5576 }
5577
5578 redo A;
5579 } else {
5580 ## XML5: No parse error.
5581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5582 line => $self->{line_prev},
5583 column => $self->{column_prev} - 2
5584 + 1 * ($self->{nc} == -1));
5585 ## Reconsume.
5586 $self->{state} = BOGUS_COMMENT_STATE;
5587 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5588 redo A;
5589 }
5590 } elsif ($self->{state} == MD_ENTITY_STATE) {
5591 if ($self->{nc} == [
5592 undef,
5593 undef,
5594 0x0054, # T
5595 0x0049, # I
5596 0x0054, # T
5597 ]->[length $self->{kwd}] or
5598 $self->{nc} == [
5599 undef,
5600 undef,
5601 0x0074, # t
5602 0x0069, # i
5603 0x0074, # t
5604 ]->[length $self->{kwd}]) {
5605 ## Stay in the state.
5606 $self->{kwd} .= chr $self->{nc};
5607
5608 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5609 $self->{line_prev} = $self->{line};
5610 $self->{column_prev} = $self->{column};
5611 $self->{column}++;
5612 $self->{nc}
5613 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5614 } else {
5615 $self->{set_nc}->($self);
5616 }
5617
5618 redo A;
5619 } elsif ((length $self->{kwd}) == 5 and
5620 ($self->{nc} == 0x0059 or # Y
5621 $self->{nc} == 0x0079)) { # y
5622 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5623 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5624 text => 'ENTITY',
5625 line => $self->{line_prev},
5626 column => $self->{column_prev} - 4);
5627 }
5628 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5629 line => $self->{line_prev},
5630 column => $self->{column_prev} - 6};
5631 $self->{state} = DOCTYPE_MD_STATE;
5632
5633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5634 $self->{line_prev} = $self->{line};
5635 $self->{column_prev} = $self->{column};
5636 $self->{column}++;
5637 $self->{nc}
5638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5639 } else {
5640 $self->{set_nc}->($self);
5641 }
5642
5643 redo A;
5644 } else {
5645 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5646 line => $self->{line_prev},
5647 column => $self->{column_prev} - 1
5648 - (length $self->{kwd})
5649 + 1 * ($self->{nc} == -1));
5650 $self->{state} = BOGUS_COMMENT_STATE;
5651 ## Reconsume.
5652 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5653 redo A;
5654 }
5655 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5656 if ($self->{nc} == [
5657 undef,
5658 undef,
5659 0x0045, # E
5660 0x004D, # M
5661 0x0045, # E
5662 0x004E, # N
5663 ]->[length $self->{kwd}] or
5664 $self->{nc} == [
5665 undef,
5666 undef,
5667 0x0065, # e
5668 0x006D, # m
5669 0x0065, # e
5670 0x006E, # n
5671 ]->[length $self->{kwd}]) {
5672 ## Stay in the state.
5673 $self->{kwd} .= chr $self->{nc};
5674
5675 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5676 $self->{line_prev} = $self->{line};
5677 $self->{column_prev} = $self->{column};
5678 $self->{column}++;
5679 $self->{nc}
5680 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5681 } else {
5682 $self->{set_nc}->($self);
5683 }
5684
5685 redo A;
5686 } elsif ((length $self->{kwd}) == 6 and
5687 ($self->{nc} == 0x0054 or # T
5688 $self->{nc} == 0x0074)) { # t
5689 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5690 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5691 text => 'ELEMENT',
5692 line => $self->{line_prev},
5693 column => $self->{column_prev} - 5);
5694 }
5695 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5696 line => $self->{line_prev},
5697 column => $self->{column_prev} - 6};
5698 $self->{state} = DOCTYPE_MD_STATE;
5699
5700 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5701 $self->{line_prev} = $self->{line};
5702 $self->{column_prev} = $self->{column};
5703 $self->{column}++;
5704 $self->{nc}
5705 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5706 } else {
5707 $self->{set_nc}->($self);
5708 }
5709
5710 redo A;
5711 } else {
5712 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5713 line => $self->{line_prev},
5714 column => $self->{column_prev} - 1
5715 - (length $self->{kwd})
5716 + 1 * ($self->{nc} == -1));
5717 $self->{state} = BOGUS_COMMENT_STATE;
5718 ## Reconsume.
5719 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5720 redo A;
5721 }
5722 } elsif ($self->{state} == MD_ATTLIST_STATE) {
5723 if ($self->{nc} == [
5724 undef,
5725 0x0054, # T
5726 0x0054, # T
5727 0x004C, # L
5728 0x0049, # I
5729 0x0053, # S
5730 ]->[length $self->{kwd}] or
5731 $self->{nc} == [
5732 undef,
5733 0x0074, # t
5734 0x0074, # t
5735 0x006C, # l
5736 0x0069, # i
5737 0x0073, # s
5738 ]->[length $self->{kwd}]) {
5739 ## Stay in the state.
5740 $self->{kwd} .= chr $self->{nc};
5741
5742 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5743 $self->{line_prev} = $self->{line};
5744 $self->{column_prev} = $self->{column};
5745 $self->{column}++;
5746 $self->{nc}
5747 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5748 } else {
5749 $self->{set_nc}->($self);
5750 }
5751
5752 redo A;
5753 } elsif ((length $self->{kwd}) == 6 and
5754 ($self->{nc} == 0x0054 or # T
5755 $self->{nc} == 0x0074)) { # t
5756 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5758 text => 'ATTLIST',
5759 line => $self->{line_prev},
5760 column => $self->{column_prev} - 5);
5761 }
5762 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5763 attrdefs => [],
5764 line => $self->{line_prev},
5765 column => $self->{column_prev} - 6};
5766 $self->{state} = DOCTYPE_MD_STATE;
5767
5768 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5769 $self->{line_prev} = $self->{line};
5770 $self->{column_prev} = $self->{column};
5771 $self->{column}++;
5772 $self->{nc}
5773 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5774 } else {
5775 $self->{set_nc}->($self);
5776 }
5777
5778 redo A;
5779 } else {
5780 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5781 line => $self->{line_prev},
5782 column => $self->{column_prev} - 1
5783 - (length $self->{kwd})
5784 + 1 * ($self->{nc} == -1));
5785 $self->{state} = BOGUS_COMMENT_STATE;
5786 ## Reconsume.
5787 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5788 redo A;
5789 }
5790 } elsif ($self->{state} == MD_NOTATION_STATE) {
5791 if ($self->{nc} == [
5792 undef,
5793 0x004F, # O
5794 0x0054, # T
5795 0x0041, # A
5796 0x0054, # T
5797 0x0049, # I
5798 0x004F, # O
5799 ]->[length $self->{kwd}] or
5800 $self->{nc} == [
5801 undef,
5802 0x006F, # o
5803 0x0074, # t
5804 0x0061, # a
5805 0x0074, # t
5806 0x0069, # i
5807 0x006F, # o
5808 ]->[length $self->{kwd}]) {
5809 ## Stay in the state.
5810 $self->{kwd} .= chr $self->{nc};
5811
5812 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5813 $self->{line_prev} = $self->{line};
5814 $self->{column_prev} = $self->{column};
5815 $self->{column}++;
5816 $self->{nc}
5817 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5818 } else {
5819 $self->{set_nc}->($self);
5820 }
5821
5822 redo A;
5823 } elsif ((length $self->{kwd}) == 7 and
5824 ($self->{nc} == 0x004E or # N
5825 $self->{nc} == 0x006E)) { # n
5826 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5827 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5828 text => 'NOTATION',
5829 line => $self->{line_prev},
5830 column => $self->{column_prev} - 6);
5831 }
5832 $self->{ct} = {type => NOTATION_TOKEN, name => '',
5833 line => $self->{line_prev},
5834 column => $self->{column_prev} - 6};
5835 $self->{state} = DOCTYPE_MD_STATE;
5836
5837 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5838 $self->{line_prev} = $self->{line};
5839 $self->{column_prev} = $self->{column};
5840 $self->{column}++;
5841 $self->{nc}
5842 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5843 } else {
5844 $self->{set_nc}->($self);
5845 }
5846
5847 redo A;
5848 } else {
5849 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5850 line => $self->{line_prev},
5851 column => $self->{column_prev} - 1
5852 - (length $self->{kwd})
5853 + 1 * ($self->{nc} == -1));
5854 $self->{state} = BOGUS_COMMENT_STATE;
5855 ## Reconsume.
5856 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5857 redo A;
5858 }
5859 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5860 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5861 ## "DOCTYPE NOTATION state".
5862
5863 if ($is_space->{$self->{nc}}) {
5864 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5865 $self->{state} = BEFORE_MD_NAME_STATE;
5866
5867 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5868 $self->{line_prev} = $self->{line};
5869 $self->{column_prev} = $self->{column};
5870 $self->{column}++;
5871 $self->{nc}
5872 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5873 } else {
5874 $self->{set_nc}->($self);
5875 }
5876
5877 redo A;
5878 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5879 $self->{nc} == 0x0025) { # %
5880 ## XML5: Switch to the "DOCTYPE bogus comment state".
5881 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5882 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5883
5884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885 $self->{line_prev} = $self->{line};
5886 $self->{column_prev} = $self->{column};
5887 $self->{column}++;
5888 $self->{nc}
5889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890 } else {
5891 $self->{set_nc}->($self);
5892 }
5893
5894 redo A;
5895 } elsif ($self->{nc} == -1) {
5896 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5897 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5898 ## Reconsume.
5899 redo A;
5900 } elsif ($self->{nc} == 0x003E) { # >
5901 ## XML5: Switch to the "DOCTYPE bogus comment state".
5902 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5903 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5904
5905 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5906 $self->{line_prev} = $self->{line};
5907 $self->{column_prev} = $self->{column};
5908 $self->{column}++;
5909 $self->{nc}
5910 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5911 } else {
5912 $self->{set_nc}->($self);
5913 }
5914
5915 redo A;
5916 } else {
5917 ## XML5: Switch to the "DOCTYPE bogus comment state".
5918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5919 $self->{state} = BEFORE_MD_NAME_STATE;
5920 redo A;
5921 }
5922 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
5923 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
5924 ## before state", "DOCTYPE ATTLIST name before state".
5925
5926 if ($is_space->{$self->{nc}}) {
5927 ## Stay in the state.
5928
5929 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5930 $self->{line_prev} = $self->{line};
5931 $self->{column_prev} = $self->{column};
5932 $self->{column}++;
5933 $self->{nc}
5934 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5935 } else {
5936 $self->{set_nc}->($self);
5937 }
5938
5939 redo A;
5940 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5941 $self->{nc} == 0x0025) { # %
5942 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5943
5944 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5945 $self->{line_prev} = $self->{line};
5946 $self->{column_prev} = $self->{column};
5947 $self->{column}++;
5948 $self->{nc}
5949 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5950 } else {
5951 $self->{set_nc}->($self);
5952 }
5953
5954 redo A;
5955 } elsif ($self->{nc} == 0x003E) { # >
5956 ## XML5: Same as "Anything else".
5957 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5958 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5959
5960 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5961 $self->{line_prev} = $self->{line};
5962 $self->{column_prev} = $self->{column};
5963 $self->{column}++;
5964 $self->{nc}
5965 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5966 } else {
5967 $self->{set_nc}->($self);
5968 }
5969
5970 redo A;
5971 } elsif ($self->{nc} == -1) {
5972 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5973 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5974 ## Reconsume.
5975 redo A;
5976 } else {
5977 ## XML5: [ATTLIST] Not defined yet.
5978 $self->{ct}->{name} .= chr $self->{nc};
5979 $self->{state} = MD_NAME_STATE;
5980
5981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5982 $self->{line_prev} = $self->{line};
5983 $self->{column_prev} = $self->{column};
5984 $self->{column}++;
5985 $self->{nc}
5986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5987 } else {
5988 $self->{set_nc}->($self);
5989 }
5990
5991 redo A;
5992 }
5993 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
5994 if ($is_space->{$self->{nc}}) {
5995 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
5996 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
5997 $self->{state} = BEFORE_MD_NAME_STATE;
5998
5999 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6000 $self->{line_prev} = $self->{line};
6001 $self->{column_prev} = $self->{column};
6002 $self->{column}++;
6003 $self->{nc}
6004 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6005 } else {
6006 $self->{set_nc}->($self);
6007 }
6008
6009 redo A;
6010 } elsif ($self->{nc} == 0x003E) { # >
6011 ## XML5: Same as "Anything else".
6012 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6013 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6014
6015 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6016 $self->{line_prev} = $self->{line};
6017 $self->{column_prev} = $self->{column};
6018 $self->{column}++;
6019 $self->{nc}
6020 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6021 } else {
6022 $self->{set_nc}->($self);
6023 }
6024
6025 redo A;
6026 } elsif ($self->{nc} == -1) {
6027 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6028 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6029 ## Reconsume.
6030 redo A;
6031 } else {
6032 ## XML5: No parse error.
6033 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6034 $self->{state} = BOGUS_COMMENT_STATE;
6035 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6036 ## Reconsume.
6037 redo A;
6038 }
6039 } elsif ($self->{state} == MD_NAME_STATE) {
6040 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6041
6042 if ($is_space->{$self->{nc}}) {
6043 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6044 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6045 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6046 ## TODO: ...
6047 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6048 } else { # ENTITY/NOTATION
6049 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6050 }
6051
6052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6053 $self->{line_prev} = $self->{line};
6054 $self->{column_prev} = $self->{column};
6055 $self->{column}++;
6056 $self->{nc}
6057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6058 } else {
6059 $self->{set_nc}->($self);
6060 }
6061
6062 redo A;
6063 } elsif ($self->{nc} == 0x003E) { # >
6064 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6065 #
6066 } else {
6067 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6068 }
6069 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6070
6071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6072 $self->{line_prev} = $self->{line};
6073 $self->{column_prev} = $self->{column};
6074 $self->{column}++;
6075 $self->{nc}
6076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6077 } else {
6078 $self->{set_nc}->($self);
6079 }
6080
6081 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6082 redo A;
6083 } elsif ($self->{nc} == -1) {
6084 ## XML5: [ATTLIST] No parse error.
6085 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6086 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6087 ## Reconsume.
6088 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6089 redo A;
6090 } else {
6091 ## XML5: [ATTLIST] Not defined yet.
6092 $self->{ct}->{name} .= chr $self->{nc};
6093 ## Stay in the state.
6094
6095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6096 $self->{line_prev} = $self->{line};
6097 $self->{column_prev} = $self->{column};
6098 $self->{column}++;
6099 $self->{nc}
6100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6101 } else {
6102 $self->{set_nc}->($self);
6103 }
6104
6105 redo A;
6106 }
6107 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6108 if ($is_space->{$self->{nc}}) {
6109 ## Stay in the state.
6110
6111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6112 $self->{line_prev} = $self->{line};
6113 $self->{column_prev} = $self->{column};
6114 $self->{column}++;
6115 $self->{nc}
6116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6117 } else {
6118 $self->{set_nc}->($self);
6119 }
6120
6121 redo A;
6122 } elsif ($self->{nc} == 0x003E) { # >
6123 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6124
6125 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6126 $self->{line_prev} = $self->{line};
6127 $self->{column_prev} = $self->{column};
6128 $self->{column}++;
6129 $self->{nc}
6130 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6131 } else {
6132 $self->{set_nc}->($self);
6133 }
6134
6135 return ($self->{ct}); # ATTLIST
6136 redo A;
6137 } elsif ($self->{nc} == -1) {
6138 ## XML5: No parse error.
6139 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6140 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6141 return ($self->{ct});
6142 redo A;
6143 } else {
6144 ## XML5: Not defined yet.
6145 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6146 tokens => [],
6147 line => $self->{line}, column => $self->{column}};
6148 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6149
6150 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6151 $self->{line_prev} = $self->{line};
6152 $self->{column_prev} = $self->{column};
6153 $self->{column}++;
6154 $self->{nc}
6155 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6156 } else {
6157 $self->{set_nc}->($self);
6158 }
6159
6160 redo A;
6161 }
6162 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6163 if ($is_space->{$self->{nc}}) {
6164 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6165
6166 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6167 $self->{line_prev} = $self->{line};
6168 $self->{column_prev} = $self->{column};
6169 $self->{column}++;
6170 $self->{nc}
6171 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6172 } else {
6173 $self->{set_nc}->($self);
6174 }
6175
6176 redo A;
6177 } elsif ($self->{nc} == 0x003E) { # >
6178 ## XML5: Same as "anything else".
6179 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6180 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6181
6182 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6183 $self->{line_prev} = $self->{line};
6184 $self->{column_prev} = $self->{column};
6185 $self->{column}++;
6186 $self->{nc}
6187 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6188 } else {
6189 $self->{set_nc}->($self);
6190 }
6191
6192 return ($self->{ct}); # ATTLIST
6193 redo A;
6194 } elsif ($self->{nc} == 0x0028) { # (
6195 ## XML5: Same as "anything else".
6196 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6197 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6198
6199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6200 $self->{line_prev} = $self->{line};
6201 $self->{column_prev} = $self->{column};
6202 $self->{column}++;
6203 $self->{nc}
6204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6205 } else {
6206 $self->{set_nc}->($self);
6207 }
6208
6209 redo A;
6210 } elsif ($self->{nc} == -1) {
6211 ## XML5: No parse error.
6212 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6213 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6214
6215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6216 $self->{line_prev} = $self->{line};
6217 $self->{column_prev} = $self->{column};
6218 $self->{column}++;
6219 $self->{nc}
6220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6221 } else {
6222 $self->{set_nc}->($self);
6223 }
6224
6225 return ($self->{ct}); # ATTLIST
6226 redo A;
6227 } else {
6228 ## XML5: Not defined yet.
6229 $self->{ca}->{name} .= chr $self->{nc};
6230 ## Stay in the state.
6231
6232 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6233 $self->{line_prev} = $self->{line};
6234 $self->{column_prev} = $self->{column};
6235 $self->{column}++;
6236 $self->{nc}
6237 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6238 } else {
6239 $self->{set_nc}->($self);
6240 }
6241
6242 redo A;
6243 }
6244 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6245 if ($is_space->{$self->{nc}}) {
6246 ## Stay in the state.
6247
6248 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6249 $self->{line_prev} = $self->{line};
6250 $self->{column_prev} = $self->{column};
6251 $self->{column}++;
6252 $self->{nc}
6253 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6254 } else {
6255 $self->{set_nc}->($self);
6256 }
6257
6258 redo A;
6259 } elsif ($self->{nc} == 0x003E) { # >
6260 ## XML5: Same as "anything else".
6261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6262 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6263
6264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6265 $self->{line_prev} = $self->{line};
6266 $self->{column_prev} = $self->{column};
6267 $self->{column}++;
6268 $self->{nc}
6269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6270 } else {
6271 $self->{set_nc}->($self);
6272 }
6273
6274 return ($self->{ct}); # ATTLIST
6275 redo A;
6276 } elsif ($self->{nc} == 0x0028) { # (
6277 ## XML5: Same as "anything else".
6278 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6279
6280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6281 $self->{line_prev} = $self->{line};
6282 $self->{column_prev} = $self->{column};
6283 $self->{column}++;
6284 $self->{nc}
6285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6286 } else {
6287 $self->{set_nc}->($self);
6288 }
6289
6290 redo A;
6291 } elsif ($self->{nc} == -1) {
6292 ## XML5: No parse error.
6293 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6294 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6295
6296 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6297 $self->{line_prev} = $self->{line};
6298 $self->{column_prev} = $self->{column};
6299 $self->{column}++;
6300 $self->{nc}
6301 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6302 } else {
6303 $self->{set_nc}->($self);
6304 }
6305
6306 return ($self->{ct});
6307 redo A;
6308 } else {
6309 ## XML5: Not defined yet.
6310 $self->{ca}->{type} = chr $self->{nc};
6311 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6312
6313 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6314 $self->{line_prev} = $self->{line};
6315 $self->{column_prev} = $self->{column};
6316 $self->{column}++;
6317 $self->{nc}
6318 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6319 } else {
6320 $self->{set_nc}->($self);
6321 }
6322
6323 redo A;
6324 }
6325 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6326 if ($is_space->{$self->{nc}}) {
6327 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6328
6329 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6330 $self->{line_prev} = $self->{line};
6331 $self->{column_prev} = $self->{column};
6332 $self->{column}++;
6333 $self->{nc}
6334 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6335 } else {
6336 $self->{set_nc}->($self);
6337 }
6338
6339 redo A;
6340 } elsif ($self->{nc} == 0x0023) { # #
6341 ## XML5: Same as "anything else".
6342 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6343 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6344
6345 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6346 $self->{line_prev} = $self->{line};
6347 $self->{column_prev} = $self->{column};
6348 $self->{column}++;
6349 $self->{nc}
6350 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6351 } else {
6352 $self->{set_nc}->($self);
6353 }
6354
6355 redo A;
6356 } elsif ($self->{nc} == 0x0022) { # "
6357 ## XML5: Same as "anything else".
6358 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6359 $self->{ca}->{value} = '';
6360 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6361
6362 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6363 $self->{line_prev} = $self->{line};
6364 $self->{column_prev} = $self->{column};
6365 $self->{column}++;
6366 $self->{nc}
6367 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6368 } else {
6369 $self->{set_nc}->($self);
6370 }
6371
6372 redo A;
6373 } elsif ($self->{nc} == 0x0027) { # '
6374 ## XML5: Same as "anything else".
6375 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6376 $self->{ca}->{value} = '';
6377 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6378
6379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6380 $self->{line_prev} = $self->{line};
6381 $self->{column_prev} = $self->{column};
6382 $self->{column}++;
6383 $self->{nc}
6384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6385 } else {
6386 $self->{set_nc}->($self);
6387 }
6388
6389 redo A;
6390 } elsif ($self->{nc} == 0x003E) { # >
6391 ## XML5: Same as "anything else".
6392 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6393 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6394
6395 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6396 $self->{line_prev} = $self->{line};
6397 $self->{column_prev} = $self->{column};
6398 $self->{column}++;
6399 $self->{nc}
6400 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6401 } else {
6402 $self->{set_nc}->($self);
6403 }
6404
6405 return ($self->{ct}); # ATTLIST
6406 redo A;
6407 } elsif ($self->{nc} == 0x0028) { # (
6408 ## XML5: Same as "anything else".
6409 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6410 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6411
6412 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6413 $self->{line_prev} = $self->{line};
6414 $self->{column_prev} = $self->{column};
6415 $self->{column}++;
6416 $self->{nc}
6417 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6418 } else {
6419 $self->{set_nc}->($self);
6420 }
6421
6422 redo A;
6423 } elsif ($self->{nc} == -1) {
6424 ## XML5: No parse error.
6425 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6426 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6427
6428 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6429 $self->{line_prev} = $self->{line};
6430 $self->{column_prev} = $self->{column};
6431 $self->{column}++;
6432 $self->{nc}
6433 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6434 } else {
6435 $self->{set_nc}->($self);
6436 }
6437
6438 return ($self->{ct});
6439 redo A;
6440 } else {
6441 ## XML5: Not defined yet.
6442 $self->{ca}->{type} .= chr $self->{nc};
6443 ## Stay in the state.
6444
6445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6446 $self->{line_prev} = $self->{line};
6447 $self->{column_prev} = $self->{column};
6448 $self->{column}++;
6449 $self->{nc}
6450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6451 } else {
6452 $self->{set_nc}->($self);
6453 }
6454
6455 redo A;
6456 }
6457 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6458 if ($is_space->{$self->{nc}}) {
6459 ## Stay in the state.
6460
6461 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6462 $self->{line_prev} = $self->{line};
6463 $self->{column_prev} = $self->{column};
6464 $self->{column}++;
6465 $self->{nc}
6466 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6467 } else {
6468 $self->{set_nc}->($self);
6469 }
6470
6471 redo A;
6472 } elsif ($self->{nc} == 0x0028) { # (
6473 ## XML5: Same as "anything else".
6474 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6475
6476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6477 $self->{line_prev} = $self->{line};
6478 $self->{column_prev} = $self->{column};
6479 $self->{column}++;
6480 $self->{nc}
6481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6482 } else {
6483 $self->{set_nc}->($self);
6484 }
6485
6486 redo A;
6487 } elsif ($self->{nc} == 0x0023) { # #
6488 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6489
6490 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6491 $self->{line_prev} = $self->{line};
6492 $self->{column_prev} = $self->{column};
6493 $self->{column}++;
6494 $self->{nc}
6495 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6496 } else {
6497 $self->{set_nc}->($self);
6498 }
6499
6500 redo A;
6501 } elsif ($self->{nc} == 0x0022) { # "
6502 ## XML5: Same as "anything else".
6503 $self->{ca}->{value} = '';
6504 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6505
6506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6507 $self->{line_prev} = $self->{line};
6508 $self->{column_prev} = $self->{column};
6509 $self->{column}++;
6510 $self->{nc}
6511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6512 } else {
6513 $self->{set_nc}->($self);
6514 }
6515
6516 redo A;
6517 } elsif ($self->{nc} == 0x0027) { # '
6518 ## XML5: Same as "anything else".
6519 $self->{ca}->{value} = '';
6520 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6521
6522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6523 $self->{line_prev} = $self->{line};
6524 $self->{column_prev} = $self->{column};
6525 $self->{column}++;
6526 $self->{nc}
6527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6528 } else {
6529 $self->{set_nc}->($self);
6530 }
6531
6532 redo A;
6533 } elsif ($self->{nc} == 0x003E) { # >
6534 ## XML5: Same as "anything else".
6535 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6536 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6537
6538 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539 $self->{line_prev} = $self->{line};
6540 $self->{column_prev} = $self->{column};
6541 $self->{column}++;
6542 $self->{nc}
6543 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544 } else {
6545 $self->{set_nc}->($self);
6546 }
6547
6548 return ($self->{ct}); # ATTLIST
6549 redo A;
6550 } elsif ($self->{nc} == -1) {
6551 ## XML5: No parse error.
6552 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6553 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6554
6555 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6556 $self->{line_prev} = $self->{line};
6557 $self->{column_prev} = $self->{column};
6558 $self->{column}++;
6559 $self->{nc}
6560 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6561 } else {
6562 $self->{set_nc}->($self);
6563 }
6564
6565 return ($self->{ct});
6566 redo A;
6567 } else {
6568 ## XML5: Switch to the "DOCTYPE bogus comment state".
6569 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6570 $self->{ca}->{value} = '';
6571 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6572 ## Reconsume.
6573 redo A;
6574 }
6575 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6576 if ($is_space->{$self->{nc}}) {
6577 ## Stay in the state.
6578
6579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6580 $self->{line_prev} = $self->{line};
6581 $self->{column_prev} = $self->{column};
6582 $self->{column}++;
6583 $self->{nc}
6584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6585 } else {
6586 $self->{set_nc}->($self);
6587 }
6588
6589 redo A;
6590 } elsif ($self->{nc} == 0x007C) { # |
6591 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6592 ## Stay in the state.
6593
6594 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6595 $self->{line_prev} = $self->{line};
6596 $self->{column_prev} = $self->{column};
6597 $self->{column}++;
6598 $self->{nc}
6599 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6600 } else {
6601 $self->{set_nc}->($self);
6602 }
6603
6604 redo A;
6605 } elsif ($self->{nc} == 0x0029) { # )
6606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6607 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6608
6609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6610 $self->{line_prev} = $self->{line};
6611 $self->{column_prev} = $self->{column};
6612 $self->{column}++;
6613 $self->{nc}
6614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6615 } else {
6616 $self->{set_nc}->($self);
6617 }
6618
6619 redo A;
6620 } elsif ($self->{nc} == 0x003E) { # >
6621 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6622 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6623
6624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6625 $self->{line_prev} = $self->{line};
6626 $self->{column_prev} = $self->{column};
6627 $self->{column}++;
6628 $self->{nc}
6629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6630 } else {
6631 $self->{set_nc}->($self);
6632 }
6633
6634 return ($self->{ct}); # ATTLIST
6635 redo A;
6636 } elsif ($self->{nc} == -1) {
6637 ## XML5: No parse error.
6638 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6639 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6640
6641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6642 $self->{line_prev} = $self->{line};
6643 $self->{column_prev} = $self->{column};
6644 $self->{column}++;
6645 $self->{nc}
6646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6647 } else {
6648 $self->{set_nc}->($self);
6649 }
6650
6651 return ($self->{ct});
6652 redo A;
6653 } else {
6654 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6655 $self->{state} = ALLOWED_TOKEN_STATE;
6656
6657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6658 $self->{line_prev} = $self->{line};
6659 $self->{column_prev} = $self->{column};
6660 $self->{column}++;
6661 $self->{nc}
6662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6663 } else {
6664 $self->{set_nc}->($self);
6665 }
6666
6667 redo A;
6668 }
6669 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6670 if ($is_space->{$self->{nc}}) {
6671 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6672
6673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6674 $self->{line_prev} = $self->{line};
6675 $self->{column_prev} = $self->{column};
6676 $self->{column}++;
6677 $self->{nc}
6678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6679 } else {
6680 $self->{set_nc}->($self);
6681 }
6682
6683 redo A;
6684 } elsif ($self->{nc} == 0x007C) { # |
6685 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6686
6687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6688 $self->{line_prev} = $self->{line};
6689 $self->{column_prev} = $self->{column};
6690 $self->{column}++;
6691 $self->{nc}
6692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6693 } else {
6694 $self->{set_nc}->($self);
6695 }
6696
6697 redo A;
6698 } elsif ($self->{nc} == 0x0029) { # )
6699 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6700
6701 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6702 $self->{line_prev} = $self->{line};
6703 $self->{column_prev} = $self->{column};
6704 $self->{column}++;
6705 $self->{nc}
6706 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6707 } else {
6708 $self->{set_nc}->($self);
6709 }
6710
6711 redo A;
6712 } elsif ($self->{nc} == 0x003E) { # >
6713 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6714 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6715
6716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6717 $self->{line_prev} = $self->{line};
6718 $self->{column_prev} = $self->{column};
6719 $self->{column}++;
6720 $self->{nc}
6721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6722 } else {
6723 $self->{set_nc}->($self);
6724 }
6725
6726 return ($self->{ct}); # ATTLIST
6727 redo A;
6728 } elsif ($self->{nc} == -1) {
6729 ## XML5: No parse error.
6730 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6731 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6732
6733 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6734 $self->{line_prev} = $self->{line};
6735 $self->{column_prev} = $self->{column};
6736 $self->{column}++;
6737 $self->{nc}
6738 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6739 } else {
6740 $self->{set_nc}->($self);
6741 }
6742
6743 return ($self->{ct});
6744 redo A;
6745 } else {
6746 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6747 ## Stay in the state.
6748
6749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6750 $self->{line_prev} = $self->{line};
6751 $self->{column_prev} = $self->{column};
6752 $self->{column}++;
6753 $self->{nc}
6754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6755 } else {
6756 $self->{set_nc}->($self);
6757 }
6758
6759 redo A;
6760 }
6761 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6762 if ($is_space->{$self->{nc}}) {
6763 ## Stay in the state.
6764
6765 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6766 $self->{line_prev} = $self->{line};
6767 $self->{column_prev} = $self->{column};
6768 $self->{column}++;
6769 $self->{nc}
6770 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6771 } else {
6772 $self->{set_nc}->($self);
6773 }
6774
6775 redo A;
6776 } elsif ($self->{nc} == 0x007C) { # |
6777 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6778
6779 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6780 $self->{line_prev} = $self->{line};
6781 $self->{column_prev} = $self->{column};
6782 $self->{column}++;
6783 $self->{nc}
6784 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6785 } else {
6786 $self->{set_nc}->($self);
6787 }
6788
6789 redo A;
6790 } elsif ($self->{nc} == 0x0029) { # )
6791 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6792
6793 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794 $self->{line_prev} = $self->{line};
6795 $self->{column_prev} = $self->{column};
6796 $self->{column}++;
6797 $self->{nc}
6798 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799 } else {
6800 $self->{set_nc}->($self);
6801 }
6802
6803 redo A;
6804 } elsif ($self->{nc} == 0x003E) { # >
6805 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6806 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6807
6808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6809 $self->{line_prev} = $self->{line};
6810 $self->{column_prev} = $self->{column};
6811 $self->{column}++;
6812 $self->{nc}
6813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6814 } else {
6815 $self->{set_nc}->($self);
6816 }
6817
6818 return ($self->{ct}); # ATTLIST
6819 redo A;
6820 } elsif ($self->{nc} == -1) {
6821 ## XML5: No parse error.
6822 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6823 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6824
6825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826 $self->{line_prev} = $self->{line};
6827 $self->{column_prev} = $self->{column};
6828 $self->{column}++;
6829 $self->{nc}
6830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831 } else {
6832 $self->{set_nc}->($self);
6833 }
6834
6835 return ($self->{ct});
6836 redo A;
6837 } else {
6838 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6839 line => $self->{line_prev},
6840 column => $self->{column_prev});
6841 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6842 $self->{state} = ALLOWED_TOKEN_STATE;
6843
6844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6845 $self->{line_prev} = $self->{line};
6846 $self->{column_prev} = $self->{column};
6847 $self->{column}++;
6848 $self->{nc}
6849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6850 } else {
6851 $self->{set_nc}->($self);
6852 }
6853
6854 redo A;
6855 }
6856 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6857 if ($is_space->{$self->{nc}}) {
6858 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6859
6860 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6861 $self->{line_prev} = $self->{line};
6862 $self->{column_prev} = $self->{column};
6863 $self->{column}++;
6864 $self->{nc}
6865 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6866 } else {
6867 $self->{set_nc}->($self);
6868 }
6869
6870 redo A;
6871 } elsif ($self->{nc} == 0x0023) { # #
6872 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6873 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6874
6875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876 $self->{line_prev} = $self->{line};
6877 $self->{column_prev} = $self->{column};
6878 $self->{column}++;
6879 $self->{nc}
6880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881 } else {
6882 $self->{set_nc}->($self);
6883 }
6884
6885 redo A;
6886 } elsif ($self->{nc} == 0x0022) { # "
6887 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6888 $self->{ca}->{value} = '';
6889 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6890
6891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6892 $self->{line_prev} = $self->{line};
6893 $self->{column_prev} = $self->{column};
6894 $self->{column}++;
6895 $self->{nc}
6896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6897 } else {
6898 $self->{set_nc}->($self);
6899 }
6900
6901 redo A;
6902 } elsif ($self->{nc} == 0x0027) { # '
6903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6904 $self->{ca}->{value} = '';
6905 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6906
6907 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6908 $self->{line_prev} = $self->{line};
6909 $self->{column_prev} = $self->{column};
6910 $self->{column}++;
6911 $self->{nc}
6912 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6913 } else {
6914 $self->{set_nc}->($self);
6915 }
6916
6917 redo A;
6918 } elsif ($self->{nc} == 0x003E) { # >
6919 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6920 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6921
6922 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6923 $self->{line_prev} = $self->{line};
6924 $self->{column_prev} = $self->{column};
6925 $self->{column}++;
6926 $self->{nc}
6927 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6928 } else {
6929 $self->{set_nc}->($self);
6930 }
6931
6932 return ($self->{ct}); # ATTLIST
6933 redo A;
6934 } elsif ($self->{nc} == -1) {
6935 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6936 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6937
6938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6939 $self->{line_prev} = $self->{line};
6940 $self->{column_prev} = $self->{column};
6941 $self->{column}++;
6942 $self->{nc}
6943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6944 } else {
6945 $self->{set_nc}->($self);
6946 }
6947
6948 return ($self->{ct});
6949 redo A;
6950 } else {
6951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6952 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6953 ## Reconsume.
6954 redo A;
6955 }
6956 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
6957 if ($is_space->{$self->{nc}}) {
6958 ## Stay in the state.
6959
6960 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6961 $self->{line_prev} = $self->{line};
6962 $self->{column_prev} = $self->{column};
6963 $self->{column}++;
6964 $self->{nc}
6965 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6966 } else {
6967 $self->{set_nc}->($self);
6968 }
6969
6970 redo A;
6971 } elsif ($self->{nc} == 0x0023) { # #
6972 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6973
6974 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6975 $self->{line_prev} = $self->{line};
6976 $self->{column_prev} = $self->{column};
6977 $self->{column}++;
6978 $self->{nc}
6979 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6980 } else {
6981 $self->{set_nc}->($self);
6982 }
6983
6984 redo A;
6985 } elsif ($self->{nc} == 0x0022) { # "
6986 $self->{ca}->{value} = '';
6987 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6988
6989 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6990 $self->{line_prev} = $self->{line};
6991 $self->{column_prev} = $self->{column};
6992 $self->{column}++;
6993 $self->{nc}
6994 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6995 } else {
6996 $self->{set_nc}->($self);
6997 }
6998
6999 redo A;
7000 } elsif ($self->{nc} == 0x0027) { # '
7001 $self->{ca}->{value} = '';
7002 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7003
7004 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7005 $self->{line_prev} = $self->{line};
7006 $self->{column_prev} = $self->{column};
7007 $self->{column}++;
7008 $self->{nc}
7009 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7010 } else {
7011 $self->{set_nc}->($self);
7012 }
7013
7014 redo A;
7015 } elsif ($self->{nc} == 0x003E) { # >
7016 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7017 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7018
7019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020 $self->{line_prev} = $self->{line};
7021 $self->{column_prev} = $self->{column};
7022 $self->{column}++;
7023 $self->{nc}
7024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025 } else {
7026 $self->{set_nc}->($self);
7027 }
7028
7029 return ($self->{ct}); # ATTLIST
7030 redo A;
7031 } elsif ($self->{nc} == -1) {
7032 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7033 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7034
7035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7036 $self->{line_prev} = $self->{line};
7037 $self->{column_prev} = $self->{column};
7038 $self->{column}++;
7039 $self->{nc}
7040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7041 } else {
7042 $self->{set_nc}->($self);
7043 }
7044
7045 return ($self->{ct});
7046 redo A;
7047 } else {
7048 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7049 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7050 ## Reconsume.
7051 redo A;
7052 }
7053 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7054 if ($is_space->{$self->{nc}}) {
7055 ## XML5: No parse error.
7056 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7057 $self->{state} = BOGUS_MD_STATE;
7058 ## Reconsume.
7059 redo A;
7060 } elsif ($self->{nc} == 0x0022) { # "
7061 ## XML5: Same as "anything else".
7062 $self->{ca}->{value} = '';
7063 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7064
7065 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7066 $self->{line_prev} = $self->{line};
7067 $self->{column_prev} = $self->{column};
7068 $self->{column}++;
7069 $self->{nc}
7070 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7071 } else {
7072 $self->{set_nc}->($self);
7073 }
7074
7075 redo A;
7076 } elsif ($self->{nc} == 0x0027) { # '
7077 ## XML5: Same as "anything else".
7078 $self->{ca}->{value} = '';
7079 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7080
7081 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7082 $self->{line_prev} = $self->{line};
7083 $self->{column_prev} = $self->{column};
7084 $self->{column}++;
7085 $self->{nc}
7086 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7087 } else {
7088 $self->{set_nc}->($self);
7089 }
7090
7091 redo A;
7092 } elsif ($self->{nc} == 0x003E) { # >
7093 ## XML5: Same as "anything else".
7094 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7095 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7096
7097 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7098 $self->{line_prev} = $self->{line};
7099 $self->{column_prev} = $self->{column};
7100 $self->{column}++;
7101 $self->{nc}
7102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7103 } else {
7104 $self->{set_nc}->($self);
7105 }
7106
7107 return ($self->{ct}); # ATTLIST
7108 redo A;
7109 } elsif ($self->{nc} == -1) {
7110 ## XML5: No parse error.
7111 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7112 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7113
7114 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115 $self->{line_prev} = $self->{line};
7116 $self->{column_prev} = $self->{column};
7117 $self->{column}++;
7118 $self->{nc}
7119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120 } else {
7121 $self->{set_nc}->($self);
7122 }
7123
7124 return ($self->{ct});
7125 redo A;
7126 } else {
7127 $self->{ca}->{default} = chr $self->{nc};
7128 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7129
7130 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7131 $self->{line_prev} = $self->{line};
7132 $self->{column_prev} = $self->{column};
7133 $self->{column}++;
7134 $self->{nc}
7135 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7136 } else {
7137 $self->{set_nc}->($self);
7138 }
7139
7140 redo A;
7141 }
7142 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7143 if ($is_space->{$self->{nc}}) {
7144 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7145
7146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7147 $self->{line_prev} = $self->{line};
7148 $self->{column_prev} = $self->{column};
7149 $self->{column}++;
7150 $self->{nc}
7151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7152 } else {
7153 $self->{set_nc}->($self);
7154 }
7155
7156 redo A;
7157 } elsif ($self->{nc} == 0x0022) { # "
7158 ## XML5: Same as "anything else".
7159 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7160 $self->{ca}->{value} = '';
7161 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7162
7163 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164 $self->{line_prev} = $self->{line};
7165 $self->{column_prev} = $self->{column};
7166 $self->{column}++;
7167 $self->{nc}
7168 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169 } else {
7170 $self->{set_nc}->($self);
7171 }
7172
7173 redo A;
7174 } elsif ($self->{nc} == 0x0027) { # '
7175 ## XML5: Same as "anything else".
7176 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7177 $self->{ca}->{value} = '';
7178 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7179
7180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7181 $self->{line_prev} = $self->{line};
7182 $self->{column_prev} = $self->{column};
7183 $self->{column}++;
7184 $self->{nc}
7185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7186 } else {
7187 $self->{set_nc}->($self);
7188 }
7189
7190 redo A;
7191 } elsif ($self->{nc} == 0x003E) { # >
7192 ## XML5: Same as "anything else".
7193 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7194 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7195
7196 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7197 $self->{line_prev} = $self->{line};
7198 $self->{column_prev} = $self->{column};
7199 $self->{column}++;
7200 $self->{nc}
7201 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7202 } else {
7203 $self->{set_nc}->($self);
7204 }
7205
7206 return ($self->{ct}); # ATTLIST
7207 redo A;
7208 } elsif ($self->{nc} == -1) {
7209 ## XML5: No parse error.
7210 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7211 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7212 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7213
7214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7215 $self->{line_prev} = $self->{line};
7216 $self->{column_prev} = $self->{column};
7217 $self->{column}++;
7218 $self->{nc}
7219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7220 } else {
7221 $self->{set_nc}->($self);
7222 }
7223
7224 return ($self->{ct});
7225 redo A;
7226 } else {
7227 $self->{ca}->{default} .= chr $self->{nc};
7228 ## Stay in the state.
7229
7230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7231 $self->{line_prev} = $self->{line};
7232 $self->{column_prev} = $self->{column};
7233 $self->{column}++;
7234 $self->{nc}
7235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7236 } else {
7237 $self->{set_nc}->($self);
7238 }
7239
7240 redo A;
7241 }
7242 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7243 if ($is_space->{$self->{nc}}) {
7244 ## Stay in the state.
7245
7246 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7247 $self->{line_prev} = $self->{line};
7248 $self->{column_prev} = $self->{column};
7249 $self->{column}++;
7250 $self->{nc}
7251 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7252 } else {
7253 $self->{set_nc}->($self);
7254 }
7255
7256 redo A;
7257 } elsif ($self->{nc} == 0x0022) { # "
7258 $self->{ca}->{value} = '';
7259 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7260
7261 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7262 $self->{line_prev} = $self->{line};
7263 $self->{column_prev} = $self->{column};
7264 $self->{column}++;
7265 $self->{nc}
7266 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7267 } else {
7268 $self->{set_nc}->($self);
7269 }
7270
7271 redo A;
7272 } elsif ($self->{nc} == 0x0027) { # '
7273 $self->{ca}->{value} = '';
7274 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7275
7276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7277 $self->{line_prev} = $self->{line};
7278 $self->{column_prev} = $self->{column};
7279 $self->{column}++;
7280 $self->{nc}
7281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7282 } else {
7283 $self->{set_nc}->($self);
7284 }
7285
7286 redo A;
7287 } elsif ($self->{nc} == 0x003E) { # >
7288 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7289 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7290
7291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7292 $self->{line_prev} = $self->{line};
7293 $self->{column_prev} = $self->{column};
7294 $self->{column}++;
7295 $self->{nc}
7296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7297 } else {
7298 $self->{set_nc}->($self);
7299 }
7300
7301 return ($self->{ct}); # ATTLIST
7302 redo A;
7303 } elsif ($self->{nc} == -1) {
7304 ## XML5: No parse error.
7305 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7306 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7307 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7308
7309 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7310 $self->{line_prev} = $self->{line};
7311 $self->{column_prev} = $self->{column};
7312 $self->{column}++;
7313 $self->{nc}
7314 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7315 } else {
7316 $self->{set_nc}->($self);
7317 }
7318
7319 return ($self->{ct});
7320 redo A;
7321 } else {
7322 ## XML5: Not defined yet.
7323 if ($self->{ca}->{default} eq 'FIXED') {
7324 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7325 } else {
7326 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7327 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7328 }
7329 ## Reconsume.
7330 redo A;
7331 }
7332 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7333 if ($is_space->{$self->{nc}} or
7334 $self->{nc} == -1 or
7335 $self->{nc} == 0x003E) { # >
7336 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7337 ## Reconsume.
7338 redo A;
7339 } else {
7340 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7341 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7342 ## Reconsume.
7343 redo A;
7344 }
7345
7346 } elsif ($self->{state} == BOGUS_MD_STATE) {
7347 if ($self->{nc} == 0x003E) { # >
7348 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7349
7350 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7351 $self->{line_prev} = $self->{line};
7352 $self->{column_prev} = $self->{column};
7353 $self->{column}++;
7354 $self->{nc}
7355 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7356 } else {
7357 $self->{set_nc}->($self);
7358 }
7359
7360 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7361 redo A;
7362 } elsif ($self->{nc} == -1) {
7363 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7364 ## Reconsume.
7365 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7366 redo A;
7367 } else {
7368 ## Stay in the state.
7369
7370 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7371 $self->{line_prev} = $self->{line};
7372 $self->{column_prev} = $self->{column};
7373 $self->{column}++;
7374 $self->{nc}
7375 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7376 } else {
7377 $self->{set_nc}->($self);
7378 }
7379
7380 redo A;
7381 }
7382 } else {
7383 die "$0: $self->{state}: Unknown state";
7384 }
7385 } # A
7386
7387 die "$0: _get_next_token: unexpected case";
7388 } # _get_next_token
7389
7390 1;
7391 ## $Date: 2008/10/18 11:34:49 $
7392

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24