/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.30 - (show annotations) (download)
Sun Aug 16 05:24:47 2009 UTC (16 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.29: +13 -6 lines
++ whatpm/t/ChangeLog	16 Aug 2009 05:21:53 -0000
	* tokenizer-test-1.test: "<" in attribute names are now parse
	errors (HTML5 revision 3354).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	16 Aug 2009 05:23:17 -0000
	* Tokenizer.pm.src: Any "<" character in attribute names become
	parse error (HTML5 revision 3354).

2009-08-16  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.29 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276
277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278 $self->{line_prev} = $self->{line};
279 $self->{column_prev} = $self->{column};
280 $self->{column}++;
281 $self->{nc}
282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283 } else {
284 $self->{set_nc}->($self);
285 }
286
287 $self->{token} = [];
288 # $self->{escape}
289 } # _initialize_tokenizer
290
291 ## A token has:
292 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 ## ->{name} (DOCTYPE_TOKEN)
295 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 ## ->{target} (PI_TOKEN)
297 ## ->{pubid} (DOCTYPE_TOKEN)
298 ## ->{sysid} (DOCTYPE_TOKEN)
299 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301 ## ->{name}
302 ## ->{value}
303 ## ->{has_reference} == 1 or 0
304 ## ->{index}: Index of the attribute in a tag.
305 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309
310 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312 ## while the token is pushed back to the stack.
313
314 ## Emitted token MUST immediately be handled by the tree construction state.
315
316 ## Before each step, UA MAY check to see if either one of the scripts in
317 ## "list of scripts that will execute as soon as possible" or the first
318 ## script in the "list of scripts that will execute asynchronously",
319 ## has completed loading. If one has, then it MUST be executed
320 ## and removed from the list.
321
322 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323 ## (This requirement was dropped from HTML5 spec, unfortunately.)
324
325 my $is_space = {
326 0x0009 => 1, # CHARACTER TABULATION (HT)
327 0x000A => 1, # LINE FEED (LF)
328 #0x000B => 0, # LINE TABULATION (VT)
329 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 #0x000D => 1, # CARRIAGE RETURN (CR)
331 0x0020 => 1, # SPACE (SP)
332 };
333
334 sub _get_next_token ($) {
335 my $self = shift;
336
337 if ($self->{self_closing}) {
338 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339 ## NOTE: The |self_closing| flag is only set by start tag token.
340 ## In addition, when a start tag token is emitted, it is always set to
341 ## |ct|.
342 delete $self->{self_closing};
343 }
344
345 if (@{$self->{token}}) {
346 $self->{self_closing} = $self->{token}->[0]->{self_closing};
347 return shift @{$self->{token}};
348 }
349
350 A: {
351 if ($self->{state} == PCDATA_STATE) {
352 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353
354 if ($self->{nc} == 0x0026) { # &
355
356 ## NOTE: In the spec, the tokenizer is switched to the
357 ## "entity data state". In this implementation, the tokenizer
358 ## is switched to the |ENTITY_STATE|, which is an implementation
359 ## of the "consume a character reference" algorithm.
360 $self->{entity_add} = -1;
361 $self->{prev_state} = DATA_STATE;
362 $self->{state} = ENTITY_STATE;
363
364 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365 $self->{line_prev} = $self->{line};
366 $self->{column_prev} = $self->{column};
367 $self->{column}++;
368 $self->{nc}
369 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370 } else {
371 $self->{set_nc}->($self);
372 }
373
374 redo A;
375 } elsif ($self->{nc} == 0x003C) { # <
376
377 $self->{state} = TAG_OPEN_STATE;
378
379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380 $self->{line_prev} = $self->{line};
381 $self->{column_prev} = $self->{column};
382 $self->{column}++;
383 $self->{nc}
384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385 } else {
386 $self->{set_nc}->($self);
387 }
388
389 redo A;
390 } elsif ($self->{nc} == -1) {
391
392 return ({type => END_OF_FILE_TOKEN,
393 line => $self->{line}, column => $self->{column}});
394 last A; ## TODO: ok?
395 } else {
396
397 #
398 }
399
400 # Anything else
401 my $token = {type => CHARACTER_TOKEN,
402 data => chr $self->{nc},
403 line => $self->{line}, column => $self->{column},
404 };
405 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406
407 ## Stay in the state.
408
409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410 $self->{line_prev} = $self->{line};
411 $self->{column_prev} = $self->{column};
412 $self->{column}++;
413 $self->{nc}
414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415 } else {
416 $self->{set_nc}->($self);
417 }
418
419 return ($token);
420 redo A;
421 } elsif ($self->{state} == DATA_STATE) {
422 $self->{s_kwd} = '' unless defined $self->{s_kwd};
423 if ($self->{nc} == 0x0026) { # &
424 $self->{s_kwd} = '';
425 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426 not $self->{escape}) {
427
428 ## NOTE: In the spec, the tokenizer is switched to the
429 ## "entity data state". In this implementation, the tokenizer
430 ## is switched to the |ENTITY_STATE|, which is an implementation
431 ## of the "consume a character reference" algorithm.
432 $self->{entity_add} = -1;
433 $self->{prev_state} = DATA_STATE;
434 $self->{state} = ENTITY_STATE;
435
436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437 $self->{line_prev} = $self->{line};
438 $self->{column_prev} = $self->{column};
439 $self->{column}++;
440 $self->{nc}
441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442 } else {
443 $self->{set_nc}->($self);
444 }
445
446 redo A;
447 } else {
448
449 #
450 }
451 } elsif ($self->{nc} == 0x002D) { # -
452 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 if ($self->{s_kwd} eq '<!-') {
454
455 $self->{escape} = 1; # unless $self->{escape};
456 $self->{s_kwd} = '--';
457 #
458 } elsif ($self->{s_kwd} eq '-') {
459
460 $self->{s_kwd} = '--';
461 #
462 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463
464 $self->{s_kwd} .= '-';
465 #
466 } else {
467
468 $self->{s_kwd} = '-';
469 #
470 }
471 }
472
473 #
474 } elsif ($self->{nc} == 0x0021) { # !
475 if (length $self->{s_kwd}) {
476
477 $self->{s_kwd} .= '!';
478 #
479 } else {
480
481 #$self->{s_kwd} = '';
482 #
483 }
484 #
485 } elsif ($self->{nc} == 0x003C) { # <
486 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488 not $self->{escape})) {
489
490 $self->{state} = TAG_OPEN_STATE;
491
492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493 $self->{line_prev} = $self->{line};
494 $self->{column_prev} = $self->{column};
495 $self->{column}++;
496 $self->{nc}
497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498 } else {
499 $self->{set_nc}->($self);
500 }
501
502 redo A;
503 } else {
504
505 $self->{s_kwd} = '';
506 #
507 }
508 } elsif ($self->{nc} == 0x003E) { # >
509 if ($self->{escape} and
510 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511 if ($self->{s_kwd} eq '--') {
512
513 delete $self->{escape};
514 #
515 } else {
516
517 #
518 }
519 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520
521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522 line => $self->{line_prev},
523 column => $self->{column_prev} - 1);
524 #
525 } else {
526
527 #
528 }
529
530 $self->{s_kwd} = '';
531 #
532 } elsif ($self->{nc} == 0x005D) { # ]
533 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534
535 $self->{s_kwd} .= ']';
536 } elsif ($self->{s_kwd} eq ']]') {
537
538 #
539 } else {
540
541 $self->{s_kwd} = '';
542 }
543 #
544 } elsif ($self->{nc} == -1) {
545
546 $self->{s_kwd} = '';
547 return ({type => END_OF_FILE_TOKEN,
548 line => $self->{line}, column => $self->{column}});
549 last A; ## TODO: ok?
550 } else {
551
552 $self->{s_kwd} = '';
553 #
554 }
555
556 # Anything else
557 my $token = {type => CHARACTER_TOKEN,
558 data => chr $self->{nc},
559 line => $self->{line}, column => $self->{column},
560 };
561 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 length $token->{data})) {
563 $self->{s_kwd} = '';
564 }
565
566 ## Stay in the data state.
567 if (not $self->{is_xml} and
568 $self->{content_model} == PCDATA_CONTENT_MODEL) {
569
570 $self->{state} = PCDATA_STATE;
571 } else {
572
573 ## Stay in the state.
574 }
575
576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577 $self->{line_prev} = $self->{line};
578 $self->{column_prev} = $self->{column};
579 $self->{column}++;
580 $self->{nc}
581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582 } else {
583 $self->{set_nc}->($self);
584 }
585
586 return ($token);
587 redo A;
588 } elsif ($self->{state} == TAG_OPEN_STATE) {
589 ## XML5: "tag state".
590
591 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592 if ($self->{nc} == 0x002F) { # /
593
594
595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596 $self->{line_prev} = $self->{line};
597 $self->{column_prev} = $self->{column};
598 $self->{column}++;
599 $self->{nc}
600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601 } else {
602 $self->{set_nc}->($self);
603 }
604
605 $self->{state} = CLOSE_TAG_OPEN_STATE;
606 redo A;
607 } elsif ($self->{nc} == 0x0021) { # !
608
609 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 #
611 } else {
612
613 $self->{s_kwd} = '';
614 #
615 }
616
617 ## reconsume
618 $self->{state} = DATA_STATE;
619 return ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623 redo A;
624 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625 if ($self->{nc} == 0x0021) { # !
626
627 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628
629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630 $self->{line_prev} = $self->{line};
631 $self->{column_prev} = $self->{column};
632 $self->{column}++;
633 $self->{nc}
634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635 } else {
636 $self->{set_nc}->($self);
637 }
638
639 redo A;
640 } elsif ($self->{nc} == 0x002F) { # /
641
642 $self->{state} = CLOSE_TAG_OPEN_STATE;
643
644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645 $self->{line_prev} = $self->{line};
646 $self->{column_prev} = $self->{column};
647 $self->{column}++;
648 $self->{nc}
649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650 } else {
651 $self->{set_nc}->($self);
652 }
653
654 redo A;
655 } elsif (0x0041 <= $self->{nc} and
656 $self->{nc} <= 0x005A) { # A..Z
657
658 $self->{ct}
659 = {type => START_TAG_TOKEN,
660 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 line => $self->{line_prev},
662 column => $self->{column_prev}};
663 $self->{state} = TAG_NAME_STATE;
664
665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666 $self->{line_prev} = $self->{line};
667 $self->{column_prev} = $self->{column};
668 $self->{column}++;
669 $self->{nc}
670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671 } else {
672 $self->{set_nc}->($self);
673 }
674
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678
679 $self->{ct} = {type => START_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $self->{line_prev},
682 column => $self->{column_prev}};
683 $self->{state} = TAG_NAME_STATE;
684
685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686 $self->{line_prev} = $self->{line};
687 $self->{column_prev} = $self->{column};
688 $self->{column}++;
689 $self->{nc}
690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691 } else {
692 $self->{set_nc}->($self);
693 }
694
695 redo A;
696 } elsif ($self->{nc} == 0x003E) { # >
697
698 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699 line => $self->{line_prev},
700 column => $self->{column_prev});
701 $self->{state} = DATA_STATE;
702 $self->{s_kwd} = '';
703
704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705 $self->{line_prev} = $self->{line};
706 $self->{column_prev} = $self->{column};
707 $self->{column}++;
708 $self->{nc}
709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710 } else {
711 $self->{set_nc}->($self);
712 }
713
714
715 return ({type => CHARACTER_TOKEN, data => '<>',
716 line => $self->{line_prev},
717 column => $self->{column_prev},
718 });
719
720 redo A;
721 } elsif ($self->{nc} == 0x003F) { # ?
722 if ($self->{is_xml}) {
723
724 $self->{state} = PI_STATE;
725
726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727 $self->{line_prev} = $self->{line};
728 $self->{column_prev} = $self->{column};
729 $self->{column}++;
730 $self->{nc}
731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732 } else {
733 $self->{set_nc}->($self);
734 }
735
736 redo A;
737 } else {
738
739 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740 line => $self->{line_prev},
741 column => $self->{column_prev});
742 $self->{state} = BOGUS_COMMENT_STATE;
743 $self->{ct} = {type => COMMENT_TOKEN, data => '',
744 line => $self->{line_prev},
745 column => $self->{column_prev},
746 };
747 ## $self->{nc} is intentionally left as is
748 redo A;
749 }
750 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751
752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753 line => $self->{line_prev},
754 column => $self->{column_prev});
755 $self->{state} = DATA_STATE;
756 $self->{s_kwd} = '';
757 ## reconsume
758
759 return ({type => CHARACTER_TOKEN, data => '<',
760 line => $self->{line_prev},
761 column => $self->{column_prev},
762 });
763
764 redo A;
765 } else {
766 ## XML5: "<:" is a parse error.
767
768 $self->{ct} = {type => START_TAG_TOKEN,
769 tag_name => chr ($self->{nc}),
770 line => $self->{line_prev},
771 column => $self->{column_prev}};
772 $self->{state} = TAG_NAME_STATE;
773
774 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775 $self->{line_prev} = $self->{line};
776 $self->{column_prev} = $self->{column};
777 $self->{column}++;
778 $self->{nc}
779 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780 } else {
781 $self->{set_nc}->($self);
782 }
783
784 redo A;
785 }
786 } else {
787 die "$0: $self->{content_model} in tag open";
788 }
789 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790 ## NOTE: The "close tag open state" in the spec is implemented as
791 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792
793 ## XML5: "end tag state".
794
795 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797 if (defined $self->{last_stag_name}) {
798 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 $self->{kwd} = '';
800 ## Reconsume.
801 redo A;
802 } else {
803 ## No start tag token has ever been emitted
804 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805
806 $self->{state} = DATA_STATE;
807 $self->{s_kwd} = '';
808 ## Reconsume.
809 return ({type => CHARACTER_TOKEN, data => '</',
810 line => $l, column => $c,
811 });
812 redo A;
813 }
814 }
815
816 if (0x0041 <= $self->{nc} and
817 $self->{nc} <= 0x005A) { # A..Z
818
819 $self->{ct}
820 = {type => END_TAG_TOKEN,
821 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 line => $l, column => $c};
823 $self->{state} = TAG_NAME_STATE;
824
825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826 $self->{line_prev} = $self->{line};
827 $self->{column_prev} = $self->{column};
828 $self->{column}++;
829 $self->{nc}
830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831 } else {
832 $self->{set_nc}->($self);
833 }
834
835 redo A;
836 } elsif (0x0061 <= $self->{nc} and
837 $self->{nc} <= 0x007A) { # a..z
838
839 $self->{ct} = {type => END_TAG_TOKEN,
840 tag_name => chr ($self->{nc}),
841 line => $l, column => $c};
842 $self->{state} = TAG_NAME_STATE;
843
844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845 $self->{line_prev} = $self->{line};
846 $self->{column_prev} = $self->{column};
847 $self->{column}++;
848 $self->{nc}
849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850 } else {
851 $self->{set_nc}->($self);
852 }
853
854 redo A;
855 } elsif ($self->{nc} == 0x003E) { # >
856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857 line => $self->{line_prev}, ## "<" in "</>"
858 column => $self->{column_prev} - 1);
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 if ($self->{is_xml}) {
862
863 ## XML5: No parse error.
864
865 ## NOTE: This parser raises a parse error, since it supports
866 ## XML1, not XML5.
867
868 ## NOTE: A short end tag token.
869 my $ct = {type => END_TAG_TOKEN,
870 tag_name => '',
871 line => $self->{line_prev},
872 column => $self->{column_prev} - 1,
873 };
874
875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876 $self->{line_prev} = $self->{line};
877 $self->{column_prev} = $self->{column};
878 $self->{column}++;
879 $self->{nc}
880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881 } else {
882 $self->{set_nc}->($self);
883 }
884
885 return ($ct);
886 } else {
887
888
889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890 $self->{line_prev} = $self->{line};
891 $self->{column_prev} = $self->{column};
892 $self->{column}++;
893 $self->{nc}
894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895 } else {
896 $self->{set_nc}->($self);
897 }
898
899 }
900 redo A;
901 } elsif ($self->{nc} == -1) {
902
903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 $self->{s_kwd} = '';
905 $self->{state} = DATA_STATE;
906 # reconsume
907
908 return ({type => CHARACTER_TOKEN, data => '</',
909 line => $l, column => $c,
910 });
911
912 redo A;
913 } elsif (not $self->{is_xml} or
914 $is_space->{$self->{nc}}) {
915
916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917 line => $self->{line_prev}, # "<" of "</"
918 column => $self->{column_prev} - 1);
919 $self->{state} = BOGUS_COMMENT_STATE;
920 $self->{ct} = {type => COMMENT_TOKEN, data => '',
921 line => $self->{line_prev}, # "<" of "</"
922 column => $self->{column_prev} - 1,
923 };
924 ## NOTE: $self->{nc} is intentionally left as is.
925 ## Although the "anything else" case of the spec not explicitly
926 ## states that the next input character is to be reconsumed,
927 ## it will be included to the |data| of the comment token
928 ## generated from the bogus end tag, as defined in the
929 ## "bogus comment state" entry.
930 redo A;
931 } else {
932 ## XML5: "</:" is a parse error.
933
934 $self->{ct} = {type => END_TAG_TOKEN,
935 tag_name => chr ($self->{nc}),
936 line => $l, column => $c};
937 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938
939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940 $self->{line_prev} = $self->{line};
941 $self->{column_prev} = $self->{column};
942 $self->{column}++;
943 $self->{nc}
944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945 } else {
946 $self->{set_nc}->($self);
947 }
948
949 redo A;
950 }
951 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 if (length $ch) {
954 my $CH = $ch;
955 $ch =~ tr/a-z/A-Z/;
956 my $nch = chr $self->{nc};
957 if ($nch eq $ch or $nch eq $CH) {
958
959 ## Stay in the state.
960 $self->{kwd} .= $nch;
961
962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963 $self->{line_prev} = $self->{line};
964 $self->{column_prev} = $self->{column};
965 $self->{column}++;
966 $self->{nc}
967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968 } else {
969 $self->{set_nc}->($self);
970 }
971
972 redo A;
973 } else {
974
975 $self->{state} = DATA_STATE;
976 $self->{s_kwd} = '';
977 ## Reconsume.
978 return ({type => CHARACTER_TOKEN,
979 data => '</' . $self->{kwd},
980 line => $self->{line_prev},
981 column => $self->{column_prev} - 1 - length $self->{kwd},
982 });
983 redo A;
984 }
985 } else { # after "<{tag-name}"
986 unless ($is_space->{$self->{nc}} or
987 {
988 0x003E => 1, # >
989 0x002F => 1, # /
990 -1 => 1, # EOF
991 }->{$self->{nc}}) {
992
993 ## Reconsume.
994 $self->{state} = DATA_STATE;
995 $self->{s_kwd} = '';
996 return ({type => CHARACTER_TOKEN,
997 data => '</' . $self->{kwd},
998 line => $self->{line_prev},
999 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 });
1001 redo A;
1002 } else {
1003
1004 $self->{ct}
1005 = {type => END_TAG_TOKEN,
1006 tag_name => $self->{last_stag_name},
1007 line => $self->{line_prev},
1008 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 $self->{state} = TAG_NAME_STATE;
1010 ## Reconsume.
1011 redo A;
1012 }
1013 }
1014 } elsif ($self->{state} == TAG_NAME_STATE) {
1015 if ($is_space->{$self->{nc}}) {
1016
1017 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018
1019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020 $self->{line_prev} = $self->{line};
1021 $self->{column_prev} = $self->{column};
1022 $self->{column}++;
1023 $self->{nc}
1024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025 } else {
1026 $self->{set_nc}->($self);
1027 }
1028
1029 redo A;
1030 } elsif ($self->{nc} == 0x003E) { # >
1031 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032
1033 $self->{last_stag_name} = $self->{ct}->{tag_name};
1034 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036 #if ($self->{ct}->{attributes}) {
1037 # ## NOTE: This should never be reached.
1038 # !!! cp (36);
1039 # !!! parse-error (type => 'end tag attribute');
1040 #} else {
1041
1042 #}
1043 } else {
1044 die "$0: $self->{ct}->{type}: Unknown token type";
1045 }
1046 $self->{state} = DATA_STATE;
1047 $self->{s_kwd} = '';
1048
1049 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050 $self->{line_prev} = $self->{line};
1051 $self->{column_prev} = $self->{column};
1052 $self->{column}++;
1053 $self->{nc}
1054 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055 } else {
1056 $self->{set_nc}->($self);
1057 }
1058
1059
1060 return ($self->{ct}); # start tag or end tag
1061
1062 redo A;
1063 } elsif (0x0041 <= $self->{nc} and
1064 $self->{nc} <= 0x005A) { # A..Z
1065
1066 $self->{ct}->{tag_name}
1067 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 # start tag or end tag
1069 ## Stay in this state
1070
1071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072 $self->{line_prev} = $self->{line};
1073 $self->{column_prev} = $self->{column};
1074 $self->{column}++;
1075 $self->{nc}
1076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077 } else {
1078 $self->{set_nc}->($self);
1079 }
1080
1081 redo A;
1082 } elsif ($self->{nc} == -1) {
1083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085
1086 $self->{last_stag_name} = $self->{ct}->{tag_name};
1087 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089 #if ($self->{ct}->{attributes}) {
1090 # ## NOTE: This state should never be reached.
1091 # !!! cp (40);
1092 # !!! parse-error (type => 'end tag attribute');
1093 #} else {
1094
1095 #}
1096 } else {
1097 die "$0: $self->{ct}->{type}: Unknown token type";
1098 }
1099 $self->{state} = DATA_STATE;
1100 $self->{s_kwd} = '';
1101 # reconsume
1102
1103 return ($self->{ct}); # start tag or end tag
1104
1105 redo A;
1106 } elsif ($self->{nc} == 0x002F) { # /
1107
1108 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109
1110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111 $self->{line_prev} = $self->{line};
1112 $self->{column_prev} = $self->{column};
1113 $self->{column}++;
1114 $self->{nc}
1115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116 } else {
1117 $self->{set_nc}->($self);
1118 }
1119
1120 redo A;
1121 } else {
1122
1123 $self->{ct}->{tag_name} .= chr $self->{nc};
1124 # start tag or end tag
1125 ## Stay in the state
1126
1127 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128 $self->{line_prev} = $self->{line};
1129 $self->{column_prev} = $self->{column};
1130 $self->{column}++;
1131 $self->{nc}
1132 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133 } else {
1134 $self->{set_nc}->($self);
1135 }
1136
1137 redo A;
1138 }
1139 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 ## XML5: "Tag attribute name before state".
1141
1142 if ($is_space->{$self->{nc}}) {
1143
1144 ## Stay in the state
1145
1146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147 $self->{line_prev} = $self->{line};
1148 $self->{column_prev} = $self->{column};
1149 $self->{column}++;
1150 $self->{nc}
1151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152 } else {
1153 $self->{set_nc}->($self);
1154 }
1155
1156 redo A;
1157 } elsif ($self->{nc} == 0x003E) { # >
1158 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159
1160 $self->{last_stag_name} = $self->{ct}->{tag_name};
1161 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163 if ($self->{ct}->{attributes}) {
1164
1165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166 } else {
1167
1168 }
1169 } else {
1170 die "$0: $self->{ct}->{type}: Unknown token type";
1171 }
1172 $self->{state} = DATA_STATE;
1173 $self->{s_kwd} = '';
1174
1175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176 $self->{line_prev} = $self->{line};
1177 $self->{column_prev} = $self->{column};
1178 $self->{column}++;
1179 $self->{nc}
1180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181 } else {
1182 $self->{set_nc}->($self);
1183 }
1184
1185
1186 return ($self->{ct}); # start tag or end tag
1187
1188 redo A;
1189 } elsif (0x0041 <= $self->{nc} and
1190 $self->{nc} <= 0x005A) { # A..Z
1191
1192 $self->{ca}
1193 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 value => '',
1195 line => $self->{line}, column => $self->{column}};
1196 $self->{state} = ATTRIBUTE_NAME_STATE;
1197
1198 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199 $self->{line_prev} = $self->{line};
1200 $self->{column_prev} = $self->{column};
1201 $self->{column}++;
1202 $self->{nc}
1203 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204 } else {
1205 $self->{set_nc}->($self);
1206 }
1207
1208 redo A;
1209 } elsif ($self->{nc} == 0x002F) { # /
1210
1211 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212
1213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214 $self->{line_prev} = $self->{line};
1215 $self->{column_prev} = $self->{column};
1216 $self->{column}++;
1217 $self->{nc}
1218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219 } else {
1220 $self->{set_nc}->($self);
1221 }
1222
1223 redo A;
1224 } elsif ($self->{nc} == -1) {
1225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232
1233 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234 } else {
1235
1236 }
1237 } else {
1238 die "$0: $self->{ct}->{type}: Unknown token type";
1239 }
1240 $self->{state} = DATA_STATE;
1241 $self->{s_kwd} = '';
1242 # reconsume
1243
1244 return ($self->{ct}); # start tag or end tag
1245
1246 redo A;
1247 } else {
1248 if ({
1249 0x0022 => 1, # "
1250 0x0027 => 1, # '
1251 0x003C => 1, # <
1252 0x003D => 1, # =
1253 }->{$self->{nc}}) {
1254
1255 ## XML5: Not a parse error.
1256 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1257 } else {
1258
1259 ## XML5: ":" raises a parse error and is ignored.
1260 }
1261 $self->{ca}
1262 = {name => chr ($self->{nc}),
1263 value => '',
1264 line => $self->{line}, column => $self->{column}};
1265 $self->{state} = ATTRIBUTE_NAME_STATE;
1266
1267 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1268 $self->{line_prev} = $self->{line};
1269 $self->{column_prev} = $self->{column};
1270 $self->{column}++;
1271 $self->{nc}
1272 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1273 } else {
1274 $self->{set_nc}->($self);
1275 }
1276
1277 redo A;
1278 }
1279 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1280 ## XML5: "Tag attribute name state".
1281
1282 my $before_leave = sub {
1283 if (exists $self->{ct}->{attributes} # start tag or end tag
1284 ->{$self->{ca}->{name}}) { # MUST
1285
1286 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1287 ## Discard $self->{ca} # MUST
1288 } else {
1289
1290 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1291 = $self->{ca};
1292 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1293 }
1294 }; # $before_leave
1295
1296 if ($is_space->{$self->{nc}}) {
1297
1298 $before_leave->();
1299 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1300
1301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1302 $self->{line_prev} = $self->{line};
1303 $self->{column_prev} = $self->{column};
1304 $self->{column}++;
1305 $self->{nc}
1306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1307 } else {
1308 $self->{set_nc}->($self);
1309 }
1310
1311 redo A;
1312 } elsif ($self->{nc} == 0x003D) { # =
1313
1314 $before_leave->();
1315 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1316
1317 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1318 $self->{line_prev} = $self->{line};
1319 $self->{column_prev} = $self->{column};
1320 $self->{column}++;
1321 $self->{nc}
1322 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1323 } else {
1324 $self->{set_nc}->($self);
1325 }
1326
1327 redo A;
1328 } elsif ($self->{nc} == 0x003E) { # >
1329 if ($self->{is_xml}) {
1330
1331 ## XML5: Not a parse error.
1332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1333 } else {
1334
1335 }
1336
1337 $before_leave->();
1338 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1339
1340 $self->{last_stag_name} = $self->{ct}->{tag_name};
1341 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1342
1343 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1344 if ($self->{ct}->{attributes}) {
1345 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1346 }
1347 } else {
1348 die "$0: $self->{ct}->{type}: Unknown token type";
1349 }
1350 $self->{state} = DATA_STATE;
1351 $self->{s_kwd} = '';
1352
1353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1354 $self->{line_prev} = $self->{line};
1355 $self->{column_prev} = $self->{column};
1356 $self->{column}++;
1357 $self->{nc}
1358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1359 } else {
1360 $self->{set_nc}->($self);
1361 }
1362
1363
1364 return ($self->{ct}); # start tag or end tag
1365
1366 redo A;
1367 } elsif (0x0041 <= $self->{nc} and
1368 $self->{nc} <= 0x005A) { # A..Z
1369
1370 $self->{ca}->{name}
1371 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1372 ## Stay in the state
1373
1374 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1375 $self->{line_prev} = $self->{line};
1376 $self->{column_prev} = $self->{column};
1377 $self->{column}++;
1378 $self->{nc}
1379 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1380 } else {
1381 $self->{set_nc}->($self);
1382 }
1383
1384 redo A;
1385 } elsif ($self->{nc} == 0x002F) { # /
1386 if ($self->{is_xml}) {
1387
1388 ## XML5: Not a parse error.
1389 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1390 } else {
1391
1392 }
1393
1394 $before_leave->();
1395 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1396
1397 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1398 $self->{line_prev} = $self->{line};
1399 $self->{column_prev} = $self->{column};
1400 $self->{column}++;
1401 $self->{nc}
1402 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1403 } else {
1404 $self->{set_nc}->($self);
1405 }
1406
1407 redo A;
1408 } elsif ($self->{nc} == -1) {
1409 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1410 $before_leave->();
1411 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1412
1413 $self->{last_stag_name} = $self->{ct}->{tag_name};
1414 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1415 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1416 if ($self->{ct}->{attributes}) {
1417
1418 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1419 } else {
1420 ## NOTE: This state should never be reached.
1421
1422 }
1423 } else {
1424 die "$0: $self->{ct}->{type}: Unknown token type";
1425 }
1426 $self->{state} = DATA_STATE;
1427 $self->{s_kwd} = '';
1428 # reconsume
1429
1430 return ($self->{ct}); # start tag or end tag
1431
1432 redo A;
1433 } else {
1434 if ({
1435 0x0022 => 1, # "
1436 0x0027 => 1, # '
1437 0x003C => 1, # <
1438 }->{$self->{nc}}) {
1439
1440 ## XML5: Not a parse error.
1441 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1442 } else {
1443
1444 }
1445 $self->{ca}->{name} .= chr ($self->{nc});
1446 ## Stay in the state
1447
1448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1449 $self->{line_prev} = $self->{line};
1450 $self->{column_prev} = $self->{column};
1451 $self->{column}++;
1452 $self->{nc}
1453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1454 } else {
1455 $self->{set_nc}->($self);
1456 }
1457
1458 redo A;
1459 }
1460 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1461 ## XML5: "Tag attribute name after state".
1462
1463 if ($is_space->{$self->{nc}}) {
1464
1465 ## Stay in the state
1466
1467 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1468 $self->{line_prev} = $self->{line};
1469 $self->{column_prev} = $self->{column};
1470 $self->{column}++;
1471 $self->{nc}
1472 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1473 } else {
1474 $self->{set_nc}->($self);
1475 }
1476
1477 redo A;
1478 } elsif ($self->{nc} == 0x003D) { # =
1479
1480 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1481
1482 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1483 $self->{line_prev} = $self->{line};
1484 $self->{column_prev} = $self->{column};
1485 $self->{column}++;
1486 $self->{nc}
1487 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1488 } else {
1489 $self->{set_nc}->($self);
1490 }
1491
1492 redo A;
1493 } elsif ($self->{nc} == 0x003E) { # >
1494 if ($self->{is_xml}) {
1495
1496 ## XML5: Not a parse error.
1497 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1498 } else {
1499
1500 }
1501
1502 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1503
1504 $self->{last_stag_name} = $self->{ct}->{tag_name};
1505 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1506 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1507 if ($self->{ct}->{attributes}) {
1508
1509 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1510 } else {
1511 ## NOTE: This state should never be reached.
1512
1513 }
1514 } else {
1515 die "$0: $self->{ct}->{type}: Unknown token type";
1516 }
1517 $self->{state} = DATA_STATE;
1518 $self->{s_kwd} = '';
1519
1520 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1521 $self->{line_prev} = $self->{line};
1522 $self->{column_prev} = $self->{column};
1523 $self->{column}++;
1524 $self->{nc}
1525 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1526 } else {
1527 $self->{set_nc}->($self);
1528 }
1529
1530
1531 return ($self->{ct}); # start tag or end tag
1532
1533 redo A;
1534 } elsif (0x0041 <= $self->{nc} and
1535 $self->{nc} <= 0x005A) { # A..Z
1536
1537 $self->{ca}
1538 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1539 value => '',
1540 line => $self->{line}, column => $self->{column}};
1541 $self->{state} = ATTRIBUTE_NAME_STATE;
1542
1543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1544 $self->{line_prev} = $self->{line};
1545 $self->{column_prev} = $self->{column};
1546 $self->{column}++;
1547 $self->{nc}
1548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1549 } else {
1550 $self->{set_nc}->($self);
1551 }
1552
1553 redo A;
1554 } elsif ($self->{nc} == 0x002F) { # /
1555 if ($self->{is_xml}) {
1556
1557 ## XML5: Not a parse error.
1558 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1559 } else {
1560
1561 }
1562
1563 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1564
1565 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1566 $self->{line_prev} = $self->{line};
1567 $self->{column_prev} = $self->{column};
1568 $self->{column}++;
1569 $self->{nc}
1570 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1571 } else {
1572 $self->{set_nc}->($self);
1573 }
1574
1575 redo A;
1576 } elsif ($self->{nc} == -1) {
1577 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1578 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1579
1580 $self->{last_stag_name} = $self->{ct}->{tag_name};
1581 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1582 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1583 if ($self->{ct}->{attributes}) {
1584
1585 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1586 } else {
1587 ## NOTE: This state should never be reached.
1588
1589 }
1590 } else {
1591 die "$0: $self->{ct}->{type}: Unknown token type";
1592 }
1593 $self->{s_kwd} = '';
1594 $self->{state} = DATA_STATE;
1595 # reconsume
1596
1597 return ($self->{ct}); # start tag or end tag
1598
1599 redo A;
1600 } else {
1601 if ($self->{is_xml}) {
1602
1603 ## XML5: Not a parse error.
1604 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1605 } else {
1606
1607 }
1608
1609 if ({
1610 0x0022 => 1, # "
1611 0x0027 => 1, # '
1612 0x003C => 1, # <
1613 }->{$self->{nc}}) {
1614
1615 ## XML5: Not a parse error.
1616 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1617 } else {
1618
1619 }
1620 $self->{ca}
1621 = {name => chr ($self->{nc}),
1622 value => '',
1623 line => $self->{line}, column => $self->{column}};
1624 $self->{state} = ATTRIBUTE_NAME_STATE;
1625
1626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1627 $self->{line_prev} = $self->{line};
1628 $self->{column_prev} = $self->{column};
1629 $self->{column}++;
1630 $self->{nc}
1631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1632 } else {
1633 $self->{set_nc}->($self);
1634 }
1635
1636 redo A;
1637 }
1638 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1639 ## XML5: "Tag attribute value before state".
1640
1641 if ($is_space->{$self->{nc}}) {
1642
1643 ## Stay in the state
1644
1645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1646 $self->{line_prev} = $self->{line};
1647 $self->{column_prev} = $self->{column};
1648 $self->{column}++;
1649 $self->{nc}
1650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1651 } else {
1652 $self->{set_nc}->($self);
1653 }
1654
1655 redo A;
1656 } elsif ($self->{nc} == 0x0022) { # "
1657
1658 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1659
1660 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1661 $self->{line_prev} = $self->{line};
1662 $self->{column_prev} = $self->{column};
1663 $self->{column}++;
1664 $self->{nc}
1665 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1666 } else {
1667 $self->{set_nc}->($self);
1668 }
1669
1670 redo A;
1671 } elsif ($self->{nc} == 0x0026) { # &
1672
1673 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1674 ## reconsume
1675 redo A;
1676 } elsif ($self->{nc} == 0x0027) { # '
1677
1678 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1679
1680 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1681 $self->{line_prev} = $self->{line};
1682 $self->{column_prev} = $self->{column};
1683 $self->{column}++;
1684 $self->{nc}
1685 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1686 } else {
1687 $self->{set_nc}->($self);
1688 }
1689
1690 redo A;
1691 } elsif ($self->{nc} == 0x003E) { # >
1692 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1693 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1694
1695 $self->{last_stag_name} = $self->{ct}->{tag_name};
1696 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1697 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1698 if ($self->{ct}->{attributes}) {
1699
1700 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1701 } else {
1702 ## NOTE: This state should never be reached.
1703
1704 }
1705 } else {
1706 die "$0: $self->{ct}->{type}: Unknown token type";
1707 }
1708 $self->{state} = DATA_STATE;
1709 $self->{s_kwd} = '';
1710
1711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1712 $self->{line_prev} = $self->{line};
1713 $self->{column_prev} = $self->{column};
1714 $self->{column}++;
1715 $self->{nc}
1716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1717 } else {
1718 $self->{set_nc}->($self);
1719 }
1720
1721
1722 return ($self->{ct}); # start tag or end tag
1723
1724 redo A;
1725 } elsif ($self->{nc} == -1) {
1726 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1727 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1728
1729 $self->{last_stag_name} = $self->{ct}->{tag_name};
1730 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1731 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1732 if ($self->{ct}->{attributes}) {
1733
1734 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1735 } else {
1736 ## NOTE: This state should never be reached.
1737
1738 }
1739 } else {
1740 die "$0: $self->{ct}->{type}: Unknown token type";
1741 }
1742 $self->{state} = DATA_STATE;
1743 $self->{s_kwd} = '';
1744 ## reconsume
1745
1746 return ($self->{ct}); # start tag or end tag
1747
1748 redo A;
1749 } else {
1750 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1751
1752 ## XML5: Not a parse error.
1753 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1754 } elsif ($self->{is_xml}) {
1755
1756 ## XML5: No parse error.
1757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1758 } else {
1759
1760 }
1761 $self->{ca}->{value} .= chr ($self->{nc});
1762 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1763
1764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1765 $self->{line_prev} = $self->{line};
1766 $self->{column_prev} = $self->{column};
1767 $self->{column}++;
1768 $self->{nc}
1769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1770 } else {
1771 $self->{set_nc}->($self);
1772 }
1773
1774 redo A;
1775 }
1776 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1777 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1778 ## ATTLIST attribute value double quoted state".
1779
1780 if ($self->{nc} == 0x0022) { # "
1781 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1782
1783 ## XML5: "DOCTYPE ATTLIST name after state".
1784 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1785 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1786 } else {
1787
1788 ## XML5: "Tag attribute name before state".
1789 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1790 }
1791
1792 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1793 $self->{line_prev} = $self->{line};
1794 $self->{column_prev} = $self->{column};
1795 $self->{column}++;
1796 $self->{nc}
1797 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1798 } else {
1799 $self->{set_nc}->($self);
1800 }
1801
1802 redo A;
1803 } elsif ($self->{nc} == 0x0026) { # &
1804
1805 ## XML5: Not defined yet.
1806
1807 ## NOTE: In the spec, the tokenizer is switched to the
1808 ## "entity in attribute value state". In this implementation, the
1809 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1810 ## implementation of the "consume a character reference" algorithm.
1811 $self->{prev_state} = $self->{state};
1812 $self->{entity_add} = 0x0022; # "
1813 $self->{state} = ENTITY_STATE;
1814
1815 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1816 $self->{line_prev} = $self->{line};
1817 $self->{column_prev} = $self->{column};
1818 $self->{column}++;
1819 $self->{nc}
1820 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1821 } else {
1822 $self->{set_nc}->($self);
1823 }
1824
1825 redo A;
1826 } elsif ($self->{is_xml} and
1827 $is_space->{$self->{nc}}) {
1828
1829 $self->{ca}->{value} .= ' ';
1830 ## Stay in the state.
1831
1832 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1833 $self->{line_prev} = $self->{line};
1834 $self->{column_prev} = $self->{column};
1835 $self->{column}++;
1836 $self->{nc}
1837 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1838 } else {
1839 $self->{set_nc}->($self);
1840 }
1841
1842 redo A;
1843 } elsif ($self->{nc} == -1) {
1844 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1845 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1846
1847 $self->{last_stag_name} = $self->{ct}->{tag_name};
1848
1849 $self->{state} = DATA_STATE;
1850 $self->{s_kwd} = '';
1851 ## reconsume
1852 return ($self->{ct}); # start tag
1853 redo A;
1854 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1855 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1856 if ($self->{ct}->{attributes}) {
1857
1858 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1859 } else {
1860 ## NOTE: This state should never be reached.
1861
1862 }
1863
1864 $self->{state} = DATA_STATE;
1865 $self->{s_kwd} = '';
1866 ## reconsume
1867 return ($self->{ct}); # end tag
1868 redo A;
1869 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1870 ## XML5: No parse error above; not defined yet.
1871 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1872 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1873 ## Reconsume.
1874 return ($self->{ct}); # ATTLIST
1875 redo A;
1876 } else {
1877 die "$0: $self->{ct}->{type}: Unknown token type";
1878 }
1879 } else {
1880 ## XML5 [ATTLIST]: Not defined yet.
1881 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1882
1883 ## XML5: Not a parse error.
1884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1885 } else {
1886
1887 }
1888 $self->{ca}->{value} .= chr ($self->{nc});
1889 $self->{read_until}->($self->{ca}->{value},
1890 qq["&<\x09\x0C\x20],
1891 length $self->{ca}->{value});
1892
1893 ## Stay in the state
1894
1895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1896 $self->{line_prev} = $self->{line};
1897 $self->{column_prev} = $self->{column};
1898 $self->{column}++;
1899 $self->{nc}
1900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1901 } else {
1902 $self->{set_nc}->($self);
1903 }
1904
1905 redo A;
1906 }
1907 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1908 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1909 ## ATTLIST attribute value single quoted state".
1910
1911 if ($self->{nc} == 0x0027) { # '
1912 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1913
1914 ## XML5: "DOCTYPE ATTLIST name after state".
1915 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1916 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1917 } else {
1918
1919 ## XML5: "Before attribute name state" (sic).
1920 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1921 }
1922
1923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1924 $self->{line_prev} = $self->{line};
1925 $self->{column_prev} = $self->{column};
1926 $self->{column}++;
1927 $self->{nc}
1928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1929 } else {
1930 $self->{set_nc}->($self);
1931 }
1932
1933 redo A;
1934 } elsif ($self->{nc} == 0x0026) { # &
1935
1936 ## XML5: Not defined yet.
1937
1938 ## NOTE: In the spec, the tokenizer is switched to the
1939 ## "entity in attribute value state". In this implementation, the
1940 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1941 ## implementation of the "consume a character reference" algorithm.
1942 $self->{entity_add} = 0x0027; # '
1943 $self->{prev_state} = $self->{state};
1944 $self->{state} = ENTITY_STATE;
1945
1946 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1947 $self->{line_prev} = $self->{line};
1948 $self->{column_prev} = $self->{column};
1949 $self->{column}++;
1950 $self->{nc}
1951 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1952 } else {
1953 $self->{set_nc}->($self);
1954 }
1955
1956 redo A;
1957 } elsif ($self->{is_xml} and
1958 $is_space->{$self->{nc}}) {
1959
1960 $self->{ca}->{value} .= ' ';
1961 ## Stay in the state.
1962
1963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1964 $self->{line_prev} = $self->{line};
1965 $self->{column_prev} = $self->{column};
1966 $self->{column}++;
1967 $self->{nc}
1968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1969 } else {
1970 $self->{set_nc}->($self);
1971 }
1972
1973 redo A;
1974 } elsif ($self->{nc} == -1) {
1975 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1976 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1977
1978 $self->{last_stag_name} = $self->{ct}->{tag_name};
1979
1980 $self->{state} = DATA_STATE;
1981 $self->{s_kwd} = '';
1982 ## reconsume
1983 return ($self->{ct}); # start tag
1984 redo A;
1985 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1986 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1987 if ($self->{ct}->{attributes}) {
1988
1989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1990 } else {
1991 ## NOTE: This state should never be reached.
1992
1993 }
1994
1995 $self->{state} = DATA_STATE;
1996 $self->{s_kwd} = '';
1997 ## reconsume
1998 return ($self->{ct}); # end tag
1999 redo A;
2000 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2001 ## XML5: No parse error above; not defined yet.
2002 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2003 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2004 ## Reconsume.
2005 return ($self->{ct}); # ATTLIST
2006 redo A;
2007 } else {
2008 die "$0: $self->{ct}->{type}: Unknown token type";
2009 }
2010 } else {
2011 ## XML5 [ATTLIST]: Not defined yet.
2012 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2013
2014 ## XML5: Not a parse error.
2015 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2016 } else {
2017
2018 }
2019 $self->{ca}->{value} .= chr ($self->{nc});
2020 $self->{read_until}->($self->{ca}->{value},
2021 qq['&<\x09\x0C\x20],
2022 length $self->{ca}->{value});
2023
2024 ## Stay in the state
2025
2026 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2027 $self->{line_prev} = $self->{line};
2028 $self->{column_prev} = $self->{column};
2029 $self->{column}++;
2030 $self->{nc}
2031 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2032 } else {
2033 $self->{set_nc}->($self);
2034 }
2035
2036 redo A;
2037 }
2038 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2039 ## XML5: "Tag attribute value unquoted state".
2040
2041 if ($is_space->{$self->{nc}}) {
2042 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2043
2044 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2045 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2046 } else {
2047
2048 ## XML5: "Tag attribute name before state".
2049 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2050 }
2051
2052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2053 $self->{line_prev} = $self->{line};
2054 $self->{column_prev} = $self->{column};
2055 $self->{column}++;
2056 $self->{nc}
2057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2058 } else {
2059 $self->{set_nc}->($self);
2060 }
2061
2062 redo A;
2063 } elsif ($self->{nc} == 0x0026) { # &
2064
2065
2066 ## XML5: Not defined yet.
2067
2068 ## NOTE: In the spec, the tokenizer is switched to the
2069 ## "entity in attribute value state". In this implementation, the
2070 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2071 ## implementation of the "consume a character reference" algorithm.
2072 $self->{entity_add} = -1;
2073 $self->{prev_state} = $self->{state};
2074 $self->{state} = ENTITY_STATE;
2075
2076 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2077 $self->{line_prev} = $self->{line};
2078 $self->{column_prev} = $self->{column};
2079 $self->{column}++;
2080 $self->{nc}
2081 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2082 } else {
2083 $self->{set_nc}->($self);
2084 }
2085
2086 redo A;
2087 } elsif ($self->{nc} == 0x003E) { # >
2088 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2089
2090 $self->{last_stag_name} = $self->{ct}->{tag_name};
2091
2092 $self->{state} = DATA_STATE;
2093 $self->{s_kwd} = '';
2094
2095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096 $self->{line_prev} = $self->{line};
2097 $self->{column_prev} = $self->{column};
2098 $self->{column}++;
2099 $self->{nc}
2100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101 } else {
2102 $self->{set_nc}->($self);
2103 }
2104
2105 return ($self->{ct}); # start tag
2106 redo A;
2107 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2108 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2109 if ($self->{ct}->{attributes}) {
2110
2111 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2112 } else {
2113 ## NOTE: This state should never be reached.
2114
2115 }
2116
2117 $self->{state} = DATA_STATE;
2118 $self->{s_kwd} = '';
2119
2120 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2121 $self->{line_prev} = $self->{line};
2122 $self->{column_prev} = $self->{column};
2123 $self->{column}++;
2124 $self->{nc}
2125 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2126 } else {
2127 $self->{set_nc}->($self);
2128 }
2129
2130 return ($self->{ct}); # end tag
2131 redo A;
2132 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2133 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2134 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2135
2136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2137 $self->{line_prev} = $self->{line};
2138 $self->{column_prev} = $self->{column};
2139 $self->{column}++;
2140 $self->{nc}
2141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2142 } else {
2143 $self->{set_nc}->($self);
2144 }
2145
2146 return ($self->{ct}); # ATTLIST
2147 redo A;
2148 } else {
2149 die "$0: $self->{ct}->{type}: Unknown token type";
2150 }
2151 } elsif ($self->{nc} == -1) {
2152 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2153
2154 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2155 $self->{last_stag_name} = $self->{ct}->{tag_name};
2156
2157 $self->{state} = DATA_STATE;
2158 $self->{s_kwd} = '';
2159 ## reconsume
2160 return ($self->{ct}); # start tag
2161 redo A;
2162 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2163 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2164 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2165 if ($self->{ct}->{attributes}) {
2166
2167 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2168 } else {
2169 ## NOTE: This state should never be reached.
2170
2171 }
2172
2173 $self->{state} = DATA_STATE;
2174 $self->{s_kwd} = '';
2175 ## reconsume
2176 return ($self->{ct}); # end tag
2177 redo A;
2178 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2179 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2180 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2181 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2182 ## Reconsume.
2183 return ($self->{ct}); # ATTLIST
2184 redo A;
2185 } else {
2186 die "$0: $self->{ct}->{type}: Unknown token type";
2187 }
2188 } else {
2189 if ({
2190 0x0022 => 1, # "
2191 0x0027 => 1, # '
2192 0x003D => 1, # =
2193 0x003C => 1, # <
2194 }->{$self->{nc}}) {
2195
2196 ## XML5: Not a parse error.
2197 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2198 } else {
2199
2200 }
2201 $self->{ca}->{value} .= chr ($self->{nc});
2202 $self->{read_until}->($self->{ca}->{value},
2203 qq["'=& \x09\x0C>],
2204 length $self->{ca}->{value});
2205
2206 ## Stay in the state
2207
2208 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2209 $self->{line_prev} = $self->{line};
2210 $self->{column_prev} = $self->{column};
2211 $self->{column}++;
2212 $self->{nc}
2213 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2214 } else {
2215 $self->{set_nc}->($self);
2216 }
2217
2218 redo A;
2219 }
2220 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2221 if ($is_space->{$self->{nc}}) {
2222
2223 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2224
2225 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2226 $self->{line_prev} = $self->{line};
2227 $self->{column_prev} = $self->{column};
2228 $self->{column}++;
2229 $self->{nc}
2230 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2231 } else {
2232 $self->{set_nc}->($self);
2233 }
2234
2235 redo A;
2236 } elsif ($self->{nc} == 0x003E) { # >
2237 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2238
2239 $self->{last_stag_name} = $self->{ct}->{tag_name};
2240 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2241 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2242 if ($self->{ct}->{attributes}) {
2243
2244 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2245 } else {
2246 ## NOTE: This state should never be reached.
2247
2248 }
2249 } else {
2250 die "$0: $self->{ct}->{type}: Unknown token type";
2251 }
2252 $self->{state} = DATA_STATE;
2253 $self->{s_kwd} = '';
2254
2255 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2256 $self->{line_prev} = $self->{line};
2257 $self->{column_prev} = $self->{column};
2258 $self->{column}++;
2259 $self->{nc}
2260 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2261 } else {
2262 $self->{set_nc}->($self);
2263 }
2264
2265
2266 return ($self->{ct}); # start tag or end tag
2267
2268 redo A;
2269 } elsif ($self->{nc} == 0x002F) { # /
2270
2271 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2272
2273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2274 $self->{line_prev} = $self->{line};
2275 $self->{column_prev} = $self->{column};
2276 $self->{column}++;
2277 $self->{nc}
2278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2279 } else {
2280 $self->{set_nc}->($self);
2281 }
2282
2283 redo A;
2284 } elsif ($self->{nc} == -1) {
2285 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2286 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2287
2288 $self->{last_stag_name} = $self->{ct}->{tag_name};
2289 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2290 if ($self->{ct}->{attributes}) {
2291
2292 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2293 } else {
2294 ## NOTE: This state should never be reached.
2295
2296 }
2297 } else {
2298 die "$0: $self->{ct}->{type}: Unknown token type";
2299 }
2300 $self->{state} = DATA_STATE;
2301 $self->{s_kwd} = '';
2302 ## Reconsume.
2303 return ($self->{ct}); # start tag or end tag
2304 redo A;
2305 } else {
2306
2307 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2308 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2309 ## reconsume
2310 redo A;
2311 }
2312 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2313 ## XML5: "Empty tag state".
2314
2315 if ($self->{nc} == 0x003E) { # >
2316 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2317
2318 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2319 ## TODO: Different type than slash in start tag
2320 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2321 if ($self->{ct}->{attributes}) {
2322
2323 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2324 } else {
2325
2326 }
2327 ## TODO: Test |<title></title/>|
2328 } else {
2329
2330 $self->{self_closing} = 1;
2331 }
2332
2333 $self->{state} = DATA_STATE;
2334 $self->{s_kwd} = '';
2335
2336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2337 $self->{line_prev} = $self->{line};
2338 $self->{column_prev} = $self->{column};
2339 $self->{column}++;
2340 $self->{nc}
2341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2342 } else {
2343 $self->{set_nc}->($self);
2344 }
2345
2346
2347 return ($self->{ct}); # start tag or end tag
2348
2349 redo A;
2350 } elsif ($self->{nc} == -1) {
2351 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2352 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2353
2354 $self->{last_stag_name} = $self->{ct}->{tag_name};
2355 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2356 if ($self->{ct}->{attributes}) {
2357
2358 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2359 } else {
2360 ## NOTE: This state should never be reached.
2361
2362 }
2363 } else {
2364 die "$0: $self->{ct}->{type}: Unknown token type";
2365 }
2366 ## XML5: "Tag attribute name before state".
2367 $self->{state} = DATA_STATE;
2368 $self->{s_kwd} = '';
2369 ## Reconsume.
2370 return ($self->{ct}); # start tag or end tag
2371 redo A;
2372 } else {
2373
2374 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2375 ## TODO: This error type is wrong.
2376 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2377 ## Reconsume.
2378 redo A;
2379 }
2380 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2381 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2382
2383 ## NOTE: Unlike spec's "bogus comment state", this implementation
2384 ## consumes characters one-by-one basis.
2385
2386 if ($self->{nc} == 0x003E) { # >
2387 if ($self->{in_subset}) {
2388
2389 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2390 } else {
2391
2392 $self->{state} = DATA_STATE;
2393 $self->{s_kwd} = '';
2394 }
2395
2396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2397 $self->{line_prev} = $self->{line};
2398 $self->{column_prev} = $self->{column};
2399 $self->{column}++;
2400 $self->{nc}
2401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2402 } else {
2403 $self->{set_nc}->($self);
2404 }
2405
2406
2407 return ($self->{ct}); # comment
2408 redo A;
2409 } elsif ($self->{nc} == -1) {
2410 if ($self->{in_subset}) {
2411
2412 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2413 } else {
2414
2415 $self->{state} = DATA_STATE;
2416 $self->{s_kwd} = '';
2417 }
2418 ## reconsume
2419
2420 return ($self->{ct}); # comment
2421 redo A;
2422 } else {
2423
2424 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2425 $self->{read_until}->($self->{ct}->{data},
2426 q[>],
2427 length $self->{ct}->{data});
2428
2429 ## Stay in the state.
2430
2431 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2432 $self->{line_prev} = $self->{line};
2433 $self->{column_prev} = $self->{column};
2434 $self->{column}++;
2435 $self->{nc}
2436 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2437 } else {
2438 $self->{set_nc}->($self);
2439 }
2440
2441 redo A;
2442 }
2443 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2444 ## XML5: "Markup declaration state".
2445
2446 if ($self->{nc} == 0x002D) { # -
2447
2448 $self->{state} = MD_HYPHEN_STATE;
2449
2450 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2451 $self->{line_prev} = $self->{line};
2452 $self->{column_prev} = $self->{column};
2453 $self->{column}++;
2454 $self->{nc}
2455 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2456 } else {
2457 $self->{set_nc}->($self);
2458 }
2459
2460 redo A;
2461 } elsif ($self->{nc} == 0x0044 or # D
2462 $self->{nc} == 0x0064) { # d
2463 ## ASCII case-insensitive.
2464
2465 $self->{state} = MD_DOCTYPE_STATE;
2466 $self->{kwd} = chr $self->{nc};
2467
2468 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2469 $self->{line_prev} = $self->{line};
2470 $self->{column_prev} = $self->{column};
2471 $self->{column}++;
2472 $self->{nc}
2473 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2474 } else {
2475 $self->{set_nc}->($self);
2476 }
2477
2478 redo A;
2479 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2480 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2481 $self->{is_xml}) and
2482 $self->{nc} == 0x005B) { # [
2483
2484 $self->{state} = MD_CDATA_STATE;
2485 $self->{kwd} = '[';
2486
2487 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2488 $self->{line_prev} = $self->{line};
2489 $self->{column_prev} = $self->{column};
2490 $self->{column}++;
2491 $self->{nc}
2492 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2493 } else {
2494 $self->{set_nc}->($self);
2495 }
2496
2497 redo A;
2498 } else {
2499
2500 }
2501
2502 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2503 line => $self->{line_prev},
2504 column => $self->{column_prev} - 1);
2505 ## Reconsume.
2506 $self->{state} = BOGUS_COMMENT_STATE;
2507 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2508 line => $self->{line_prev},
2509 column => $self->{column_prev} - 1,
2510 };
2511 redo A;
2512 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2513 if ($self->{nc} == 0x002D) { # -
2514
2515 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2516 line => $self->{line_prev},
2517 column => $self->{column_prev} - 2,
2518 };
2519 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2520
2521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2522 $self->{line_prev} = $self->{line};
2523 $self->{column_prev} = $self->{column};
2524 $self->{column}++;
2525 $self->{nc}
2526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2527 } else {
2528 $self->{set_nc}->($self);
2529 }
2530
2531 redo A;
2532 } else {
2533
2534 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2535 line => $self->{line_prev},
2536 column => $self->{column_prev} - 2);
2537 $self->{state} = BOGUS_COMMENT_STATE;
2538 ## Reconsume.
2539 $self->{ct} = {type => COMMENT_TOKEN,
2540 data => '-',
2541 line => $self->{line_prev},
2542 column => $self->{column_prev} - 2,
2543 };
2544 redo A;
2545 }
2546 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2547 ## ASCII case-insensitive.
2548 if ($self->{nc} == [
2549 undef,
2550 0x004F, # O
2551 0x0043, # C
2552 0x0054, # T
2553 0x0059, # Y
2554 0x0050, # P
2555 ]->[length $self->{kwd}] or
2556 $self->{nc} == [
2557 undef,
2558 0x006F, # o
2559 0x0063, # c
2560 0x0074, # t
2561 0x0079, # y
2562 0x0070, # p
2563 ]->[length $self->{kwd}]) {
2564
2565 ## Stay in the state.
2566 $self->{kwd} .= chr $self->{nc};
2567
2568 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2569 $self->{line_prev} = $self->{line};
2570 $self->{column_prev} = $self->{column};
2571 $self->{column}++;
2572 $self->{nc}
2573 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2574 } else {
2575 $self->{set_nc}->($self);
2576 }
2577
2578 redo A;
2579 } elsif ((length $self->{kwd}) == 6 and
2580 ($self->{nc} == 0x0045 or # E
2581 $self->{nc} == 0x0065)) { # e
2582 if ($self->{is_xml} and
2583 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2584
2585 ## XML5: case-sensitive.
2586 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2587 text => 'DOCTYPE',
2588 line => $self->{line_prev},
2589 column => $self->{column_prev} - 5);
2590 } else {
2591
2592 }
2593 $self->{state} = DOCTYPE_STATE;
2594 $self->{ct} = {type => DOCTYPE_TOKEN,
2595 quirks => 1,
2596 line => $self->{line_prev},
2597 column => $self->{column_prev} - 7,
2598 };
2599
2600 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2601 $self->{line_prev} = $self->{line};
2602 $self->{column_prev} = $self->{column};
2603 $self->{column}++;
2604 $self->{nc}
2605 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2606 } else {
2607 $self->{set_nc}->($self);
2608 }
2609
2610 redo A;
2611 } else {
2612
2613 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2614 line => $self->{line_prev},
2615 column => $self->{column_prev} - 1 - length $self->{kwd});
2616 $self->{state} = BOGUS_COMMENT_STATE;
2617 ## Reconsume.
2618 $self->{ct} = {type => COMMENT_TOKEN,
2619 data => $self->{kwd},
2620 line => $self->{line_prev},
2621 column => $self->{column_prev} - 1 - length $self->{kwd},
2622 };
2623 redo A;
2624 }
2625 } elsif ($self->{state} == MD_CDATA_STATE) {
2626 if ($self->{nc} == {
2627 '[' => 0x0043, # C
2628 '[C' => 0x0044, # D
2629 '[CD' => 0x0041, # A
2630 '[CDA' => 0x0054, # T
2631 '[CDAT' => 0x0041, # A
2632 }->{$self->{kwd}}) {
2633
2634 ## Stay in the state.
2635 $self->{kwd} .= chr $self->{nc};
2636
2637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2638 $self->{line_prev} = $self->{line};
2639 $self->{column_prev} = $self->{column};
2640 $self->{column}++;
2641 $self->{nc}
2642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2643 } else {
2644 $self->{set_nc}->($self);
2645 }
2646
2647 redo A;
2648 } elsif ($self->{kwd} eq '[CDATA' and
2649 $self->{nc} == 0x005B) { # [
2650 if ($self->{is_xml} and
2651 not $self->{tainted} and
2652 @{$self->{open_elements} or []} == 0) {
2653
2654 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2655 line => $self->{line_prev},
2656 column => $self->{column_prev} - 7);
2657 $self->{tainted} = 1;
2658 } else {
2659
2660 }
2661
2662 $self->{ct} = {type => CHARACTER_TOKEN,
2663 data => '',
2664 line => $self->{line_prev},
2665 column => $self->{column_prev} - 7};
2666 $self->{state} = CDATA_SECTION_STATE;
2667
2668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2669 $self->{line_prev} = $self->{line};
2670 $self->{column_prev} = $self->{column};
2671 $self->{column}++;
2672 $self->{nc}
2673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2674 } else {
2675 $self->{set_nc}->($self);
2676 }
2677
2678 redo A;
2679 } else {
2680
2681 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2682 line => $self->{line_prev},
2683 column => $self->{column_prev} - 1 - length $self->{kwd});
2684 $self->{state} = BOGUS_COMMENT_STATE;
2685 ## Reconsume.
2686 $self->{ct} = {type => COMMENT_TOKEN,
2687 data => $self->{kwd},
2688 line => $self->{line_prev},
2689 column => $self->{column_prev} - 1 - length $self->{kwd},
2690 };
2691 redo A;
2692 }
2693 } elsif ($self->{state} == COMMENT_START_STATE) {
2694 if ($self->{nc} == 0x002D) { # -
2695
2696 $self->{state} = COMMENT_START_DASH_STATE;
2697
2698 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2699 $self->{line_prev} = $self->{line};
2700 $self->{column_prev} = $self->{column};
2701 $self->{column}++;
2702 $self->{nc}
2703 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2704 } else {
2705 $self->{set_nc}->($self);
2706 }
2707
2708 redo A;
2709 } elsif ($self->{nc} == 0x003E) { # >
2710 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2711 if ($self->{in_subset}) {
2712
2713 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2714 } else {
2715
2716 $self->{state} = DATA_STATE;
2717 $self->{s_kwd} = '';
2718 }
2719
2720 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2721 $self->{line_prev} = $self->{line};
2722 $self->{column_prev} = $self->{column};
2723 $self->{column}++;
2724 $self->{nc}
2725 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2726 } else {
2727 $self->{set_nc}->($self);
2728 }
2729
2730
2731 return ($self->{ct}); # comment
2732
2733 redo A;
2734 } elsif ($self->{nc} == -1) {
2735 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2736 if ($self->{in_subset}) {
2737
2738 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2739 } else {
2740
2741 $self->{state} = DATA_STATE;
2742 $self->{s_kwd} = '';
2743 }
2744 ## reconsume
2745
2746 return ($self->{ct}); # comment
2747
2748 redo A;
2749 } else {
2750
2751 $self->{ct}->{data} # comment
2752 .= chr ($self->{nc});
2753 $self->{state} = COMMENT_STATE;
2754
2755 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2756 $self->{line_prev} = $self->{line};
2757 $self->{column_prev} = $self->{column};
2758 $self->{column}++;
2759 $self->{nc}
2760 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2761 } else {
2762 $self->{set_nc}->($self);
2763 }
2764
2765 redo A;
2766 }
2767 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2768 if ($self->{nc} == 0x002D) { # -
2769
2770 $self->{state} = COMMENT_END_STATE;
2771
2772 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2773 $self->{line_prev} = $self->{line};
2774 $self->{column_prev} = $self->{column};
2775 $self->{column}++;
2776 $self->{nc}
2777 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2778 } else {
2779 $self->{set_nc}->($self);
2780 }
2781
2782 redo A;
2783 } elsif ($self->{nc} == 0x003E) { # >
2784 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2785 if ($self->{in_subset}) {
2786
2787 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2788 } else {
2789
2790 $self->{state} = DATA_STATE;
2791 $self->{s_kwd} = '';
2792 }
2793
2794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2795 $self->{line_prev} = $self->{line};
2796 $self->{column_prev} = $self->{column};
2797 $self->{column}++;
2798 $self->{nc}
2799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2800 } else {
2801 $self->{set_nc}->($self);
2802 }
2803
2804
2805 return ($self->{ct}); # comment
2806
2807 redo A;
2808 } elsif ($self->{nc} == -1) {
2809 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2810 if ($self->{in_subset}) {
2811
2812 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2813 } else {
2814
2815 $self->{state} = DATA_STATE;
2816 $self->{s_kwd} = '';
2817 }
2818 ## reconsume
2819
2820 return ($self->{ct}); # comment
2821
2822 redo A;
2823 } else {
2824
2825 $self->{ct}->{data} # comment
2826 .= '-' . chr ($self->{nc});
2827 $self->{state} = COMMENT_STATE;
2828
2829 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2830 $self->{line_prev} = $self->{line};
2831 $self->{column_prev} = $self->{column};
2832 $self->{column}++;
2833 $self->{nc}
2834 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2835 } else {
2836 $self->{set_nc}->($self);
2837 }
2838
2839 redo A;
2840 }
2841 } elsif ($self->{state} == COMMENT_STATE) {
2842 ## XML5: "Comment state" and "DOCTYPE comment state".
2843
2844 if ($self->{nc} == 0x002D) { # -
2845
2846 $self->{state} = COMMENT_END_DASH_STATE;
2847
2848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2849 $self->{line_prev} = $self->{line};
2850 $self->{column_prev} = $self->{column};
2851 $self->{column}++;
2852 $self->{nc}
2853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2854 } else {
2855 $self->{set_nc}->($self);
2856 }
2857
2858 redo A;
2859 } elsif ($self->{nc} == -1) {
2860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2861 if ($self->{in_subset}) {
2862
2863 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2864 } else {
2865
2866 $self->{state} = DATA_STATE;
2867 $self->{s_kwd} = '';
2868 }
2869 ## reconsume
2870
2871 return ($self->{ct}); # comment
2872
2873 redo A;
2874 } else {
2875
2876 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2877 $self->{read_until}->($self->{ct}->{data},
2878 q[-],
2879 length $self->{ct}->{data});
2880
2881 ## Stay in the state
2882
2883 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2884 $self->{line_prev} = $self->{line};
2885 $self->{column_prev} = $self->{column};
2886 $self->{column}++;
2887 $self->{nc}
2888 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2889 } else {
2890 $self->{set_nc}->($self);
2891 }
2892
2893 redo A;
2894 }
2895 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2896 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2897
2898 if ($self->{nc} == 0x002D) { # -
2899
2900 $self->{state} = COMMENT_END_STATE;
2901
2902 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2903 $self->{line_prev} = $self->{line};
2904 $self->{column_prev} = $self->{column};
2905 $self->{column}++;
2906 $self->{nc}
2907 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2908 } else {
2909 $self->{set_nc}->($self);
2910 }
2911
2912 redo A;
2913 } elsif ($self->{nc} == -1) {
2914 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2915 if ($self->{in_subset}) {
2916
2917 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2918 } else {
2919
2920 $self->{state} = DATA_STATE;
2921 $self->{s_kwd} = '';
2922 }
2923 ## reconsume
2924
2925 return ($self->{ct}); # comment
2926
2927 redo A;
2928 } else {
2929
2930 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2931 $self->{state} = COMMENT_STATE;
2932
2933 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2934 $self->{line_prev} = $self->{line};
2935 $self->{column_prev} = $self->{column};
2936 $self->{column}++;
2937 $self->{nc}
2938 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2939 } else {
2940 $self->{set_nc}->($self);
2941 }
2942
2943 redo A;
2944 }
2945 } elsif ($self->{state} == COMMENT_END_STATE) {
2946 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2947
2948 if ($self->{nc} == 0x003E) { # >
2949 if ($self->{in_subset}) {
2950
2951 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2952 } else {
2953
2954 $self->{state} = DATA_STATE;
2955 $self->{s_kwd} = '';
2956 }
2957
2958 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2959 $self->{line_prev} = $self->{line};
2960 $self->{column_prev} = $self->{column};
2961 $self->{column}++;
2962 $self->{nc}
2963 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2964 } else {
2965 $self->{set_nc}->($self);
2966 }
2967
2968
2969 return ($self->{ct}); # comment
2970
2971 redo A;
2972 } elsif ($self->{nc} == 0x002D) { # -
2973
2974 ## XML5: Not a parse error.
2975 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2976 line => $self->{line_prev},
2977 column => $self->{column_prev});
2978 $self->{ct}->{data} .= '-'; # comment
2979 ## Stay in the state
2980
2981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2982 $self->{line_prev} = $self->{line};
2983 $self->{column_prev} = $self->{column};
2984 $self->{column}++;
2985 $self->{nc}
2986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2987 } else {
2988 $self->{set_nc}->($self);
2989 }
2990
2991 redo A;
2992 } elsif ($self->{nc} == -1) {
2993 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2994 if ($self->{in_subset}) {
2995
2996 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2997 } else {
2998
2999 $self->{state} = DATA_STATE;
3000 $self->{s_kwd} = '';
3001 }
3002 ## reconsume
3003
3004 return ($self->{ct}); # comment
3005
3006 redo A;
3007 } else {
3008
3009 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3010 $self->{state} = COMMENT_STATE;
3011
3012 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3013 $self->{line_prev} = $self->{line};
3014 $self->{column_prev} = $self->{column};
3015 $self->{column}++;
3016 $self->{nc}
3017 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3018 } else {
3019 $self->{set_nc}->($self);
3020 }
3021
3022 redo A;
3023 }
3024 } elsif ($self->{state} == DOCTYPE_STATE) {
3025 if ($is_space->{$self->{nc}}) {
3026
3027 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3028
3029 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3030 $self->{line_prev} = $self->{line};
3031 $self->{column_prev} = $self->{column};
3032 $self->{column}++;
3033 $self->{nc}
3034 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3035 } else {
3036 $self->{set_nc}->($self);
3037 }
3038
3039 redo A;
3040 } elsif ($self->{nc} == -1) {
3041
3042 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3043 $self->{ct}->{quirks} = 1;
3044
3045 $self->{state} = DATA_STATE;
3046 ## Reconsume.
3047 return ($self->{ct}); # DOCTYPE (quirks)
3048
3049 redo A;
3050 } else {
3051
3052 ## XML5: Swith to the bogus comment state.
3053 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3054 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3055 ## reconsume
3056 redo A;
3057 }
3058 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3059 ## XML5: "DOCTYPE root name before state".
3060
3061 if ($is_space->{$self->{nc}}) {
3062
3063 ## Stay in the state
3064
3065 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3066 $self->{line_prev} = $self->{line};
3067 $self->{column_prev} = $self->{column};
3068 $self->{column}++;
3069 $self->{nc}
3070 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3071 } else {
3072 $self->{set_nc}->($self);
3073 }
3074
3075 redo A;
3076 } elsif ($self->{nc} == 0x003E) { # >
3077
3078 ## XML5: No parse error.
3079 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3080 $self->{state} = DATA_STATE;
3081 $self->{s_kwd} = '';
3082
3083 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3084 $self->{line_prev} = $self->{line};
3085 $self->{column_prev} = $self->{column};
3086 $self->{column}++;
3087 $self->{nc}
3088 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3089 } else {
3090 $self->{set_nc}->($self);
3091 }
3092
3093
3094 return ($self->{ct}); # DOCTYPE (quirks)
3095
3096 redo A;
3097 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3098
3099 $self->{ct}->{name} # DOCTYPE
3100 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3101 delete $self->{ct}->{quirks};
3102 $self->{state} = DOCTYPE_NAME_STATE;
3103
3104 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105 $self->{line_prev} = $self->{line};
3106 $self->{column_prev} = $self->{column};
3107 $self->{column}++;
3108 $self->{nc}
3109 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110 } else {
3111 $self->{set_nc}->($self);
3112 }
3113
3114 redo A;
3115 } elsif ($self->{nc} == -1) {
3116
3117 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3118 $self->{state} = DATA_STATE;
3119 $self->{s_kwd} = '';
3120 ## reconsume
3121
3122 return ($self->{ct}); # DOCTYPE (quirks)
3123
3124 redo A;
3125 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3126
3127 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3128 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3129 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3130 $self->{in_subset} = 1;
3131
3132 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3133 $self->{line_prev} = $self->{line};
3134 $self->{column_prev} = $self->{column};
3135 $self->{column}++;
3136 $self->{nc}
3137 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3138 } else {
3139 $self->{set_nc}->($self);
3140 }
3141
3142 return ($self->{ct}); # DOCTYPE
3143 redo A;
3144 } else {
3145
3146 $self->{ct}->{name} = chr $self->{nc};
3147 delete $self->{ct}->{quirks};
3148 $self->{state} = DOCTYPE_NAME_STATE;
3149
3150 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3151 $self->{line_prev} = $self->{line};
3152 $self->{column_prev} = $self->{column};
3153 $self->{column}++;
3154 $self->{nc}
3155 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3156 } else {
3157 $self->{set_nc}->($self);
3158 }
3159
3160 redo A;
3161 }
3162 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3163 ## XML5: "DOCTYPE root name state".
3164
3165 ## ISSUE: Redundant "First," in the spec.
3166
3167 if ($is_space->{$self->{nc}}) {
3168
3169 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3170
3171 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3172 $self->{line_prev} = $self->{line};
3173 $self->{column_prev} = $self->{column};
3174 $self->{column}++;
3175 $self->{nc}
3176 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3177 } else {
3178 $self->{set_nc}->($self);
3179 }
3180
3181 redo A;
3182 } elsif ($self->{nc} == 0x003E) { # >
3183
3184 $self->{state} = DATA_STATE;
3185 $self->{s_kwd} = '';
3186
3187 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3188 $self->{line_prev} = $self->{line};
3189 $self->{column_prev} = $self->{column};
3190 $self->{column}++;
3191 $self->{nc}
3192 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3193 } else {
3194 $self->{set_nc}->($self);
3195 }
3196
3197
3198 return ($self->{ct}); # DOCTYPE
3199
3200 redo A;
3201 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3202
3203 $self->{ct}->{name} # DOCTYPE
3204 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3205 delete $self->{ct}->{quirks};
3206 ## Stay in the state.
3207
3208 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3209 $self->{line_prev} = $self->{line};
3210 $self->{column_prev} = $self->{column};
3211 $self->{column}++;
3212 $self->{nc}
3213 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3214 } else {
3215 $self->{set_nc}->($self);
3216 }
3217
3218 redo A;
3219 } elsif ($self->{nc} == -1) {
3220
3221 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3222 $self->{state} = DATA_STATE;
3223 $self->{s_kwd} = '';
3224 ## reconsume
3225
3226 $self->{ct}->{quirks} = 1;
3227 return ($self->{ct}); # DOCTYPE
3228
3229 redo A;
3230 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3231
3232 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3233 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3234 $self->{in_subset} = 1;
3235
3236 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3237 $self->{line_prev} = $self->{line};
3238 $self->{column_prev} = $self->{column};
3239 $self->{column}++;
3240 $self->{nc}
3241 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3242 } else {
3243 $self->{set_nc}->($self);
3244 }
3245
3246 return ($self->{ct}); # DOCTYPE
3247 redo A;
3248 } else {
3249
3250 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3251 ## Stay in the state.
3252
3253 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3254 $self->{line_prev} = $self->{line};
3255 $self->{column_prev} = $self->{column};
3256 $self->{column}++;
3257 $self->{nc}
3258 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3259 } else {
3260 $self->{set_nc}->($self);
3261 }
3262
3263 redo A;
3264 }
3265 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3266 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3267 ## state", but implemented differently.
3268
3269 if ($is_space->{$self->{nc}}) {
3270
3271 ## Stay in the state
3272
3273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3274 $self->{line_prev} = $self->{line};
3275 $self->{column_prev} = $self->{column};
3276 $self->{column}++;
3277 $self->{nc}
3278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3279 } else {
3280 $self->{set_nc}->($self);
3281 }
3282
3283 redo A;
3284 } elsif ($self->{nc} == 0x003E) { # >
3285 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3286
3287 $self->{state} = DATA_STATE;
3288 $self->{s_kwd} = '';
3289 } else {
3290
3291 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3292 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3293 }
3294
3295
3296 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3297 $self->{line_prev} = $self->{line};
3298 $self->{column_prev} = $self->{column};
3299 $self->{column}++;
3300 $self->{nc}
3301 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3302 } else {
3303 $self->{set_nc}->($self);
3304 }
3305
3306 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3307 redo A;
3308 } elsif ($self->{nc} == -1) {
3309 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3310
3311 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3312 $self->{state} = DATA_STATE;
3313 $self->{s_kwd} = '';
3314 $self->{ct}->{quirks} = 1;
3315 } else {
3316
3317 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3318 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3319 }
3320
3321 ## Reconsume.
3322 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3323 redo A;
3324 } elsif ($self->{nc} == 0x0050 or # P
3325 $self->{nc} == 0x0070) { # p
3326
3327 $self->{state} = PUBLIC_STATE;
3328 $self->{kwd} = chr $self->{nc};
3329
3330 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3331 $self->{line_prev} = $self->{line};
3332 $self->{column_prev} = $self->{column};
3333 $self->{column}++;
3334 $self->{nc}
3335 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3336 } else {
3337 $self->{set_nc}->($self);
3338 }
3339
3340 redo A;
3341 } elsif ($self->{nc} == 0x0053 or # S
3342 $self->{nc} == 0x0073) { # s
3343
3344 $self->{state} = SYSTEM_STATE;
3345 $self->{kwd} = chr $self->{nc};
3346
3347 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3348 $self->{line_prev} = $self->{line};
3349 $self->{column_prev} = $self->{column};
3350 $self->{column}++;
3351 $self->{nc}
3352 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3353 } else {
3354 $self->{set_nc}->($self);
3355 }
3356
3357 redo A;
3358 } elsif ($self->{nc} == 0x0022 and # "
3359 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3360 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3361
3362 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3363 $self->{ct}->{value} = ''; # ENTITY
3364
3365 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3366 $self->{line_prev} = $self->{line};
3367 $self->{column_prev} = $self->{column};
3368 $self->{column}++;
3369 $self->{nc}
3370 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3371 } else {
3372 $self->{set_nc}->($self);
3373 }
3374
3375 redo A;
3376 } elsif ($self->{nc} == 0x0027 and # '
3377 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3378 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3379
3380 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3381 $self->{ct}->{value} = ''; # ENTITY
3382
3383 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3384 $self->{line_prev} = $self->{line};
3385 $self->{column_prev} = $self->{column};
3386 $self->{column}++;
3387 $self->{nc}
3388 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3389 } else {
3390 $self->{set_nc}->($self);
3391 }
3392
3393 redo A;
3394 } elsif ($self->{is_xml} and
3395 $self->{ct}->{type} == DOCTYPE_TOKEN and
3396 $self->{nc} == 0x005B) { # [
3397
3398 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3399 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3400 $self->{in_subset} = 1;
3401
3402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403 $self->{line_prev} = $self->{line};
3404 $self->{column_prev} = $self->{column};
3405 $self->{column}++;
3406 $self->{nc}
3407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408 } else {
3409 $self->{set_nc}->($self);
3410 }
3411
3412 return ($self->{ct}); # DOCTYPE
3413 redo A;
3414 } else {
3415 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3416
3417 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418
3419 $self->{ct}->{quirks} = 1;
3420 $self->{state} = BOGUS_DOCTYPE_STATE;
3421 } else {
3422
3423 $self->{state} = BOGUS_MD_STATE;
3424 }
3425
3426
3427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428 $self->{line_prev} = $self->{line};
3429 $self->{column_prev} = $self->{column};
3430 $self->{column}++;
3431 $self->{nc}
3432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3433 } else {
3434 $self->{set_nc}->($self);
3435 }
3436
3437 redo A;
3438 }
3439 } elsif ($self->{state} == PUBLIC_STATE) {
3440 ## ASCII case-insensitive
3441 if ($self->{nc} == [
3442 undef,
3443 0x0055, # U
3444 0x0042, # B
3445 0x004C, # L
3446 0x0049, # I
3447 ]->[length $self->{kwd}] or
3448 $self->{nc} == [
3449 undef,
3450 0x0075, # u
3451 0x0062, # b
3452 0x006C, # l
3453 0x0069, # i
3454 ]->[length $self->{kwd}]) {
3455
3456 ## Stay in the state.
3457 $self->{kwd} .= chr $self->{nc};
3458
3459 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3460 $self->{line_prev} = $self->{line};
3461 $self->{column_prev} = $self->{column};
3462 $self->{column}++;
3463 $self->{nc}
3464 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3465 } else {
3466 $self->{set_nc}->($self);
3467 }
3468
3469 redo A;
3470 } elsif ((length $self->{kwd}) == 5 and
3471 ($self->{nc} == 0x0043 or # C
3472 $self->{nc} == 0x0063)) { # c
3473 if ($self->{is_xml} and
3474 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3475
3476 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3477 text => 'PUBLIC',
3478 line => $self->{line_prev},
3479 column => $self->{column_prev} - 4);
3480 } else {
3481
3482 }
3483 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3484
3485 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3486 $self->{line_prev} = $self->{line};
3487 $self->{column_prev} = $self->{column};
3488 $self->{column}++;
3489 $self->{nc}
3490 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3491 } else {
3492 $self->{set_nc}->($self);
3493 }
3494
3495 redo A;
3496 } else {
3497 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3498 line => $self->{line_prev},
3499 column => $self->{column_prev} + 1 - length $self->{kwd});
3500 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3501
3502 $self->{ct}->{quirks} = 1;
3503 $self->{state} = BOGUS_DOCTYPE_STATE;
3504 } else {
3505
3506 $self->{state} = BOGUS_MD_STATE;
3507 }
3508 ## Reconsume.
3509 redo A;
3510 }
3511 } elsif ($self->{state} == SYSTEM_STATE) {
3512 ## ASCII case-insensitive
3513 if ($self->{nc} == [
3514 undef,
3515 0x0059, # Y
3516 0x0053, # S
3517 0x0054, # T
3518 0x0045, # E
3519 ]->[length $self->{kwd}] or
3520 $self->{nc} == [
3521 undef,
3522 0x0079, # y
3523 0x0073, # s
3524 0x0074, # t
3525 0x0065, # e
3526 ]->[length $self->{kwd}]) {
3527
3528 ## Stay in the state.
3529 $self->{kwd} .= chr $self->{nc};
3530
3531 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3532 $self->{line_prev} = $self->{line};
3533 $self->{column_prev} = $self->{column};
3534 $self->{column}++;
3535 $self->{nc}
3536 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3537 } else {
3538 $self->{set_nc}->($self);
3539 }
3540
3541 redo A;
3542 } elsif ((length $self->{kwd}) == 5 and
3543 ($self->{nc} == 0x004D or # M
3544 $self->{nc} == 0x006D)) { # m
3545 if ($self->{is_xml} and
3546 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3547
3548 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3549 text => 'SYSTEM',
3550 line => $self->{line_prev},
3551 column => $self->{column_prev} - 4);
3552 } else {
3553
3554 }
3555 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3556
3557 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3558 $self->{line_prev} = $self->{line};
3559 $self->{column_prev} = $self->{column};
3560 $self->{column}++;
3561 $self->{nc}
3562 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3563 } else {
3564 $self->{set_nc}->($self);
3565 }
3566
3567 redo A;
3568 } else {
3569 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3570 line => $self->{line_prev},
3571 column => $self->{column_prev} + 1 - length $self->{kwd});
3572 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3573
3574 $self->{ct}->{quirks} = 1;
3575 $self->{state} = BOGUS_DOCTYPE_STATE;
3576 } else {
3577
3578 $self->{state} = BOGUS_MD_STATE;
3579 }
3580 ## Reconsume.
3581 redo A;
3582 }
3583 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3584 if ($is_space->{$self->{nc}}) {
3585
3586 ## Stay in the state
3587
3588 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3589 $self->{line_prev} = $self->{line};
3590 $self->{column_prev} = $self->{column};
3591 $self->{column}++;
3592 $self->{nc}
3593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3594 } else {
3595 $self->{set_nc}->($self);
3596 }
3597
3598 redo A;
3599 } elsif ($self->{nc} eq 0x0022) { # "
3600
3601 $self->{ct}->{pubid} = ''; # DOCTYPE
3602 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3603
3604 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3605 $self->{line_prev} = $self->{line};
3606 $self->{column_prev} = $self->{column};
3607 $self->{column}++;
3608 $self->{nc}
3609 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3610 } else {
3611 $self->{set_nc}->($self);
3612 }
3613
3614 redo A;
3615 } elsif ($self->{nc} eq 0x0027) { # '
3616
3617 $self->{ct}->{pubid} = ''; # DOCTYPE
3618 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3619
3620 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3621 $self->{line_prev} = $self->{line};
3622 $self->{column_prev} = $self->{column};
3623 $self->{column}++;
3624 $self->{nc}
3625 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3626 } else {
3627 $self->{set_nc}->($self);
3628 }
3629
3630 redo A;
3631 } elsif ($self->{nc} eq 0x003E) { # >
3632 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3633
3634 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3635
3636 $self->{state} = DATA_STATE;
3637 $self->{s_kwd} = '';
3638 $self->{ct}->{quirks} = 1;
3639 } else {
3640
3641 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3642 }
3643
3644
3645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3646 $self->{line_prev} = $self->{line};
3647 $self->{column_prev} = $self->{column};
3648 $self->{column}++;
3649 $self->{nc}
3650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3651 } else {
3652 $self->{set_nc}->($self);
3653 }
3654
3655 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3656 redo A;
3657 } elsif ($self->{nc} == -1) {
3658 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3659
3660 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3661 $self->{state} = DATA_STATE;
3662 $self->{s_kwd} = '';
3663 $self->{ct}->{quirks} = 1;
3664 } else {
3665
3666 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3667 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3668 }
3669
3670 ## reconsume
3671 return ($self->{ct}); # DOCTYPE
3672 redo A;
3673 } elsif ($self->{is_xml} and
3674 $self->{ct}->{type} == DOCTYPE_TOKEN and
3675 $self->{nc} == 0x005B) { # [
3676
3677 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3678 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3679 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3680 $self->{in_subset} = 1;
3681
3682 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3683 $self->{line_prev} = $self->{line};
3684 $self->{column_prev} = $self->{column};
3685 $self->{column}++;
3686 $self->{nc}
3687 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3688 } else {
3689 $self->{set_nc}->($self);
3690 }
3691
3692 return ($self->{ct}); # DOCTYPE
3693 redo A;
3694 } else {
3695 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3696
3697 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3698
3699 $self->{ct}->{quirks} = 1;
3700 $self->{state} = BOGUS_DOCTYPE_STATE;
3701 } else {
3702
3703 $self->{state} = BOGUS_MD_STATE;
3704 }
3705
3706
3707 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3708 $self->{line_prev} = $self->{line};
3709 $self->{column_prev} = $self->{column};
3710 $self->{column}++;
3711 $self->{nc}
3712 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3713 } else {
3714 $self->{set_nc}->($self);
3715 }
3716
3717 redo A;
3718 }
3719 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3720 if ($self->{nc} == 0x0022) { # "
3721
3722 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3723
3724 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3725 $self->{line_prev} = $self->{line};
3726 $self->{column_prev} = $self->{column};
3727 $self->{column}++;
3728 $self->{nc}
3729 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3730 } else {
3731 $self->{set_nc}->($self);
3732 }
3733
3734 redo A;
3735 } elsif ($self->{nc} == 0x003E) { # >
3736 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3737
3738 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3739
3740 $self->{state} = DATA_STATE;
3741 $self->{s_kwd} = '';
3742 $self->{ct}->{quirks} = 1;
3743 } else {
3744
3745 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3746 }
3747
3748
3749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3750 $self->{line_prev} = $self->{line};
3751 $self->{column_prev} = $self->{column};
3752 $self->{column}++;
3753 $self->{nc}
3754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3755 } else {
3756 $self->{set_nc}->($self);
3757 }
3758
3759 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3760 redo A;
3761 } elsif ($self->{nc} == -1) {
3762 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3763
3764 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3765
3766 $self->{state} = DATA_STATE;
3767 $self->{s_kwd} = '';
3768 $self->{ct}->{quirks} = 1;
3769 } else {
3770
3771 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3772 }
3773
3774 ## Reconsume.
3775 return ($self->{ct}); # DOCTYPE
3776 redo A;
3777 } else {
3778
3779 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3780 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3781 length $self->{ct}->{pubid});
3782
3783 ## Stay in the state
3784
3785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786 $self->{line_prev} = $self->{line};
3787 $self->{column_prev} = $self->{column};
3788 $self->{column}++;
3789 $self->{nc}
3790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791 } else {
3792 $self->{set_nc}->($self);
3793 }
3794
3795 redo A;
3796 }
3797 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3798 if ($self->{nc} == 0x0027) { # '
3799
3800 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3801
3802 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3803 $self->{line_prev} = $self->{line};
3804 $self->{column_prev} = $self->{column};
3805 $self->{column}++;
3806 $self->{nc}
3807 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3808 } else {
3809 $self->{set_nc}->($self);
3810 }
3811
3812 redo A;
3813 } elsif ($self->{nc} == 0x003E) { # >
3814 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3815
3816 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3817
3818 $self->{state} = DATA_STATE;
3819 $self->{s_kwd} = '';
3820 $self->{ct}->{quirks} = 1;
3821 } else {
3822
3823 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3824 }
3825
3826
3827 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3828 $self->{line_prev} = $self->{line};
3829 $self->{column_prev} = $self->{column};
3830 $self->{column}++;
3831 $self->{nc}
3832 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3833 } else {
3834 $self->{set_nc}->($self);
3835 }
3836
3837 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3838 redo A;
3839 } elsif ($self->{nc} == -1) {
3840 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3841
3842 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3843
3844 $self->{state} = DATA_STATE;
3845 $self->{s_kwd} = '';
3846 $self->{ct}->{quirks} = 1;
3847 } else {
3848
3849 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3850 }
3851
3852 ## reconsume
3853 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3854 redo A;
3855 } else {
3856
3857 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3858 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3859 length $self->{ct}->{pubid});
3860
3861 ## Stay in the state
3862
3863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3864 $self->{line_prev} = $self->{line};
3865 $self->{column_prev} = $self->{column};
3866 $self->{column}++;
3867 $self->{nc}
3868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3869 } else {
3870 $self->{set_nc}->($self);
3871 }
3872
3873 redo A;
3874 }
3875 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3876 if ($is_space->{$self->{nc}}) {
3877
3878 ## Stay in the state
3879
3880 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3881 $self->{line_prev} = $self->{line};
3882 $self->{column_prev} = $self->{column};
3883 $self->{column}++;
3884 $self->{nc}
3885 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3886 } else {
3887 $self->{set_nc}->($self);
3888 }
3889
3890 redo A;
3891 } elsif ($self->{nc} == 0x0022) { # "
3892
3893 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3894 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3895
3896 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3897 $self->{line_prev} = $self->{line};
3898 $self->{column_prev} = $self->{column};
3899 $self->{column}++;
3900 $self->{nc}
3901 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3902 } else {
3903 $self->{set_nc}->($self);
3904 }
3905
3906 redo A;
3907 } elsif ($self->{nc} == 0x0027) { # '
3908
3909 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3910 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3911
3912 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3913 $self->{line_prev} = $self->{line};
3914 $self->{column_prev} = $self->{column};
3915 $self->{column}++;
3916 $self->{nc}
3917 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3918 } else {
3919 $self->{set_nc}->($self);
3920 }
3921
3922 redo A;
3923 } elsif ($self->{nc} == 0x003E) { # >
3924 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3925 if ($self->{is_xml}) {
3926
3927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3928 } else {
3929
3930 }
3931 $self->{state} = DATA_STATE;
3932 $self->{s_kwd} = '';
3933 } else {
3934 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3935
3936 } else {
3937
3938 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3939 }
3940 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3941 }
3942
3943
3944 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3945 $self->{line_prev} = $self->{line};
3946 $self->{column_prev} = $self->{column};
3947 $self->{column}++;
3948 $self->{nc}
3949 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3950 } else {
3951 $self->{set_nc}->($self);
3952 }
3953
3954 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3955 redo A;
3956 } elsif ($self->{nc} == -1) {
3957 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3958
3959 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3960
3961 $self->{state} = DATA_STATE;
3962 $self->{s_kwd} = '';
3963 $self->{ct}->{quirks} = 1;
3964 } else {
3965 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3966 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3967 }
3968
3969 ## reconsume
3970 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3971 redo A;
3972 } elsif ($self->{is_xml} and
3973 $self->{ct}->{type} == DOCTYPE_TOKEN and
3974 $self->{nc} == 0x005B) { # [
3975
3976 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3977 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3978 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3979 $self->{in_subset} = 1;
3980
3981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3982 $self->{line_prev} = $self->{line};
3983 $self->{column_prev} = $self->{column};
3984 $self->{column}++;
3985 $self->{nc}
3986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3987 } else {
3988 $self->{set_nc}->($self);
3989 }
3990
3991 return ($self->{ct}); # DOCTYPE
3992 redo A;
3993 } else {
3994 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3995
3996 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3997
3998 $self->{ct}->{quirks} = 1;
3999 $self->{state} = BOGUS_DOCTYPE_STATE;
4000 } else {
4001
4002 $self->{state} = BOGUS_MD_STATE;
4003 }
4004
4005
4006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4007 $self->{line_prev} = $self->{line};
4008 $self->{column_prev} = $self->{column};
4009 $self->{column}++;
4010 $self->{nc}
4011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4012 } else {
4013 $self->{set_nc}->($self);
4014 }
4015
4016 redo A;
4017 }
4018 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4019 if ($is_space->{$self->{nc}}) {
4020
4021 ## Stay in the state
4022
4023 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4024 $self->{line_prev} = $self->{line};
4025 $self->{column_prev} = $self->{column};
4026 $self->{column}++;
4027 $self->{nc}
4028 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4029 } else {
4030 $self->{set_nc}->($self);
4031 }
4032
4033 redo A;
4034 } elsif ($self->{nc} == 0x0022) { # "
4035
4036 $self->{ct}->{sysid} = ''; # DOCTYPE
4037 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4038
4039 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4040 $self->{line_prev} = $self->{line};
4041 $self->{column_prev} = $self->{column};
4042 $self->{column}++;
4043 $self->{nc}
4044 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4045 } else {
4046 $self->{set_nc}->($self);
4047 }
4048
4049 redo A;
4050 } elsif ($self->{nc} == 0x0027) { # '
4051
4052 $self->{ct}->{sysid} = ''; # DOCTYPE
4053 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4054
4055 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4056 $self->{line_prev} = $self->{line};
4057 $self->{column_prev} = $self->{column};
4058 $self->{column}++;
4059 $self->{nc}
4060 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4061 } else {
4062 $self->{set_nc}->($self);
4063 }
4064
4065 redo A;
4066 } elsif ($self->{nc} == 0x003E) { # >
4067 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4068
4069 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070 $self->{line_prev} = $self->{line};
4071 $self->{column_prev} = $self->{column};
4072 $self->{column}++;
4073 $self->{nc}
4074 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4075 } else {
4076 $self->{set_nc}->($self);
4077 }
4078
4079
4080 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4081
4082 $self->{state} = DATA_STATE;
4083 $self->{s_kwd} = '';
4084 $self->{ct}->{quirks} = 1;
4085 } else {
4086
4087 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4088 }
4089
4090 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4091 redo A;
4092 } elsif ($self->{nc} == -1) {
4093 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4094
4095 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4096 $self->{state} = DATA_STATE;
4097 $self->{s_kwd} = '';
4098 $self->{ct}->{quirks} = 1;
4099 } else {
4100
4101 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4102 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4103 }
4104
4105 ## reconsume
4106 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4107 redo A;
4108 } elsif ($self->{is_xml} and
4109 $self->{ct}->{type} == DOCTYPE_TOKEN and
4110 $self->{nc} == 0x005B) { # [
4111
4112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4113
4114 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4115 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4116 $self->{in_subset} = 1;
4117
4118 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4119 $self->{line_prev} = $self->{line};
4120 $self->{column_prev} = $self->{column};
4121 $self->{column}++;
4122 $self->{nc}
4123 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4124 } else {
4125 $self->{set_nc}->($self);
4126 }
4127
4128 return ($self->{ct}); # DOCTYPE
4129 redo A;
4130 } else {
4131 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4132
4133 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4134
4135 $self->{ct}->{quirks} = 1;
4136 $self->{state} = BOGUS_DOCTYPE_STATE;
4137 } else {
4138
4139 $self->{state} = BOGUS_MD_STATE;
4140 }
4141
4142
4143 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4144 $self->{line_prev} = $self->{line};
4145 $self->{column_prev} = $self->{column};
4146 $self->{column}++;
4147 $self->{nc}
4148 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4149 } else {
4150 $self->{set_nc}->($self);
4151 }
4152
4153 redo A;
4154 }
4155 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4156 if ($self->{nc} == 0x0022) { # "
4157
4158 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4159
4160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4161 $self->{line_prev} = $self->{line};
4162 $self->{column_prev} = $self->{column};
4163 $self->{column}++;
4164 $self->{nc}
4165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4166 } else {
4167 $self->{set_nc}->($self);
4168 }
4169
4170 redo A;
4171 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4172 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4173
4174 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4175
4176 $self->{state} = DATA_STATE;
4177 $self->{s_kwd} = '';
4178 $self->{ct}->{quirks} = 1;
4179 } else {
4180
4181 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4182 }
4183
4184
4185 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4186 $self->{line_prev} = $self->{line};
4187 $self->{column_prev} = $self->{column};
4188 $self->{column}++;
4189 $self->{nc}
4190 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4191 } else {
4192 $self->{set_nc}->($self);
4193 }
4194
4195 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4196 redo A;
4197 } elsif ($self->{nc} == -1) {
4198 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4199
4200 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4201
4202 $self->{state} = DATA_STATE;
4203 $self->{s_kwd} = '';
4204 $self->{ct}->{quirks} = 1;
4205 } else {
4206
4207 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4208 }
4209
4210 ## reconsume
4211 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4212 redo A;
4213 } else {
4214
4215 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4216 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4217 length $self->{ct}->{sysid});
4218
4219 ## Stay in the state
4220
4221 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4222 $self->{line_prev} = $self->{line};
4223 $self->{column_prev} = $self->{column};
4224 $self->{column}++;
4225 $self->{nc}
4226 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4227 } else {
4228 $self->{set_nc}->($self);
4229 }
4230
4231 redo A;
4232 }
4233 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4234 if ($self->{nc} == 0x0027) { # '
4235
4236 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4237
4238 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4239 $self->{line_prev} = $self->{line};
4240 $self->{column_prev} = $self->{column};
4241 $self->{column}++;
4242 $self->{nc}
4243 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4244 } else {
4245 $self->{set_nc}->($self);
4246 }
4247
4248 redo A;
4249 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4250
4251 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4252
4253 $self->{state} = DATA_STATE;
4254 $self->{s_kwd} = '';
4255
4256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257 $self->{line_prev} = $self->{line};
4258 $self->{column_prev} = $self->{column};
4259 $self->{column}++;
4260 $self->{nc}
4261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262 } else {
4263 $self->{set_nc}->($self);
4264 }
4265
4266
4267 $self->{ct}->{quirks} = 1;
4268 return ($self->{ct}); # DOCTYPE
4269
4270 redo A;
4271 } elsif ($self->{nc} == -1) {
4272 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4273
4274 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4275
4276 $self->{state} = DATA_STATE;
4277 $self->{s_kwd} = '';
4278 $self->{ct}->{quirks} = 1;
4279 } else {
4280
4281 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4282 }
4283
4284 ## reconsume
4285 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4286 redo A;
4287 } else {
4288
4289 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4290 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4291 length $self->{ct}->{sysid});
4292
4293 ## Stay in the state
4294
4295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4296 $self->{line_prev} = $self->{line};
4297 $self->{column_prev} = $self->{column};
4298 $self->{column}++;
4299 $self->{nc}
4300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4301 } else {
4302 $self->{set_nc}->($self);
4303 }
4304
4305 redo A;
4306 }
4307 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4308 if ($is_space->{$self->{nc}}) {
4309 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4310
4311 $self->{state} = BEFORE_NDATA_STATE;
4312 } else {
4313
4314 ## Stay in the state
4315 }
4316
4317 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4318 $self->{line_prev} = $self->{line};
4319 $self->{column_prev} = $self->{column};
4320 $self->{column}++;
4321 $self->{nc}
4322 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4323 } else {
4324 $self->{set_nc}->($self);
4325 }
4326
4327 redo A;
4328 } elsif ($self->{nc} == 0x003E) { # >
4329 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4330
4331 $self->{state} = DATA_STATE;
4332 $self->{s_kwd} = '';
4333 } else {
4334
4335 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4336 }
4337
4338
4339 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4340 $self->{line_prev} = $self->{line};
4341 $self->{column_prev} = $self->{column};
4342 $self->{column}++;
4343 $self->{nc}
4344 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4345 } else {
4346 $self->{set_nc}->($self);
4347 }
4348
4349 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4350 redo A;
4351 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4352 ($self->{nc} == 0x004E or # N
4353 $self->{nc} == 0x006E)) { # n
4354
4355 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4356 $self->{state} = NDATA_STATE;
4357 $self->{kwd} = chr $self->{nc};
4358
4359 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4360 $self->{line_prev} = $self->{line};
4361 $self->{column_prev} = $self->{column};
4362 $self->{column}++;
4363 $self->{nc}
4364 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4365 } else {
4366 $self->{set_nc}->($self);
4367 }
4368
4369 redo A;
4370 } elsif ($self->{nc} == -1) {
4371 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4372
4373 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4374 $self->{state} = DATA_STATE;
4375 $self->{s_kwd} = '';
4376 $self->{ct}->{quirks} = 1;
4377 } else {
4378
4379 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4380 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4381 }
4382
4383 ## reconsume
4384 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4385 redo A;
4386 } elsif ($self->{is_xml} and
4387 $self->{ct}->{type} == DOCTYPE_TOKEN and
4388 $self->{nc} == 0x005B) { # [
4389
4390 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4392 $self->{in_subset} = 1;
4393
4394 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4395 $self->{line_prev} = $self->{line};
4396 $self->{column_prev} = $self->{column};
4397 $self->{column}++;
4398 $self->{nc}
4399 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4400 } else {
4401 $self->{set_nc}->($self);
4402 }
4403
4404 return ($self->{ct}); # DOCTYPE
4405 redo A;
4406 } else {
4407 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4408
4409 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4410
4411 #$self->{ct}->{quirks} = 1;
4412 $self->{state} = BOGUS_DOCTYPE_STATE;
4413 } else {
4414
4415 $self->{state} = BOGUS_MD_STATE;
4416 }
4417
4418
4419 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4420 $self->{line_prev} = $self->{line};
4421 $self->{column_prev} = $self->{column};
4422 $self->{column}++;
4423 $self->{nc}
4424 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4425 } else {
4426 $self->{set_nc}->($self);
4427 }
4428
4429 redo A;
4430 }
4431 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4432 if ($is_space->{$self->{nc}}) {
4433
4434 ## Stay in the state.
4435
4436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4437 $self->{line_prev} = $self->{line};
4438 $self->{column_prev} = $self->{column};
4439 $self->{column}++;
4440 $self->{nc}
4441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4442 } else {
4443 $self->{set_nc}->($self);
4444 }
4445
4446 redo A;
4447 } elsif ($self->{nc} == 0x003E) { # >
4448
4449 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4450
4451 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4452 $self->{line_prev} = $self->{line};
4453 $self->{column_prev} = $self->{column};
4454 $self->{column}++;
4455 $self->{nc}
4456 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4457 } else {
4458 $self->{set_nc}->($self);
4459 }
4460
4461 return ($self->{ct}); # ENTITY
4462 redo A;
4463 } elsif ($self->{nc} == 0x004E or # N
4464 $self->{nc} == 0x006E) { # n
4465
4466 $self->{state} = NDATA_STATE;
4467 $self->{kwd} = chr $self->{nc};
4468
4469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4470 $self->{line_prev} = $self->{line};
4471 $self->{column_prev} = $self->{column};
4472 $self->{column}++;
4473 $self->{nc}
4474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4475 } else {
4476 $self->{set_nc}->($self);
4477 }
4478
4479 redo A;
4480 } elsif ($self->{nc} == -1) {
4481
4482 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4483 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4484 ## reconsume
4485 return ($self->{ct}); # ENTITY
4486 redo A;
4487 } else {
4488
4489 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4490 $self->{state} = BOGUS_MD_STATE;
4491
4492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493 $self->{line_prev} = $self->{line};
4494 $self->{column_prev} = $self->{column};
4495 $self->{column}++;
4496 $self->{nc}
4497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498 } else {
4499 $self->{set_nc}->($self);
4500 }
4501
4502 redo A;
4503 }
4504 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4505 if ($self->{nc} == 0x003E) { # >
4506
4507 $self->{state} = DATA_STATE;
4508 $self->{s_kwd} = '';
4509
4510 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4511 $self->{line_prev} = $self->{line};
4512 $self->{column_prev} = $self->{column};
4513 $self->{column}++;
4514 $self->{nc}
4515 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4516 } else {
4517 $self->{set_nc}->($self);
4518 }
4519
4520
4521 return ($self->{ct}); # DOCTYPE
4522
4523 redo A;
4524 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4525
4526 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4527 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4528 $self->{in_subset} = 1;
4529
4530 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4531 $self->{line_prev} = $self->{line};
4532 $self->{column_prev} = $self->{column};
4533 $self->{column}++;
4534 $self->{nc}
4535 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4536 } else {
4537 $self->{set_nc}->($self);
4538 }
4539
4540 return ($self->{ct}); # DOCTYPE
4541 redo A;
4542 } elsif ($self->{nc} == -1) {
4543
4544 $self->{state} = DATA_STATE;
4545 $self->{s_kwd} = '';
4546 ## reconsume
4547
4548 return ($self->{ct}); # DOCTYPE
4549
4550 redo A;
4551 } else {
4552
4553 my $s = '';
4554 $self->{read_until}->($s, q{>[}, 0);
4555
4556 ## Stay in the state
4557
4558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559 $self->{line_prev} = $self->{line};
4560 $self->{column_prev} = $self->{column};
4561 $self->{column}++;
4562 $self->{nc}
4563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564 } else {
4565 $self->{set_nc}->($self);
4566 }
4567
4568 redo A;
4569 }
4570 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4571 ## NOTE: "CDATA section state" in the state is jointly implemented
4572 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4573 ## and |CDATA_SECTION_MSE2_STATE|.
4574
4575 ## XML5: "CDATA state".
4576
4577 if ($self->{nc} == 0x005D) { # ]
4578
4579 $self->{state} = CDATA_SECTION_MSE1_STATE;
4580
4581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4582 $self->{line_prev} = $self->{line};
4583 $self->{column_prev} = $self->{column};
4584 $self->{column}++;
4585 $self->{nc}
4586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4587 } else {
4588 $self->{set_nc}->($self);
4589 }
4590
4591 redo A;
4592 } elsif ($self->{nc} == -1) {
4593 if ($self->{is_xml}) {
4594
4595 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4596 } else {
4597
4598 }
4599
4600 $self->{state} = DATA_STATE;
4601 $self->{s_kwd} = '';
4602 ## Reconsume.
4603 if (length $self->{ct}->{data}) { # character
4604
4605 return ($self->{ct}); # character
4606 } else {
4607
4608 ## No token to emit. $self->{ct} is discarded.
4609 }
4610 redo A;
4611 } else {
4612
4613 $self->{ct}->{data} .= chr $self->{nc};
4614 $self->{read_until}->($self->{ct}->{data},
4615 q<]>,
4616 length $self->{ct}->{data});
4617
4618 ## Stay in the state.
4619
4620 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4621 $self->{line_prev} = $self->{line};
4622 $self->{column_prev} = $self->{column};
4623 $self->{column}++;
4624 $self->{nc}
4625 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4626 } else {
4627 $self->{set_nc}->($self);
4628 }
4629
4630 redo A;
4631 }
4632
4633 ## ISSUE: "text tokens" in spec.
4634 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4635 ## XML5: "CDATA bracket state".
4636
4637 if ($self->{nc} == 0x005D) { # ]
4638
4639 $self->{state} = CDATA_SECTION_MSE2_STATE;
4640
4641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642 $self->{line_prev} = $self->{line};
4643 $self->{column_prev} = $self->{column};
4644 $self->{column}++;
4645 $self->{nc}
4646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647 } else {
4648 $self->{set_nc}->($self);
4649 }
4650
4651 redo A;
4652 } else {
4653
4654 ## XML5: If EOF, "]" is not appended and changed to the data state.
4655 $self->{ct}->{data} .= ']';
4656 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4657 ## Reconsume.
4658 redo A;
4659 }
4660 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4661 ## XML5: "CDATA end state".
4662
4663 if ($self->{nc} == 0x003E) { # >
4664 $self->{state} = DATA_STATE;
4665 $self->{s_kwd} = '';
4666
4667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4668 $self->{line_prev} = $self->{line};
4669 $self->{column_prev} = $self->{column};
4670 $self->{column}++;
4671 $self->{nc}
4672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4673 } else {
4674 $self->{set_nc}->($self);
4675 }
4676
4677 if (length $self->{ct}->{data}) { # character
4678
4679 return ($self->{ct}); # character
4680 } else {
4681
4682 ## No token to emit. $self->{ct} is discarded.
4683 }
4684 redo A;
4685 } elsif ($self->{nc} == 0x005D) { # ]
4686 # character
4687 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4688 ## Stay in the state.
4689
4690 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4691 $self->{line_prev} = $self->{line};
4692 $self->{column_prev} = $self->{column};
4693 $self->{column}++;
4694 $self->{nc}
4695 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4696 } else {
4697 $self->{set_nc}->($self);
4698 }
4699
4700 redo A;
4701 } else {
4702
4703 $self->{ct}->{data} .= ']]'; # character
4704 $self->{state} = CDATA_SECTION_STATE;
4705 ## Reconsume. ## XML5: Emit.
4706 redo A;
4707 }
4708 } elsif ($self->{state} == ENTITY_STATE) {
4709 if ($is_space->{$self->{nc}} or
4710 {
4711 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4712 $self->{entity_add} => 1,
4713 }->{$self->{nc}}) {
4714 if ($self->{is_xml}) {
4715
4716 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4717 line => $self->{line_prev},
4718 column => $self->{column_prev}
4719 + ($self->{nc} == -1 ? 1 : 0));
4720 } else {
4721
4722 ## No error
4723 }
4724 ## Don't consume
4725 ## Return nothing.
4726 #
4727 } elsif ($self->{nc} == 0x0023) { # #
4728
4729 $self->{state} = ENTITY_HASH_STATE;
4730 $self->{kwd} = '#';
4731
4732 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4733 $self->{line_prev} = $self->{line};
4734 $self->{column_prev} = $self->{column};
4735 $self->{column}++;
4736 $self->{nc}
4737 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4738 } else {
4739 $self->{set_nc}->($self);
4740 }
4741
4742 redo A;
4743 } elsif ($self->{is_xml} or
4744 (0x0041 <= $self->{nc} and
4745 $self->{nc} <= 0x005A) or # A..Z
4746 (0x0061 <= $self->{nc} and
4747 $self->{nc} <= 0x007A)) { # a..z
4748
4749 require Whatpm::_NamedEntityList;
4750 $self->{state} = ENTITY_NAME_STATE;
4751 $self->{kwd} = chr $self->{nc};
4752 $self->{entity__value} = $self->{kwd};
4753 $self->{entity__match} = 0;
4754
4755 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4756 $self->{line_prev} = $self->{line};
4757 $self->{column_prev} = $self->{column};
4758 $self->{column}++;
4759 $self->{nc}
4760 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4761 } else {
4762 $self->{set_nc}->($self);
4763 }
4764
4765 redo A;
4766 } else {
4767
4768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4769 ## Return nothing.
4770 #
4771 }
4772
4773 ## NOTE: No character is consumed by the "consume a character
4774 ## reference" algorithm. In other word, there is an "&" character
4775 ## that does not introduce a character reference, which would be
4776 ## appended to the parent element or the attribute value in later
4777 ## process of the tokenizer.
4778
4779 if ($self->{prev_state} == DATA_STATE) {
4780
4781 $self->{state} = $self->{prev_state};
4782 $self->{s_kwd} = '';
4783 ## Reconsume.
4784 return ({type => CHARACTER_TOKEN, data => '&',
4785 line => $self->{line_prev},
4786 column => $self->{column_prev},
4787 });
4788 redo A;
4789 } else {
4790
4791 $self->{ca}->{value} .= '&';
4792 $self->{state} = $self->{prev_state};
4793 $self->{s_kwd} = '';
4794 ## Reconsume.
4795 redo A;
4796 }
4797 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4798 if ($self->{nc} == 0x0078) { # x
4799
4800 $self->{state} = HEXREF_X_STATE;
4801 $self->{kwd} .= chr $self->{nc};
4802
4803 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4804 $self->{line_prev} = $self->{line};
4805 $self->{column_prev} = $self->{column};
4806 $self->{column}++;
4807 $self->{nc}
4808 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4809 } else {
4810 $self->{set_nc}->($self);
4811 }
4812
4813 redo A;
4814 } elsif ($self->{nc} == 0x0058) { # X
4815
4816 if ($self->{is_xml}) {
4817 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4818 }
4819 $self->{state} = HEXREF_X_STATE;
4820 $self->{kwd} .= chr $self->{nc};
4821
4822 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4823 $self->{line_prev} = $self->{line};
4824 $self->{column_prev} = $self->{column};
4825 $self->{column}++;
4826 $self->{nc}
4827 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4828 } else {
4829 $self->{set_nc}->($self);
4830 }
4831
4832 redo A;
4833 } elsif (0x0030 <= $self->{nc} and
4834 $self->{nc} <= 0x0039) { # 0..9
4835
4836 $self->{state} = NCR_NUM_STATE;
4837 $self->{kwd} = $self->{nc} - 0x0030;
4838
4839 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4840 $self->{line_prev} = $self->{line};
4841 $self->{column_prev} = $self->{column};
4842 $self->{column}++;
4843 $self->{nc}
4844 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4845 } else {
4846 $self->{set_nc}->($self);
4847 }
4848
4849 redo A;
4850 } else {
4851 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4852 line => $self->{line_prev},
4853 column => $self->{column_prev} - 1);
4854
4855 ## NOTE: According to the spec algorithm, nothing is returned,
4856 ## and then "&#" is appended to the parent element or the attribute
4857 ## value in the later processing.
4858
4859 if ($self->{prev_state} == DATA_STATE) {
4860
4861 $self->{state} = $self->{prev_state};
4862 $self->{s_kwd} = '';
4863 ## Reconsume.
4864 return ({type => CHARACTER_TOKEN,
4865 data => '&#',
4866 line => $self->{line_prev},
4867 column => $self->{column_prev} - 1,
4868 });
4869 redo A;
4870 } else {
4871
4872 $self->{ca}->{value} .= '&#';
4873 $self->{state} = $self->{prev_state};
4874 $self->{s_kwd} = '';
4875 ## Reconsume.
4876 redo A;
4877 }
4878 }
4879 } elsif ($self->{state} == NCR_NUM_STATE) {
4880 if (0x0030 <= $self->{nc} and
4881 $self->{nc} <= 0x0039) { # 0..9
4882
4883 $self->{kwd} *= 10;
4884 $self->{kwd} += $self->{nc} - 0x0030;
4885
4886 ## Stay in the state.
4887
4888 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4889 $self->{line_prev} = $self->{line};
4890 $self->{column_prev} = $self->{column};
4891 $self->{column}++;
4892 $self->{nc}
4893 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4894 } else {
4895 $self->{set_nc}->($self);
4896 }
4897
4898 redo A;
4899 } elsif ($self->{nc} == 0x003B) { # ;
4900
4901
4902 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4903 $self->{line_prev} = $self->{line};
4904 $self->{column_prev} = $self->{column};
4905 $self->{column}++;
4906 $self->{nc}
4907 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4908 } else {
4909 $self->{set_nc}->($self);
4910 }
4911
4912 #
4913 } else {
4914
4915 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4916 ## Reconsume.
4917 #
4918 }
4919
4920 my $code = $self->{kwd};
4921 my $l = $self->{line_prev};
4922 my $c = $self->{column_prev};
4923 if ((not $self->{is_xml} and $charref_map->{$code}) or
4924 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4925 ($self->{is_xml} and $code == 0x0000)) {
4926
4927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4928 text => (sprintf 'U+%04X', $code),
4929 line => $l, column => $c);
4930 $code = $charref_map->{$code};
4931 } elsif ($code > 0x10FFFF) {
4932
4933 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4934 text => (sprintf 'U-%08X', $code),
4935 line => $l, column => $c);
4936 $code = 0xFFFD;
4937 }
4938
4939 if ($self->{prev_state} == DATA_STATE) {
4940
4941 $self->{state} = $self->{prev_state};
4942 $self->{s_kwd} = '';
4943 ## Reconsume.
4944 return ({type => CHARACTER_TOKEN, data => chr $code,
4945 has_reference => 1,
4946 line => $l, column => $c,
4947 });
4948 redo A;
4949 } else {
4950
4951 $self->{ca}->{value} .= chr $code;
4952 $self->{ca}->{has_reference} = 1;
4953 $self->{state} = $self->{prev_state};
4954 $self->{s_kwd} = '';
4955 ## Reconsume.
4956 redo A;
4957 }
4958 } elsif ($self->{state} == HEXREF_X_STATE) {
4959 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4960 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4961 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4962 # 0..9, A..F, a..f
4963
4964 $self->{state} = HEXREF_HEX_STATE;
4965 $self->{kwd} = 0;
4966 ## Reconsume.
4967 redo A;
4968 } else {
4969 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4970 line => $self->{line_prev},
4971 column => $self->{column_prev} - 2);
4972
4973 ## NOTE: According to the spec algorithm, nothing is returned,
4974 ## and then "&#" followed by "X" or "x" is appended to the parent
4975 ## element or the attribute value in the later processing.
4976
4977 if ($self->{prev_state} == DATA_STATE) {
4978
4979 $self->{state} = $self->{prev_state};
4980 $self->{s_kwd} = '';
4981 ## Reconsume.
4982 return ({type => CHARACTER_TOKEN,
4983 data => '&' . $self->{kwd},
4984 line => $self->{line_prev},
4985 column => $self->{column_prev} - length $self->{kwd},
4986 });
4987 redo A;
4988 } else {
4989
4990 $self->{ca}->{value} .= '&' . $self->{kwd};
4991 $self->{state} = $self->{prev_state};
4992 $self->{s_kwd} = '';
4993 ## Reconsume.
4994 redo A;
4995 }
4996 }
4997 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4998 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4999 # 0..9
5000
5001 $self->{kwd} *= 0x10;
5002 $self->{kwd} += $self->{nc} - 0x0030;
5003 ## Stay in the state.
5004
5005 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5006 $self->{line_prev} = $self->{line};
5007 $self->{column_prev} = $self->{column};
5008 $self->{column}++;
5009 $self->{nc}
5010 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5011 } else {
5012 $self->{set_nc}->($self);
5013 }
5014
5015 redo A;
5016 } elsif (0x0061 <= $self->{nc} and
5017 $self->{nc} <= 0x0066) { # a..f
5018
5019 $self->{kwd} *= 0x10;
5020 $self->{kwd} += $self->{nc} - 0x0060 + 9;
5021 ## Stay in the state.
5022
5023 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5024 $self->{line_prev} = $self->{line};
5025 $self->{column_prev} = $self->{column};
5026 $self->{column}++;
5027 $self->{nc}
5028 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5029 } else {
5030 $self->{set_nc}->($self);
5031 }
5032
5033 redo A;
5034 } elsif (0x0041 <= $self->{nc} and
5035 $self->{nc} <= 0x0046) { # A..F
5036
5037 $self->{kwd} *= 0x10;
5038 $self->{kwd} += $self->{nc} - 0x0040 + 9;
5039 ## Stay in the state.
5040
5041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5042 $self->{line_prev} = $self->{line};
5043 $self->{column_prev} = $self->{column};
5044 $self->{column}++;
5045 $self->{nc}
5046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5047 } else {
5048 $self->{set_nc}->($self);
5049 }
5050
5051 redo A;
5052 } elsif ($self->{nc} == 0x003B) { # ;
5053
5054
5055 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5056 $self->{line_prev} = $self->{line};
5057 $self->{column_prev} = $self->{column};
5058 $self->{column}++;
5059 $self->{nc}
5060 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5061 } else {
5062 $self->{set_nc}->($self);
5063 }
5064
5065 #
5066 } else {
5067
5068 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5069 line => $self->{line},
5070 column => $self->{column});
5071 ## Reconsume.
5072 #
5073 }
5074
5075 my $code = $self->{kwd};
5076 my $l = $self->{line_prev};
5077 my $c = $self->{column_prev};
5078 if ((not $self->{is_xml} and $charref_map->{$code}) or
5079 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5080 ($self->{is_xml} and $code == 0x0000)) {
5081
5082 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5083 text => (sprintf 'U+%04X', $code),
5084 line => $l, column => $c);
5085 $code = $charref_map->{$code};
5086 } elsif ($code > 0x10FFFF) {
5087
5088 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5089 text => (sprintf 'U-%08X', $code),
5090 line => $l, column => $c);
5091 $code = 0xFFFD;
5092 }
5093
5094 if ($self->{prev_state} == DATA_STATE) {
5095
5096 $self->{state} = $self->{prev_state};
5097 $self->{s_kwd} = '';
5098 ## Reconsume.
5099 return ({type => CHARACTER_TOKEN, data => chr $code,
5100 has_reference => 1,
5101 line => $l, column => $c,
5102 });
5103 redo A;
5104 } else {
5105
5106 $self->{ca}->{value} .= chr $code;
5107 $self->{ca}->{has_reference} = 1;
5108 $self->{state} = $self->{prev_state};
5109 $self->{s_kwd} = '';
5110 ## Reconsume.
5111 redo A;
5112 }
5113 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5114 if ((0x0041 <= $self->{nc} and # a
5115 $self->{nc} <= 0x005A) or # x
5116 (0x0061 <= $self->{nc} and # a
5117 $self->{nc} <= 0x007A) or # z
5118 (0x0030 <= $self->{nc} and # 0
5119 $self->{nc} <= 0x0039) or # 9
5120 $self->{nc} == 0x003B or # ;
5121 ($self->{is_xml} and
5122 not ($is_space->{$self->{nc}} or
5123 {
5124 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5125 $self->{entity_add} => 1,
5126 }->{$self->{nc}}))) {
5127 our $EntityChar;
5128 $self->{kwd} .= chr $self->{nc};
5129 if (defined $EntityChar->{$self->{kwd}} or
5130 $self->{ge}->{$self->{kwd}}) {
5131 if ($self->{nc} == 0x003B) { # ;
5132 if (defined $self->{ge}->{$self->{kwd}}) {
5133 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5134
5135 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5136 } else {
5137 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5138
5139 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5140 value => $self->{kwd});
5141 } else {
5142
5143 }
5144 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5145 }
5146 } else {
5147 if ($self->{is_xml}) {
5148
5149 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5150 value => $self->{kwd},
5151 level => {
5152 'amp;' => $self->{level}->{warn},
5153 'quot;' => $self->{level}->{warn},
5154 'lt;' => $self->{level}->{warn},
5155 'gt;' => $self->{level}->{warn},
5156 'apos;' => $self->{level}->{warn},
5157 }->{$self->{kwd}} ||
5158 $self->{level}->{must});
5159 } else {
5160
5161 }
5162 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5163 }
5164 $self->{entity__match} = 1;
5165
5166 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5167 $self->{line_prev} = $self->{line};
5168 $self->{column_prev} = $self->{column};
5169 $self->{column}++;
5170 $self->{nc}
5171 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5172 } else {
5173 $self->{set_nc}->($self);
5174 }
5175
5176 #
5177 } else {
5178
5179 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5180 $self->{entity__match} = -1;
5181 ## Stay in the state.
5182
5183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5184 $self->{line_prev} = $self->{line};
5185 $self->{column_prev} = $self->{column};
5186 $self->{column}++;
5187 $self->{nc}
5188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5189 } else {
5190 $self->{set_nc}->($self);
5191 }
5192
5193 redo A;
5194 }
5195 } else {
5196
5197 $self->{entity__value} .= chr $self->{nc};
5198 $self->{entity__match} *= 2;
5199 ## Stay in the state.
5200
5201 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5202 $self->{line_prev} = $self->{line};
5203 $self->{column_prev} = $self->{column};
5204 $self->{column}++;
5205 $self->{nc}
5206 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5207 } else {
5208 $self->{set_nc}->($self);
5209 }
5210
5211 redo A;
5212 }
5213 }
5214
5215 my $data;
5216 my $has_ref;
5217 if ($self->{entity__match} > 0) {
5218
5219 $data = $self->{entity__value};
5220 $has_ref = 1;
5221 #
5222 } elsif ($self->{entity__match} < 0) {
5223 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5224 if ($self->{prev_state} != DATA_STATE and # in attribute
5225 $self->{entity__match} < -1) {
5226
5227 $data = '&' . $self->{kwd};
5228 #
5229 } else {
5230
5231 $data = $self->{entity__value};
5232 $has_ref = 1;
5233 #
5234 }
5235 } else {
5236
5237 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5238 line => $self->{line_prev},
5239 column => $self->{column_prev} - length $self->{kwd});
5240 $data = '&' . $self->{kwd};
5241 #
5242 }
5243
5244 ## NOTE: In these cases, when a character reference is found,
5245 ## it is consumed and a character token is returned, or, otherwise,
5246 ## nothing is consumed and returned, according to the spec algorithm.
5247 ## In this implementation, anything that has been examined by the
5248 ## tokenizer is appended to the parent element or the attribute value
5249 ## as string, either literal string when no character reference or
5250 ## entity-replaced string otherwise, in this stage, since any characters
5251 ## that would not be consumed are appended in the data state or in an
5252 ## appropriate attribute value state anyway.
5253
5254 if ($self->{prev_state} == DATA_STATE) {
5255
5256 $self->{state} = $self->{prev_state};
5257 $self->{s_kwd} = '';
5258 ## Reconsume.
5259 return ({type => CHARACTER_TOKEN,
5260 data => $data,
5261 has_reference => $has_ref,
5262 line => $self->{line_prev},
5263 column => $self->{column_prev} + 1 - length $self->{kwd},
5264 });
5265 redo A;
5266 } else {
5267
5268 $self->{ca}->{value} .= $data;
5269 $self->{ca}->{has_reference} = 1 if $has_ref;
5270 $self->{state} = $self->{prev_state};
5271 $self->{s_kwd} = '';
5272 ## Reconsume.
5273 redo A;
5274 }
5275
5276 ## XML-only states
5277
5278 } elsif ($self->{state} == PI_STATE) {
5279 ## XML5: "Pi state" and "DOCTYPE pi state".
5280
5281 if ($is_space->{$self->{nc}} or
5282 $self->{nc} == 0x003F or # ?
5283 $self->{nc} == -1) {
5284 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5285 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5286 ## "DOCTYPE pi state": Parse error, switch to the "data
5287 ## state".
5288 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5289 line => $self->{line_prev},
5290 column => $self->{column_prev}
5291 - 1 * ($self->{nc} != -1));
5292 $self->{state} = BOGUS_COMMENT_STATE;
5293 ## Reconsume.
5294 $self->{ct} = {type => COMMENT_TOKEN,
5295 data => '?',
5296 line => $self->{line_prev},
5297 column => $self->{column_prev}
5298 - 1 * ($self->{nc} != -1),
5299 };
5300 redo A;
5301 } else {
5302 ## XML5: "DOCTYPE pi state": Stay in the state.
5303 $self->{ct} = {type => PI_TOKEN,
5304 target => chr $self->{nc},
5305 data => '',
5306 line => $self->{line_prev},
5307 column => $self->{column_prev} - 1,
5308 };
5309 $self->{state} = PI_TARGET_STATE;
5310
5311 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5312 $self->{line_prev} = $self->{line};
5313 $self->{column_prev} = $self->{column};
5314 $self->{column}++;
5315 $self->{nc}
5316 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5317 } else {
5318 $self->{set_nc}->($self);
5319 }
5320
5321 redo A;
5322 }
5323 } elsif ($self->{state} == PI_TARGET_STATE) {
5324 if ($is_space->{$self->{nc}}) {
5325 $self->{state} = PI_TARGET_AFTER_STATE;
5326
5327 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5328 $self->{line_prev} = $self->{line};
5329 $self->{column_prev} = $self->{column};
5330 $self->{column}++;
5331 $self->{nc}
5332 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5333 } else {
5334 $self->{set_nc}->($self);
5335 }
5336
5337 redo A;
5338 } elsif ($self->{nc} == -1) {
5339 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5340 if ($self->{in_subset}) {
5341 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5342 } else {
5343 $self->{state} = DATA_STATE;
5344 $self->{s_kwd} = '';
5345 }
5346 ## Reconsume.
5347 return ($self->{ct}); # pi
5348 redo A;
5349 } elsif ($self->{nc} == 0x003F) { # ?
5350 $self->{state} = PI_AFTER_STATE;
5351
5352 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5353 $self->{line_prev} = $self->{line};
5354 $self->{column_prev} = $self->{column};
5355 $self->{column}++;
5356 $self->{nc}
5357 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5358 } else {
5359 $self->{set_nc}->($self);
5360 }
5361
5362 redo A;
5363 } else {
5364 ## XML5: typo ("tag name" -> "target")
5365 $self->{ct}->{target} .= chr $self->{nc}; # pi
5366
5367 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5368 $self->{line_prev} = $self->{line};
5369 $self->{column_prev} = $self->{column};
5370 $self->{column}++;
5371 $self->{nc}
5372 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5373 } else {
5374 $self->{set_nc}->($self);
5375 }
5376
5377 redo A;
5378 }
5379 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5380 if ($is_space->{$self->{nc}}) {
5381 ## Stay in the state.
5382
5383 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384 $self->{line_prev} = $self->{line};
5385 $self->{column_prev} = $self->{column};
5386 $self->{column}++;
5387 $self->{nc}
5388 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389 } else {
5390 $self->{set_nc}->($self);
5391 }
5392
5393 redo A;
5394 } else {
5395 $self->{state} = PI_DATA_STATE;
5396 ## Reprocess.
5397 redo A;
5398 }
5399 } elsif ($self->{state} == PI_DATA_STATE) {
5400 if ($self->{nc} == 0x003F) { # ?
5401 $self->{state} = PI_DATA_AFTER_STATE;
5402
5403 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5404 $self->{line_prev} = $self->{line};
5405 $self->{column_prev} = $self->{column};
5406 $self->{column}++;
5407 $self->{nc}
5408 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5409 } else {
5410 $self->{set_nc}->($self);
5411 }
5412
5413 redo A;
5414 } elsif ($self->{nc} == -1) {
5415 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5416 if ($self->{in_subset}) {
5417 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5418 } else {
5419 $self->{state} = DATA_STATE;
5420 $self->{s_kwd} = '';
5421 }
5422 ## Reprocess.
5423 return ($self->{ct}); # pi
5424 redo A;
5425 } else {
5426 $self->{ct}->{data} .= chr $self->{nc}; # pi
5427 $self->{read_until}->($self->{ct}->{data}, q[?],
5428 length $self->{ct}->{data});
5429 ## Stay in the state.
5430
5431 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5432 $self->{line_prev} = $self->{line};
5433 $self->{column_prev} = $self->{column};
5434 $self->{column}++;
5435 $self->{nc}
5436 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5437 } else {
5438 $self->{set_nc}->($self);
5439 }
5440
5441 ## Reprocess.
5442 redo A;
5443 }
5444 } elsif ($self->{state} == PI_AFTER_STATE) {
5445 ## XML5: Part of "Pi after state".
5446
5447 if ($self->{nc} == 0x003E) { # >
5448 if ($self->{in_subset}) {
5449 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5450 } else {
5451 $self->{state} = DATA_STATE;
5452 $self->{s_kwd} = '';
5453 }
5454
5455 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5456 $self->{line_prev} = $self->{line};
5457 $self->{column_prev} = $self->{column};
5458 $self->{column}++;
5459 $self->{nc}
5460 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5461 } else {
5462 $self->{set_nc}->($self);
5463 }
5464
5465 return ($self->{ct}); # pi
5466 redo A;
5467 } elsif ($self->{nc} == 0x003F) { # ?
5468 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5469 line => $self->{line_prev},
5470 column => $self->{column_prev}); ## XML5: no error
5471 $self->{ct}->{data} .= '?';
5472 $self->{state} = PI_DATA_AFTER_STATE;
5473
5474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475 $self->{line_prev} = $self->{line};
5476 $self->{column_prev} = $self->{column};
5477 $self->{column}++;
5478 $self->{nc}
5479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480 } else {
5481 $self->{set_nc}->($self);
5482 }
5483
5484 redo A;
5485 } else {
5486 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5487 line => $self->{line_prev},
5488 column => $self->{column_prev}
5489 + 1 * ($self->{nc} == -1)); ## XML5: no error
5490 $self->{ct}->{data} .= '?'; ## XML5: not appended
5491 $self->{state} = PI_DATA_STATE;
5492 ## Reprocess.
5493 redo A;
5494 }
5495 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5496 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5497
5498 if ($self->{nc} == 0x003E) { # >
5499 if ($self->{in_subset}) {
5500 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5501 } else {
5502 $self->{state} = DATA_STATE;
5503 $self->{s_kwd} = '';
5504 }
5505
5506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5507 $self->{line_prev} = $self->{line};
5508 $self->{column_prev} = $self->{column};
5509 $self->{column}++;
5510 $self->{nc}
5511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5512 } else {
5513 $self->{set_nc}->($self);
5514 }
5515
5516 return ($self->{ct}); # pi
5517 redo A;
5518 } elsif ($self->{nc} == 0x003F) { # ?
5519 $self->{ct}->{data} .= '?';
5520 ## Stay in the state.
5521
5522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5523 $self->{line_prev} = $self->{line};
5524 $self->{column_prev} = $self->{column};
5525 $self->{column}++;
5526 $self->{nc}
5527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5528 } else {
5529 $self->{set_nc}->($self);
5530 }
5531
5532 redo A;
5533 } else {
5534 $self->{ct}->{data} .= '?'; ## XML5: not appended
5535 $self->{state} = PI_DATA_STATE;
5536 ## Reprocess.
5537 redo A;
5538 }
5539
5540 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5541 if ($self->{nc} == 0x003C) { # <
5542 $self->{state} = DOCTYPE_TAG_STATE;
5543
5544 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5545 $self->{line_prev} = $self->{line};
5546 $self->{column_prev} = $self->{column};
5547 $self->{column}++;
5548 $self->{nc}
5549 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5550 } else {
5551 $self->{set_nc}->($self);
5552 }
5553
5554 redo A;
5555 } elsif ($self->{nc} == 0x0025) { # %
5556 ## XML5: Not defined yet.
5557
5558 ## TODO:
5559
5560 if (not $self->{stop_processing} and
5561 not $self->{document}->xml_standalone) {
5562 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5563 level => $self->{level}->{info});
5564 $self->{stop_processing} = 1;
5565 }
5566
5567
5568 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5569 $self->{line_prev} = $self->{line};
5570 $self->{column_prev} = $self->{column};
5571 $self->{column}++;
5572 $self->{nc}
5573 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5574 } else {
5575 $self->{set_nc}->($self);
5576 }
5577
5578 redo A;
5579 } elsif ($self->{nc} == 0x005D) { # ]
5580 delete $self->{in_subset};
5581 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5582
5583 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5584 $self->{line_prev} = $self->{line};
5585 $self->{column_prev} = $self->{column};
5586 $self->{column}++;
5587 $self->{nc}
5588 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5589 } else {
5590 $self->{set_nc}->($self);
5591 }
5592
5593 redo A;
5594 } elsif ($is_space->{$self->{nc}}) {
5595 ## Stay in the state.
5596
5597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5598 $self->{line_prev} = $self->{line};
5599 $self->{column_prev} = $self->{column};
5600 $self->{column}++;
5601 $self->{nc}
5602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5603 } else {
5604 $self->{set_nc}->($self);
5605 }
5606
5607 redo A;
5608 } elsif ($self->{nc} == -1) {
5609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5610 delete $self->{in_subset};
5611 $self->{state} = DATA_STATE;
5612 $self->{s_kwd} = '';
5613 ## Reconsume.
5614 return ({type => END_OF_DOCTYPE_TOKEN});
5615 redo A;
5616 } else {
5617 unless ($self->{internal_subset_tainted}) {
5618 ## XML5: No parse error.
5619 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5620 $self->{internal_subset_tainted} = 1;
5621 }
5622 ## Stay in the state.
5623
5624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625 $self->{line_prev} = $self->{line};
5626 $self->{column_prev} = $self->{column};
5627 $self->{column}++;
5628 $self->{nc}
5629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630 } else {
5631 $self->{set_nc}->($self);
5632 }
5633
5634 redo A;
5635 }
5636 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5637 if ($self->{nc} == 0x003E) { # >
5638 $self->{state} = DATA_STATE;
5639 $self->{s_kwd} = '';
5640
5641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5642 $self->{line_prev} = $self->{line};
5643 $self->{column_prev} = $self->{column};
5644 $self->{column}++;
5645 $self->{nc}
5646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5647 } else {
5648 $self->{set_nc}->($self);
5649 }
5650
5651 return ({type => END_OF_DOCTYPE_TOKEN});
5652 redo A;
5653 } elsif ($self->{nc} == -1) {
5654 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5655 $self->{state} = DATA_STATE;
5656 $self->{s_kwd} = '';
5657 ## Reconsume.
5658 return ({type => END_OF_DOCTYPE_TOKEN});
5659 redo A;
5660 } else {
5661 ## XML5: No parse error and stay in the state.
5662 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5663
5664 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5665
5666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667 $self->{line_prev} = $self->{line};
5668 $self->{column_prev} = $self->{column};
5669 $self->{column}++;
5670 $self->{nc}
5671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5672 } else {
5673 $self->{set_nc}->($self);
5674 }
5675
5676 redo A;
5677 }
5678 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5679 if ($self->{nc} == 0x003E) { # >
5680 $self->{state} = DATA_STATE;
5681 $self->{s_kwd} = '';
5682
5683 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5684 $self->{line_prev} = $self->{line};
5685 $self->{column_prev} = $self->{column};
5686 $self->{column}++;
5687 $self->{nc}
5688 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5689 } else {
5690 $self->{set_nc}->($self);
5691 }
5692
5693 return ({type => END_OF_DOCTYPE_TOKEN});
5694 redo A;
5695 } elsif ($self->{nc} == -1) {
5696 $self->{state} = DATA_STATE;
5697 $self->{s_kwd} = '';
5698 ## Reconsume.
5699 return ({type => END_OF_DOCTYPE_TOKEN});
5700 redo A;
5701 } else {
5702 ## Stay in the state.
5703
5704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5705 $self->{line_prev} = $self->{line};
5706 $self->{column_prev} = $self->{column};
5707 $self->{column}++;
5708 $self->{nc}
5709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5710 } else {
5711 $self->{set_nc}->($self);
5712 }
5713
5714 redo A;
5715 }
5716 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5717 if ($self->{nc} == 0x0021) { # !
5718 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5719
5720 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5721 $self->{line_prev} = $self->{line};
5722 $self->{column_prev} = $self->{column};
5723 $self->{column}++;
5724 $self->{nc}
5725 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5726 } else {
5727 $self->{set_nc}->($self);
5728 }
5729
5730 redo A;
5731 } elsif ($self->{nc} == 0x003F) { # ?
5732 $self->{state} = PI_STATE;
5733
5734 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5735 $self->{line_prev} = $self->{line};
5736 $self->{column_prev} = $self->{column};
5737 $self->{column}++;
5738 $self->{nc}
5739 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5740 } else {
5741 $self->{set_nc}->($self);
5742 }
5743
5744 redo A;
5745 } elsif ($self->{nc} == -1) {
5746 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5747 $self->{state} = DATA_STATE;
5748 $self->{s_kwd} = '';
5749 ## Reconsume.
5750 redo A;
5751 } else {
5752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5753 line => $self->{line_prev},
5754 column => $self->{column_prev});
5755 $self->{state} = BOGUS_COMMENT_STATE;
5756 $self->{ct} = {type => COMMENT_TOKEN,
5757 data => '',
5758 }; ## NOTE: Will be discarded.
5759
5760 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5761 $self->{line_prev} = $self->{line};
5762 $self->{column_prev} = $self->{column};
5763 $self->{column}++;
5764 $self->{nc}
5765 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5766 } else {
5767 $self->{set_nc}->($self);
5768 }
5769
5770 redo A;
5771 }
5772 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5773 ## XML5: "DOCTYPE markup declaration state".
5774
5775 if ($self->{nc} == 0x002D) { # -
5776 $self->{state} = MD_HYPHEN_STATE;
5777
5778 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5779 $self->{line_prev} = $self->{line};
5780 $self->{column_prev} = $self->{column};
5781 $self->{column}++;
5782 $self->{nc}
5783 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5784 } else {
5785 $self->{set_nc}->($self);
5786 }
5787
5788 redo A;
5789 } elsif ($self->{nc} == 0x0045 or # E
5790 $self->{nc} == 0x0065) { # e
5791 $self->{state} = MD_E_STATE;
5792 $self->{kwd} = chr $self->{nc};
5793
5794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5795 $self->{line_prev} = $self->{line};
5796 $self->{column_prev} = $self->{column};
5797 $self->{column}++;
5798 $self->{nc}
5799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5800 } else {
5801 $self->{set_nc}->($self);
5802 }
5803
5804 redo A;
5805 } elsif ($self->{nc} == 0x0041 or # A
5806 $self->{nc} == 0x0061) { # a
5807 $self->{state} = MD_ATTLIST_STATE;
5808 $self->{kwd} = chr $self->{nc};
5809
5810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5811 $self->{line_prev} = $self->{line};
5812 $self->{column_prev} = $self->{column};
5813 $self->{column}++;
5814 $self->{nc}
5815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5816 } else {
5817 $self->{set_nc}->($self);
5818 }
5819
5820 redo A;
5821 } elsif ($self->{nc} == 0x004E or # N
5822 $self->{nc} == 0x006E) { # n
5823 $self->{state} = MD_NOTATION_STATE;
5824 $self->{kwd} = chr $self->{nc};
5825
5826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5827 $self->{line_prev} = $self->{line};
5828 $self->{column_prev} = $self->{column};
5829 $self->{column}++;
5830 $self->{nc}
5831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5832 } else {
5833 $self->{set_nc}->($self);
5834 }
5835
5836 redo A;
5837 } else {
5838 #
5839 }
5840
5841 ## XML5: No parse error.
5842 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5843 line => $self->{line_prev},
5844 column => $self->{column_prev} - 1);
5845 ## Reconsume.
5846 $self->{state} = BOGUS_COMMENT_STATE;
5847 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5848 redo A;
5849 } elsif ($self->{state} == MD_E_STATE) {
5850 if ($self->{nc} == 0x004E or # N
5851 $self->{nc} == 0x006E) { # n
5852 $self->{state} = MD_ENTITY_STATE;
5853 $self->{kwd} .= chr $self->{nc};
5854
5855 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5856 $self->{line_prev} = $self->{line};
5857 $self->{column_prev} = $self->{column};
5858 $self->{column}++;
5859 $self->{nc}
5860 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5861 } else {
5862 $self->{set_nc}->($self);
5863 }
5864
5865 redo A;
5866 } elsif ($self->{nc} == 0x004C or # L
5867 $self->{nc} == 0x006C) { # l
5868 ## XML5: <!ELEMENT> not supported.
5869 $self->{state} = MD_ELEMENT_STATE;
5870 $self->{kwd} .= chr $self->{nc};
5871
5872 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5873 $self->{line_prev} = $self->{line};
5874 $self->{column_prev} = $self->{column};
5875 $self->{column}++;
5876 $self->{nc}
5877 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5878 } else {
5879 $self->{set_nc}->($self);
5880 }
5881
5882 redo A;
5883 } else {
5884 ## XML5: No parse error.
5885 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5886 line => $self->{line_prev},
5887 column => $self->{column_prev} - 2
5888 + 1 * ($self->{nc} == -1));
5889 ## Reconsume.
5890 $self->{state} = BOGUS_COMMENT_STATE;
5891 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5892 redo A;
5893 }
5894 } elsif ($self->{state} == MD_ENTITY_STATE) {
5895 if ($self->{nc} == [
5896 undef,
5897 undef,
5898 0x0054, # T
5899 0x0049, # I
5900 0x0054, # T
5901 ]->[length $self->{kwd}] or
5902 $self->{nc} == [
5903 undef,
5904 undef,
5905 0x0074, # t
5906 0x0069, # i
5907 0x0074, # t
5908 ]->[length $self->{kwd}]) {
5909 ## Stay in the state.
5910 $self->{kwd} .= chr $self->{nc};
5911
5912 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5913 $self->{line_prev} = $self->{line};
5914 $self->{column_prev} = $self->{column};
5915 $self->{column}++;
5916 $self->{nc}
5917 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5918 } else {
5919 $self->{set_nc}->($self);
5920 }
5921
5922 redo A;
5923 } elsif ((length $self->{kwd}) == 5 and
5924 ($self->{nc} == 0x0059 or # Y
5925 $self->{nc} == 0x0079)) { # y
5926 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5928 text => 'ENTITY',
5929 line => $self->{line_prev},
5930 column => $self->{column_prev} - 4);
5931 }
5932 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5933 line => $self->{line_prev},
5934 column => $self->{column_prev} - 6};
5935 $self->{state} = DOCTYPE_MD_STATE;
5936
5937 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5938 $self->{line_prev} = $self->{line};
5939 $self->{column_prev} = $self->{column};
5940 $self->{column}++;
5941 $self->{nc}
5942 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5943 } else {
5944 $self->{set_nc}->($self);
5945 }
5946
5947 redo A;
5948 } else {
5949 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5950 line => $self->{line_prev},
5951 column => $self->{column_prev} - 1
5952 - (length $self->{kwd})
5953 + 1 * ($self->{nc} == -1));
5954 $self->{state} = BOGUS_COMMENT_STATE;
5955 ## Reconsume.
5956 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5957 redo A;
5958 }
5959 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5960 if ($self->{nc} == [
5961 undef,
5962 undef,
5963 0x0045, # E
5964 0x004D, # M
5965 0x0045, # E
5966 0x004E, # N
5967 ]->[length $self->{kwd}] or
5968 $self->{nc} == [
5969 undef,
5970 undef,
5971 0x0065, # e
5972 0x006D, # m
5973 0x0065, # e
5974 0x006E, # n
5975 ]->[length $self->{kwd}]) {
5976 ## Stay in the state.
5977 $self->{kwd} .= chr $self->{nc};
5978
5979 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5980 $self->{line_prev} = $self->{line};
5981 $self->{column_prev} = $self->{column};
5982 $self->{column}++;
5983 $self->{nc}
5984 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5985 } else {
5986 $self->{set_nc}->($self);
5987 }
5988
5989 redo A;
5990 } elsif ((length $self->{kwd}) == 6 and
5991 ($self->{nc} == 0x0054 or # T
5992 $self->{nc} == 0x0074)) { # t
5993 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5994 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5995 text => 'ELEMENT',
5996 line => $self->{line_prev},
5997 column => $self->{column_prev} - 5);
5998 }
5999 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6000 line => $self->{line_prev},
6001 column => $self->{column_prev} - 7};
6002 $self->{state} = DOCTYPE_MD_STATE;
6003
6004 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6005 $self->{line_prev} = $self->{line};
6006 $self->{column_prev} = $self->{column};
6007 $self->{column}++;
6008 $self->{nc}
6009 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6010 } else {
6011 $self->{set_nc}->($self);
6012 }
6013
6014 redo A;
6015 } else {
6016 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6017 line => $self->{line_prev},
6018 column => $self->{column_prev} - 1
6019 - (length $self->{kwd})
6020 + 1 * ($self->{nc} == -1));
6021 $self->{state} = BOGUS_COMMENT_STATE;
6022 ## Reconsume.
6023 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6024 redo A;
6025 }
6026 } elsif ($self->{state} == MD_ATTLIST_STATE) {
6027 if ($self->{nc} == [
6028 undef,
6029 0x0054, # T
6030 0x0054, # T
6031 0x004C, # L
6032 0x0049, # I
6033 0x0053, # S
6034 ]->[length $self->{kwd}] or
6035 $self->{nc} == [
6036 undef,
6037 0x0074, # t
6038 0x0074, # t
6039 0x006C, # l
6040 0x0069, # i
6041 0x0073, # s
6042 ]->[length $self->{kwd}]) {
6043 ## Stay in the state.
6044 $self->{kwd} .= chr $self->{nc};
6045
6046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047 $self->{line_prev} = $self->{line};
6048 $self->{column_prev} = $self->{column};
6049 $self->{column}++;
6050 $self->{nc}
6051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052 } else {
6053 $self->{set_nc}->($self);
6054 }
6055
6056 redo A;
6057 } elsif ((length $self->{kwd}) == 6 and
6058 ($self->{nc} == 0x0054 or # T
6059 $self->{nc} == 0x0074)) { # t
6060 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6061 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6062 text => 'ATTLIST',
6063 line => $self->{line_prev},
6064 column => $self->{column_prev} - 5);
6065 }
6066 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6067 attrdefs => [],
6068 line => $self->{line_prev},
6069 column => $self->{column_prev} - 7};
6070 $self->{state} = DOCTYPE_MD_STATE;
6071
6072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6073 $self->{line_prev} = $self->{line};
6074 $self->{column_prev} = $self->{column};
6075 $self->{column}++;
6076 $self->{nc}
6077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6078 } else {
6079 $self->{set_nc}->($self);
6080 }
6081
6082 redo A;
6083 } else {
6084 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6085 line => $self->{line_prev},
6086 column => $self->{column_prev} - 1
6087 - (length $self->{kwd})
6088 + 1 * ($self->{nc} == -1));
6089 $self->{state} = BOGUS_COMMENT_STATE;
6090 ## Reconsume.
6091 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6092 redo A;
6093 }
6094 } elsif ($self->{state} == MD_NOTATION_STATE) {
6095 if ($self->{nc} == [
6096 undef,
6097 0x004F, # O
6098 0x0054, # T
6099 0x0041, # A
6100 0x0054, # T
6101 0x0049, # I
6102 0x004F, # O
6103 ]->[length $self->{kwd}] or
6104 $self->{nc} == [
6105 undef,
6106 0x006F, # o
6107 0x0074, # t
6108 0x0061, # a
6109 0x0074, # t
6110 0x0069, # i
6111 0x006F, # o
6112 ]->[length $self->{kwd}]) {
6113 ## Stay in the state.
6114 $self->{kwd} .= chr $self->{nc};
6115
6116 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6117 $self->{line_prev} = $self->{line};
6118 $self->{column_prev} = $self->{column};
6119 $self->{column}++;
6120 $self->{nc}
6121 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6122 } else {
6123 $self->{set_nc}->($self);
6124 }
6125
6126 redo A;
6127 } elsif ((length $self->{kwd}) == 7 and
6128 ($self->{nc} == 0x004E or # N
6129 $self->{nc} == 0x006E)) { # n
6130 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6131 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6132 text => 'NOTATION',
6133 line => $self->{line_prev},
6134 column => $self->{column_prev} - 6);
6135 }
6136 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6137 line => $self->{line_prev},
6138 column => $self->{column_prev} - 8};
6139 $self->{state} = DOCTYPE_MD_STATE;
6140
6141 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6142 $self->{line_prev} = $self->{line};
6143 $self->{column_prev} = $self->{column};
6144 $self->{column}++;
6145 $self->{nc}
6146 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6147 } else {
6148 $self->{set_nc}->($self);
6149 }
6150
6151 redo A;
6152 } else {
6153 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6154 line => $self->{line_prev},
6155 column => $self->{column_prev} - 1
6156 - (length $self->{kwd})
6157 + 1 * ($self->{nc} == -1));
6158 $self->{state} = BOGUS_COMMENT_STATE;
6159 ## Reconsume.
6160 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6161 redo A;
6162 }
6163 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6164 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6165 ## "DOCTYPE NOTATION state".
6166
6167 if ($is_space->{$self->{nc}}) {
6168 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6169 $self->{state} = BEFORE_MD_NAME_STATE;
6170
6171 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6172 $self->{line_prev} = $self->{line};
6173 $self->{column_prev} = $self->{column};
6174 $self->{column}++;
6175 $self->{nc}
6176 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6177 } else {
6178 $self->{set_nc}->($self);
6179 }
6180
6181 redo A;
6182 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6183 $self->{nc} == 0x0025) { # %
6184 ## XML5: Switch to the "DOCTYPE bogus comment state".
6185 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6186 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6187
6188 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6189 $self->{line_prev} = $self->{line};
6190 $self->{column_prev} = $self->{column};
6191 $self->{column}++;
6192 $self->{nc}
6193 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6194 } else {
6195 $self->{set_nc}->($self);
6196 }
6197
6198 redo A;
6199 } elsif ($self->{nc} == -1) {
6200 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6201 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6202 ## Reconsume.
6203 redo A;
6204 } elsif ($self->{nc} == 0x003E) { # >
6205 ## XML5: Switch to the "DOCTYPE bogus comment state".
6206 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6207 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6208
6209 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6210 $self->{line_prev} = $self->{line};
6211 $self->{column_prev} = $self->{column};
6212 $self->{column}++;
6213 $self->{nc}
6214 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6215 } else {
6216 $self->{set_nc}->($self);
6217 }
6218
6219 redo A;
6220 } else {
6221 ## XML5: Switch to the "DOCTYPE bogus comment state".
6222 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6223 $self->{state} = BEFORE_MD_NAME_STATE;
6224 redo A;
6225 }
6226 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6227 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6228 ## before state", "DOCTYPE ATTLIST name before state".
6229
6230 if ($is_space->{$self->{nc}}) {
6231 ## Stay in the state.
6232
6233 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6234 $self->{line_prev} = $self->{line};
6235 $self->{column_prev} = $self->{column};
6236 $self->{column}++;
6237 $self->{nc}
6238 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6239 } else {
6240 $self->{set_nc}->($self);
6241 }
6242
6243 redo A;
6244 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6245 $self->{nc} == 0x0025) { # %
6246 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6247
6248 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6249 $self->{line_prev} = $self->{line};
6250 $self->{column_prev} = $self->{column};
6251 $self->{column}++;
6252 $self->{nc}
6253 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6254 } else {
6255 $self->{set_nc}->($self);
6256 }
6257
6258 redo A;
6259 } elsif ($self->{nc} == 0x003E) { # >
6260 ## XML5: Same as "Anything else".
6261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6262 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6263
6264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6265 $self->{line_prev} = $self->{line};
6266 $self->{column_prev} = $self->{column};
6267 $self->{column}++;
6268 $self->{nc}
6269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6270 } else {
6271 $self->{set_nc}->($self);
6272 }
6273
6274 redo A;
6275 } elsif ($self->{nc} == -1) {
6276 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6277 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6278 ## Reconsume.
6279 redo A;
6280 } else {
6281 ## XML5: [ATTLIST] Not defined yet.
6282 $self->{ct}->{name} .= chr $self->{nc};
6283 $self->{state} = MD_NAME_STATE;
6284
6285 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6286 $self->{line_prev} = $self->{line};
6287 $self->{column_prev} = $self->{column};
6288 $self->{column}++;
6289 $self->{nc}
6290 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6291 } else {
6292 $self->{set_nc}->($self);
6293 }
6294
6295 redo A;
6296 }
6297 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6298 if ($is_space->{$self->{nc}}) {
6299 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6300 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6301 $self->{state} = BEFORE_MD_NAME_STATE;
6302
6303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6304 $self->{line_prev} = $self->{line};
6305 $self->{column_prev} = $self->{column};
6306 $self->{column}++;
6307 $self->{nc}
6308 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6309 } else {
6310 $self->{set_nc}->($self);
6311 }
6312
6313 redo A;
6314 } elsif ($self->{nc} == 0x003E) { # >
6315 ## XML5: Same as "Anything else".
6316 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6317 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6318
6319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6320 $self->{line_prev} = $self->{line};
6321 $self->{column_prev} = $self->{column};
6322 $self->{column}++;
6323 $self->{nc}
6324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6325 } else {
6326 $self->{set_nc}->($self);
6327 }
6328
6329 redo A;
6330 } elsif ($self->{nc} == -1) {
6331 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6332 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6333 ## Reconsume.
6334 redo A;
6335 } else {
6336 ## XML5: No parse error.
6337 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6338 $self->{state} = BOGUS_COMMENT_STATE;
6339 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6340 ## Reconsume.
6341 redo A;
6342 }
6343 } elsif ($self->{state} == MD_NAME_STATE) {
6344 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6345
6346 if ($is_space->{$self->{nc}}) {
6347 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6348 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6349 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6350 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6351 } else { # ENTITY/NOTATION
6352 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6353 }
6354
6355 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6356 $self->{line_prev} = $self->{line};
6357 $self->{column_prev} = $self->{column};
6358 $self->{column}++;
6359 $self->{nc}
6360 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6361 } else {
6362 $self->{set_nc}->($self);
6363 }
6364
6365 redo A;
6366 } elsif ($self->{nc} == 0x003E) { # >
6367 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6368 #
6369 } else {
6370 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6371 }
6372 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6373
6374 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6375 $self->{line_prev} = $self->{line};
6376 $self->{column_prev} = $self->{column};
6377 $self->{column}++;
6378 $self->{nc}
6379 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6380 } else {
6381 $self->{set_nc}->($self);
6382 }
6383
6384 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6385 redo A;
6386 } elsif ($self->{nc} == -1) {
6387 ## XML5: [ATTLIST] No parse error.
6388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6389 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6390 ## Reconsume.
6391 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6392 redo A;
6393 } else {
6394 ## XML5: [ATTLIST] Not defined yet.
6395 $self->{ct}->{name} .= chr $self->{nc};
6396 ## Stay in the state.
6397
6398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399 $self->{line_prev} = $self->{line};
6400 $self->{column_prev} = $self->{column};
6401 $self->{column}++;
6402 $self->{nc}
6403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404 } else {
6405 $self->{set_nc}->($self);
6406 }
6407
6408 redo A;
6409 }
6410 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6411 if ($is_space->{$self->{nc}}) {
6412 ## Stay in the state.
6413
6414 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6415 $self->{line_prev} = $self->{line};
6416 $self->{column_prev} = $self->{column};
6417 $self->{column}++;
6418 $self->{nc}
6419 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6420 } else {
6421 $self->{set_nc}->($self);
6422 }
6423
6424 redo A;
6425 } elsif ($self->{nc} == 0x003E) { # >
6426 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6427
6428 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6429 $self->{line_prev} = $self->{line};
6430 $self->{column_prev} = $self->{column};
6431 $self->{column}++;
6432 $self->{nc}
6433 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6434 } else {
6435 $self->{set_nc}->($self);
6436 }
6437
6438 return ($self->{ct}); # ATTLIST
6439 redo A;
6440 } elsif ($self->{nc} == -1) {
6441 ## XML5: No parse error.
6442 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6443 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6444 return ($self->{ct});
6445 redo A;
6446 } else {
6447 ## XML5: Not defined yet.
6448 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6449 tokens => [],
6450 line => $self->{line}, column => $self->{column}};
6451 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6452
6453 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454 $self->{line_prev} = $self->{line};
6455 $self->{column_prev} = $self->{column};
6456 $self->{column}++;
6457 $self->{nc}
6458 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459 } else {
6460 $self->{set_nc}->($self);
6461 }
6462
6463 redo A;
6464 }
6465 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6466 if ($is_space->{$self->{nc}}) {
6467 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6468
6469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470 $self->{line_prev} = $self->{line};
6471 $self->{column_prev} = $self->{column};
6472 $self->{column}++;
6473 $self->{nc}
6474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475 } else {
6476 $self->{set_nc}->($self);
6477 }
6478
6479 redo A;
6480 } elsif ($self->{nc} == 0x003E) { # >
6481 ## XML5: Same as "anything else".
6482 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6483 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6484
6485 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6486 $self->{line_prev} = $self->{line};
6487 $self->{column_prev} = $self->{column};
6488 $self->{column}++;
6489 $self->{nc}
6490 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6491 } else {
6492 $self->{set_nc}->($self);
6493 }
6494
6495 return ($self->{ct}); # ATTLIST
6496 redo A;
6497 } elsif ($self->{nc} == 0x0028) { # (
6498 ## XML5: Same as "anything else".
6499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6500 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6501
6502 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503 $self->{line_prev} = $self->{line};
6504 $self->{column_prev} = $self->{column};
6505 $self->{column}++;
6506 $self->{nc}
6507 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508 } else {
6509 $self->{set_nc}->($self);
6510 }
6511
6512 redo A;
6513 } elsif ($self->{nc} == -1) {
6514 ## XML5: No parse error.
6515 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6516 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6517
6518 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6519 $self->{line_prev} = $self->{line};
6520 $self->{column_prev} = $self->{column};
6521 $self->{column}++;
6522 $self->{nc}
6523 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6524 } else {
6525 $self->{set_nc}->($self);
6526 }
6527
6528 return ($self->{ct}); # ATTLIST
6529 redo A;
6530 } else {
6531 ## XML5: Not defined yet.
6532 $self->{ca}->{name} .= chr $self->{nc};
6533 ## Stay in the state.
6534
6535 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6536 $self->{line_prev} = $self->{line};
6537 $self->{column_prev} = $self->{column};
6538 $self->{column}++;
6539 $self->{nc}
6540 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6541 } else {
6542 $self->{set_nc}->($self);
6543 }
6544
6545 redo A;
6546 }
6547 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6548 if ($is_space->{$self->{nc}}) {
6549 ## Stay in the state.
6550
6551 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6552 $self->{line_prev} = $self->{line};
6553 $self->{column_prev} = $self->{column};
6554 $self->{column}++;
6555 $self->{nc}
6556 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6557 } else {
6558 $self->{set_nc}->($self);
6559 }
6560
6561 redo A;
6562 } elsif ($self->{nc} == 0x003E) { # >
6563 ## XML5: Same as "anything else".
6564 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6565 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6566
6567 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6568 $self->{line_prev} = $self->{line};
6569 $self->{column_prev} = $self->{column};
6570 $self->{column}++;
6571 $self->{nc}
6572 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6573 } else {
6574 $self->{set_nc}->($self);
6575 }
6576
6577 return ($self->{ct}); # ATTLIST
6578 redo A;
6579 } elsif ($self->{nc} == 0x0028) { # (
6580 ## XML5: Same as "anything else".
6581 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6582
6583 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6584 $self->{line_prev} = $self->{line};
6585 $self->{column_prev} = $self->{column};
6586 $self->{column}++;
6587 $self->{nc}
6588 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6589 } else {
6590 $self->{set_nc}->($self);
6591 }
6592
6593 redo A;
6594 } elsif ($self->{nc} == -1) {
6595 ## XML5: No parse error.
6596 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6597 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6598
6599 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6600 $self->{line_prev} = $self->{line};
6601 $self->{column_prev} = $self->{column};
6602 $self->{column}++;
6603 $self->{nc}
6604 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6605 } else {
6606 $self->{set_nc}->($self);
6607 }
6608
6609 return ($self->{ct});
6610 redo A;
6611 } else {
6612 ## XML5: Not defined yet.
6613 $self->{ca}->{type} = chr $self->{nc};
6614 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6615
6616 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617 $self->{line_prev} = $self->{line};
6618 $self->{column_prev} = $self->{column};
6619 $self->{column}++;
6620 $self->{nc}
6621 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622 } else {
6623 $self->{set_nc}->($self);
6624 }
6625
6626 redo A;
6627 }
6628 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6629 if ($is_space->{$self->{nc}}) {
6630 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6631
6632 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6633 $self->{line_prev} = $self->{line};
6634 $self->{column_prev} = $self->{column};
6635 $self->{column}++;
6636 $self->{nc}
6637 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6638 } else {
6639 $self->{set_nc}->($self);
6640 }
6641
6642 redo A;
6643 } elsif ($self->{nc} == 0x0023) { # #
6644 ## XML5: Same as "anything else".
6645 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6646 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6647
6648 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6649 $self->{line_prev} = $self->{line};
6650 $self->{column_prev} = $self->{column};
6651 $self->{column}++;
6652 $self->{nc}
6653 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6654 } else {
6655 $self->{set_nc}->($self);
6656 }
6657
6658 redo A;
6659 } elsif ($self->{nc} == 0x0022) { # "
6660 ## XML5: Same as "anything else".
6661 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6662 $self->{ca}->{value} = '';
6663 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6664
6665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6666 $self->{line_prev} = $self->{line};
6667 $self->{column_prev} = $self->{column};
6668 $self->{column}++;
6669 $self->{nc}
6670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6671 } else {
6672 $self->{set_nc}->($self);
6673 }
6674
6675 redo A;
6676 } elsif ($self->{nc} == 0x0027) { # '
6677 ## XML5: Same as "anything else".
6678 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6679 $self->{ca}->{value} = '';
6680 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6681
6682 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6683 $self->{line_prev} = $self->{line};
6684 $self->{column_prev} = $self->{column};
6685 $self->{column}++;
6686 $self->{nc}
6687 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6688 } else {
6689 $self->{set_nc}->($self);
6690 }
6691
6692 redo A;
6693 } elsif ($self->{nc} == 0x003E) { # >
6694 ## XML5: Same as "anything else".
6695 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6696 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6697
6698 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6699 $self->{line_prev} = $self->{line};
6700 $self->{column_prev} = $self->{column};
6701 $self->{column}++;
6702 $self->{nc}
6703 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6704 } else {
6705 $self->{set_nc}->($self);
6706 }
6707
6708 return ($self->{ct}); # ATTLIST
6709 redo A;
6710 } elsif ($self->{nc} == 0x0028) { # (
6711 ## XML5: Same as "anything else".
6712 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6713 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6714
6715 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716 $self->{line_prev} = $self->{line};
6717 $self->{column_prev} = $self->{column};
6718 $self->{column}++;
6719 $self->{nc}
6720 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721 } else {
6722 $self->{set_nc}->($self);
6723 }
6724
6725 redo A;
6726 } elsif ($self->{nc} == -1) {
6727 ## XML5: No parse error.
6728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6729 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6730
6731 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6732 $self->{line_prev} = $self->{line};
6733 $self->{column_prev} = $self->{column};
6734 $self->{column}++;
6735 $self->{nc}
6736 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6737 } else {
6738 $self->{set_nc}->($self);
6739 }
6740
6741 return ($self->{ct});
6742 redo A;
6743 } else {
6744 ## XML5: Not defined yet.
6745 $self->{ca}->{type} .= chr $self->{nc};
6746 ## Stay in the state.
6747
6748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6749 $self->{line_prev} = $self->{line};
6750 $self->{column_prev} = $self->{column};
6751 $self->{column}++;
6752 $self->{nc}
6753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6754 } else {
6755 $self->{set_nc}->($self);
6756 }
6757
6758 redo A;
6759 }
6760 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6761 if ($is_space->{$self->{nc}}) {
6762 ## Stay in the state.
6763
6764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6765 $self->{line_prev} = $self->{line};
6766 $self->{column_prev} = $self->{column};
6767 $self->{column}++;
6768 $self->{nc}
6769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6770 } else {
6771 $self->{set_nc}->($self);
6772 }
6773
6774 redo A;
6775 } elsif ($self->{nc} == 0x0028) { # (
6776 ## XML5: Same as "anything else".
6777 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6778
6779 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6780 $self->{line_prev} = $self->{line};
6781 $self->{column_prev} = $self->{column};
6782 $self->{column}++;
6783 $self->{nc}
6784 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6785 } else {
6786 $self->{set_nc}->($self);
6787 }
6788
6789 redo A;
6790 } elsif ($self->{nc} == 0x0023) { # #
6791 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6792
6793 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794 $self->{line_prev} = $self->{line};
6795 $self->{column_prev} = $self->{column};
6796 $self->{column}++;
6797 $self->{nc}
6798 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799 } else {
6800 $self->{set_nc}->($self);
6801 }
6802
6803 redo A;
6804 } elsif ($self->{nc} == 0x0022) { # "
6805 ## XML5: Same as "anything else".
6806 $self->{ca}->{value} = '';
6807 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6808
6809 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6810 $self->{line_prev} = $self->{line};
6811 $self->{column_prev} = $self->{column};
6812 $self->{column}++;
6813 $self->{nc}
6814 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6815 } else {
6816 $self->{set_nc}->($self);
6817 }
6818
6819 redo A;
6820 } elsif ($self->{nc} == 0x0027) { # '
6821 ## XML5: Same as "anything else".
6822 $self->{ca}->{value} = '';
6823 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6824
6825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826 $self->{line_prev} = $self->{line};
6827 $self->{column_prev} = $self->{column};
6828 $self->{column}++;
6829 $self->{nc}
6830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831 } else {
6832 $self->{set_nc}->($self);
6833 }
6834
6835 redo A;
6836 } elsif ($self->{nc} == 0x003E) { # >
6837 ## XML5: Same as "anything else".
6838 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6839 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6840
6841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842 $self->{line_prev} = $self->{line};
6843 $self->{column_prev} = $self->{column};
6844 $self->{column}++;
6845 $self->{nc}
6846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847 } else {
6848 $self->{set_nc}->($self);
6849 }
6850
6851 return ($self->{ct}); # ATTLIST
6852 redo A;
6853 } elsif ($self->{nc} == -1) {
6854 ## XML5: No parse error.
6855 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6856 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6857
6858 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6859 $self->{line_prev} = $self->{line};
6860 $self->{column_prev} = $self->{column};
6861 $self->{column}++;
6862 $self->{nc}
6863 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6864 } else {
6865 $self->{set_nc}->($self);
6866 }
6867
6868 return ($self->{ct});
6869 redo A;
6870 } else {
6871 ## XML5: Switch to the "DOCTYPE bogus comment state".
6872 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6873 $self->{ca}->{value} = '';
6874 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6875 ## Reconsume.
6876 redo A;
6877 }
6878 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6879 if ($is_space->{$self->{nc}}) {
6880 ## Stay in the state.
6881
6882 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6883 $self->{line_prev} = $self->{line};
6884 $self->{column_prev} = $self->{column};
6885 $self->{column}++;
6886 $self->{nc}
6887 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6888 } else {
6889 $self->{set_nc}->($self);
6890 }
6891
6892 redo A;
6893 } elsif ($self->{nc} == 0x007C) { # |
6894 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6895 ## Stay in the state.
6896
6897 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6898 $self->{line_prev} = $self->{line};
6899 $self->{column_prev} = $self->{column};
6900 $self->{column}++;
6901 $self->{nc}
6902 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6903 } else {
6904 $self->{set_nc}->($self);
6905 }
6906
6907 redo A;
6908 } elsif ($self->{nc} == 0x0029) { # )
6909 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6910 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6911
6912 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6913 $self->{line_prev} = $self->{line};
6914 $self->{column_prev} = $self->{column};
6915 $self->{column}++;
6916 $self->{nc}
6917 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6918 } else {
6919 $self->{set_nc}->($self);
6920 }
6921
6922 redo A;
6923 } elsif ($self->{nc} == 0x003E) { # >
6924 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6925 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6926
6927 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928 $self->{line_prev} = $self->{line};
6929 $self->{column_prev} = $self->{column};
6930 $self->{column}++;
6931 $self->{nc}
6932 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933 } else {
6934 $self->{set_nc}->($self);
6935 }
6936
6937 return ($self->{ct}); # ATTLIST
6938 redo A;
6939 } elsif ($self->{nc} == -1) {
6940 ## XML5: No parse error.
6941 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6942 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6943
6944 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6945 $self->{line_prev} = $self->{line};
6946 $self->{column_prev} = $self->{column};
6947 $self->{column}++;
6948 $self->{nc}
6949 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6950 } else {
6951 $self->{set_nc}->($self);
6952 }
6953
6954 return ($self->{ct});
6955 redo A;
6956 } else {
6957 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6958 $self->{state} = ALLOWED_TOKEN_STATE;
6959
6960 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6961 $self->{line_prev} = $self->{line};
6962 $self->{column_prev} = $self->{column};
6963 $self->{column}++;
6964 $self->{nc}
6965 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6966 } else {
6967 $self->{set_nc}->($self);
6968 }
6969
6970 redo A;
6971 }
6972 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6973 if ($is_space->{$self->{nc}}) {
6974 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6975
6976 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6977 $self->{line_prev} = $self->{line};
6978 $self->{column_prev} = $self->{column};
6979 $self->{column}++;
6980 $self->{nc}
6981 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6982 } else {
6983 $self->{set_nc}->($self);
6984 }
6985
6986 redo A;
6987 } elsif ($self->{nc} == 0x007C) { # |
6988 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6989
6990 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6991 $self->{line_prev} = $self->{line};
6992 $self->{column_prev} = $self->{column};
6993 $self->{column}++;
6994 $self->{nc}
6995 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6996 } else {
6997 $self->{set_nc}->($self);
6998 }
6999
7000 redo A;
7001 } elsif ($self->{nc} == 0x0029) { # )
7002 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7003
7004 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7005 $self->{line_prev} = $self->{line};
7006 $self->{column_prev} = $self->{column};
7007 $self->{column}++;
7008 $self->{nc}
7009 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7010 } else {
7011 $self->{set_nc}->($self);
7012 }
7013
7014 redo A;
7015 } elsif ($self->{nc} == 0x003E) { # >
7016 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7017 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7018
7019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020 $self->{line_prev} = $self->{line};
7021 $self->{column_prev} = $self->{column};
7022 $self->{column}++;
7023 $self->{nc}
7024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025 } else {
7026 $self->{set_nc}->($self);
7027 }
7028
7029 return ($self->{ct}); # ATTLIST
7030 redo A;
7031 } elsif ($self->{nc} == -1) {
7032 ## XML5: No parse error.
7033 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7034 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7035
7036 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7037 $self->{line_prev} = $self->{line};
7038 $self->{column_prev} = $self->{column};
7039 $self->{column}++;
7040 $self->{nc}
7041 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7042 } else {
7043 $self->{set_nc}->($self);
7044 }
7045
7046 return ($self->{ct});
7047 redo A;
7048 } else {
7049 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7050 ## Stay in the state.
7051
7052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7053 $self->{line_prev} = $self->{line};
7054 $self->{column_prev} = $self->{column};
7055 $self->{column}++;
7056 $self->{nc}
7057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7058 } else {
7059 $self->{set_nc}->($self);
7060 }
7061
7062 redo A;
7063 }
7064 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7065 if ($is_space->{$self->{nc}}) {
7066 ## Stay in the state.
7067
7068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069 $self->{line_prev} = $self->{line};
7070 $self->{column_prev} = $self->{column};
7071 $self->{column}++;
7072 $self->{nc}
7073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074 } else {
7075 $self->{set_nc}->($self);
7076 }
7077
7078 redo A;
7079 } elsif ($self->{nc} == 0x007C) { # |
7080 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7081
7082 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7083 $self->{line_prev} = $self->{line};
7084 $self->{column_prev} = $self->{column};
7085 $self->{column}++;
7086 $self->{nc}
7087 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7088 } else {
7089 $self->{set_nc}->($self);
7090 }
7091
7092 redo A;
7093 } elsif ($self->{nc} == 0x0029) { # )
7094 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7095
7096 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7097 $self->{line_prev} = $self->{line};
7098 $self->{column_prev} = $self->{column};
7099 $self->{column}++;
7100 $self->{nc}
7101 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7102 } else {
7103 $self->{set_nc}->($self);
7104 }
7105
7106 redo A;
7107 } elsif ($self->{nc} == 0x003E) { # >
7108 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7109 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7110
7111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7112 $self->{line_prev} = $self->{line};
7113 $self->{column_prev} = $self->{column};
7114 $self->{column}++;
7115 $self->{nc}
7116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7117 } else {
7118 $self->{set_nc}->($self);
7119 }
7120
7121 return ($self->{ct}); # ATTLIST
7122 redo A;
7123 } elsif ($self->{nc} == -1) {
7124 ## XML5: No parse error.
7125 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7126 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7127
7128 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7129 $self->{line_prev} = $self->{line};
7130 $self->{column_prev} = $self->{column};
7131 $self->{column}++;
7132 $self->{nc}
7133 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7134 } else {
7135 $self->{set_nc}->($self);
7136 }
7137
7138 return ($self->{ct});
7139 redo A;
7140 } else {
7141 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7142 line => $self->{line_prev},
7143 column => $self->{column_prev});
7144 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7145 $self->{state} = ALLOWED_TOKEN_STATE;
7146
7147 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7148 $self->{line_prev} = $self->{line};
7149 $self->{column_prev} = $self->{column};
7150 $self->{column}++;
7151 $self->{nc}
7152 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7153 } else {
7154 $self->{set_nc}->($self);
7155 }
7156
7157 redo A;
7158 }
7159 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7160 if ($is_space->{$self->{nc}}) {
7161 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7162
7163 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164 $self->{line_prev} = $self->{line};
7165 $self->{column_prev} = $self->{column};
7166 $self->{column}++;
7167 $self->{nc}
7168 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169 } else {
7170 $self->{set_nc}->($self);
7171 }
7172
7173 redo A;
7174 } elsif ($self->{nc} == 0x0023) { # #
7175 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7176 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7177
7178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7179 $self->{line_prev} = $self->{line};
7180 $self->{column_prev} = $self->{column};
7181 $self->{column}++;
7182 $self->{nc}
7183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7184 } else {
7185 $self->{set_nc}->($self);
7186 }
7187
7188 redo A;
7189 } elsif ($self->{nc} == 0x0022) { # "
7190 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7191 $self->{ca}->{value} = '';
7192 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7193
7194 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7195 $self->{line_prev} = $self->{line};
7196 $self->{column_prev} = $self->{column};
7197 $self->{column}++;
7198 $self->{nc}
7199 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7200 } else {
7201 $self->{set_nc}->($self);
7202 }
7203
7204 redo A;
7205 } elsif ($self->{nc} == 0x0027) { # '
7206 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7207 $self->{ca}->{value} = '';
7208 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7209
7210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7211 $self->{line_prev} = $self->{line};
7212 $self->{column_prev} = $self->{column};
7213 $self->{column}++;
7214 $self->{nc}
7215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7216 } else {
7217 $self->{set_nc}->($self);
7218 }
7219
7220 redo A;
7221 } elsif ($self->{nc} == 0x003E) { # >
7222 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7223 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7224
7225 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7226 $self->{line_prev} = $self->{line};
7227 $self->{column_prev} = $self->{column};
7228 $self->{column}++;
7229 $self->{nc}
7230 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7231 } else {
7232 $self->{set_nc}->($self);
7233 }
7234
7235 return ($self->{ct}); # ATTLIST
7236 redo A;
7237 } elsif ($self->{nc} == -1) {
7238 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7239 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7240
7241 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7242 $self->{line_prev} = $self->{line};
7243 $self->{column_prev} = $self->{column};
7244 $self->{column}++;
7245 $self->{nc}
7246 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7247 } else {
7248 $self->{set_nc}->($self);
7249 }
7250
7251 return ($self->{ct});
7252 redo A;
7253 } else {
7254 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7255 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7256 ## Reconsume.
7257 redo A;
7258 }
7259 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7260 if ($is_space->{$self->{nc}}) {
7261 ## Stay in the state.
7262
7263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7264 $self->{line_prev} = $self->{line};
7265 $self->{column_prev} = $self->{column};
7266 $self->{column}++;
7267 $self->{nc}
7268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7269 } else {
7270 $self->{set_nc}->($self);
7271 }
7272
7273 redo A;
7274 } elsif ($self->{nc} == 0x0023) { # #
7275 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7276
7277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7278 $self->{line_prev} = $self->{line};
7279 $self->{column_prev} = $self->{column};
7280 $self->{column}++;
7281 $self->{nc}
7282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7283 } else {
7284 $self->{set_nc}->($self);
7285 }
7286
7287 redo A;
7288 } elsif ($self->{nc} == 0x0022) { # "
7289 $self->{ca}->{value} = '';
7290 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7291
7292 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7293 $self->{line_prev} = $self->{line};
7294 $self->{column_prev} = $self->{column};
7295 $self->{column}++;
7296 $self->{nc}
7297 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7298 } else {
7299 $self->{set_nc}->($self);
7300 }
7301
7302 redo A;
7303 } elsif ($self->{nc} == 0x0027) { # '
7304 $self->{ca}->{value} = '';
7305 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7306
7307 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308 $self->{line_prev} = $self->{line};
7309 $self->{column_prev} = $self->{column};
7310 $self->{column}++;
7311 $self->{nc}
7312 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313 } else {
7314 $self->{set_nc}->($self);
7315 }
7316
7317 redo A;
7318 } elsif ($self->{nc} == 0x003E) { # >
7319 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7320 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7321
7322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323 $self->{line_prev} = $self->{line};
7324 $self->{column_prev} = $self->{column};
7325 $self->{column}++;
7326 $self->{nc}
7327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328 } else {
7329 $self->{set_nc}->($self);
7330 }
7331
7332 return ($self->{ct}); # ATTLIST
7333 redo A;
7334 } elsif ($self->{nc} == -1) {
7335 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7336 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7337
7338 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339 $self->{line_prev} = $self->{line};
7340 $self->{column_prev} = $self->{column};
7341 $self->{column}++;
7342 $self->{nc}
7343 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344 } else {
7345 $self->{set_nc}->($self);
7346 }
7347
7348 return ($self->{ct});
7349 redo A;
7350 } else {
7351 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7352 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7353 ## Reconsume.
7354 redo A;
7355 }
7356 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7357 if ($is_space->{$self->{nc}}) {
7358 ## XML5: No parse error.
7359 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7360 $self->{state} = BOGUS_MD_STATE;
7361 ## Reconsume.
7362 redo A;
7363 } elsif ($self->{nc} == 0x0022) { # "
7364 ## XML5: Same as "anything else".
7365 $self->{ca}->{value} = '';
7366 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7367
7368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369 $self->{line_prev} = $self->{line};
7370 $self->{column_prev} = $self->{column};
7371 $self->{column}++;
7372 $self->{nc}
7373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374 } else {
7375 $self->{set_nc}->($self);
7376 }
7377
7378 redo A;
7379 } elsif ($self->{nc} == 0x0027) { # '
7380 ## XML5: Same as "anything else".
7381 $self->{ca}->{value} = '';
7382 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7383
7384 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385 $self->{line_prev} = $self->{line};
7386 $self->{column_prev} = $self->{column};
7387 $self->{column}++;
7388 $self->{nc}
7389 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390 } else {
7391 $self->{set_nc}->($self);
7392 }
7393
7394 redo A;
7395 } elsif ($self->{nc} == 0x003E) { # >
7396 ## XML5: Same as "anything else".
7397 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7398 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7399
7400 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401 $self->{line_prev} = $self->{line};
7402 $self->{column_prev} = $self->{column};
7403 $self->{column}++;
7404 $self->{nc}
7405 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406 } else {
7407 $self->{set_nc}->($self);
7408 }
7409
7410 return ($self->{ct}); # ATTLIST
7411 redo A;
7412 } elsif ($self->{nc} == -1) {
7413 ## XML5: No parse error.
7414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7415 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7416
7417 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7418 $self->{line_prev} = $self->{line};
7419 $self->{column_prev} = $self->{column};
7420 $self->{column}++;
7421 $self->{nc}
7422 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7423 } else {
7424 $self->{set_nc}->($self);
7425 }
7426
7427 return ($self->{ct});
7428 redo A;
7429 } else {
7430 $self->{ca}->{default} = chr $self->{nc};
7431 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7432
7433 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7434 $self->{line_prev} = $self->{line};
7435 $self->{column_prev} = $self->{column};
7436 $self->{column}++;
7437 $self->{nc}
7438 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7439 } else {
7440 $self->{set_nc}->($self);
7441 }
7442
7443 redo A;
7444 }
7445 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7446 if ($is_space->{$self->{nc}}) {
7447 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7448
7449 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7450 $self->{line_prev} = $self->{line};
7451 $self->{column_prev} = $self->{column};
7452 $self->{column}++;
7453 $self->{nc}
7454 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7455 } else {
7456 $self->{set_nc}->($self);
7457 }
7458
7459 redo A;
7460 } elsif ($self->{nc} == 0x0022) { # "
7461 ## XML5: Same as "anything else".
7462 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7463 $self->{ca}->{value} = '';
7464 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7465
7466 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7467 $self->{line_prev} = $self->{line};
7468 $self->{column_prev} = $self->{column};
7469 $self->{column}++;
7470 $self->{nc}
7471 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7472 } else {
7473 $self->{set_nc}->($self);
7474 }
7475
7476 redo A;
7477 } elsif ($self->{nc} == 0x0027) { # '
7478 ## XML5: Same as "anything else".
7479 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7480 $self->{ca}->{value} = '';
7481 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7482
7483 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7484 $self->{line_prev} = $self->{line};
7485 $self->{column_prev} = $self->{column};
7486 $self->{column}++;
7487 $self->{nc}
7488 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7489 } else {
7490 $self->{set_nc}->($self);
7491 }
7492
7493 redo A;
7494 } elsif ($self->{nc} == 0x003E) { # >
7495 ## XML5: Same as "anything else".
7496 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7497 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7498
7499 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7500 $self->{line_prev} = $self->{line};
7501 $self->{column_prev} = $self->{column};
7502 $self->{column}++;
7503 $self->{nc}
7504 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7505 } else {
7506 $self->{set_nc}->($self);
7507 }
7508
7509 return ($self->{ct}); # ATTLIST
7510 redo A;
7511 } elsif ($self->{nc} == -1) {
7512 ## XML5: No parse error.
7513 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7514 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7515 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7516
7517 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7518 $self->{line_prev} = $self->{line};
7519 $self->{column_prev} = $self->{column};
7520 $self->{column}++;
7521 $self->{nc}
7522 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7523 } else {
7524 $self->{set_nc}->($self);
7525 }
7526
7527 return ($self->{ct});
7528 redo A;
7529 } else {
7530 $self->{ca}->{default} .= chr $self->{nc};
7531 ## Stay in the state.
7532
7533 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7534 $self->{line_prev} = $self->{line};
7535 $self->{column_prev} = $self->{column};
7536 $self->{column}++;
7537 $self->{nc}
7538 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7539 } else {
7540 $self->{set_nc}->($self);
7541 }
7542
7543 redo A;
7544 }
7545 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7546 if ($is_space->{$self->{nc}}) {
7547 ## Stay in the state.
7548
7549 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7550 $self->{line_prev} = $self->{line};
7551 $self->{column_prev} = $self->{column};
7552 $self->{column}++;
7553 $self->{nc}
7554 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7555 } else {
7556 $self->{set_nc}->($self);
7557 }
7558
7559 redo A;
7560 } elsif ($self->{nc} == 0x0022) { # "
7561 $self->{ca}->{value} = '';
7562 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7563
7564 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7565 $self->{line_prev} = $self->{line};
7566 $self->{column_prev} = $self->{column};
7567 $self->{column}++;
7568 $self->{nc}
7569 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7570 } else {
7571 $self->{set_nc}->($self);
7572 }
7573
7574 redo A;
7575 } elsif ($self->{nc} == 0x0027) { # '
7576 $self->{ca}->{value} = '';
7577 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7578
7579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7580 $self->{line_prev} = $self->{line};
7581 $self->{column_prev} = $self->{column};
7582 $self->{column}++;
7583 $self->{nc}
7584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7585 } else {
7586 $self->{set_nc}->($self);
7587 }
7588
7589 redo A;
7590 } elsif ($self->{nc} == 0x003E) { # >
7591 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7592 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7593
7594 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7595 $self->{line_prev} = $self->{line};
7596 $self->{column_prev} = $self->{column};
7597 $self->{column}++;
7598 $self->{nc}
7599 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7600 } else {
7601 $self->{set_nc}->($self);
7602 }
7603
7604 return ($self->{ct}); # ATTLIST
7605 redo A;
7606 } elsif ($self->{nc} == -1) {
7607 ## XML5: No parse error.
7608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7609 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7610 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7611
7612 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7613 $self->{line_prev} = $self->{line};
7614 $self->{column_prev} = $self->{column};
7615 $self->{column}++;
7616 $self->{nc}
7617 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7618 } else {
7619 $self->{set_nc}->($self);
7620 }
7621
7622 return ($self->{ct});
7623 redo A;
7624 } else {
7625 ## XML5: Not defined yet.
7626 if ($self->{ca}->{default} eq 'FIXED') {
7627 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7628 } else {
7629 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7630 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7631 }
7632 ## Reconsume.
7633 redo A;
7634 }
7635 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7636 if ($is_space->{$self->{nc}} or
7637 $self->{nc} == -1 or
7638 $self->{nc} == 0x003E) { # >
7639 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7640 ## Reconsume.
7641 redo A;
7642 } else {
7643 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7644 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7645 ## Reconsume.
7646 redo A;
7647 }
7648 } elsif ($self->{state} == NDATA_STATE) {
7649 ## ASCII case-insensitive
7650 if ($self->{nc} == [
7651 undef,
7652 0x0044, # D
7653 0x0041, # A
7654 0x0054, # T
7655 ]->[length $self->{kwd}] or
7656 $self->{nc} == [
7657 undef,
7658 0x0064, # d
7659 0x0061, # a
7660 0x0074, # t
7661 ]->[length $self->{kwd}]) {
7662
7663 ## Stay in the state.
7664 $self->{kwd} .= chr $self->{nc};
7665
7666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7667 $self->{line_prev} = $self->{line};
7668 $self->{column_prev} = $self->{column};
7669 $self->{column}++;
7670 $self->{nc}
7671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7672 } else {
7673 $self->{set_nc}->($self);
7674 }
7675
7676 redo A;
7677 } elsif ((length $self->{kwd}) == 4 and
7678 ($self->{nc} == 0x0041 or # A
7679 $self->{nc} == 0x0061)) { # a
7680 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7681
7682 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7683 text => 'NDATA',
7684 line => $self->{line_prev},
7685 column => $self->{column_prev} - 4);
7686 } else {
7687
7688 }
7689 $self->{state} = AFTER_NDATA_STATE;
7690
7691 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7692 $self->{line_prev} = $self->{line};
7693 $self->{column_prev} = $self->{column};
7694 $self->{column}++;
7695 $self->{nc}
7696 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7697 } else {
7698 $self->{set_nc}->($self);
7699 }
7700
7701 redo A;
7702 } else {
7703 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7704 line => $self->{line_prev},
7705 column => $self->{column_prev} + 1
7706 - length $self->{kwd});
7707
7708 $self->{state} = BOGUS_MD_STATE;
7709 ## Reconsume.
7710 redo A;
7711 }
7712 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7713 if ($is_space->{$self->{nc}}) {
7714 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7715
7716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7717 $self->{line_prev} = $self->{line};
7718 $self->{column_prev} = $self->{column};
7719 $self->{column}++;
7720 $self->{nc}
7721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7722 } else {
7723 $self->{set_nc}->($self);
7724 }
7725
7726 redo A;
7727 } elsif ($self->{nc} == 0x003E) { # >
7728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7729 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7730
7731 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7732 $self->{line_prev} = $self->{line};
7733 $self->{column_prev} = $self->{column};
7734 $self->{column}++;
7735 $self->{nc}
7736 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7737 } else {
7738 $self->{set_nc}->($self);
7739 }
7740
7741 return ($self->{ct}); # ENTITY
7742 redo A;
7743 } elsif ($self->{nc} == -1) {
7744 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7745 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7746
7747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7748 $self->{line_prev} = $self->{line};
7749 $self->{column_prev} = $self->{column};
7750 $self->{column}++;
7751 $self->{nc}
7752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7753 } else {
7754 $self->{set_nc}->($self);
7755 }
7756
7757 return ($self->{ct}); # ENTITY
7758 redo A;
7759 } else {
7760 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7761 line => $self->{line_prev},
7762 column => $self->{column_prev} + 1
7763 - length $self->{kwd});
7764 $self->{state} = BOGUS_MD_STATE;
7765 ## Reconsume.
7766 redo A;
7767 }
7768 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7769 if ($is_space->{$self->{nc}}) {
7770 ## Stay in the state.
7771
7772 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7773 $self->{line_prev} = $self->{line};
7774 $self->{column_prev} = $self->{column};
7775 $self->{column}++;
7776 $self->{nc}
7777 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7778 } else {
7779 $self->{set_nc}->($self);
7780 }
7781
7782 redo A;
7783 } elsif ($self->{nc} == 0x003E) { # >
7784 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7785 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7786
7787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788 $self->{line_prev} = $self->{line};
7789 $self->{column_prev} = $self->{column};
7790 $self->{column}++;
7791 $self->{nc}
7792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793 } else {
7794 $self->{set_nc}->($self);
7795 }
7796
7797 return ($self->{ct}); # ENTITY
7798 redo A;
7799 } elsif ($self->{nc} == -1) {
7800 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7801 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7802
7803 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804 $self->{line_prev} = $self->{line};
7805 $self->{column_prev} = $self->{column};
7806 $self->{column}++;
7807 $self->{nc}
7808 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809 } else {
7810 $self->{set_nc}->($self);
7811 }
7812
7813 return ($self->{ct}); # ENTITY
7814 redo A;
7815 } else {
7816 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7817 $self->{state} = NOTATION_NAME_STATE;
7818
7819 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7820 $self->{line_prev} = $self->{line};
7821 $self->{column_prev} = $self->{column};
7822 $self->{column}++;
7823 $self->{nc}
7824 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7825 } else {
7826 $self->{set_nc}->($self);
7827 }
7828
7829 redo A;
7830 }
7831 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7832 if ($is_space->{$self->{nc}}) {
7833 $self->{state} = AFTER_MD_DEF_STATE;
7834
7835 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7836 $self->{line_prev} = $self->{line};
7837 $self->{column_prev} = $self->{column};
7838 $self->{column}++;
7839 $self->{nc}
7840 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7841 } else {
7842 $self->{set_nc}->($self);
7843 }
7844
7845 redo A;
7846 } elsif ($self->{nc} == 0x003E) { # >
7847 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7848
7849 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7850 $self->{line_prev} = $self->{line};
7851 $self->{column_prev} = $self->{column};
7852 $self->{column}++;
7853 $self->{nc}
7854 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7855 } else {
7856 $self->{set_nc}->($self);
7857 }
7858
7859 return ($self->{ct}); # ENTITY
7860 redo A;
7861 } elsif ($self->{nc} == -1) {
7862 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7863 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7864
7865 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7866 $self->{line_prev} = $self->{line};
7867 $self->{column_prev} = $self->{column};
7868 $self->{column}++;
7869 $self->{nc}
7870 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7871 } else {
7872 $self->{set_nc}->($self);
7873 }
7874
7875 return ($self->{ct}); # ENTITY
7876 redo A;
7877 } else {
7878 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7879 ## Stay in the state.
7880
7881 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7882 $self->{line_prev} = $self->{line};
7883 $self->{column_prev} = $self->{column};
7884 $self->{column}++;
7885 $self->{nc}
7886 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7887 } else {
7888 $self->{set_nc}->($self);
7889 }
7890
7891 redo A;
7892 }
7893 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7894 if ($self->{nc} == 0x0022) { # "
7895 $self->{state} = AFTER_MD_DEF_STATE;
7896
7897 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7898 $self->{line_prev} = $self->{line};
7899 $self->{column_prev} = $self->{column};
7900 $self->{column}++;
7901 $self->{nc}
7902 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7903 } else {
7904 $self->{set_nc}->($self);
7905 }
7906
7907 redo A;
7908 } elsif ($self->{nc} == 0x0026) { # &
7909 $self->{prev_state} = $self->{state};
7910 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7911 $self->{entity_add} = 0x0022; # "
7912
7913 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7914 $self->{line_prev} = $self->{line};
7915 $self->{column_prev} = $self->{column};
7916 $self->{column}++;
7917 $self->{nc}
7918 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7919 } else {
7920 $self->{set_nc}->($self);
7921 }
7922
7923 redo A;
7924 ## TODO: %
7925 } elsif ($self->{nc} == -1) {
7926 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7927 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7928 ## Reconsume.
7929 return ($self->{ct}); # ENTITY
7930 redo A;
7931 } else {
7932 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7933
7934 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7935 $self->{line_prev} = $self->{line};
7936 $self->{column_prev} = $self->{column};
7937 $self->{column}++;
7938 $self->{nc}
7939 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7940 } else {
7941 $self->{set_nc}->($self);
7942 }
7943
7944 redo A;
7945 }
7946 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7947 if ($self->{nc} == 0x0027) { # '
7948 $self->{state} = AFTER_MD_DEF_STATE;
7949
7950 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7951 $self->{line_prev} = $self->{line};
7952 $self->{column_prev} = $self->{column};
7953 $self->{column}++;
7954 $self->{nc}
7955 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7956 } else {
7957 $self->{set_nc}->($self);
7958 }
7959
7960 redo A;
7961 } elsif ($self->{nc} == 0x0026) { # &
7962 $self->{prev_state} = $self->{state};
7963 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7964 $self->{entity_add} = 0x0027; # '
7965
7966 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7967 $self->{line_prev} = $self->{line};
7968 $self->{column_prev} = $self->{column};
7969 $self->{column}++;
7970 $self->{nc}
7971 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7972 } else {
7973 $self->{set_nc}->($self);
7974 }
7975
7976 redo A;
7977 ## TODO: %
7978 } elsif ($self->{nc} == -1) {
7979 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7981 ## Reconsume.
7982 return ($self->{ct}); # ENTITY
7983 redo A;
7984 } else {
7985 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7986
7987 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7988 $self->{line_prev} = $self->{line};
7989 $self->{column_prev} = $self->{column};
7990 $self->{column}++;
7991 $self->{nc}
7992 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7993 } else {
7994 $self->{set_nc}->($self);
7995 }
7996
7997 redo A;
7998 }
7999 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8000 if ($is_space->{$self->{nc}} or
8001 {
8002 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8003 $self->{entity_add} => 1,
8004 }->{$self->{nc}}) {
8005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8006 line => $self->{line_prev},
8007 column => $self->{column_prev}
8008 + ($self->{nc} == -1 ? 1 : 0));
8009 ## Don't consume
8010 ## Return nothing.
8011 #
8012 } elsif ($self->{nc} == 0x0023) { # #
8013 $self->{ca} = $self->{ct};
8014 $self->{state} = ENTITY_HASH_STATE;
8015 $self->{kwd} = '#';
8016
8017 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8018 $self->{line_prev} = $self->{line};
8019 $self->{column_prev} = $self->{column};
8020 $self->{column}++;
8021 $self->{nc}
8022 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8023 } else {
8024 $self->{set_nc}->($self);
8025 }
8026
8027 redo A;
8028 } else {
8029 #
8030 }
8031
8032 $self->{ct}->{value} .= '&';
8033 $self->{state} = $self->{prev_state};
8034 ## Reconsume.
8035 redo A;
8036 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8037 if ($is_space->{$self->{nc}}) {
8038 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8039
8040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8041 $self->{line_prev} = $self->{line};
8042 $self->{column_prev} = $self->{column};
8043 $self->{column}++;
8044 $self->{nc}
8045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8046 } else {
8047 $self->{set_nc}->($self);
8048 }
8049
8050 redo A;
8051 } elsif ($self->{nc} == 0x0028) { # (
8052 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8053 $self->{ct}->{content} = ['('];
8054 $self->{group_depth} = 1;
8055
8056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8057 $self->{line_prev} = $self->{line};
8058 $self->{column_prev} = $self->{column};
8059 $self->{column}++;
8060 $self->{nc}
8061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8062 } else {
8063 $self->{set_nc}->($self);
8064 }
8065
8066 redo A;
8067 } elsif ($self->{nc} == 0x003E) { # >
8068 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8069 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8070
8071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8072 $self->{line_prev} = $self->{line};
8073 $self->{column_prev} = $self->{column};
8074 $self->{column}++;
8075 $self->{nc}
8076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8077 } else {
8078 $self->{set_nc}->($self);
8079 }
8080
8081 return ($self->{ct}); # ELEMENT
8082 redo A;
8083 } elsif ($self->{nc} == -1) {
8084 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8085 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8086
8087 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8088 $self->{line_prev} = $self->{line};
8089 $self->{column_prev} = $self->{column};
8090 $self->{column}++;
8091 $self->{nc}
8092 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8093 } else {
8094 $self->{set_nc}->($self);
8095 }
8096
8097 return ($self->{ct}); # ELEMENT
8098 redo A;
8099 } else {
8100 $self->{ct}->{content} = [chr $self->{nc}];
8101 $self->{state} = CONTENT_KEYWORD_STATE;
8102
8103 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8104 $self->{line_prev} = $self->{line};
8105 $self->{column_prev} = $self->{column};
8106 $self->{column}++;
8107 $self->{nc}
8108 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8109 } else {
8110 $self->{set_nc}->($self);
8111 }
8112
8113 redo A;
8114 }
8115 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8116 if ($is_space->{$self->{nc}}) {
8117 $self->{state} = AFTER_MD_DEF_STATE;
8118
8119 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8120 $self->{line_prev} = $self->{line};
8121 $self->{column_prev} = $self->{column};
8122 $self->{column}++;
8123 $self->{nc}
8124 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8125 } else {
8126 $self->{set_nc}->($self);
8127 }
8128
8129 redo A;
8130 } elsif ($self->{nc} == 0x003E) { # >
8131 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8132
8133 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8134 $self->{line_prev} = $self->{line};
8135 $self->{column_prev} = $self->{column};
8136 $self->{column}++;
8137 $self->{nc}
8138 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8139 } else {
8140 $self->{set_nc}->($self);
8141 }
8142
8143 return ($self->{ct}); # ELEMENT
8144 redo A;
8145 } elsif ($self->{nc} == -1) {
8146 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8147 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8148
8149 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8150 $self->{line_prev} = $self->{line};
8151 $self->{column_prev} = $self->{column};
8152 $self->{column}++;
8153 $self->{nc}
8154 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8155 } else {
8156 $self->{set_nc}->($self);
8157 }
8158
8159 return ($self->{ct}); # ELEMENT
8160 redo A;
8161 } else {
8162 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8163 ## Stay in the state.
8164
8165 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8166 $self->{line_prev} = $self->{line};
8167 $self->{column_prev} = $self->{column};
8168 $self->{column}++;
8169 $self->{nc}
8170 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8171 } else {
8172 $self->{set_nc}->($self);
8173 }
8174
8175 redo A;
8176 }
8177 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8178 if ($is_space->{$self->{nc}}) {
8179 ## Stay in the state.
8180
8181 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8182 $self->{line_prev} = $self->{line};
8183 $self->{column_prev} = $self->{column};
8184 $self->{column}++;
8185 $self->{nc}
8186 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8187 } else {
8188 $self->{set_nc}->($self);
8189 }
8190
8191 redo A;
8192 } elsif ($self->{nc} == 0x0028) { # (
8193 $self->{group_depth}++;
8194 push @{$self->{ct}->{content}}, chr $self->{nc};
8195 ## Stay in the state.
8196
8197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8198 $self->{line_prev} = $self->{line};
8199 $self->{column_prev} = $self->{column};
8200 $self->{column}++;
8201 $self->{nc}
8202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8203 } else {
8204 $self->{set_nc}->($self);
8205 }
8206
8207 redo A;
8208 } elsif ($self->{nc} == 0x007C or # |
8209 $self->{nc} == 0x002C) { # ,
8210 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8211 ## Stay in the state.
8212
8213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8214 $self->{line_prev} = $self->{line};
8215 $self->{column_prev} = $self->{column};
8216 $self->{column}++;
8217 $self->{nc}
8218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8219 } else {
8220 $self->{set_nc}->($self);
8221 }
8222
8223 redo A;
8224 } elsif ($self->{nc} == 0x0029) { # )
8225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8226 push @{$self->{ct}->{content}}, chr $self->{nc};
8227 $self->{group_depth}--;
8228 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8229
8230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231 $self->{line_prev} = $self->{line};
8232 $self->{column_prev} = $self->{column};
8233 $self->{column}++;
8234 $self->{nc}
8235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236 } else {
8237 $self->{set_nc}->($self);
8238 }
8239
8240 redo A;
8241 } elsif ($self->{nc} == 0x003E) { # >
8242 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8243 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8244 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8245
8246 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247 $self->{line_prev} = $self->{line};
8248 $self->{column_prev} = $self->{column};
8249 $self->{column}++;
8250 $self->{nc}
8251 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252 } else {
8253 $self->{set_nc}->($self);
8254 }
8255
8256 return ($self->{ct}); # ELEMENT
8257 redo A;
8258 } elsif ($self->{nc} == -1) {
8259 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8260 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8261 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8262
8263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264 $self->{line_prev} = $self->{line};
8265 $self->{column_prev} = $self->{column};
8266 $self->{column}++;
8267 $self->{nc}
8268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269 } else {
8270 $self->{set_nc}->($self);
8271 }
8272
8273 return ($self->{ct}); # ELEMENT
8274 redo A;
8275 } else {
8276 push @{$self->{ct}->{content}}, chr $self->{nc};
8277 $self->{state} = CM_ELEMENT_NAME_STATE;
8278
8279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280 $self->{line_prev} = $self->{line};
8281 $self->{column_prev} = $self->{column};
8282 $self->{column}++;
8283 $self->{nc}
8284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285 } else {
8286 $self->{set_nc}->($self);
8287 }
8288
8289 redo A;
8290 }
8291 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8292 if ($is_space->{$self->{nc}}) {
8293 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8294
8295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8296 $self->{line_prev} = $self->{line};
8297 $self->{column_prev} = $self->{column};
8298 $self->{column}++;
8299 $self->{nc}
8300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8301 } else {
8302 $self->{set_nc}->($self);
8303 }
8304
8305 redo A;
8306 } elsif ($self->{nc} == 0x002A or # *
8307 $self->{nc} == 0x002B or # +
8308 $self->{nc} == 0x003F) { # ?
8309 push @{$self->{ct}->{content}}, chr $self->{nc};
8310 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8311
8312 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8313 $self->{line_prev} = $self->{line};
8314 $self->{column_prev} = $self->{column};
8315 $self->{column}++;
8316 $self->{nc}
8317 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8318 } else {
8319 $self->{set_nc}->($self);
8320 }
8321
8322 redo A;
8323 } elsif ($self->{nc} == 0x007C or # |
8324 $self->{nc} == 0x002C) { # ,
8325 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8326 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8327
8328 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8329 $self->{line_prev} = $self->{line};
8330 $self->{column_prev} = $self->{column};
8331 $self->{column}++;
8332 $self->{nc}
8333 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8334 } else {
8335 $self->{set_nc}->($self);
8336 }
8337
8338 redo A;
8339 } elsif ($self->{nc} == 0x0029) { # )
8340 $self->{group_depth}--;
8341 push @{$self->{ct}->{content}}, chr $self->{nc};
8342 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8343
8344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8345 $self->{line_prev} = $self->{line};
8346 $self->{column_prev} = $self->{column};
8347 $self->{column}++;
8348 $self->{nc}
8349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8350 } else {
8351 $self->{set_nc}->($self);
8352 }
8353
8354 redo A;
8355 } elsif ($self->{nc} == 0x003E) { # >
8356 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8357 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8358 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8359
8360 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361 $self->{line_prev} = $self->{line};
8362 $self->{column_prev} = $self->{column};
8363 $self->{column}++;
8364 $self->{nc}
8365 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366 } else {
8367 $self->{set_nc}->($self);
8368 }
8369
8370 return ($self->{ct}); # ELEMENT
8371 redo A;
8372 } elsif ($self->{nc} == -1) {
8373 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8374 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8375 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8376
8377 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8378 $self->{line_prev} = $self->{line};
8379 $self->{column_prev} = $self->{column};
8380 $self->{column}++;
8381 $self->{nc}
8382 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8383 } else {
8384 $self->{set_nc}->($self);
8385 }
8386
8387 return ($self->{ct}); # ELEMENT
8388 redo A;
8389 } else {
8390 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8391 ## Stay in the state.
8392
8393 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8394 $self->{line_prev} = $self->{line};
8395 $self->{column_prev} = $self->{column};
8396 $self->{column}++;
8397 $self->{nc}
8398 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8399 } else {
8400 $self->{set_nc}->($self);
8401 }
8402
8403 redo A;
8404 }
8405 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8406 if ($is_space->{$self->{nc}}) {
8407 ## Stay in the state.
8408
8409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8410 $self->{line_prev} = $self->{line};
8411 $self->{column_prev} = $self->{column};
8412 $self->{column}++;
8413 $self->{nc}
8414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8415 } else {
8416 $self->{set_nc}->($self);
8417 }
8418
8419 redo A;
8420 } elsif ($self->{nc} == 0x007C or # |
8421 $self->{nc} == 0x002C) { # ,
8422 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8423 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8424
8425 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8426 $self->{line_prev} = $self->{line};
8427 $self->{column_prev} = $self->{column};
8428 $self->{column}++;
8429 $self->{nc}
8430 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8431 } else {
8432 $self->{set_nc}->($self);
8433 }
8434
8435 redo A;
8436 } elsif ($self->{nc} == 0x0029) { # )
8437 $self->{group_depth}--;
8438 push @{$self->{ct}->{content}}, chr $self->{nc};
8439 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8440
8441 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8442 $self->{line_prev} = $self->{line};
8443 $self->{column_prev} = $self->{column};
8444 $self->{column}++;
8445 $self->{nc}
8446 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8447 } else {
8448 $self->{set_nc}->($self);
8449 }
8450
8451 redo A;
8452 } elsif ($self->{nc} == 0x003E) { # >
8453 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8454 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8455 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8456
8457 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8458 $self->{line_prev} = $self->{line};
8459 $self->{column_prev} = $self->{column};
8460 $self->{column}++;
8461 $self->{nc}
8462 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8463 } else {
8464 $self->{set_nc}->($self);
8465 }
8466
8467 return ($self->{ct}); # ELEMENT
8468 redo A;
8469 } elsif ($self->{nc} == -1) {
8470 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8471 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8472 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8473
8474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8475 $self->{line_prev} = $self->{line};
8476 $self->{column_prev} = $self->{column};
8477 $self->{column}++;
8478 $self->{nc}
8479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8480 } else {
8481 $self->{set_nc}->($self);
8482 }
8483
8484 return ($self->{ct}); # ELEMENT
8485 redo A;
8486 } else {
8487 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8488 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8489 $self->{state} = BOGUS_MD_STATE;
8490
8491 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8492 $self->{line_prev} = $self->{line};
8493 $self->{column_prev} = $self->{column};
8494 $self->{column}++;
8495 $self->{nc}
8496 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8497 } else {
8498 $self->{set_nc}->($self);
8499 }
8500
8501 redo A;
8502 }
8503 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8504 if ($is_space->{$self->{nc}}) {
8505 if ($self->{group_depth}) {
8506 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8507 } else {
8508 $self->{state} = AFTER_MD_DEF_STATE;
8509 }
8510
8511 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8512 $self->{line_prev} = $self->{line};
8513 $self->{column_prev} = $self->{column};
8514 $self->{column}++;
8515 $self->{nc}
8516 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8517 } else {
8518 $self->{set_nc}->($self);
8519 }
8520
8521 redo A;
8522 } elsif ($self->{nc} == 0x002A or # *
8523 $self->{nc} == 0x002B or # +
8524 $self->{nc} == 0x003F) { # ?
8525 push @{$self->{ct}->{content}}, chr $self->{nc};
8526 if ($self->{group_depth}) {
8527 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8528 } else {
8529 $self->{state} = AFTER_MD_DEF_STATE;
8530 }
8531
8532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8533 $self->{line_prev} = $self->{line};
8534 $self->{column_prev} = $self->{column};
8535 $self->{column}++;
8536 $self->{nc}
8537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8538 } else {
8539 $self->{set_nc}->($self);
8540 }
8541
8542 redo A;
8543 } elsif ($self->{nc} == 0x0029) { # )
8544 if ($self->{group_depth}) {
8545 $self->{group_depth}--;
8546 push @{$self->{ct}->{content}}, chr $self->{nc};
8547 ## Stay in the state.
8548
8549 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8550 $self->{line_prev} = $self->{line};
8551 $self->{column_prev} = $self->{column};
8552 $self->{column}++;
8553 $self->{nc}
8554 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8555 } else {
8556 $self->{set_nc}->($self);
8557 }
8558
8559 redo A;
8560 } else {
8561 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8562 $self->{state} = BOGUS_MD_STATE;
8563 ## Reconsume.
8564 redo A;
8565 }
8566 } elsif ($self->{nc} == 0x003E) { # >
8567 if ($self->{group_depth}) {
8568 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8569 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8570 }
8571 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8572
8573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8574 $self->{line_prev} = $self->{line};
8575 $self->{column_prev} = $self->{column};
8576 $self->{column}++;
8577 $self->{nc}
8578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8579 } else {
8580 $self->{set_nc}->($self);
8581 }
8582
8583 return ($self->{ct}); # ELEMENT
8584 redo A;
8585 } elsif ($self->{nc} == -1) {
8586 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8587 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8588 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8589
8590 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8591 $self->{line_prev} = $self->{line};
8592 $self->{column_prev} = $self->{column};
8593 $self->{column}++;
8594 $self->{nc}
8595 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8596 } else {
8597 $self->{set_nc}->($self);
8598 }
8599
8600 return ($self->{ct}); # ELEMENT
8601 redo A;
8602 } else {
8603 if ($self->{group_depth}) {
8604 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8605 } else {
8606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8607 $self->{state} = BOGUS_MD_STATE;
8608 }
8609 ## Reconsume.
8610 redo A;
8611 }
8612 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8613 if ($is_space->{$self->{nc}}) {
8614 ## Stay in the state.
8615
8616 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8617 $self->{line_prev} = $self->{line};
8618 $self->{column_prev} = $self->{column};
8619 $self->{column}++;
8620 $self->{nc}
8621 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8622 } else {
8623 $self->{set_nc}->($self);
8624 }
8625
8626 redo A;
8627 } elsif ($self->{nc} == 0x003E) { # >
8628 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8629
8630 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8631 $self->{line_prev} = $self->{line};
8632 $self->{column_prev} = $self->{column};
8633 $self->{column}++;
8634 $self->{nc}
8635 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8636 } else {
8637 $self->{set_nc}->($self);
8638 }
8639
8640 return ($self->{ct}); # ENTITY/ELEMENT
8641 redo A;
8642 } elsif ($self->{nc} == -1) {
8643 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8644 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8645
8646 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8647 $self->{line_prev} = $self->{line};
8648 $self->{column_prev} = $self->{column};
8649 $self->{column}++;
8650 $self->{nc}
8651 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8652 } else {
8653 $self->{set_nc}->($self);
8654 }
8655
8656 return ($self->{ct}); # ENTITY/ELEMENT
8657 redo A;
8658 } else {
8659 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8660 $self->{state} = BOGUS_MD_STATE;
8661 ## Reconsume.
8662 redo A;
8663 }
8664 } elsif ($self->{state} == BOGUS_MD_STATE) {
8665 if ($self->{nc} == 0x003E) { # >
8666 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8667
8668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8669 $self->{line_prev} = $self->{line};
8670 $self->{column_prev} = $self->{column};
8671 $self->{column}++;
8672 $self->{nc}
8673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8674 } else {
8675 $self->{set_nc}->($self);
8676 }
8677
8678 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8679 redo A;
8680 } elsif ($self->{nc} == -1) {
8681 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8682 ## Reconsume.
8683 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8684 redo A;
8685 } else {
8686 ## Stay in the state.
8687
8688 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8689 $self->{line_prev} = $self->{line};
8690 $self->{column_prev} = $self->{column};
8691 $self->{column}++;
8692 $self->{nc}
8693 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8694 } else {
8695 $self->{set_nc}->($self);
8696 }
8697
8698 redo A;
8699 }
8700 } else {
8701 die "$0: $self->{state}: Unknown state";
8702 }
8703 } # A
8704
8705 die "$0: _get_next_token: unexpected case";
8706 } # _get_next_token
8707
8708 1;
8709 ## $Date: 2009/08/16 04:06:34 $
8710

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24