/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.32 - (show annotations) (download)
Sat Sep 5 09:57:55 2009 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.31: +100 -5 lines
++ whatpm/t/ChangeLog	5 Sep 2009 09:57:06 -0000
	* tokenizer-test-1.test: Added test cases for "comment end space
	state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 09:57:45 -0000
	space state" (HTML5 revision 3195).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src (_get_next_token): Implemented the "comment end

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.31 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_BANG_STATE () { 102 }
109 sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 sub COMMENT_END_DASH_STATE () { 18 }
111 sub BOGUS_COMMENT_STATE () { 19 }
112 sub DOCTYPE_STATE () { 20 }
113 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114 sub DOCTYPE_NAME_STATE () { 22 }
115 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124 sub BOGUS_DOCTYPE_STATE () { 32 }
125 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126 sub SELF_CLOSING_START_TAG_STATE () { 34 }
127 sub CDATA_SECTION_STATE () { 35 }
128 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136 ## NOTE: "Entity data state", "entity in attribute value state", and
137 ## "consume a character reference" algorithm are jointly implemented
138 ## using the following six states:
139 sub ENTITY_STATE () { 44 }
140 sub ENTITY_HASH_STATE () { 45 }
141 sub NCR_NUM_STATE () { 46 }
142 sub HEXREF_X_STATE () { 47 }
143 sub HEXREF_HEX_STATE () { 48 }
144 sub ENTITY_NAME_STATE () { 49 }
145 sub PCDATA_STATE () { 50 } # "data state" in the spec
146
147 ## XML-only states
148 sub PI_STATE () { 51 }
149 sub PI_TARGET_STATE () { 52 }
150 sub PI_TARGET_AFTER_STATE () { 53 }
151 sub PI_DATA_STATE () { 54 }
152 sub PI_AFTER_STATE () { 55 }
153 sub PI_DATA_AFTER_STATE () { 56 }
154 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157 sub DOCTYPE_TAG_STATE () { 60 }
158 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159 sub MD_ATTLIST_STATE () { 62 }
160 sub MD_E_STATE () { 63 }
161 sub MD_ELEMENT_STATE () { 64 }
162 sub MD_ENTITY_STATE () { 65 }
163 sub MD_NOTATION_STATE () { 66 }
164 sub DOCTYPE_MD_STATE () { 67 }
165 sub BEFORE_MD_NAME_STATE () { 68 }
166 sub MD_NAME_STATE () { 69 }
167 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174 sub ALLOWED_TOKEN_STATE () { 77 }
175 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 sub BEFORE_NDATA_STATE () { 85 }
183 sub NDATA_STATE () { 86 }
184 sub AFTER_NDATA_STATE () { 87 }
185 sub BEFORE_NOTATION_NAME_STATE () { 88 }
186 sub NOTATION_NAME_STATE () { 89 }
187 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190 sub AFTER_ELEMENT_NAME_STATE () { 93 }
191 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192 sub CONTENT_KEYWORD_STATE () { 95 }
193 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194 sub CM_ELEMENT_NAME_STATE () { 97 }
195 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197 sub AFTER_MD_DEF_STATE () { 100 }
198 sub BOGUS_MD_STATE () { 101 }
199
200 ## Tree constructor state constants (see Whatpm::HTML for the full
201 ## list and descriptions)
202
203 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204 sub FOREIGN_EL () { 0b1_00000000000 }
205
206 ## Character reference mappings
207
208 my $charref_map = {
209 0x0D => 0x000A,
210 0x80 => 0x20AC,
211 0x81 => 0xFFFD,
212 0x82 => 0x201A,
213 0x83 => 0x0192,
214 0x84 => 0x201E,
215 0x85 => 0x2026,
216 0x86 => 0x2020,
217 0x87 => 0x2021,
218 0x88 => 0x02C6,
219 0x89 => 0x2030,
220 0x8A => 0x0160,
221 0x8B => 0x2039,
222 0x8C => 0x0152,
223 0x8D => 0xFFFD,
224 0x8E => 0x017D,
225 0x8F => 0xFFFD,
226 0x90 => 0xFFFD,
227 0x91 => 0x2018,
228 0x92 => 0x2019,
229 0x93 => 0x201C,
230 0x94 => 0x201D,
231 0x95 => 0x2022,
232 0x96 => 0x2013,
233 0x97 => 0x2014,
234 0x98 => 0x02DC,
235 0x99 => 0x2122,
236 0x9A => 0x0161,
237 0x9B => 0x203A,
238 0x9C => 0x0153,
239 0x9D => 0xFFFD,
240 0x9E => 0x017E,
241 0x9F => 0x0178,
242 }; # $charref_map
243 $charref_map->{$_} = 0xFFFD
244 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
245 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
246 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
247 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
248 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
249 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
250 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
251
252 ## Implementations MUST act as if state machine in the spec
253
254 sub _initialize_tokenizer ($) {
255 my $self = shift;
256
257 ## NOTE: Fields set by |new| constructor:
258 #$self->{level}
259 #$self->{set_nc}
260 #$self->{parse_error}
261 #$self->{is_xml} (if XML)
262
263 $self->{state} = DATA_STATE; # MUST
264 $self->{s_kwd} = ''; # Data state keyword
265 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266 #$self->{entity__value}; # initialized when used
267 #$self->{entity__match}; # initialized when used
268 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
269 undef $self->{ct}; # current token
270 undef $self->{ca}; # current attribute
271 undef $self->{last_stag_name}; # last emitted start tag name
272 #$self->{prev_state}; # initialized when used
273 delete $self->{self_closing};
274 $self->{char_buffer} = '';
275 $self->{char_buffer_pos} = 0;
276 $self->{nc} = -1; # next input character
277 #$self->{next_nc}
278
279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
280 $self->{line_prev} = $self->{line};
281 $self->{column_prev} = $self->{column};
282 $self->{column}++;
283 $self->{nc}
284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
285 } else {
286 $self->{set_nc}->($self);
287 }
288
289 $self->{token} = [];
290 # $self->{escape}
291 } # _initialize_tokenizer
292
293 ## A token has:
294 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
295 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
296 ## ->{name} (DOCTYPE_TOKEN)
297 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
298 ## ->{target} (PI_TOKEN)
299 ## ->{pubid} (DOCTYPE_TOKEN)
300 ## ->{sysid} (DOCTYPE_TOKEN)
301 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
302 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
303 ## ->{name}
304 ## ->{value}
305 ## ->{has_reference} == 1 or 0
306 ## ->{index}: Index of the attribute in a tag.
307 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
308 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
309 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
310 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
311
312 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
313 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
314 ## while the token is pushed back to the stack.
315
316 ## Emitted token MUST immediately be handled by the tree construction state.
317
318 ## Before each step, UA MAY check to see if either one of the scripts in
319 ## "list of scripts that will execute as soon as possible" or the first
320 ## script in the "list of scripts that will execute asynchronously",
321 ## has completed loading. If one has, then it MUST be executed
322 ## and removed from the list.
323
324 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
325 ## (This requirement was dropped from HTML5 spec, unfortunately.)
326
327 my $is_space = {
328 0x0009 => 1, # CHARACTER TABULATION (HT)
329 0x000A => 1, # LINE FEED (LF)
330 #0x000B => 0, # LINE TABULATION (VT)
331 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
332 #0x000D => 1, # CARRIAGE RETURN (CR)
333 0x0020 => 1, # SPACE (SP)
334 };
335
336 sub _get_next_token ($) {
337 my $self = shift;
338
339 if ($self->{self_closing}) {
340 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
341 ## NOTE: The |self_closing| flag is only set by start tag token.
342 ## In addition, when a start tag token is emitted, it is always set to
343 ## |ct|.
344 delete $self->{self_closing};
345 }
346
347 if (@{$self->{token}}) {
348 $self->{self_closing} = $self->{token}->[0]->{self_closing};
349 return shift @{$self->{token}};
350 }
351
352 A: {
353 if ($self->{state} == PCDATA_STATE) {
354 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
355
356 if ($self->{nc} == 0x0026) { # &
357
358 ## NOTE: In the spec, the tokenizer is switched to the
359 ## "entity data state". In this implementation, the tokenizer
360 ## is switched to the |ENTITY_STATE|, which is an implementation
361 ## of the "consume a character reference" algorithm.
362 $self->{entity_add} = -1;
363 $self->{prev_state} = DATA_STATE;
364 $self->{state} = ENTITY_STATE;
365
366 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
367 $self->{line_prev} = $self->{line};
368 $self->{column_prev} = $self->{column};
369 $self->{column}++;
370 $self->{nc}
371 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
372 } else {
373 $self->{set_nc}->($self);
374 }
375
376 redo A;
377 } elsif ($self->{nc} == 0x003C) { # <
378
379 $self->{state} = TAG_OPEN_STATE;
380
381 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
382 $self->{line_prev} = $self->{line};
383 $self->{column_prev} = $self->{column};
384 $self->{column}++;
385 $self->{nc}
386 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
387 } else {
388 $self->{set_nc}->($self);
389 }
390
391 redo A;
392 } elsif ($self->{nc} == -1) {
393
394 return ({type => END_OF_FILE_TOKEN,
395 line => $self->{line}, column => $self->{column}});
396 last A; ## TODO: ok?
397 } else {
398
399 #
400 }
401
402 # Anything else
403 my $token = {type => CHARACTER_TOKEN,
404 data => chr $self->{nc},
405 line => $self->{line}, column => $self->{column},
406 };
407 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
408
409 ## Stay in the state.
410
411 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
412 $self->{line_prev} = $self->{line};
413 $self->{column_prev} = $self->{column};
414 $self->{column}++;
415 $self->{nc}
416 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
417 } else {
418 $self->{set_nc}->($self);
419 }
420
421 return ($token);
422 redo A;
423 } elsif ($self->{state} == DATA_STATE) {
424 $self->{s_kwd} = '' unless defined $self->{s_kwd};
425 if ($self->{nc} == 0x0026) { # &
426 $self->{s_kwd} = '';
427 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
428 not $self->{escape}) {
429
430 ## NOTE: In the spec, the tokenizer is switched to the
431 ## "entity data state". In this implementation, the tokenizer
432 ## is switched to the |ENTITY_STATE|, which is an implementation
433 ## of the "consume a character reference" algorithm.
434 $self->{entity_add} = -1;
435 $self->{prev_state} = DATA_STATE;
436 $self->{state} = ENTITY_STATE;
437
438 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
439 $self->{line_prev} = $self->{line};
440 $self->{column_prev} = $self->{column};
441 $self->{column}++;
442 $self->{nc}
443 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
444 } else {
445 $self->{set_nc}->($self);
446 }
447
448 redo A;
449 } else {
450
451 #
452 }
453 } elsif ($self->{nc} == 0x002D) { # -
454 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
455 if ($self->{s_kwd} eq '<!-') {
456
457 $self->{escape} = 1; # unless $self->{escape};
458 $self->{s_kwd} = '--';
459 #
460 } elsif ($self->{s_kwd} eq '-') {
461
462 $self->{s_kwd} = '--';
463 #
464 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
465
466 $self->{s_kwd} .= '-';
467 #
468 } else {
469
470 $self->{s_kwd} = '-';
471 #
472 }
473 }
474
475 #
476 } elsif ($self->{nc} == 0x0021) { # !
477 if (length $self->{s_kwd}) {
478
479 $self->{s_kwd} .= '!';
480 #
481 } else {
482
483 #$self->{s_kwd} = '';
484 #
485 }
486 #
487 } elsif ($self->{nc} == 0x003C) { # <
488 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
489 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
490 not $self->{escape})) {
491
492 $self->{state} = TAG_OPEN_STATE;
493
494 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
495 $self->{line_prev} = $self->{line};
496 $self->{column_prev} = $self->{column};
497 $self->{column}++;
498 $self->{nc}
499 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
500 } else {
501 $self->{set_nc}->($self);
502 }
503
504 redo A;
505 } else {
506
507 $self->{s_kwd} = '';
508 #
509 }
510 } elsif ($self->{nc} == 0x003E) { # >
511 if ($self->{escape} and
512 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
513 if ($self->{s_kwd} eq '--') {
514
515 delete $self->{escape};
516 #
517 } else {
518
519 #
520 }
521 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
522
523 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
524 line => $self->{line_prev},
525 column => $self->{column_prev} - 1);
526 #
527 } else {
528
529 #
530 }
531
532 $self->{s_kwd} = '';
533 #
534 } elsif ($self->{nc} == 0x005D) { # ]
535 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
536
537 $self->{s_kwd} .= ']';
538 } elsif ($self->{s_kwd} eq ']]') {
539
540 #
541 } else {
542
543 $self->{s_kwd} = '';
544 }
545 #
546 } elsif ($self->{nc} == -1) {
547
548 $self->{s_kwd} = '';
549 return ({type => END_OF_FILE_TOKEN,
550 line => $self->{line}, column => $self->{column}});
551 last A; ## TODO: ok?
552 } else {
553
554 $self->{s_kwd} = '';
555 #
556 }
557
558 # Anything else
559 my $token = {type => CHARACTER_TOKEN,
560 data => chr $self->{nc},
561 line => $self->{line}, column => $self->{column},
562 };
563 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
564 length $token->{data})) {
565 $self->{s_kwd} = '';
566 }
567
568 ## Stay in the data state.
569 if (not $self->{is_xml} and
570 $self->{content_model} == PCDATA_CONTENT_MODEL) {
571
572 $self->{state} = PCDATA_STATE;
573 } else {
574
575 ## Stay in the state.
576 }
577
578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
579 $self->{line_prev} = $self->{line};
580 $self->{column_prev} = $self->{column};
581 $self->{column}++;
582 $self->{nc}
583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
584 } else {
585 $self->{set_nc}->($self);
586 }
587
588 return ($token);
589 redo A;
590 } elsif ($self->{state} == TAG_OPEN_STATE) {
591 ## XML5: "tag state".
592
593 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
594 if ($self->{nc} == 0x002F) { # /
595
596
597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
598 $self->{line_prev} = $self->{line};
599 $self->{column_prev} = $self->{column};
600 $self->{column}++;
601 $self->{nc}
602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
603 } else {
604 $self->{set_nc}->($self);
605 }
606
607 $self->{state} = CLOSE_TAG_OPEN_STATE;
608 redo A;
609 } elsif ($self->{nc} == 0x0021) { # !
610
611 $self->{s_kwd} = $self->{escaped} ? '' : '<';
612 #
613 } else {
614
615 $self->{s_kwd} = '';
616 #
617 }
618
619 ## reconsume
620 $self->{state} = DATA_STATE;
621 return ({type => CHARACTER_TOKEN, data => '<',
622 line => $self->{line_prev},
623 column => $self->{column_prev},
624 });
625 redo A;
626 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
627 if ($self->{nc} == 0x0021) { # !
628
629 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
630
631 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
632 $self->{line_prev} = $self->{line};
633 $self->{column_prev} = $self->{column};
634 $self->{column}++;
635 $self->{nc}
636 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
637 } else {
638 $self->{set_nc}->($self);
639 }
640
641 redo A;
642 } elsif ($self->{nc} == 0x002F) { # /
643
644 $self->{state} = CLOSE_TAG_OPEN_STATE;
645
646 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
647 $self->{line_prev} = $self->{line};
648 $self->{column_prev} = $self->{column};
649 $self->{column}++;
650 $self->{nc}
651 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
652 } else {
653 $self->{set_nc}->($self);
654 }
655
656 redo A;
657 } elsif (0x0041 <= $self->{nc} and
658 $self->{nc} <= 0x005A) { # A..Z
659
660 $self->{ct}
661 = {type => START_TAG_TOKEN,
662 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
663 line => $self->{line_prev},
664 column => $self->{column_prev}};
665 $self->{state} = TAG_NAME_STATE;
666
667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668 $self->{line_prev} = $self->{line};
669 $self->{column_prev} = $self->{column};
670 $self->{column}++;
671 $self->{nc}
672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673 } else {
674 $self->{set_nc}->($self);
675 }
676
677 redo A;
678 } elsif (0x0061 <= $self->{nc} and
679 $self->{nc} <= 0x007A) { # a..z
680
681 $self->{ct} = {type => START_TAG_TOKEN,
682 tag_name => chr ($self->{nc}),
683 line => $self->{line_prev},
684 column => $self->{column_prev}};
685 $self->{state} = TAG_NAME_STATE;
686
687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
688 $self->{line_prev} = $self->{line};
689 $self->{column_prev} = $self->{column};
690 $self->{column}++;
691 $self->{nc}
692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
693 } else {
694 $self->{set_nc}->($self);
695 }
696
697 redo A;
698 } elsif ($self->{nc} == 0x003E) { # >
699
700 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
701 line => $self->{line_prev},
702 column => $self->{column_prev});
703 $self->{state} = DATA_STATE;
704 $self->{s_kwd} = '';
705
706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
707 $self->{line_prev} = $self->{line};
708 $self->{column_prev} = $self->{column};
709 $self->{column}++;
710 $self->{nc}
711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
712 } else {
713 $self->{set_nc}->($self);
714 }
715
716
717 return ({type => CHARACTER_TOKEN, data => '<>',
718 line => $self->{line_prev},
719 column => $self->{column_prev},
720 });
721
722 redo A;
723 } elsif ($self->{nc} == 0x003F) { # ?
724 if ($self->{is_xml}) {
725
726 $self->{state} = PI_STATE;
727
728 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
729 $self->{line_prev} = $self->{line};
730 $self->{column_prev} = $self->{column};
731 $self->{column}++;
732 $self->{nc}
733 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
734 } else {
735 $self->{set_nc}->($self);
736 }
737
738 redo A;
739 } else {
740
741 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
742 line => $self->{line_prev},
743 column => $self->{column_prev});
744 $self->{state} = BOGUS_COMMENT_STATE;
745 $self->{ct} = {type => COMMENT_TOKEN, data => '',
746 line => $self->{line_prev},
747 column => $self->{column_prev},
748 };
749 ## $self->{nc} is intentionally left as is
750 redo A;
751 }
752 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
753
754 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
755 line => $self->{line_prev},
756 column => $self->{column_prev});
757 $self->{state} = DATA_STATE;
758 $self->{s_kwd} = '';
759 ## reconsume
760
761 return ({type => CHARACTER_TOKEN, data => '<',
762 line => $self->{line_prev},
763 column => $self->{column_prev},
764 });
765
766 redo A;
767 } else {
768 ## XML5: "<:" is a parse error.
769
770 $self->{ct} = {type => START_TAG_TOKEN,
771 tag_name => chr ($self->{nc}),
772 line => $self->{line_prev},
773 column => $self->{column_prev}};
774 $self->{state} = TAG_NAME_STATE;
775
776 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
777 $self->{line_prev} = $self->{line};
778 $self->{column_prev} = $self->{column};
779 $self->{column}++;
780 $self->{nc}
781 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
782 } else {
783 $self->{set_nc}->($self);
784 }
785
786 redo A;
787 }
788 } else {
789 die "$0: $self->{content_model} in tag open";
790 }
791 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
792 ## NOTE: The "close tag open state" in the spec is implemented as
793 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
794
795 ## XML5: "end tag state".
796
797 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
798 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
799 if (defined $self->{last_stag_name}) {
800 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
801 $self->{kwd} = '';
802 ## Reconsume.
803 redo A;
804 } else {
805 ## No start tag token has ever been emitted
806 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
807
808 $self->{state} = DATA_STATE;
809 $self->{s_kwd} = '';
810 ## Reconsume.
811 return ({type => CHARACTER_TOKEN, data => '</',
812 line => $l, column => $c,
813 });
814 redo A;
815 }
816 }
817
818 if (0x0041 <= $self->{nc} and
819 $self->{nc} <= 0x005A) { # A..Z
820
821 $self->{ct}
822 = {type => END_TAG_TOKEN,
823 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
824 line => $l, column => $c};
825 $self->{state} = TAG_NAME_STATE;
826
827 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
828 $self->{line_prev} = $self->{line};
829 $self->{column_prev} = $self->{column};
830 $self->{column}++;
831 $self->{nc}
832 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
833 } else {
834 $self->{set_nc}->($self);
835 }
836
837 redo A;
838 } elsif (0x0061 <= $self->{nc} and
839 $self->{nc} <= 0x007A) { # a..z
840
841 $self->{ct} = {type => END_TAG_TOKEN,
842 tag_name => chr ($self->{nc}),
843 line => $l, column => $c};
844 $self->{state} = TAG_NAME_STATE;
845
846 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
847 $self->{line_prev} = $self->{line};
848 $self->{column_prev} = $self->{column};
849 $self->{column}++;
850 $self->{nc}
851 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
852 } else {
853 $self->{set_nc}->($self);
854 }
855
856 redo A;
857 } elsif ($self->{nc} == 0x003E) { # >
858 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
859 line => $self->{line_prev}, ## "<" in "</>"
860 column => $self->{column_prev} - 1);
861 $self->{state} = DATA_STATE;
862 $self->{s_kwd} = '';
863 if ($self->{is_xml}) {
864
865 ## XML5: No parse error.
866
867 ## NOTE: This parser raises a parse error, since it supports
868 ## XML1, not XML5.
869
870 ## NOTE: A short end tag token.
871 my $ct = {type => END_TAG_TOKEN,
872 tag_name => '',
873 line => $self->{line_prev},
874 column => $self->{column_prev} - 1,
875 };
876
877 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
878 $self->{line_prev} = $self->{line};
879 $self->{column_prev} = $self->{column};
880 $self->{column}++;
881 $self->{nc}
882 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
883 } else {
884 $self->{set_nc}->($self);
885 }
886
887 return ($ct);
888 } else {
889
890
891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
892 $self->{line_prev} = $self->{line};
893 $self->{column_prev} = $self->{column};
894 $self->{column}++;
895 $self->{nc}
896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
897 } else {
898 $self->{set_nc}->($self);
899 }
900
901 }
902 redo A;
903 } elsif ($self->{nc} == -1) {
904
905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
906 $self->{s_kwd} = '';
907 $self->{state} = DATA_STATE;
908 # reconsume
909
910 return ({type => CHARACTER_TOKEN, data => '</',
911 line => $l, column => $c,
912 });
913
914 redo A;
915 } elsif (not $self->{is_xml} or
916 $is_space->{$self->{nc}}) {
917
918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
919 line => $self->{line_prev}, # "<" of "</"
920 column => $self->{column_prev} - 1);
921 $self->{state} = BOGUS_COMMENT_STATE;
922 $self->{ct} = {type => COMMENT_TOKEN, data => '',
923 line => $self->{line_prev}, # "<" of "</"
924 column => $self->{column_prev} - 1,
925 };
926 ## NOTE: $self->{nc} is intentionally left as is.
927 ## Although the "anything else" case of the spec not explicitly
928 ## states that the next input character is to be reconsumed,
929 ## it will be included to the |data| of the comment token
930 ## generated from the bogus end tag, as defined in the
931 ## "bogus comment state" entry.
932 redo A;
933 } else {
934 ## XML5: "</:" is a parse error.
935
936 $self->{ct} = {type => END_TAG_TOKEN,
937 tag_name => chr ($self->{nc}),
938 line => $l, column => $c};
939 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
940
941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
942 $self->{line_prev} = $self->{line};
943 $self->{column_prev} = $self->{column};
944 $self->{column}++;
945 $self->{nc}
946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
947 } else {
948 $self->{set_nc}->($self);
949 }
950
951 redo A;
952 }
953 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
954 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
955 if (length $ch) {
956 my $CH = $ch;
957 $ch =~ tr/a-z/A-Z/;
958 my $nch = chr $self->{nc};
959 if ($nch eq $ch or $nch eq $CH) {
960
961 ## Stay in the state.
962 $self->{kwd} .= $nch;
963
964 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
965 $self->{line_prev} = $self->{line};
966 $self->{column_prev} = $self->{column};
967 $self->{column}++;
968 $self->{nc}
969 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
970 } else {
971 $self->{set_nc}->($self);
972 }
973
974 redo A;
975 } else {
976
977 $self->{state} = DATA_STATE;
978 $self->{s_kwd} = '';
979 ## Reconsume.
980 return ({type => CHARACTER_TOKEN,
981 data => '</' . $self->{kwd},
982 line => $self->{line_prev},
983 column => $self->{column_prev} - 1 - length $self->{kwd},
984 });
985 redo A;
986 }
987 } else { # after "<{tag-name}"
988 unless ($is_space->{$self->{nc}} or
989 {
990 0x003E => 1, # >
991 0x002F => 1, # /
992 -1 => 1, # EOF
993 }->{$self->{nc}}) {
994
995 ## Reconsume.
996 $self->{state} = DATA_STATE;
997 $self->{s_kwd} = '';
998 return ({type => CHARACTER_TOKEN,
999 data => '</' . $self->{kwd},
1000 line => $self->{line_prev},
1001 column => $self->{column_prev} - 1 - length $self->{kwd},
1002 });
1003 redo A;
1004 } else {
1005
1006 $self->{ct}
1007 = {type => END_TAG_TOKEN,
1008 tag_name => $self->{last_stag_name},
1009 line => $self->{line_prev},
1010 column => $self->{column_prev} - 1 - length $self->{kwd}};
1011 $self->{state} = TAG_NAME_STATE;
1012 ## Reconsume.
1013 redo A;
1014 }
1015 }
1016 } elsif ($self->{state} == TAG_NAME_STATE) {
1017 if ($is_space->{$self->{nc}}) {
1018
1019 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1020
1021 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1022 $self->{line_prev} = $self->{line};
1023 $self->{column_prev} = $self->{column};
1024 $self->{column}++;
1025 $self->{nc}
1026 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1027 } else {
1028 $self->{set_nc}->($self);
1029 }
1030
1031 redo A;
1032 } elsif ($self->{nc} == 0x003E) { # >
1033 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1034
1035 $self->{last_stag_name} = $self->{ct}->{tag_name};
1036 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1037 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1038 #if ($self->{ct}->{attributes}) {
1039 # ## NOTE: This should never be reached.
1040 # !!! cp (36);
1041 # !!! parse-error (type => 'end tag attribute');
1042 #} else {
1043
1044 #}
1045 } else {
1046 die "$0: $self->{ct}->{type}: Unknown token type";
1047 }
1048 $self->{state} = DATA_STATE;
1049 $self->{s_kwd} = '';
1050
1051 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1052 $self->{line_prev} = $self->{line};
1053 $self->{column_prev} = $self->{column};
1054 $self->{column}++;
1055 $self->{nc}
1056 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1057 } else {
1058 $self->{set_nc}->($self);
1059 }
1060
1061
1062 return ($self->{ct}); # start tag or end tag
1063
1064 redo A;
1065 } elsif (0x0041 <= $self->{nc} and
1066 $self->{nc} <= 0x005A) { # A..Z
1067
1068 $self->{ct}->{tag_name}
1069 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1070 # start tag or end tag
1071 ## Stay in this state
1072
1073 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1074 $self->{line_prev} = $self->{line};
1075 $self->{column_prev} = $self->{column};
1076 $self->{column}++;
1077 $self->{nc}
1078 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1079 } else {
1080 $self->{set_nc}->($self);
1081 }
1082
1083 redo A;
1084 } elsif ($self->{nc} == -1) {
1085 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1086 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1087
1088 $self->{last_stag_name} = $self->{ct}->{tag_name};
1089 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1090 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1091 #if ($self->{ct}->{attributes}) {
1092 # ## NOTE: This state should never be reached.
1093 # !!! cp (40);
1094 # !!! parse-error (type => 'end tag attribute');
1095 #} else {
1096
1097 #}
1098 } else {
1099 die "$0: $self->{ct}->{type}: Unknown token type";
1100 }
1101 $self->{state} = DATA_STATE;
1102 $self->{s_kwd} = '';
1103 # reconsume
1104
1105 return ($self->{ct}); # start tag or end tag
1106
1107 redo A;
1108 } elsif ($self->{nc} == 0x002F) { # /
1109
1110 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1111
1112 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1113 $self->{line_prev} = $self->{line};
1114 $self->{column_prev} = $self->{column};
1115 $self->{column}++;
1116 $self->{nc}
1117 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1118 } else {
1119 $self->{set_nc}->($self);
1120 }
1121
1122 redo A;
1123 } else {
1124
1125 $self->{ct}->{tag_name} .= chr $self->{nc};
1126 # start tag or end tag
1127 ## Stay in the state
1128
1129 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1130 $self->{line_prev} = $self->{line};
1131 $self->{column_prev} = $self->{column};
1132 $self->{column}++;
1133 $self->{nc}
1134 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1135 } else {
1136 $self->{set_nc}->($self);
1137 }
1138
1139 redo A;
1140 }
1141 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1142 ## XML5: "Tag attribute name before state".
1143
1144 if ($is_space->{$self->{nc}}) {
1145
1146 ## Stay in the state
1147
1148 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1149 $self->{line_prev} = $self->{line};
1150 $self->{column_prev} = $self->{column};
1151 $self->{column}++;
1152 $self->{nc}
1153 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1154 } else {
1155 $self->{set_nc}->($self);
1156 }
1157
1158 redo A;
1159 } elsif ($self->{nc} == 0x003E) { # >
1160 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1161
1162 $self->{last_stag_name} = $self->{ct}->{tag_name};
1163 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1164 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1165 if ($self->{ct}->{attributes}) {
1166
1167 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1168 } else {
1169
1170 }
1171 } else {
1172 die "$0: $self->{ct}->{type}: Unknown token type";
1173 }
1174 $self->{state} = DATA_STATE;
1175 $self->{s_kwd} = '';
1176
1177 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1178 $self->{line_prev} = $self->{line};
1179 $self->{column_prev} = $self->{column};
1180 $self->{column}++;
1181 $self->{nc}
1182 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1183 } else {
1184 $self->{set_nc}->($self);
1185 }
1186
1187
1188 return ($self->{ct}); # start tag or end tag
1189
1190 redo A;
1191 } elsif (0x0041 <= $self->{nc} and
1192 $self->{nc} <= 0x005A) { # A..Z
1193
1194 $self->{ca}
1195 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1196 value => '',
1197 line => $self->{line}, column => $self->{column}};
1198 $self->{state} = ATTRIBUTE_NAME_STATE;
1199
1200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1201 $self->{line_prev} = $self->{line};
1202 $self->{column_prev} = $self->{column};
1203 $self->{column}++;
1204 $self->{nc}
1205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1206 } else {
1207 $self->{set_nc}->($self);
1208 }
1209
1210 redo A;
1211 } elsif ($self->{nc} == 0x002F) { # /
1212
1213 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1214
1215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1216 $self->{line_prev} = $self->{line};
1217 $self->{column_prev} = $self->{column};
1218 $self->{column}++;
1219 $self->{nc}
1220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1221 } else {
1222 $self->{set_nc}->($self);
1223 }
1224
1225 redo A;
1226 } elsif ($self->{nc} == -1) {
1227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1228 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1229
1230 $self->{last_stag_name} = $self->{ct}->{tag_name};
1231 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1232 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1233 if ($self->{ct}->{attributes}) {
1234
1235 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1236 } else {
1237
1238 }
1239 } else {
1240 die "$0: $self->{ct}->{type}: Unknown token type";
1241 }
1242 $self->{state} = DATA_STATE;
1243 $self->{s_kwd} = '';
1244 # reconsume
1245
1246 return ($self->{ct}); # start tag or end tag
1247
1248 redo A;
1249 } else {
1250 if ({
1251 0x0022 => 1, # "
1252 0x0027 => 1, # '
1253 0x003C => 1, # <
1254 0x003D => 1, # =
1255 }->{$self->{nc}}) {
1256
1257 ## XML5: Not a parse error.
1258 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1259 } else {
1260
1261 ## XML5: ":" raises a parse error and is ignored.
1262 }
1263 $self->{ca}
1264 = {name => chr ($self->{nc}),
1265 value => '',
1266 line => $self->{line}, column => $self->{column}};
1267 $self->{state} = ATTRIBUTE_NAME_STATE;
1268
1269 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1270 $self->{line_prev} = $self->{line};
1271 $self->{column_prev} = $self->{column};
1272 $self->{column}++;
1273 $self->{nc}
1274 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1275 } else {
1276 $self->{set_nc}->($self);
1277 }
1278
1279 redo A;
1280 }
1281 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1282 ## XML5: "Tag attribute name state".
1283
1284 my $before_leave = sub {
1285 if (exists $self->{ct}->{attributes} # start tag or end tag
1286 ->{$self->{ca}->{name}}) { # MUST
1287
1288 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1289 ## Discard $self->{ca} # MUST
1290 } else {
1291
1292 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1293 = $self->{ca};
1294 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1295 }
1296 }; # $before_leave
1297
1298 if ($is_space->{$self->{nc}}) {
1299
1300 $before_leave->();
1301 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1302
1303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1304 $self->{line_prev} = $self->{line};
1305 $self->{column_prev} = $self->{column};
1306 $self->{column}++;
1307 $self->{nc}
1308 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1309 } else {
1310 $self->{set_nc}->($self);
1311 }
1312
1313 redo A;
1314 } elsif ($self->{nc} == 0x003D) { # =
1315
1316 $before_leave->();
1317 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1318
1319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1320 $self->{line_prev} = $self->{line};
1321 $self->{column_prev} = $self->{column};
1322 $self->{column}++;
1323 $self->{nc}
1324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1325 } else {
1326 $self->{set_nc}->($self);
1327 }
1328
1329 redo A;
1330 } elsif ($self->{nc} == 0x003E) { # >
1331 if ($self->{is_xml}) {
1332
1333 ## XML5: Not a parse error.
1334 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1335 } else {
1336
1337 }
1338
1339 $before_leave->();
1340 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1341
1342 $self->{last_stag_name} = $self->{ct}->{tag_name};
1343 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1344
1345 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1346 if ($self->{ct}->{attributes}) {
1347 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1348 }
1349 } else {
1350 die "$0: $self->{ct}->{type}: Unknown token type";
1351 }
1352 $self->{state} = DATA_STATE;
1353 $self->{s_kwd} = '';
1354
1355 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1356 $self->{line_prev} = $self->{line};
1357 $self->{column_prev} = $self->{column};
1358 $self->{column}++;
1359 $self->{nc}
1360 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1361 } else {
1362 $self->{set_nc}->($self);
1363 }
1364
1365
1366 return ($self->{ct}); # start tag or end tag
1367
1368 redo A;
1369 } elsif (0x0041 <= $self->{nc} and
1370 $self->{nc} <= 0x005A) { # A..Z
1371
1372 $self->{ca}->{name}
1373 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1374 ## Stay in the state
1375
1376 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1377 $self->{line_prev} = $self->{line};
1378 $self->{column_prev} = $self->{column};
1379 $self->{column}++;
1380 $self->{nc}
1381 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1382 } else {
1383 $self->{set_nc}->($self);
1384 }
1385
1386 redo A;
1387 } elsif ($self->{nc} == 0x002F) { # /
1388 if ($self->{is_xml}) {
1389
1390 ## XML5: Not a parse error.
1391 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1392 } else {
1393
1394 }
1395
1396 $before_leave->();
1397 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1398
1399 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1400 $self->{line_prev} = $self->{line};
1401 $self->{column_prev} = $self->{column};
1402 $self->{column}++;
1403 $self->{nc}
1404 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1405 } else {
1406 $self->{set_nc}->($self);
1407 }
1408
1409 redo A;
1410 } elsif ($self->{nc} == -1) {
1411 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1412 $before_leave->();
1413 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1414
1415 $self->{last_stag_name} = $self->{ct}->{tag_name};
1416 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1417 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1418 if ($self->{ct}->{attributes}) {
1419
1420 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1421 } else {
1422 ## NOTE: This state should never be reached.
1423
1424 }
1425 } else {
1426 die "$0: $self->{ct}->{type}: Unknown token type";
1427 }
1428 $self->{state} = DATA_STATE;
1429 $self->{s_kwd} = '';
1430 # reconsume
1431
1432 return ($self->{ct}); # start tag or end tag
1433
1434 redo A;
1435 } else {
1436 if ({
1437 0x0022 => 1, # "
1438 0x0027 => 1, # '
1439 0x003C => 1, # <
1440 }->{$self->{nc}}) {
1441
1442 ## XML5: Not a parse error.
1443 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1444 } else {
1445
1446 }
1447 $self->{ca}->{name} .= chr ($self->{nc});
1448 ## Stay in the state
1449
1450 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1451 $self->{line_prev} = $self->{line};
1452 $self->{column_prev} = $self->{column};
1453 $self->{column}++;
1454 $self->{nc}
1455 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1456 } else {
1457 $self->{set_nc}->($self);
1458 }
1459
1460 redo A;
1461 }
1462 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1463 ## XML5: "Tag attribute name after state".
1464
1465 if ($is_space->{$self->{nc}}) {
1466
1467 ## Stay in the state
1468
1469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1470 $self->{line_prev} = $self->{line};
1471 $self->{column_prev} = $self->{column};
1472 $self->{column}++;
1473 $self->{nc}
1474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1475 } else {
1476 $self->{set_nc}->($self);
1477 }
1478
1479 redo A;
1480 } elsif ($self->{nc} == 0x003D) { # =
1481
1482 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1483
1484 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1485 $self->{line_prev} = $self->{line};
1486 $self->{column_prev} = $self->{column};
1487 $self->{column}++;
1488 $self->{nc}
1489 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1490 } else {
1491 $self->{set_nc}->($self);
1492 }
1493
1494 redo A;
1495 } elsif ($self->{nc} == 0x003E) { # >
1496 if ($self->{is_xml}) {
1497
1498 ## XML5: Not a parse error.
1499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1500 } else {
1501
1502 }
1503
1504 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1505
1506 $self->{last_stag_name} = $self->{ct}->{tag_name};
1507 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1508 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1509 if ($self->{ct}->{attributes}) {
1510
1511 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1512 } else {
1513 ## NOTE: This state should never be reached.
1514
1515 }
1516 } else {
1517 die "$0: $self->{ct}->{type}: Unknown token type";
1518 }
1519 $self->{state} = DATA_STATE;
1520 $self->{s_kwd} = '';
1521
1522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1523 $self->{line_prev} = $self->{line};
1524 $self->{column_prev} = $self->{column};
1525 $self->{column}++;
1526 $self->{nc}
1527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1528 } else {
1529 $self->{set_nc}->($self);
1530 }
1531
1532
1533 return ($self->{ct}); # start tag or end tag
1534
1535 redo A;
1536 } elsif (0x0041 <= $self->{nc} and
1537 $self->{nc} <= 0x005A) { # A..Z
1538
1539 $self->{ca}
1540 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1541 value => '',
1542 line => $self->{line}, column => $self->{column}};
1543 $self->{state} = ATTRIBUTE_NAME_STATE;
1544
1545 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1546 $self->{line_prev} = $self->{line};
1547 $self->{column_prev} = $self->{column};
1548 $self->{column}++;
1549 $self->{nc}
1550 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1551 } else {
1552 $self->{set_nc}->($self);
1553 }
1554
1555 redo A;
1556 } elsif ($self->{nc} == 0x002F) { # /
1557 if ($self->{is_xml}) {
1558
1559 ## XML5: Not a parse error.
1560 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1561 } else {
1562
1563 }
1564
1565 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1566
1567 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1568 $self->{line_prev} = $self->{line};
1569 $self->{column_prev} = $self->{column};
1570 $self->{column}++;
1571 $self->{nc}
1572 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1573 } else {
1574 $self->{set_nc}->($self);
1575 }
1576
1577 redo A;
1578 } elsif ($self->{nc} == -1) {
1579 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1580 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1581
1582 $self->{last_stag_name} = $self->{ct}->{tag_name};
1583 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1584 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1585 if ($self->{ct}->{attributes}) {
1586
1587 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1588 } else {
1589 ## NOTE: This state should never be reached.
1590
1591 }
1592 } else {
1593 die "$0: $self->{ct}->{type}: Unknown token type";
1594 }
1595 $self->{s_kwd} = '';
1596 $self->{state} = DATA_STATE;
1597 # reconsume
1598
1599 return ($self->{ct}); # start tag or end tag
1600
1601 redo A;
1602 } else {
1603 if ($self->{is_xml}) {
1604
1605 ## XML5: Not a parse error.
1606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1607 } else {
1608
1609 }
1610
1611 if ({
1612 0x0022 => 1, # "
1613 0x0027 => 1, # '
1614 0x003C => 1, # <
1615 }->{$self->{nc}}) {
1616
1617 ## XML5: Not a parse error.
1618 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1619 } else {
1620
1621 }
1622 $self->{ca}
1623 = {name => chr ($self->{nc}),
1624 value => '',
1625 line => $self->{line}, column => $self->{column}};
1626 $self->{state} = ATTRIBUTE_NAME_STATE;
1627
1628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1629 $self->{line_prev} = $self->{line};
1630 $self->{column_prev} = $self->{column};
1631 $self->{column}++;
1632 $self->{nc}
1633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1634 } else {
1635 $self->{set_nc}->($self);
1636 }
1637
1638 redo A;
1639 }
1640 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1641 ## XML5: "Tag attribute value before state".
1642
1643 if ($is_space->{$self->{nc}}) {
1644
1645 ## Stay in the state
1646
1647 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1648 $self->{line_prev} = $self->{line};
1649 $self->{column_prev} = $self->{column};
1650 $self->{column}++;
1651 $self->{nc}
1652 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1653 } else {
1654 $self->{set_nc}->($self);
1655 }
1656
1657 redo A;
1658 } elsif ($self->{nc} == 0x0022) { # "
1659
1660 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1661
1662 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1663 $self->{line_prev} = $self->{line};
1664 $self->{column_prev} = $self->{column};
1665 $self->{column}++;
1666 $self->{nc}
1667 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1668 } else {
1669 $self->{set_nc}->($self);
1670 }
1671
1672 redo A;
1673 } elsif ($self->{nc} == 0x0026) { # &
1674
1675 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1676 ## reconsume
1677 redo A;
1678 } elsif ($self->{nc} == 0x0027) { # '
1679
1680 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1681
1682 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1683 $self->{line_prev} = $self->{line};
1684 $self->{column_prev} = $self->{column};
1685 $self->{column}++;
1686 $self->{nc}
1687 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1688 } else {
1689 $self->{set_nc}->($self);
1690 }
1691
1692 redo A;
1693 } elsif ($self->{nc} == 0x003E) { # >
1694 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1695 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1696
1697 $self->{last_stag_name} = $self->{ct}->{tag_name};
1698 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1699 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1700 if ($self->{ct}->{attributes}) {
1701
1702 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1703 } else {
1704 ## NOTE: This state should never be reached.
1705
1706 }
1707 } else {
1708 die "$0: $self->{ct}->{type}: Unknown token type";
1709 }
1710 $self->{state} = DATA_STATE;
1711 $self->{s_kwd} = '';
1712
1713 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1714 $self->{line_prev} = $self->{line};
1715 $self->{column_prev} = $self->{column};
1716 $self->{column}++;
1717 $self->{nc}
1718 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1719 } else {
1720 $self->{set_nc}->($self);
1721 }
1722
1723
1724 return ($self->{ct}); # start tag or end tag
1725
1726 redo A;
1727 } elsif ($self->{nc} == -1) {
1728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1729 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1730
1731 $self->{last_stag_name} = $self->{ct}->{tag_name};
1732 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1733 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1734 if ($self->{ct}->{attributes}) {
1735
1736 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1737 } else {
1738 ## NOTE: This state should never be reached.
1739
1740 }
1741 } else {
1742 die "$0: $self->{ct}->{type}: Unknown token type";
1743 }
1744 $self->{state} = DATA_STATE;
1745 $self->{s_kwd} = '';
1746 ## reconsume
1747
1748 return ($self->{ct}); # start tag or end tag
1749
1750 redo A;
1751 } else {
1752 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1753
1754 ## XML5: Not a parse error.
1755 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1756 } elsif ($self->{is_xml}) {
1757
1758 ## XML5: No parse error.
1759 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1760 } else {
1761
1762 }
1763 $self->{ca}->{value} .= chr ($self->{nc});
1764 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1765
1766 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1767 $self->{line_prev} = $self->{line};
1768 $self->{column_prev} = $self->{column};
1769 $self->{column}++;
1770 $self->{nc}
1771 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1772 } else {
1773 $self->{set_nc}->($self);
1774 }
1775
1776 redo A;
1777 }
1778 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1779 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1780 ## ATTLIST attribute value double quoted state".
1781
1782 if ($self->{nc} == 0x0022) { # "
1783 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1784
1785 ## XML5: "DOCTYPE ATTLIST name after state".
1786 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1787 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1788 } else {
1789
1790 ## XML5: "Tag attribute name before state".
1791 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1792 }
1793
1794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795 $self->{line_prev} = $self->{line};
1796 $self->{column_prev} = $self->{column};
1797 $self->{column}++;
1798 $self->{nc}
1799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800 } else {
1801 $self->{set_nc}->($self);
1802 }
1803
1804 redo A;
1805 } elsif ($self->{nc} == 0x0026) { # &
1806
1807 ## XML5: Not defined yet.
1808
1809 ## NOTE: In the spec, the tokenizer is switched to the
1810 ## "entity in attribute value state". In this implementation, the
1811 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1812 ## implementation of the "consume a character reference" algorithm.
1813 $self->{prev_state} = $self->{state};
1814 $self->{entity_add} = 0x0022; # "
1815 $self->{state} = ENTITY_STATE;
1816
1817 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1818 $self->{line_prev} = $self->{line};
1819 $self->{column_prev} = $self->{column};
1820 $self->{column}++;
1821 $self->{nc}
1822 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1823 } else {
1824 $self->{set_nc}->($self);
1825 }
1826
1827 redo A;
1828 } elsif ($self->{is_xml} and
1829 $is_space->{$self->{nc}}) {
1830
1831 $self->{ca}->{value} .= ' ';
1832 ## Stay in the state.
1833
1834 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1835 $self->{line_prev} = $self->{line};
1836 $self->{column_prev} = $self->{column};
1837 $self->{column}++;
1838 $self->{nc}
1839 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1840 } else {
1841 $self->{set_nc}->($self);
1842 }
1843
1844 redo A;
1845 } elsif ($self->{nc} == -1) {
1846 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1847 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1848
1849 $self->{last_stag_name} = $self->{ct}->{tag_name};
1850
1851 $self->{state} = DATA_STATE;
1852 $self->{s_kwd} = '';
1853 ## reconsume
1854 return ($self->{ct}); # start tag
1855 redo A;
1856 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1857 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1858 if ($self->{ct}->{attributes}) {
1859
1860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1861 } else {
1862 ## NOTE: This state should never be reached.
1863
1864 }
1865
1866 $self->{state} = DATA_STATE;
1867 $self->{s_kwd} = '';
1868 ## reconsume
1869 return ($self->{ct}); # end tag
1870 redo A;
1871 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1872 ## XML5: No parse error above; not defined yet.
1873 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1874 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1875 ## Reconsume.
1876 return ($self->{ct}); # ATTLIST
1877 redo A;
1878 } else {
1879 die "$0: $self->{ct}->{type}: Unknown token type";
1880 }
1881 } else {
1882 ## XML5 [ATTLIST]: Not defined yet.
1883 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1884
1885 ## XML5: Not a parse error.
1886 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1887 } else {
1888
1889 }
1890 $self->{ca}->{value} .= chr ($self->{nc});
1891 $self->{read_until}->($self->{ca}->{value},
1892 qq["&<\x09\x0C\x20],
1893 length $self->{ca}->{value});
1894
1895 ## Stay in the state
1896
1897 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1898 $self->{line_prev} = $self->{line};
1899 $self->{column_prev} = $self->{column};
1900 $self->{column}++;
1901 $self->{nc}
1902 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1903 } else {
1904 $self->{set_nc}->($self);
1905 }
1906
1907 redo A;
1908 }
1909 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1910 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1911 ## ATTLIST attribute value single quoted state".
1912
1913 if ($self->{nc} == 0x0027) { # '
1914 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1915
1916 ## XML5: "DOCTYPE ATTLIST name after state".
1917 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1918 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1919 } else {
1920
1921 ## XML5: "Before attribute name state" (sic).
1922 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1923 }
1924
1925 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1926 $self->{line_prev} = $self->{line};
1927 $self->{column_prev} = $self->{column};
1928 $self->{column}++;
1929 $self->{nc}
1930 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1931 } else {
1932 $self->{set_nc}->($self);
1933 }
1934
1935 redo A;
1936 } elsif ($self->{nc} == 0x0026) { # &
1937
1938 ## XML5: Not defined yet.
1939
1940 ## NOTE: In the spec, the tokenizer is switched to the
1941 ## "entity in attribute value state". In this implementation, the
1942 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1943 ## implementation of the "consume a character reference" algorithm.
1944 $self->{entity_add} = 0x0027; # '
1945 $self->{prev_state} = $self->{state};
1946 $self->{state} = ENTITY_STATE;
1947
1948 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1949 $self->{line_prev} = $self->{line};
1950 $self->{column_prev} = $self->{column};
1951 $self->{column}++;
1952 $self->{nc}
1953 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1954 } else {
1955 $self->{set_nc}->($self);
1956 }
1957
1958 redo A;
1959 } elsif ($self->{is_xml} and
1960 $is_space->{$self->{nc}}) {
1961
1962 $self->{ca}->{value} .= ' ';
1963 ## Stay in the state.
1964
1965 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1966 $self->{line_prev} = $self->{line};
1967 $self->{column_prev} = $self->{column};
1968 $self->{column}++;
1969 $self->{nc}
1970 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1971 } else {
1972 $self->{set_nc}->($self);
1973 }
1974
1975 redo A;
1976 } elsif ($self->{nc} == -1) {
1977 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1978 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1979
1980 $self->{last_stag_name} = $self->{ct}->{tag_name};
1981
1982 $self->{state} = DATA_STATE;
1983 $self->{s_kwd} = '';
1984 ## reconsume
1985 return ($self->{ct}); # start tag
1986 redo A;
1987 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1988 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1989 if ($self->{ct}->{attributes}) {
1990
1991 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1992 } else {
1993 ## NOTE: This state should never be reached.
1994
1995 }
1996
1997 $self->{state} = DATA_STATE;
1998 $self->{s_kwd} = '';
1999 ## reconsume
2000 return ($self->{ct}); # end tag
2001 redo A;
2002 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2003 ## XML5: No parse error above; not defined yet.
2004 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2005 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2006 ## Reconsume.
2007 return ($self->{ct}); # ATTLIST
2008 redo A;
2009 } else {
2010 die "$0: $self->{ct}->{type}: Unknown token type";
2011 }
2012 } else {
2013 ## XML5 [ATTLIST]: Not defined yet.
2014 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2015
2016 ## XML5: Not a parse error.
2017 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2018 } else {
2019
2020 }
2021 $self->{ca}->{value} .= chr ($self->{nc});
2022 $self->{read_until}->($self->{ca}->{value},
2023 qq['&<\x09\x0C\x20],
2024 length $self->{ca}->{value});
2025
2026 ## Stay in the state
2027
2028 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2029 $self->{line_prev} = $self->{line};
2030 $self->{column_prev} = $self->{column};
2031 $self->{column}++;
2032 $self->{nc}
2033 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2034 } else {
2035 $self->{set_nc}->($self);
2036 }
2037
2038 redo A;
2039 }
2040 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2041 ## XML5: "Tag attribute value unquoted state".
2042
2043 if ($is_space->{$self->{nc}}) {
2044 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2045
2046 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2047 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2048 } else {
2049
2050 ## XML5: "Tag attribute name before state".
2051 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2052 }
2053
2054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055 $self->{line_prev} = $self->{line};
2056 $self->{column_prev} = $self->{column};
2057 $self->{column}++;
2058 $self->{nc}
2059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060 } else {
2061 $self->{set_nc}->($self);
2062 }
2063
2064 redo A;
2065 } elsif ($self->{nc} == 0x0026) { # &
2066
2067
2068 ## XML5: Not defined yet.
2069
2070 ## NOTE: In the spec, the tokenizer is switched to the
2071 ## "entity in attribute value state". In this implementation, the
2072 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2073 ## implementation of the "consume a character reference" algorithm.
2074 $self->{entity_add} = -1;
2075 $self->{prev_state} = $self->{state};
2076 $self->{state} = ENTITY_STATE;
2077
2078 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2079 $self->{line_prev} = $self->{line};
2080 $self->{column_prev} = $self->{column};
2081 $self->{column}++;
2082 $self->{nc}
2083 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2084 } else {
2085 $self->{set_nc}->($self);
2086 }
2087
2088 redo A;
2089 } elsif ($self->{nc} == 0x003E) { # >
2090 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2091
2092 $self->{last_stag_name} = $self->{ct}->{tag_name};
2093
2094 $self->{state} = DATA_STATE;
2095 $self->{s_kwd} = '';
2096
2097 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2098 $self->{line_prev} = $self->{line};
2099 $self->{column_prev} = $self->{column};
2100 $self->{column}++;
2101 $self->{nc}
2102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2103 } else {
2104 $self->{set_nc}->($self);
2105 }
2106
2107 return ($self->{ct}); # start tag
2108 redo A;
2109 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2110 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2111 if ($self->{ct}->{attributes}) {
2112
2113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2114 } else {
2115 ## NOTE: This state should never be reached.
2116
2117 }
2118
2119 $self->{state} = DATA_STATE;
2120 $self->{s_kwd} = '';
2121
2122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2123 $self->{line_prev} = $self->{line};
2124 $self->{column_prev} = $self->{column};
2125 $self->{column}++;
2126 $self->{nc}
2127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2128 } else {
2129 $self->{set_nc}->($self);
2130 }
2131
2132 return ($self->{ct}); # end tag
2133 redo A;
2134 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2135 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2136 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2137
2138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2139 $self->{line_prev} = $self->{line};
2140 $self->{column_prev} = $self->{column};
2141 $self->{column}++;
2142 $self->{nc}
2143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2144 } else {
2145 $self->{set_nc}->($self);
2146 }
2147
2148 return ($self->{ct}); # ATTLIST
2149 redo A;
2150 } else {
2151 die "$0: $self->{ct}->{type}: Unknown token type";
2152 }
2153 } elsif ($self->{nc} == -1) {
2154 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2155
2156 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 $self->{last_stag_name} = $self->{ct}->{tag_name};
2158
2159 $self->{state} = DATA_STATE;
2160 $self->{s_kwd} = '';
2161 ## reconsume
2162 return ($self->{ct}); # start tag
2163 redo A;
2164 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2166 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2167 if ($self->{ct}->{attributes}) {
2168
2169 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2170 } else {
2171 ## NOTE: This state should never be reached.
2172
2173 }
2174
2175 $self->{state} = DATA_STATE;
2176 $self->{s_kwd} = '';
2177 ## reconsume
2178 return ($self->{ct}); # end tag
2179 redo A;
2180 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2181 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2182 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2183 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2184 ## Reconsume.
2185 return ($self->{ct}); # ATTLIST
2186 redo A;
2187 } else {
2188 die "$0: $self->{ct}->{type}: Unknown token type";
2189 }
2190 } else {
2191 if ({
2192 0x0022 => 1, # "
2193 0x0027 => 1, # '
2194 0x003D => 1, # =
2195 0x003C => 1, # <
2196 }->{$self->{nc}}) {
2197
2198 ## XML5: Not a parse error.
2199 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2200 } else {
2201
2202 }
2203 $self->{ca}->{value} .= chr ($self->{nc});
2204 $self->{read_until}->($self->{ca}->{value},
2205 qq["'=& \x09\x0C>],
2206 length $self->{ca}->{value});
2207
2208 ## Stay in the state
2209
2210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2211 $self->{line_prev} = $self->{line};
2212 $self->{column_prev} = $self->{column};
2213 $self->{column}++;
2214 $self->{nc}
2215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2216 } else {
2217 $self->{set_nc}->($self);
2218 }
2219
2220 redo A;
2221 }
2222 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2223 if ($is_space->{$self->{nc}}) {
2224
2225 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2226
2227 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2228 $self->{line_prev} = $self->{line};
2229 $self->{column_prev} = $self->{column};
2230 $self->{column}++;
2231 $self->{nc}
2232 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2233 } else {
2234 $self->{set_nc}->($self);
2235 }
2236
2237 redo A;
2238 } elsif ($self->{nc} == 0x003E) { # >
2239 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2240
2241 $self->{last_stag_name} = $self->{ct}->{tag_name};
2242 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2243 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2244 if ($self->{ct}->{attributes}) {
2245
2246 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2247 } else {
2248 ## NOTE: This state should never be reached.
2249
2250 }
2251 } else {
2252 die "$0: $self->{ct}->{type}: Unknown token type";
2253 }
2254 $self->{state} = DATA_STATE;
2255 $self->{s_kwd} = '';
2256
2257 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258 $self->{line_prev} = $self->{line};
2259 $self->{column_prev} = $self->{column};
2260 $self->{column}++;
2261 $self->{nc}
2262 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263 } else {
2264 $self->{set_nc}->($self);
2265 }
2266
2267
2268 return ($self->{ct}); # start tag or end tag
2269
2270 redo A;
2271 } elsif ($self->{nc} == 0x002F) { # /
2272
2273 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2274
2275 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2276 $self->{line_prev} = $self->{line};
2277 $self->{column_prev} = $self->{column};
2278 $self->{column}++;
2279 $self->{nc}
2280 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2281 } else {
2282 $self->{set_nc}->($self);
2283 }
2284
2285 redo A;
2286 } elsif ($self->{nc} == -1) {
2287 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2288 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2289
2290 $self->{last_stag_name} = $self->{ct}->{tag_name};
2291 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2292 if ($self->{ct}->{attributes}) {
2293
2294 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2295 } else {
2296 ## NOTE: This state should never be reached.
2297
2298 }
2299 } else {
2300 die "$0: $self->{ct}->{type}: Unknown token type";
2301 }
2302 $self->{state} = DATA_STATE;
2303 $self->{s_kwd} = '';
2304 ## Reconsume.
2305 return ($self->{ct}); # start tag or end tag
2306 redo A;
2307 } else {
2308
2309 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2310 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2311 ## reconsume
2312 redo A;
2313 }
2314 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2315 ## XML5: "Empty tag state".
2316
2317 if ($self->{nc} == 0x003E) { # >
2318 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2319
2320 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2321 ## TODO: Different type than slash in start tag
2322 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2323 if ($self->{ct}->{attributes}) {
2324
2325 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2326 } else {
2327
2328 }
2329 ## TODO: Test |<title></title/>|
2330 } else {
2331
2332 $self->{self_closing} = 1;
2333 }
2334
2335 $self->{state} = DATA_STATE;
2336 $self->{s_kwd} = '';
2337
2338 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339 $self->{line_prev} = $self->{line};
2340 $self->{column_prev} = $self->{column};
2341 $self->{column}++;
2342 $self->{nc}
2343 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2344 } else {
2345 $self->{set_nc}->($self);
2346 }
2347
2348
2349 return ($self->{ct}); # start tag or end tag
2350
2351 redo A;
2352 } elsif ($self->{nc} == -1) {
2353 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2354 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2355
2356 $self->{last_stag_name} = $self->{ct}->{tag_name};
2357 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2358 if ($self->{ct}->{attributes}) {
2359
2360 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2361 } else {
2362 ## NOTE: This state should never be reached.
2363
2364 }
2365 } else {
2366 die "$0: $self->{ct}->{type}: Unknown token type";
2367 }
2368 ## XML5: "Tag attribute name before state".
2369 $self->{state} = DATA_STATE;
2370 $self->{s_kwd} = '';
2371 ## Reconsume.
2372 return ($self->{ct}); # start tag or end tag
2373 redo A;
2374 } else {
2375
2376 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2377 ## TODO: This error type is wrong.
2378 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2379 ## Reconsume.
2380 redo A;
2381 }
2382 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2383 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2384
2385 ## NOTE: Unlike spec's "bogus comment state", this implementation
2386 ## consumes characters one-by-one basis.
2387
2388 if ($self->{nc} == 0x003E) { # >
2389 if ($self->{in_subset}) {
2390
2391 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2392 } else {
2393
2394 $self->{state} = DATA_STATE;
2395 $self->{s_kwd} = '';
2396 }
2397
2398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2399 $self->{line_prev} = $self->{line};
2400 $self->{column_prev} = $self->{column};
2401 $self->{column}++;
2402 $self->{nc}
2403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2404 } else {
2405 $self->{set_nc}->($self);
2406 }
2407
2408
2409 return ($self->{ct}); # comment
2410 redo A;
2411 } elsif ($self->{nc} == -1) {
2412 if ($self->{in_subset}) {
2413
2414 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2415 } else {
2416
2417 $self->{state} = DATA_STATE;
2418 $self->{s_kwd} = '';
2419 }
2420 ## reconsume
2421
2422 return ($self->{ct}); # comment
2423 redo A;
2424 } else {
2425
2426 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2427 $self->{read_until}->($self->{ct}->{data},
2428 q[>],
2429 length $self->{ct}->{data});
2430
2431 ## Stay in the state.
2432
2433 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2434 $self->{line_prev} = $self->{line};
2435 $self->{column_prev} = $self->{column};
2436 $self->{column}++;
2437 $self->{nc}
2438 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2439 } else {
2440 $self->{set_nc}->($self);
2441 }
2442
2443 redo A;
2444 }
2445 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2446 ## XML5: "Markup declaration state".
2447
2448 if ($self->{nc} == 0x002D) { # -
2449
2450 $self->{state} = MD_HYPHEN_STATE;
2451
2452 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2453 $self->{line_prev} = $self->{line};
2454 $self->{column_prev} = $self->{column};
2455 $self->{column}++;
2456 $self->{nc}
2457 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2458 } else {
2459 $self->{set_nc}->($self);
2460 }
2461
2462 redo A;
2463 } elsif ($self->{nc} == 0x0044 or # D
2464 $self->{nc} == 0x0064) { # d
2465 ## ASCII case-insensitive.
2466
2467 $self->{state} = MD_DOCTYPE_STATE;
2468 $self->{kwd} = chr $self->{nc};
2469
2470 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2471 $self->{line_prev} = $self->{line};
2472 $self->{column_prev} = $self->{column};
2473 $self->{column}++;
2474 $self->{nc}
2475 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2476 } else {
2477 $self->{set_nc}->($self);
2478 }
2479
2480 redo A;
2481 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2482 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2483 $self->{is_xml}) and
2484 $self->{nc} == 0x005B) { # [
2485
2486 $self->{state} = MD_CDATA_STATE;
2487 $self->{kwd} = '[';
2488
2489 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2490 $self->{line_prev} = $self->{line};
2491 $self->{column_prev} = $self->{column};
2492 $self->{column}++;
2493 $self->{nc}
2494 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2495 } else {
2496 $self->{set_nc}->($self);
2497 }
2498
2499 redo A;
2500 } else {
2501
2502 }
2503
2504 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2505 line => $self->{line_prev},
2506 column => $self->{column_prev} - 1);
2507 ## Reconsume.
2508 $self->{state} = BOGUS_COMMENT_STATE;
2509 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2510 line => $self->{line_prev},
2511 column => $self->{column_prev} - 1,
2512 };
2513 redo A;
2514 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2515 if ($self->{nc} == 0x002D) { # -
2516
2517 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2518 line => $self->{line_prev},
2519 column => $self->{column_prev} - 2,
2520 };
2521 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2522
2523 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2524 $self->{line_prev} = $self->{line};
2525 $self->{column_prev} = $self->{column};
2526 $self->{column}++;
2527 $self->{nc}
2528 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2529 } else {
2530 $self->{set_nc}->($self);
2531 }
2532
2533 redo A;
2534 } else {
2535
2536 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2537 line => $self->{line_prev},
2538 column => $self->{column_prev} - 2);
2539 $self->{state} = BOGUS_COMMENT_STATE;
2540 ## Reconsume.
2541 $self->{ct} = {type => COMMENT_TOKEN,
2542 data => '-',
2543 line => $self->{line_prev},
2544 column => $self->{column_prev} - 2,
2545 };
2546 redo A;
2547 }
2548 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2549 ## ASCII case-insensitive.
2550 if ($self->{nc} == [
2551 undef,
2552 0x004F, # O
2553 0x0043, # C
2554 0x0054, # T
2555 0x0059, # Y
2556 0x0050, # P
2557 ]->[length $self->{kwd}] or
2558 $self->{nc} == [
2559 undef,
2560 0x006F, # o
2561 0x0063, # c
2562 0x0074, # t
2563 0x0079, # y
2564 0x0070, # p
2565 ]->[length $self->{kwd}]) {
2566
2567 ## Stay in the state.
2568 $self->{kwd} .= chr $self->{nc};
2569
2570 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2571 $self->{line_prev} = $self->{line};
2572 $self->{column_prev} = $self->{column};
2573 $self->{column}++;
2574 $self->{nc}
2575 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2576 } else {
2577 $self->{set_nc}->($self);
2578 }
2579
2580 redo A;
2581 } elsif ((length $self->{kwd}) == 6 and
2582 ($self->{nc} == 0x0045 or # E
2583 $self->{nc} == 0x0065)) { # e
2584 if ($self->{is_xml} and
2585 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2586
2587 ## XML5: case-sensitive.
2588 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2589 text => 'DOCTYPE',
2590 line => $self->{line_prev},
2591 column => $self->{column_prev} - 5);
2592 } else {
2593
2594 }
2595 $self->{state} = DOCTYPE_STATE;
2596 $self->{ct} = {type => DOCTYPE_TOKEN,
2597 quirks => 1,
2598 line => $self->{line_prev},
2599 column => $self->{column_prev} - 7,
2600 };
2601
2602 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2603 $self->{line_prev} = $self->{line};
2604 $self->{column_prev} = $self->{column};
2605 $self->{column}++;
2606 $self->{nc}
2607 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2608 } else {
2609 $self->{set_nc}->($self);
2610 }
2611
2612 redo A;
2613 } else {
2614
2615 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2616 line => $self->{line_prev},
2617 column => $self->{column_prev} - 1 - length $self->{kwd});
2618 $self->{state} = BOGUS_COMMENT_STATE;
2619 ## Reconsume.
2620 $self->{ct} = {type => COMMENT_TOKEN,
2621 data => $self->{kwd},
2622 line => $self->{line_prev},
2623 column => $self->{column_prev} - 1 - length $self->{kwd},
2624 };
2625 redo A;
2626 }
2627 } elsif ($self->{state} == MD_CDATA_STATE) {
2628 if ($self->{nc} == {
2629 '[' => 0x0043, # C
2630 '[C' => 0x0044, # D
2631 '[CD' => 0x0041, # A
2632 '[CDA' => 0x0054, # T
2633 '[CDAT' => 0x0041, # A
2634 }->{$self->{kwd}}) {
2635
2636 ## Stay in the state.
2637 $self->{kwd} .= chr $self->{nc};
2638
2639 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2640 $self->{line_prev} = $self->{line};
2641 $self->{column_prev} = $self->{column};
2642 $self->{column}++;
2643 $self->{nc}
2644 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2645 } else {
2646 $self->{set_nc}->($self);
2647 }
2648
2649 redo A;
2650 } elsif ($self->{kwd} eq '[CDATA' and
2651 $self->{nc} == 0x005B) { # [
2652 if ($self->{is_xml} and
2653 not $self->{tainted} and
2654 @{$self->{open_elements} or []} == 0) {
2655
2656 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2657 line => $self->{line_prev},
2658 column => $self->{column_prev} - 7);
2659 $self->{tainted} = 1;
2660 } else {
2661
2662 }
2663
2664 $self->{ct} = {type => CHARACTER_TOKEN,
2665 data => '',
2666 line => $self->{line_prev},
2667 column => $self->{column_prev} - 7};
2668 $self->{state} = CDATA_SECTION_STATE;
2669
2670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2671 $self->{line_prev} = $self->{line};
2672 $self->{column_prev} = $self->{column};
2673 $self->{column}++;
2674 $self->{nc}
2675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2676 } else {
2677 $self->{set_nc}->($self);
2678 }
2679
2680 redo A;
2681 } else {
2682
2683 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2684 line => $self->{line_prev},
2685 column => $self->{column_prev} - 1 - length $self->{kwd});
2686 $self->{state} = BOGUS_COMMENT_STATE;
2687 ## Reconsume.
2688 $self->{ct} = {type => COMMENT_TOKEN,
2689 data => $self->{kwd},
2690 line => $self->{line_prev},
2691 column => $self->{column_prev} - 1 - length $self->{kwd},
2692 };
2693 redo A;
2694 }
2695 } elsif ($self->{state} == COMMENT_START_STATE) {
2696 if ($self->{nc} == 0x002D) { # -
2697
2698 $self->{state} = COMMENT_START_DASH_STATE;
2699
2700 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2701 $self->{line_prev} = $self->{line};
2702 $self->{column_prev} = $self->{column};
2703 $self->{column}++;
2704 $self->{nc}
2705 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2706 } else {
2707 $self->{set_nc}->($self);
2708 }
2709
2710 redo A;
2711 } elsif ($self->{nc} == 0x003E) { # >
2712 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2713 if ($self->{in_subset}) {
2714
2715 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2716 } else {
2717
2718 $self->{state} = DATA_STATE;
2719 $self->{s_kwd} = '';
2720 }
2721
2722 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2723 $self->{line_prev} = $self->{line};
2724 $self->{column_prev} = $self->{column};
2725 $self->{column}++;
2726 $self->{nc}
2727 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2728 } else {
2729 $self->{set_nc}->($self);
2730 }
2731
2732
2733 return ($self->{ct}); # comment
2734
2735 redo A;
2736 } elsif ($self->{nc} == -1) {
2737 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2738 if ($self->{in_subset}) {
2739
2740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2741 } else {
2742
2743 $self->{state} = DATA_STATE;
2744 $self->{s_kwd} = '';
2745 }
2746 ## reconsume
2747
2748 return ($self->{ct}); # comment
2749
2750 redo A;
2751 } else {
2752
2753 $self->{ct}->{data} # comment
2754 .= chr ($self->{nc});
2755 $self->{state} = COMMENT_STATE;
2756
2757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2758 $self->{line_prev} = $self->{line};
2759 $self->{column_prev} = $self->{column};
2760 $self->{column}++;
2761 $self->{nc}
2762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2763 } else {
2764 $self->{set_nc}->($self);
2765 }
2766
2767 redo A;
2768 }
2769 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2770 if ($self->{nc} == 0x002D) { # -
2771
2772 $self->{state} = COMMENT_END_STATE;
2773
2774 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2775 $self->{line_prev} = $self->{line};
2776 $self->{column_prev} = $self->{column};
2777 $self->{column}++;
2778 $self->{nc}
2779 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2780 } else {
2781 $self->{set_nc}->($self);
2782 }
2783
2784 redo A;
2785 } elsif ($self->{nc} == 0x003E) { # >
2786 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2787 if ($self->{in_subset}) {
2788
2789 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2790 } else {
2791
2792 $self->{state} = DATA_STATE;
2793 $self->{s_kwd} = '';
2794 }
2795
2796 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2797 $self->{line_prev} = $self->{line};
2798 $self->{column_prev} = $self->{column};
2799 $self->{column}++;
2800 $self->{nc}
2801 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2802 } else {
2803 $self->{set_nc}->($self);
2804 }
2805
2806
2807 return ($self->{ct}); # comment
2808
2809 redo A;
2810 } elsif ($self->{nc} == -1) {
2811 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2812 if ($self->{in_subset}) {
2813
2814 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2815 } else {
2816
2817 $self->{state} = DATA_STATE;
2818 $self->{s_kwd} = '';
2819 }
2820 ## reconsume
2821
2822 return ($self->{ct}); # comment
2823
2824 redo A;
2825 } else {
2826
2827 $self->{ct}->{data} # comment
2828 .= '-' . chr ($self->{nc});
2829 $self->{state} = COMMENT_STATE;
2830
2831 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2832 $self->{line_prev} = $self->{line};
2833 $self->{column_prev} = $self->{column};
2834 $self->{column}++;
2835 $self->{nc}
2836 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2837 } else {
2838 $self->{set_nc}->($self);
2839 }
2840
2841 redo A;
2842 }
2843 } elsif ($self->{state} == COMMENT_STATE) {
2844 ## XML5: "Comment state" and "DOCTYPE comment state".
2845
2846 if ($self->{nc} == 0x002D) { # -
2847
2848 $self->{state} = COMMENT_END_DASH_STATE;
2849
2850 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2851 $self->{line_prev} = $self->{line};
2852 $self->{column_prev} = $self->{column};
2853 $self->{column}++;
2854 $self->{nc}
2855 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2856 } else {
2857 $self->{set_nc}->($self);
2858 }
2859
2860 redo A;
2861 } elsif ($self->{nc} == -1) {
2862 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2863 if ($self->{in_subset}) {
2864
2865 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2866 } else {
2867
2868 $self->{state} = DATA_STATE;
2869 $self->{s_kwd} = '';
2870 }
2871 ## reconsume
2872
2873 return ($self->{ct}); # comment
2874
2875 redo A;
2876 } else {
2877
2878 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2879 $self->{read_until}->($self->{ct}->{data},
2880 q[-],
2881 length $self->{ct}->{data});
2882
2883 ## Stay in the state
2884
2885 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2886 $self->{line_prev} = $self->{line};
2887 $self->{column_prev} = $self->{column};
2888 $self->{column}++;
2889 $self->{nc}
2890 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2891 } else {
2892 $self->{set_nc}->($self);
2893 }
2894
2895 redo A;
2896 }
2897 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2898 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2899
2900 if ($self->{nc} == 0x002D) { # -
2901
2902 $self->{state} = COMMENT_END_STATE;
2903
2904 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2905 $self->{line_prev} = $self->{line};
2906 $self->{column_prev} = $self->{column};
2907 $self->{column}++;
2908 $self->{nc}
2909 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2910 } else {
2911 $self->{set_nc}->($self);
2912 }
2913
2914 redo A;
2915 } elsif ($self->{nc} == -1) {
2916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2917 if ($self->{in_subset}) {
2918
2919 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920 } else {
2921
2922 $self->{state} = DATA_STATE;
2923 $self->{s_kwd} = '';
2924 }
2925 ## reconsume
2926
2927 return ($self->{ct}); # comment
2928
2929 redo A;
2930 } else {
2931
2932 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2933 $self->{state} = COMMENT_STATE;
2934
2935 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2936 $self->{line_prev} = $self->{line};
2937 $self->{column_prev} = $self->{column};
2938 $self->{column}++;
2939 $self->{nc}
2940 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2941 } else {
2942 $self->{set_nc}->($self);
2943 }
2944
2945 redo A;
2946 }
2947 } elsif ($self->{state} == COMMENT_END_STATE or
2948 $self->{state} == COMMENT_END_BANG_STATE) {
2949 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2950 ## (No comment end bang state.)
2951
2952 if ($self->{nc} == 0x003E) { # >
2953 if ($self->{in_subset}) {
2954
2955 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2956 } else {
2957
2958 $self->{state} = DATA_STATE;
2959 $self->{s_kwd} = '';
2960 }
2961
2962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2963 $self->{line_prev} = $self->{line};
2964 $self->{column_prev} = $self->{column};
2965 $self->{column}++;
2966 $self->{nc}
2967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2968 } else {
2969 $self->{set_nc}->($self);
2970 }
2971
2972
2973 return ($self->{ct}); # comment
2974
2975 redo A;
2976 } elsif ($self->{nc} == 0x002D) { # -
2977 if ($self->{state} == COMMENT_END_BANG_STATE) {
2978
2979 $self->{ct}->{data} .= '--!'; # comment
2980 $self->{state} = COMMENT_END_DASH_STATE;
2981 } else {
2982
2983 ## XML5: Not a parse error.
2984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2985 line => $self->{line_prev},
2986 column => $self->{column_prev});
2987 $self->{ct}->{data} .= '-'; # comment
2988 ## Stay in the state
2989 }
2990
2991 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992 $self->{line_prev} = $self->{line};
2993 $self->{column_prev} = $self->{column};
2994 $self->{column}++;
2995 $self->{nc}
2996 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997 } else {
2998 $self->{set_nc}->($self);
2999 }
3000
3001 redo A;
3002 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3003 $is_space->{$self->{nc}}) {
3004
3005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3006 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3007 $self->{state} = COMMENT_END_SPACE_STATE;
3008
3009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3010 $self->{line_prev} = $self->{line};
3011 $self->{column_prev} = $self->{column};
3012 $self->{column}++;
3013 $self->{nc}
3014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3015 } else {
3016 $self->{set_nc}->($self);
3017 }
3018
3019 redo A;
3020 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3021 $self->{nc} == 0x0021) { # !
3022
3023 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3024 $self->{state} = COMMENT_END_BANG_STATE;
3025
3026 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027 $self->{line_prev} = $self->{line};
3028 $self->{column_prev} = $self->{column};
3029 $self->{column}++;
3030 $self->{nc}
3031 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032 } else {
3033 $self->{set_nc}->($self);
3034 }
3035
3036 redo A;
3037 } elsif ($self->{nc} == -1) {
3038 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3039 if ($self->{in_subset}) {
3040
3041 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3042 } else {
3043
3044 $self->{state} = DATA_STATE;
3045 $self->{s_kwd} = '';
3046 }
3047 ## Reconsume.
3048
3049 return ($self->{ct}); # comment
3050
3051 redo A;
3052 } else {
3053
3054 if ($self->{state} == COMMENT_END_BANG_STATE) {
3055 $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3056 } else {
3057 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3058 }
3059 $self->{state} = COMMENT_STATE;
3060
3061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062 $self->{line_prev} = $self->{line};
3063 $self->{column_prev} = $self->{column};
3064 $self->{column}++;
3065 $self->{nc}
3066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3067 } else {
3068 $self->{set_nc}->($self);
3069 }
3070
3071 redo A;
3072 }
3073 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3074 ## XML5: Not exist.
3075
3076 if ($self->{nc} == 0x003E) { # >
3077 if ($self->{in_subset}) {
3078
3079 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3080 } else {
3081
3082 $self->{state} = DATA_STATE;
3083 $self->{s_kwd} = '';
3084 }
3085
3086 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3087 $self->{line_prev} = $self->{line};
3088 $self->{column_prev} = $self->{column};
3089 $self->{column}++;
3090 $self->{nc}
3091 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3092 } else {
3093 $self->{set_nc}->($self);
3094 }
3095
3096
3097 return ($self->{ct}); # comment
3098
3099 redo A;
3100 } elsif ($is_space->{$self->{nc}}) {
3101
3102 $self->{ct}->{data} .= chr ($self->{nc}); # comment
3103 ## Stay in the state.
3104
3105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106 $self->{line_prev} = $self->{line};
3107 $self->{column_prev} = $self->{column};
3108 $self->{column}++;
3109 $self->{nc}
3110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111 } else {
3112 $self->{set_nc}->($self);
3113 }
3114
3115 redo A;
3116 } elsif ($self->{nc} == -1) {
3117 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3118 if ($self->{in_subset}) {
3119
3120 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3121 } else {
3122
3123 $self->{state} = DATA_STATE;
3124 $self->{s_kwd} = '';
3125 }
3126 ## Reconsume.
3127
3128 return ($self->{ct}); # comment
3129
3130 redo A;
3131 } else {
3132
3133 $self->{ct}->{data} .= chr ($self->{nc}); # comment
3134 $self->{state} = COMMENT_STATE;
3135
3136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137 $self->{line_prev} = $self->{line};
3138 $self->{column_prev} = $self->{column};
3139 $self->{column}++;
3140 $self->{nc}
3141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142 } else {
3143 $self->{set_nc}->($self);
3144 }
3145
3146 redo A;
3147 }
3148 } elsif ($self->{state} == DOCTYPE_STATE) {
3149 if ($is_space->{$self->{nc}}) {
3150
3151 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3152
3153 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3154 $self->{line_prev} = $self->{line};
3155 $self->{column_prev} = $self->{column};
3156 $self->{column}++;
3157 $self->{nc}
3158 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3159 } else {
3160 $self->{set_nc}->($self);
3161 }
3162
3163 redo A;
3164 } elsif ($self->{nc} == -1) {
3165
3166 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3167 $self->{ct}->{quirks} = 1;
3168
3169 $self->{state} = DATA_STATE;
3170 ## Reconsume.
3171 return ($self->{ct}); # DOCTYPE (quirks)
3172
3173 redo A;
3174 } else {
3175
3176 ## XML5: Swith to the bogus comment state.
3177 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3178 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3179 ## reconsume
3180 redo A;
3181 }
3182 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3183 ## XML5: "DOCTYPE root name before state".
3184
3185 if ($is_space->{$self->{nc}}) {
3186
3187 ## Stay in the state
3188
3189 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3190 $self->{line_prev} = $self->{line};
3191 $self->{column_prev} = $self->{column};
3192 $self->{column}++;
3193 $self->{nc}
3194 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3195 } else {
3196 $self->{set_nc}->($self);
3197 }
3198
3199 redo A;
3200 } elsif ($self->{nc} == 0x003E) { # >
3201
3202 ## XML5: No parse error.
3203 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3204 $self->{state} = DATA_STATE;
3205 $self->{s_kwd} = '';
3206
3207 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3208 $self->{line_prev} = $self->{line};
3209 $self->{column_prev} = $self->{column};
3210 $self->{column}++;
3211 $self->{nc}
3212 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3213 } else {
3214 $self->{set_nc}->($self);
3215 }
3216
3217
3218 return ($self->{ct}); # DOCTYPE (quirks)
3219
3220 redo A;
3221 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3222
3223 $self->{ct}->{name} # DOCTYPE
3224 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3225 delete $self->{ct}->{quirks};
3226 $self->{state} = DOCTYPE_NAME_STATE;
3227
3228 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3229 $self->{line_prev} = $self->{line};
3230 $self->{column_prev} = $self->{column};
3231 $self->{column}++;
3232 $self->{nc}
3233 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3234 } else {
3235 $self->{set_nc}->($self);
3236 }
3237
3238 redo A;
3239 } elsif ($self->{nc} == -1) {
3240
3241 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3242 $self->{state} = DATA_STATE;
3243 $self->{s_kwd} = '';
3244 ## reconsume
3245
3246 return ($self->{ct}); # DOCTYPE (quirks)
3247
3248 redo A;
3249 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3250
3251 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3252 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3253 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3254 $self->{in_subset} = 1;
3255
3256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3257 $self->{line_prev} = $self->{line};
3258 $self->{column_prev} = $self->{column};
3259 $self->{column}++;
3260 $self->{nc}
3261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3262 } else {
3263 $self->{set_nc}->($self);
3264 }
3265
3266 return ($self->{ct}); # DOCTYPE
3267 redo A;
3268 } else {
3269
3270 $self->{ct}->{name} = chr $self->{nc};
3271 delete $self->{ct}->{quirks};
3272 $self->{state} = DOCTYPE_NAME_STATE;
3273
3274 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3275 $self->{line_prev} = $self->{line};
3276 $self->{column_prev} = $self->{column};
3277 $self->{column}++;
3278 $self->{nc}
3279 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3280 } else {
3281 $self->{set_nc}->($self);
3282 }
3283
3284 redo A;
3285 }
3286 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3287 ## XML5: "DOCTYPE root name state".
3288
3289 ## ISSUE: Redundant "First," in the spec.
3290
3291 if ($is_space->{$self->{nc}}) {
3292
3293 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3294
3295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3296 $self->{line_prev} = $self->{line};
3297 $self->{column_prev} = $self->{column};
3298 $self->{column}++;
3299 $self->{nc}
3300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3301 } else {
3302 $self->{set_nc}->($self);
3303 }
3304
3305 redo A;
3306 } elsif ($self->{nc} == 0x003E) { # >
3307
3308 $self->{state} = DATA_STATE;
3309 $self->{s_kwd} = '';
3310
3311 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3312 $self->{line_prev} = $self->{line};
3313 $self->{column_prev} = $self->{column};
3314 $self->{column}++;
3315 $self->{nc}
3316 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3317 } else {
3318 $self->{set_nc}->($self);
3319 }
3320
3321
3322 return ($self->{ct}); # DOCTYPE
3323
3324 redo A;
3325 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3326
3327 $self->{ct}->{name} # DOCTYPE
3328 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3329 delete $self->{ct}->{quirks};
3330 ## Stay in the state.
3331
3332 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3333 $self->{line_prev} = $self->{line};
3334 $self->{column_prev} = $self->{column};
3335 $self->{column}++;
3336 $self->{nc}
3337 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3338 } else {
3339 $self->{set_nc}->($self);
3340 }
3341
3342 redo A;
3343 } elsif ($self->{nc} == -1) {
3344
3345 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3346 $self->{state} = DATA_STATE;
3347 $self->{s_kwd} = '';
3348 ## reconsume
3349
3350 $self->{ct}->{quirks} = 1;
3351 return ($self->{ct}); # DOCTYPE
3352
3353 redo A;
3354 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3355
3356 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3357 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3358 $self->{in_subset} = 1;
3359
3360 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3361 $self->{line_prev} = $self->{line};
3362 $self->{column_prev} = $self->{column};
3363 $self->{column}++;
3364 $self->{nc}
3365 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3366 } else {
3367 $self->{set_nc}->($self);
3368 }
3369
3370 return ($self->{ct}); # DOCTYPE
3371 redo A;
3372 } else {
3373
3374 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3375 ## Stay in the state.
3376
3377 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3378 $self->{line_prev} = $self->{line};
3379 $self->{column_prev} = $self->{column};
3380 $self->{column}++;
3381 $self->{nc}
3382 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3383 } else {
3384 $self->{set_nc}->($self);
3385 }
3386
3387 redo A;
3388 }
3389 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3390 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3391 ## state", but implemented differently.
3392
3393 if ($is_space->{$self->{nc}}) {
3394
3395 ## Stay in the state
3396
3397 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3398 $self->{line_prev} = $self->{line};
3399 $self->{column_prev} = $self->{column};
3400 $self->{column}++;
3401 $self->{nc}
3402 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3403 } else {
3404 $self->{set_nc}->($self);
3405 }
3406
3407 redo A;
3408 } elsif ($self->{nc} == 0x003E) { # >
3409 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3410
3411 $self->{state} = DATA_STATE;
3412 $self->{s_kwd} = '';
3413 } else {
3414
3415 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3416 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3417 }
3418
3419
3420 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3421 $self->{line_prev} = $self->{line};
3422 $self->{column_prev} = $self->{column};
3423 $self->{column}++;
3424 $self->{nc}
3425 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3426 } else {
3427 $self->{set_nc}->($self);
3428 }
3429
3430 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3431 redo A;
3432 } elsif ($self->{nc} == -1) {
3433 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3434
3435 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3436 $self->{state} = DATA_STATE;
3437 $self->{s_kwd} = '';
3438 $self->{ct}->{quirks} = 1;
3439 } else {
3440
3441 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3442 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3443 }
3444
3445 ## Reconsume.
3446 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3447 redo A;
3448 } elsif ($self->{nc} == 0x0050 or # P
3449 $self->{nc} == 0x0070) { # p
3450
3451 $self->{state} = PUBLIC_STATE;
3452 $self->{kwd} = chr $self->{nc};
3453
3454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3455 $self->{line_prev} = $self->{line};
3456 $self->{column_prev} = $self->{column};
3457 $self->{column}++;
3458 $self->{nc}
3459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3460 } else {
3461 $self->{set_nc}->($self);
3462 }
3463
3464 redo A;
3465 } elsif ($self->{nc} == 0x0053 or # S
3466 $self->{nc} == 0x0073) { # s
3467
3468 $self->{state} = SYSTEM_STATE;
3469 $self->{kwd} = chr $self->{nc};
3470
3471 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3472 $self->{line_prev} = $self->{line};
3473 $self->{column_prev} = $self->{column};
3474 $self->{column}++;
3475 $self->{nc}
3476 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3477 } else {
3478 $self->{set_nc}->($self);
3479 }
3480
3481 redo A;
3482 } elsif ($self->{nc} == 0x0022 and # "
3483 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3484 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3485
3486 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3487 $self->{ct}->{value} = ''; # ENTITY
3488
3489 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490 $self->{line_prev} = $self->{line};
3491 $self->{column_prev} = $self->{column};
3492 $self->{column}++;
3493 $self->{nc}
3494 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3495 } else {
3496 $self->{set_nc}->($self);
3497 }
3498
3499 redo A;
3500 } elsif ($self->{nc} == 0x0027 and # '
3501 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3502 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3503
3504 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3505 $self->{ct}->{value} = ''; # ENTITY
3506
3507 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3508 $self->{line_prev} = $self->{line};
3509 $self->{column_prev} = $self->{column};
3510 $self->{column}++;
3511 $self->{nc}
3512 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3513 } else {
3514 $self->{set_nc}->($self);
3515 }
3516
3517 redo A;
3518 } elsif ($self->{is_xml} and
3519 $self->{ct}->{type} == DOCTYPE_TOKEN and
3520 $self->{nc} == 0x005B) { # [
3521
3522 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3523 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3524 $self->{in_subset} = 1;
3525
3526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3527 $self->{line_prev} = $self->{line};
3528 $self->{column_prev} = $self->{column};
3529 $self->{column}++;
3530 $self->{nc}
3531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3532 } else {
3533 $self->{set_nc}->($self);
3534 }
3535
3536 return ($self->{ct}); # DOCTYPE
3537 redo A;
3538 } else {
3539 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3540
3541 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3542
3543 $self->{ct}->{quirks} = 1;
3544 $self->{state} = BOGUS_DOCTYPE_STATE;
3545 } else {
3546
3547 $self->{state} = BOGUS_MD_STATE;
3548 }
3549
3550
3551 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3552 $self->{line_prev} = $self->{line};
3553 $self->{column_prev} = $self->{column};
3554 $self->{column}++;
3555 $self->{nc}
3556 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3557 } else {
3558 $self->{set_nc}->($self);
3559 }
3560
3561 redo A;
3562 }
3563 } elsif ($self->{state} == PUBLIC_STATE) {
3564 ## ASCII case-insensitive
3565 if ($self->{nc} == [
3566 undef,
3567 0x0055, # U
3568 0x0042, # B
3569 0x004C, # L
3570 0x0049, # I
3571 ]->[length $self->{kwd}] or
3572 $self->{nc} == [
3573 undef,
3574 0x0075, # u
3575 0x0062, # b
3576 0x006C, # l
3577 0x0069, # i
3578 ]->[length $self->{kwd}]) {
3579
3580 ## Stay in the state.
3581 $self->{kwd} .= chr $self->{nc};
3582
3583 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3584 $self->{line_prev} = $self->{line};
3585 $self->{column_prev} = $self->{column};
3586 $self->{column}++;
3587 $self->{nc}
3588 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3589 } else {
3590 $self->{set_nc}->($self);
3591 }
3592
3593 redo A;
3594 } elsif ((length $self->{kwd}) == 5 and
3595 ($self->{nc} == 0x0043 or # C
3596 $self->{nc} == 0x0063)) { # c
3597 if ($self->{is_xml} and
3598 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3599
3600 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3601 text => 'PUBLIC',
3602 line => $self->{line_prev},
3603 column => $self->{column_prev} - 4);
3604 } else {
3605
3606 }
3607 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3608
3609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3610 $self->{line_prev} = $self->{line};
3611 $self->{column_prev} = $self->{column};
3612 $self->{column}++;
3613 $self->{nc}
3614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3615 } else {
3616 $self->{set_nc}->($self);
3617 }
3618
3619 redo A;
3620 } else {
3621 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3622 line => $self->{line_prev},
3623 column => $self->{column_prev} + 1 - length $self->{kwd});
3624 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3625
3626 $self->{ct}->{quirks} = 1;
3627 $self->{state} = BOGUS_DOCTYPE_STATE;
3628 } else {
3629
3630 $self->{state} = BOGUS_MD_STATE;
3631 }
3632 ## Reconsume.
3633 redo A;
3634 }
3635 } elsif ($self->{state} == SYSTEM_STATE) {
3636 ## ASCII case-insensitive
3637 if ($self->{nc} == [
3638 undef,
3639 0x0059, # Y
3640 0x0053, # S
3641 0x0054, # T
3642 0x0045, # E
3643 ]->[length $self->{kwd}] or
3644 $self->{nc} == [
3645 undef,
3646 0x0079, # y
3647 0x0073, # s
3648 0x0074, # t
3649 0x0065, # e
3650 ]->[length $self->{kwd}]) {
3651
3652 ## Stay in the state.
3653 $self->{kwd} .= chr $self->{nc};
3654
3655 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3656 $self->{line_prev} = $self->{line};
3657 $self->{column_prev} = $self->{column};
3658 $self->{column}++;
3659 $self->{nc}
3660 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3661 } else {
3662 $self->{set_nc}->($self);
3663 }
3664
3665 redo A;
3666 } elsif ((length $self->{kwd}) == 5 and
3667 ($self->{nc} == 0x004D or # M
3668 $self->{nc} == 0x006D)) { # m
3669 if ($self->{is_xml} and
3670 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3671
3672 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3673 text => 'SYSTEM',
3674 line => $self->{line_prev},
3675 column => $self->{column_prev} - 4);
3676 } else {
3677
3678 }
3679 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3680
3681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3682 $self->{line_prev} = $self->{line};
3683 $self->{column_prev} = $self->{column};
3684 $self->{column}++;
3685 $self->{nc}
3686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3687 } else {
3688 $self->{set_nc}->($self);
3689 }
3690
3691 redo A;
3692 } else {
3693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3694 line => $self->{line_prev},
3695 column => $self->{column_prev} + 1 - length $self->{kwd});
3696 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3697
3698 $self->{ct}->{quirks} = 1;
3699 $self->{state} = BOGUS_DOCTYPE_STATE;
3700 } else {
3701
3702 $self->{state} = BOGUS_MD_STATE;
3703 }
3704 ## Reconsume.
3705 redo A;
3706 }
3707 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3708 if ($is_space->{$self->{nc}}) {
3709
3710 ## Stay in the state
3711
3712 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3713 $self->{line_prev} = $self->{line};
3714 $self->{column_prev} = $self->{column};
3715 $self->{column}++;
3716 $self->{nc}
3717 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3718 } else {
3719 $self->{set_nc}->($self);
3720 }
3721
3722 redo A;
3723 } elsif ($self->{nc} eq 0x0022) { # "
3724
3725 $self->{ct}->{pubid} = ''; # DOCTYPE
3726 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3727
3728 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3729 $self->{line_prev} = $self->{line};
3730 $self->{column_prev} = $self->{column};
3731 $self->{column}++;
3732 $self->{nc}
3733 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3734 } else {
3735 $self->{set_nc}->($self);
3736 }
3737
3738 redo A;
3739 } elsif ($self->{nc} eq 0x0027) { # '
3740
3741 $self->{ct}->{pubid} = ''; # DOCTYPE
3742 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3743
3744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745 $self->{line_prev} = $self->{line};
3746 $self->{column_prev} = $self->{column};
3747 $self->{column}++;
3748 $self->{nc}
3749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750 } else {
3751 $self->{set_nc}->($self);
3752 }
3753
3754 redo A;
3755 } elsif ($self->{nc} eq 0x003E) { # >
3756 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3757
3758 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3759
3760 $self->{state} = DATA_STATE;
3761 $self->{s_kwd} = '';
3762 $self->{ct}->{quirks} = 1;
3763 } else {
3764
3765 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3766 }
3767
3768
3769 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3770 $self->{line_prev} = $self->{line};
3771 $self->{column_prev} = $self->{column};
3772 $self->{column}++;
3773 $self->{nc}
3774 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3775 } else {
3776 $self->{set_nc}->($self);
3777 }
3778
3779 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3780 redo A;
3781 } elsif ($self->{nc} == -1) {
3782 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3783
3784 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3785 $self->{state} = DATA_STATE;
3786 $self->{s_kwd} = '';
3787 $self->{ct}->{quirks} = 1;
3788 } else {
3789
3790 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3791 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3792 }
3793
3794 ## reconsume
3795 return ($self->{ct}); # DOCTYPE
3796 redo A;
3797 } elsif ($self->{is_xml} and
3798 $self->{ct}->{type} == DOCTYPE_TOKEN and
3799 $self->{nc} == 0x005B) { # [
3800
3801 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3802 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3803 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3804 $self->{in_subset} = 1;
3805
3806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3807 $self->{line_prev} = $self->{line};
3808 $self->{column_prev} = $self->{column};
3809 $self->{column}++;
3810 $self->{nc}
3811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3812 } else {
3813 $self->{set_nc}->($self);
3814 }
3815
3816 return ($self->{ct}); # DOCTYPE
3817 redo A;
3818 } else {
3819 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3820
3821 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3822
3823 $self->{ct}->{quirks} = 1;
3824 $self->{state} = BOGUS_DOCTYPE_STATE;
3825 } else {
3826
3827 $self->{state} = BOGUS_MD_STATE;
3828 }
3829
3830
3831 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3832 $self->{line_prev} = $self->{line};
3833 $self->{column_prev} = $self->{column};
3834 $self->{column}++;
3835 $self->{nc}
3836 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3837 } else {
3838 $self->{set_nc}->($self);
3839 }
3840
3841 redo A;
3842 }
3843 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3844 if ($self->{nc} == 0x0022) { # "
3845
3846 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3847
3848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3849 $self->{line_prev} = $self->{line};
3850 $self->{column_prev} = $self->{column};
3851 $self->{column}++;
3852 $self->{nc}
3853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3854 } else {
3855 $self->{set_nc}->($self);
3856 }
3857
3858 redo A;
3859 } elsif ($self->{nc} == 0x003E) { # >
3860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3861
3862 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3863
3864 $self->{state} = DATA_STATE;
3865 $self->{s_kwd} = '';
3866 $self->{ct}->{quirks} = 1;
3867 } else {
3868
3869 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3870 }
3871
3872
3873 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3874 $self->{line_prev} = $self->{line};
3875 $self->{column_prev} = $self->{column};
3876 $self->{column}++;
3877 $self->{nc}
3878 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3879 } else {
3880 $self->{set_nc}->($self);
3881 }
3882
3883 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3884 redo A;
3885 } elsif ($self->{nc} == -1) {
3886 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3887
3888 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3889
3890 $self->{state} = DATA_STATE;
3891 $self->{s_kwd} = '';
3892 $self->{ct}->{quirks} = 1;
3893 } else {
3894
3895 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3896 }
3897
3898 ## Reconsume.
3899 return ($self->{ct}); # DOCTYPE
3900 redo A;
3901 } else {
3902
3903 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3904 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3905 length $self->{ct}->{pubid});
3906
3907 ## Stay in the state
3908
3909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3910 $self->{line_prev} = $self->{line};
3911 $self->{column_prev} = $self->{column};
3912 $self->{column}++;
3913 $self->{nc}
3914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3915 } else {
3916 $self->{set_nc}->($self);
3917 }
3918
3919 redo A;
3920 }
3921 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3922 if ($self->{nc} == 0x0027) { # '
3923
3924 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3925
3926 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3927 $self->{line_prev} = $self->{line};
3928 $self->{column_prev} = $self->{column};
3929 $self->{column}++;
3930 $self->{nc}
3931 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3932 } else {
3933 $self->{set_nc}->($self);
3934 }
3935
3936 redo A;
3937 } elsif ($self->{nc} == 0x003E) { # >
3938 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3939
3940 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3941
3942 $self->{state} = DATA_STATE;
3943 $self->{s_kwd} = '';
3944 $self->{ct}->{quirks} = 1;
3945 } else {
3946
3947 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3948 }
3949
3950
3951 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3952 $self->{line_prev} = $self->{line};
3953 $self->{column_prev} = $self->{column};
3954 $self->{column}++;
3955 $self->{nc}
3956 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3957 } else {
3958 $self->{set_nc}->($self);
3959 }
3960
3961 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3962 redo A;
3963 } elsif ($self->{nc} == -1) {
3964 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3965
3966 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3967
3968 $self->{state} = DATA_STATE;
3969 $self->{s_kwd} = '';
3970 $self->{ct}->{quirks} = 1;
3971 } else {
3972
3973 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3974 }
3975
3976 ## reconsume
3977 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3978 redo A;
3979 } else {
3980
3981 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3982 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3983 length $self->{ct}->{pubid});
3984
3985 ## Stay in the state
3986
3987 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3988 $self->{line_prev} = $self->{line};
3989 $self->{column_prev} = $self->{column};
3990 $self->{column}++;
3991 $self->{nc}
3992 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3993 } else {
3994 $self->{set_nc}->($self);
3995 }
3996
3997 redo A;
3998 }
3999 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
4000 if ($is_space->{$self->{nc}}) {
4001
4002 ## Stay in the state
4003
4004 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4005 $self->{line_prev} = $self->{line};
4006 $self->{column_prev} = $self->{column};
4007 $self->{column}++;
4008 $self->{nc}
4009 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4010 } else {
4011 $self->{set_nc}->($self);
4012 }
4013
4014 redo A;
4015 } elsif ($self->{nc} == 0x0022) { # "
4016
4017 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4018 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4019
4020 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4021 $self->{line_prev} = $self->{line};
4022 $self->{column_prev} = $self->{column};
4023 $self->{column}++;
4024 $self->{nc}
4025 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4026 } else {
4027 $self->{set_nc}->($self);
4028 }
4029
4030 redo A;
4031 } elsif ($self->{nc} == 0x0027) { # '
4032
4033 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4034 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4035
4036 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4037 $self->{line_prev} = $self->{line};
4038 $self->{column_prev} = $self->{column};
4039 $self->{column}++;
4040 $self->{nc}
4041 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4042 } else {
4043 $self->{set_nc}->($self);
4044 }
4045
4046 redo A;
4047 } elsif ($self->{nc} == 0x003E) { # >
4048 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4049 if ($self->{is_xml}) {
4050
4051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4052 } else {
4053
4054 }
4055 $self->{state} = DATA_STATE;
4056 $self->{s_kwd} = '';
4057 } else {
4058 if ($self->{ct}->{type} == NOTATION_TOKEN) {
4059
4060 } else {
4061
4062 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4063 }
4064 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4065 }
4066
4067
4068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4069 $self->{line_prev} = $self->{line};
4070 $self->{column_prev} = $self->{column};
4071 $self->{column}++;
4072 $self->{nc}
4073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4074 } else {
4075 $self->{set_nc}->($self);
4076 }
4077
4078 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4079 redo A;
4080 } elsif ($self->{nc} == -1) {
4081 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4082
4083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4084
4085 $self->{state} = DATA_STATE;
4086 $self->{s_kwd} = '';
4087 $self->{ct}->{quirks} = 1;
4088 } else {
4089 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4090 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4091 }
4092
4093 ## reconsume
4094 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4095 redo A;
4096 } elsif ($self->{is_xml} and
4097 $self->{ct}->{type} == DOCTYPE_TOKEN and
4098 $self->{nc} == 0x005B) { # [
4099
4100 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4101 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4102 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4103 $self->{in_subset} = 1;
4104
4105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4106 $self->{line_prev} = $self->{line};
4107 $self->{column_prev} = $self->{column};
4108 $self->{column}++;
4109 $self->{nc}
4110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4111 } else {
4112 $self->{set_nc}->($self);
4113 }
4114
4115 return ($self->{ct}); # DOCTYPE
4116 redo A;
4117 } else {
4118 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4119
4120 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4121
4122 $self->{ct}->{quirks} = 1;
4123 $self->{state} = BOGUS_DOCTYPE_STATE;
4124 } else {
4125
4126 $self->{state} = BOGUS_MD_STATE;
4127 }
4128
4129
4130 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4131 $self->{line_prev} = $self->{line};
4132 $self->{column_prev} = $self->{column};
4133 $self->{column}++;
4134 $self->{nc}
4135 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4136 } else {
4137 $self->{set_nc}->($self);
4138 }
4139
4140 redo A;
4141 }
4142 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4143 if ($is_space->{$self->{nc}}) {
4144
4145 ## Stay in the state
4146
4147 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4148 $self->{line_prev} = $self->{line};
4149 $self->{column_prev} = $self->{column};
4150 $self->{column}++;
4151 $self->{nc}
4152 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4153 } else {
4154 $self->{set_nc}->($self);
4155 }
4156
4157 redo A;
4158 } elsif ($self->{nc} == 0x0022) { # "
4159
4160 $self->{ct}->{sysid} = ''; # DOCTYPE
4161 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4162
4163 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4164 $self->{line_prev} = $self->{line};
4165 $self->{column_prev} = $self->{column};
4166 $self->{column}++;
4167 $self->{nc}
4168 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4169 } else {
4170 $self->{set_nc}->($self);
4171 }
4172
4173 redo A;
4174 } elsif ($self->{nc} == 0x0027) { # '
4175
4176 $self->{ct}->{sysid} = ''; # DOCTYPE
4177 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4178
4179 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4180 $self->{line_prev} = $self->{line};
4181 $self->{column_prev} = $self->{column};
4182 $self->{column}++;
4183 $self->{nc}
4184 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4185 } else {
4186 $self->{set_nc}->($self);
4187 }
4188
4189 redo A;
4190 } elsif ($self->{nc} == 0x003E) { # >
4191 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4192
4193 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4194 $self->{line_prev} = $self->{line};
4195 $self->{column_prev} = $self->{column};
4196 $self->{column}++;
4197 $self->{nc}
4198 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4199 } else {
4200 $self->{set_nc}->($self);
4201 }
4202
4203
4204 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4205
4206 $self->{state} = DATA_STATE;
4207 $self->{s_kwd} = '';
4208 $self->{ct}->{quirks} = 1;
4209 } else {
4210
4211 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4212 }
4213
4214 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4215 redo A;
4216 } elsif ($self->{nc} == -1) {
4217 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4218
4219 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4220 $self->{state} = DATA_STATE;
4221 $self->{s_kwd} = '';
4222 $self->{ct}->{quirks} = 1;
4223 } else {
4224
4225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4226 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4227 }
4228
4229 ## reconsume
4230 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4231 redo A;
4232 } elsif ($self->{is_xml} and
4233 $self->{ct}->{type} == DOCTYPE_TOKEN and
4234 $self->{nc} == 0x005B) { # [
4235
4236 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4237
4238 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4239 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4240 $self->{in_subset} = 1;
4241
4242 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4243 $self->{line_prev} = $self->{line};
4244 $self->{column_prev} = $self->{column};
4245 $self->{column}++;
4246 $self->{nc}
4247 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4248 } else {
4249 $self->{set_nc}->($self);
4250 }
4251
4252 return ($self->{ct}); # DOCTYPE
4253 redo A;
4254 } else {
4255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4256
4257 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4258
4259 $self->{ct}->{quirks} = 1;
4260 $self->{state} = BOGUS_DOCTYPE_STATE;
4261 } else {
4262
4263 $self->{state} = BOGUS_MD_STATE;
4264 }
4265
4266
4267 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268 $self->{line_prev} = $self->{line};
4269 $self->{column_prev} = $self->{column};
4270 $self->{column}++;
4271 $self->{nc}
4272 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4273 } else {
4274 $self->{set_nc}->($self);
4275 }
4276
4277 redo A;
4278 }
4279 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4280 if ($self->{nc} == 0x0022) { # "
4281
4282 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4283
4284 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4285 $self->{line_prev} = $self->{line};
4286 $self->{column_prev} = $self->{column};
4287 $self->{column}++;
4288 $self->{nc}
4289 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4290 } else {
4291 $self->{set_nc}->($self);
4292 }
4293
4294 redo A;
4295 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4296 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4297
4298 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4299
4300 $self->{state} = DATA_STATE;
4301 $self->{s_kwd} = '';
4302 $self->{ct}->{quirks} = 1;
4303 } else {
4304
4305 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4306 }
4307
4308
4309 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4310 $self->{line_prev} = $self->{line};
4311 $self->{column_prev} = $self->{column};
4312 $self->{column}++;
4313 $self->{nc}
4314 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4315 } else {
4316 $self->{set_nc}->($self);
4317 }
4318
4319 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4320 redo A;
4321 } elsif ($self->{nc} == -1) {
4322 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4323
4324 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4325
4326 $self->{state} = DATA_STATE;
4327 $self->{s_kwd} = '';
4328 $self->{ct}->{quirks} = 1;
4329 } else {
4330
4331 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4332 }
4333
4334 ## reconsume
4335 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4336 redo A;
4337 } else {
4338
4339 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4340 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4341 length $self->{ct}->{sysid});
4342
4343 ## Stay in the state
4344
4345 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346 $self->{line_prev} = $self->{line};
4347 $self->{column_prev} = $self->{column};
4348 $self->{column}++;
4349 $self->{nc}
4350 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4351 } else {
4352 $self->{set_nc}->($self);
4353 }
4354
4355 redo A;
4356 }
4357 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4358 if ($self->{nc} == 0x0027) { # '
4359
4360 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4361
4362 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4363 $self->{line_prev} = $self->{line};
4364 $self->{column_prev} = $self->{column};
4365 $self->{column}++;
4366 $self->{nc}
4367 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4368 } else {
4369 $self->{set_nc}->($self);
4370 }
4371
4372 redo A;
4373 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4374
4375 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4376
4377 $self->{state} = DATA_STATE;
4378 $self->{s_kwd} = '';
4379
4380 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4381 $self->{line_prev} = $self->{line};
4382 $self->{column_prev} = $self->{column};
4383 $self->{column}++;
4384 $self->{nc}
4385 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4386 } else {
4387 $self->{set_nc}->($self);
4388 }
4389
4390
4391 $self->{ct}->{quirks} = 1;
4392 return ($self->{ct}); # DOCTYPE
4393
4394 redo A;
4395 } elsif ($self->{nc} == -1) {
4396 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4397
4398 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4399
4400 $self->{state} = DATA_STATE;
4401 $self->{s_kwd} = '';
4402 $self->{ct}->{quirks} = 1;
4403 } else {
4404
4405 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4406 }
4407
4408 ## reconsume
4409 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4410 redo A;
4411 } else {
4412
4413 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4414 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4415 length $self->{ct}->{sysid});
4416
4417 ## Stay in the state
4418
4419 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4420 $self->{line_prev} = $self->{line};
4421 $self->{column_prev} = $self->{column};
4422 $self->{column}++;
4423 $self->{nc}
4424 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4425 } else {
4426 $self->{set_nc}->($self);
4427 }
4428
4429 redo A;
4430 }
4431 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4432 if ($is_space->{$self->{nc}}) {
4433 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4434
4435 $self->{state} = BEFORE_NDATA_STATE;
4436 } else {
4437
4438 ## Stay in the state
4439 }
4440
4441 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4442 $self->{line_prev} = $self->{line};
4443 $self->{column_prev} = $self->{column};
4444 $self->{column}++;
4445 $self->{nc}
4446 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4447 } else {
4448 $self->{set_nc}->($self);
4449 }
4450
4451 redo A;
4452 } elsif ($self->{nc} == 0x003E) { # >
4453 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4454
4455 $self->{state} = DATA_STATE;
4456 $self->{s_kwd} = '';
4457 } else {
4458
4459 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4460 }
4461
4462
4463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4464 $self->{line_prev} = $self->{line};
4465 $self->{column_prev} = $self->{column};
4466 $self->{column}++;
4467 $self->{nc}
4468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4469 } else {
4470 $self->{set_nc}->($self);
4471 }
4472
4473 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4474 redo A;
4475 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4476 ($self->{nc} == 0x004E or # N
4477 $self->{nc} == 0x006E)) { # n
4478
4479 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4480 $self->{state} = NDATA_STATE;
4481 $self->{kwd} = chr $self->{nc};
4482
4483 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4484 $self->{line_prev} = $self->{line};
4485 $self->{column_prev} = $self->{column};
4486 $self->{column}++;
4487 $self->{nc}
4488 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4489 } else {
4490 $self->{set_nc}->($self);
4491 }
4492
4493 redo A;
4494 } elsif ($self->{nc} == -1) {
4495 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4496
4497 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4498 $self->{state} = DATA_STATE;
4499 $self->{s_kwd} = '';
4500 $self->{ct}->{quirks} = 1;
4501 } else {
4502
4503 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4504 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505 }
4506
4507 ## reconsume
4508 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4509 redo A;
4510 } elsif ($self->{is_xml} and
4511 $self->{ct}->{type} == DOCTYPE_TOKEN and
4512 $self->{nc} == 0x005B) { # [
4513
4514 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4515 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4516 $self->{in_subset} = 1;
4517
4518 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4519 $self->{line_prev} = $self->{line};
4520 $self->{column_prev} = $self->{column};
4521 $self->{column}++;
4522 $self->{nc}
4523 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4524 } else {
4525 $self->{set_nc}->($self);
4526 }
4527
4528 return ($self->{ct}); # DOCTYPE
4529 redo A;
4530 } else {
4531 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4532
4533 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4534
4535 #$self->{ct}->{quirks} = 1;
4536 $self->{state} = BOGUS_DOCTYPE_STATE;
4537 } else {
4538
4539 $self->{state} = BOGUS_MD_STATE;
4540 }
4541
4542
4543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4544 $self->{line_prev} = $self->{line};
4545 $self->{column_prev} = $self->{column};
4546 $self->{column}++;
4547 $self->{nc}
4548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4549 } else {
4550 $self->{set_nc}->($self);
4551 }
4552
4553 redo A;
4554 }
4555 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4556 if ($is_space->{$self->{nc}}) {
4557
4558 ## Stay in the state.
4559
4560 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4561 $self->{line_prev} = $self->{line};
4562 $self->{column_prev} = $self->{column};
4563 $self->{column}++;
4564 $self->{nc}
4565 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4566 } else {
4567 $self->{set_nc}->($self);
4568 }
4569
4570 redo A;
4571 } elsif ($self->{nc} == 0x003E) { # >
4572
4573 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4574
4575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4576 $self->{line_prev} = $self->{line};
4577 $self->{column_prev} = $self->{column};
4578 $self->{column}++;
4579 $self->{nc}
4580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4581 } else {
4582 $self->{set_nc}->($self);
4583 }
4584
4585 return ($self->{ct}); # ENTITY
4586 redo A;
4587 } elsif ($self->{nc} == 0x004E or # N
4588 $self->{nc} == 0x006E) { # n
4589
4590 $self->{state} = NDATA_STATE;
4591 $self->{kwd} = chr $self->{nc};
4592
4593 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4594 $self->{line_prev} = $self->{line};
4595 $self->{column_prev} = $self->{column};
4596 $self->{column}++;
4597 $self->{nc}
4598 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4599 } else {
4600 $self->{set_nc}->($self);
4601 }
4602
4603 redo A;
4604 } elsif ($self->{nc} == -1) {
4605
4606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4607 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4608 ## reconsume
4609 return ($self->{ct}); # ENTITY
4610 redo A;
4611 } else {
4612
4613 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4614 $self->{state} = BOGUS_MD_STATE;
4615
4616 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4617 $self->{line_prev} = $self->{line};
4618 $self->{column_prev} = $self->{column};
4619 $self->{column}++;
4620 $self->{nc}
4621 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4622 } else {
4623 $self->{set_nc}->($self);
4624 }
4625
4626 redo A;
4627 }
4628 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4629 if ($self->{nc} == 0x003E) { # >
4630
4631 $self->{state} = DATA_STATE;
4632 $self->{s_kwd} = '';
4633
4634 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4635 $self->{line_prev} = $self->{line};
4636 $self->{column_prev} = $self->{column};
4637 $self->{column}++;
4638 $self->{nc}
4639 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4640 } else {
4641 $self->{set_nc}->($self);
4642 }
4643
4644
4645 return ($self->{ct}); # DOCTYPE
4646
4647 redo A;
4648 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4649
4650 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4651 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4652 $self->{in_subset} = 1;
4653
4654 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4655 $self->{line_prev} = $self->{line};
4656 $self->{column_prev} = $self->{column};
4657 $self->{column}++;
4658 $self->{nc}
4659 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4660 } else {
4661 $self->{set_nc}->($self);
4662 }
4663
4664 return ($self->{ct}); # DOCTYPE
4665 redo A;
4666 } elsif ($self->{nc} == -1) {
4667
4668 $self->{state} = DATA_STATE;
4669 $self->{s_kwd} = '';
4670 ## reconsume
4671
4672 return ($self->{ct}); # DOCTYPE
4673
4674 redo A;
4675 } else {
4676
4677 my $s = '';
4678 $self->{read_until}->($s, q{>[}, 0);
4679
4680 ## Stay in the state
4681
4682 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4683 $self->{line_prev} = $self->{line};
4684 $self->{column_prev} = $self->{column};
4685 $self->{column}++;
4686 $self->{nc}
4687 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4688 } else {
4689 $self->{set_nc}->($self);
4690 }
4691
4692 redo A;
4693 }
4694 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4695 ## NOTE: "CDATA section state" in the state is jointly implemented
4696 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4697 ## and |CDATA_SECTION_MSE2_STATE|.
4698
4699 ## XML5: "CDATA state".
4700
4701 if ($self->{nc} == 0x005D) { # ]
4702
4703 $self->{state} = CDATA_SECTION_MSE1_STATE;
4704
4705 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4706 $self->{line_prev} = $self->{line};
4707 $self->{column_prev} = $self->{column};
4708 $self->{column}++;
4709 $self->{nc}
4710 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4711 } else {
4712 $self->{set_nc}->($self);
4713 }
4714
4715 redo A;
4716 } elsif ($self->{nc} == -1) {
4717 if ($self->{is_xml}) {
4718
4719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4720 } else {
4721
4722 }
4723
4724 $self->{state} = DATA_STATE;
4725 $self->{s_kwd} = '';
4726 ## Reconsume.
4727 if (length $self->{ct}->{data}) { # character
4728
4729 return ($self->{ct}); # character
4730 } else {
4731
4732 ## No token to emit. $self->{ct} is discarded.
4733 }
4734 redo A;
4735 } else {
4736
4737 $self->{ct}->{data} .= chr $self->{nc};
4738 $self->{read_until}->($self->{ct}->{data},
4739 q<]>,
4740 length $self->{ct}->{data});
4741
4742 ## Stay in the state.
4743
4744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4745 $self->{line_prev} = $self->{line};
4746 $self->{column_prev} = $self->{column};
4747 $self->{column}++;
4748 $self->{nc}
4749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4750 } else {
4751 $self->{set_nc}->($self);
4752 }
4753
4754 redo A;
4755 }
4756
4757 ## ISSUE: "text tokens" in spec.
4758 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4759 ## XML5: "CDATA bracket state".
4760
4761 if ($self->{nc} == 0x005D) { # ]
4762
4763 $self->{state} = CDATA_SECTION_MSE2_STATE;
4764
4765 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4766 $self->{line_prev} = $self->{line};
4767 $self->{column_prev} = $self->{column};
4768 $self->{column}++;
4769 $self->{nc}
4770 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4771 } else {
4772 $self->{set_nc}->($self);
4773 }
4774
4775 redo A;
4776 } else {
4777
4778 ## XML5: If EOF, "]" is not appended and changed to the data state.
4779 $self->{ct}->{data} .= ']';
4780 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4781 ## Reconsume.
4782 redo A;
4783 }
4784 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4785 ## XML5: "CDATA end state".
4786
4787 if ($self->{nc} == 0x003E) { # >
4788 $self->{state} = DATA_STATE;
4789 $self->{s_kwd} = '';
4790
4791 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4792 $self->{line_prev} = $self->{line};
4793 $self->{column_prev} = $self->{column};
4794 $self->{column}++;
4795 $self->{nc}
4796 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4797 } else {
4798 $self->{set_nc}->($self);
4799 }
4800
4801 if (length $self->{ct}->{data}) { # character
4802
4803 return ($self->{ct}); # character
4804 } else {
4805
4806 ## No token to emit. $self->{ct} is discarded.
4807 }
4808 redo A;
4809 } elsif ($self->{nc} == 0x005D) { # ]
4810 # character
4811 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4812 ## Stay in the state.
4813
4814 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4815 $self->{line_prev} = $self->{line};
4816 $self->{column_prev} = $self->{column};
4817 $self->{column}++;
4818 $self->{nc}
4819 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4820 } else {
4821 $self->{set_nc}->($self);
4822 }
4823
4824 redo A;
4825 } else {
4826
4827 $self->{ct}->{data} .= ']]'; # character
4828 $self->{state} = CDATA_SECTION_STATE;
4829 ## Reconsume. ## XML5: Emit.
4830 redo A;
4831 }
4832 } elsif ($self->{state} == ENTITY_STATE) {
4833 if ($is_space->{$self->{nc}} or
4834 {
4835 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4836 $self->{entity_add} => 1,
4837 }->{$self->{nc}}) {
4838 if ($self->{is_xml}) {
4839
4840 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4841 line => $self->{line_prev},
4842 column => $self->{column_prev}
4843 + ($self->{nc} == -1 ? 1 : 0));
4844 } else {
4845
4846 ## No error
4847 }
4848 ## Don't consume
4849 ## Return nothing.
4850 #
4851 } elsif ($self->{nc} == 0x0023) { # #
4852
4853 $self->{state} = ENTITY_HASH_STATE;
4854 $self->{kwd} = '#';
4855
4856 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4857 $self->{line_prev} = $self->{line};
4858 $self->{column_prev} = $self->{column};
4859 $self->{column}++;
4860 $self->{nc}
4861 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4862 } else {
4863 $self->{set_nc}->($self);
4864 }
4865
4866 redo A;
4867 } elsif ($self->{is_xml} or
4868 (0x0041 <= $self->{nc} and
4869 $self->{nc} <= 0x005A) or # A..Z
4870 (0x0061 <= $self->{nc} and
4871 $self->{nc} <= 0x007A)) { # a..z
4872
4873 require Whatpm::_NamedEntityList;
4874 $self->{state} = ENTITY_NAME_STATE;
4875 $self->{kwd} = chr $self->{nc};
4876 $self->{entity__value} = $self->{kwd};
4877 $self->{entity__match} = 0;
4878
4879 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4880 $self->{line_prev} = $self->{line};
4881 $self->{column_prev} = $self->{column};
4882 $self->{column}++;
4883 $self->{nc}
4884 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4885 } else {
4886 $self->{set_nc}->($self);
4887 }
4888
4889 redo A;
4890 } else {
4891
4892 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4893 ## Return nothing.
4894 #
4895 }
4896
4897 ## NOTE: No character is consumed by the "consume a character
4898 ## reference" algorithm. In other word, there is an "&" character
4899 ## that does not introduce a character reference, which would be
4900 ## appended to the parent element or the attribute value in later
4901 ## process of the tokenizer.
4902
4903 if ($self->{prev_state} == DATA_STATE) {
4904
4905 $self->{state} = $self->{prev_state};
4906 $self->{s_kwd} = '';
4907 ## Reconsume.
4908 return ({type => CHARACTER_TOKEN, data => '&',
4909 line => $self->{line_prev},
4910 column => $self->{column_prev},
4911 });
4912 redo A;
4913 } else {
4914
4915 $self->{ca}->{value} .= '&';
4916 $self->{state} = $self->{prev_state};
4917 $self->{s_kwd} = '';
4918 ## Reconsume.
4919 redo A;
4920 }
4921 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4922 if ($self->{nc} == 0x0078) { # x
4923
4924 $self->{state} = HEXREF_X_STATE;
4925 $self->{kwd} .= chr $self->{nc};
4926
4927 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4928 $self->{line_prev} = $self->{line};
4929 $self->{column_prev} = $self->{column};
4930 $self->{column}++;
4931 $self->{nc}
4932 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4933 } else {
4934 $self->{set_nc}->($self);
4935 }
4936
4937 redo A;
4938 } elsif ($self->{nc} == 0x0058) { # X
4939
4940 if ($self->{is_xml}) {
4941 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4942 }
4943 $self->{state} = HEXREF_X_STATE;
4944 $self->{kwd} .= chr $self->{nc};
4945
4946 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4947 $self->{line_prev} = $self->{line};
4948 $self->{column_prev} = $self->{column};
4949 $self->{column}++;
4950 $self->{nc}
4951 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4952 } else {
4953 $self->{set_nc}->($self);
4954 }
4955
4956 redo A;
4957 } elsif (0x0030 <= $self->{nc} and
4958 $self->{nc} <= 0x0039) { # 0..9
4959
4960 $self->{state} = NCR_NUM_STATE;
4961 $self->{kwd} = $self->{nc} - 0x0030;
4962
4963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964 $self->{line_prev} = $self->{line};
4965 $self->{column_prev} = $self->{column};
4966 $self->{column}++;
4967 $self->{nc}
4968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969 } else {
4970 $self->{set_nc}->($self);
4971 }
4972
4973 redo A;
4974 } else {
4975 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4976 line => $self->{line_prev},
4977 column => $self->{column_prev} - 1);
4978
4979 ## NOTE: According to the spec algorithm, nothing is returned,
4980 ## and then "&#" is appended to the parent element or the attribute
4981 ## value in the later processing.
4982
4983 if ($self->{prev_state} == DATA_STATE) {
4984
4985 $self->{state} = $self->{prev_state};
4986 $self->{s_kwd} = '';
4987 ## Reconsume.
4988 return ({type => CHARACTER_TOKEN,
4989 data => '&#',
4990 line => $self->{line_prev},
4991 column => $self->{column_prev} - 1,
4992 });
4993 redo A;
4994 } else {
4995
4996 $self->{ca}->{value} .= '&#';
4997 $self->{state} = $self->{prev_state};
4998 $self->{s_kwd} = '';
4999 ## Reconsume.
5000 redo A;
5001 }
5002 }
5003 } elsif ($self->{state} == NCR_NUM_STATE) {
5004 if (0x0030 <= $self->{nc} and
5005 $self->{nc} <= 0x0039) { # 0..9
5006
5007 $self->{kwd} *= 10;
5008 $self->{kwd} += $self->{nc} - 0x0030;
5009
5010 ## Stay in the state.
5011
5012 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5013 $self->{line_prev} = $self->{line};
5014 $self->{column_prev} = $self->{column};
5015 $self->{column}++;
5016 $self->{nc}
5017 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5018 } else {
5019 $self->{set_nc}->($self);
5020 }
5021
5022 redo A;
5023 } elsif ($self->{nc} == 0x003B) { # ;
5024
5025
5026 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5027 $self->{line_prev} = $self->{line};
5028 $self->{column_prev} = $self->{column};
5029 $self->{column}++;
5030 $self->{nc}
5031 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5032 } else {
5033 $self->{set_nc}->($self);
5034 }
5035
5036 #
5037 } else {
5038
5039 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5040 ## Reconsume.
5041 #
5042 }
5043
5044 my $code = $self->{kwd};
5045 my $l = $self->{line_prev};
5046 my $c = $self->{column_prev};
5047 if ((not $self->{is_xml} and $charref_map->{$code}) or
5048 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5049 ($self->{is_xml} and $code == 0x0000)) {
5050
5051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5052 text => (sprintf 'U+%04X', $code),
5053 line => $l, column => $c);
5054 $code = $charref_map->{$code};
5055 } elsif ($code > 0x10FFFF) {
5056
5057 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5058 text => (sprintf 'U-%08X', $code),
5059 line => $l, column => $c);
5060 $code = 0xFFFD;
5061 }
5062
5063 if ($self->{prev_state} == DATA_STATE) {
5064
5065 $self->{state} = $self->{prev_state};
5066 $self->{s_kwd} = '';
5067 ## Reconsume.
5068 return ({type => CHARACTER_TOKEN, data => chr $code,
5069 has_reference => 1,
5070 line => $l, column => $c,
5071 });
5072 redo A;
5073 } else {
5074
5075 $self->{ca}->{value} .= chr $code;
5076 $self->{ca}->{has_reference} = 1;
5077 $self->{state} = $self->{prev_state};
5078 $self->{s_kwd} = '';
5079 ## Reconsume.
5080 redo A;
5081 }
5082 } elsif ($self->{state} == HEXREF_X_STATE) {
5083 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
5084 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
5085 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
5086 # 0..9, A..F, a..f
5087
5088 $self->{state} = HEXREF_HEX_STATE;
5089 $self->{kwd} = 0;
5090 ## Reconsume.
5091 redo A;
5092 } else {
5093 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
5094 line => $self->{line_prev},
5095 column => $self->{column_prev} - 2);
5096
5097 ## NOTE: According to the spec algorithm, nothing is returned,
5098 ## and then "&#" followed by "X" or "x" is appended to the parent
5099 ## element or the attribute value in the later processing.
5100
5101 if ($self->{prev_state} == DATA_STATE) {
5102
5103 $self->{state} = $self->{prev_state};
5104 $self->{s_kwd} = '';
5105 ## Reconsume.
5106 return ({type => CHARACTER_TOKEN,
5107 data => '&' . $self->{kwd},
5108 line => $self->{line_prev},
5109 column => $self->{column_prev} - length $self->{kwd},
5110 });
5111 redo A;
5112 } else {
5113
5114 $self->{ca}->{value} .= '&' . $self->{kwd};
5115 $self->{state} = $self->{prev_state};
5116 $self->{s_kwd} = '';
5117 ## Reconsume.
5118 redo A;
5119 }
5120 }
5121 } elsif ($self->{state} == HEXREF_HEX_STATE) {
5122 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5123 # 0..9
5124
5125 $self->{kwd} *= 0x10;
5126 $self->{kwd} += $self->{nc} - 0x0030;
5127 ## Stay in the state.
5128
5129 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5130 $self->{line_prev} = $self->{line};
5131 $self->{column_prev} = $self->{column};
5132 $self->{column}++;
5133 $self->{nc}
5134 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5135 } else {
5136 $self->{set_nc}->($self);
5137 }
5138
5139 redo A;
5140 } elsif (0x0061 <= $self->{nc} and
5141 $self->{nc} <= 0x0066) { # a..f
5142
5143 $self->{kwd} *= 0x10;
5144 $self->{kwd} += $self->{nc} - 0x0060 + 9;
5145 ## Stay in the state.
5146
5147 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5148 $self->{line_prev} = $self->{line};
5149 $self->{column_prev} = $self->{column};
5150 $self->{column}++;
5151 $self->{nc}
5152 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5153 } else {
5154 $self->{set_nc}->($self);
5155 }
5156
5157 redo A;
5158 } elsif (0x0041 <= $self->{nc} and
5159 $self->{nc} <= 0x0046) { # A..F
5160
5161 $self->{kwd} *= 0x10;
5162 $self->{kwd} += $self->{nc} - 0x0040 + 9;
5163 ## Stay in the state.
5164
5165 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5166 $self->{line_prev} = $self->{line};
5167 $self->{column_prev} = $self->{column};
5168 $self->{column}++;
5169 $self->{nc}
5170 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5171 } else {
5172 $self->{set_nc}->($self);
5173 }
5174
5175 redo A;
5176 } elsif ($self->{nc} == 0x003B) { # ;
5177
5178
5179 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5180 $self->{line_prev} = $self->{line};
5181 $self->{column_prev} = $self->{column};
5182 $self->{column}++;
5183 $self->{nc}
5184 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5185 } else {
5186 $self->{set_nc}->($self);
5187 }
5188
5189 #
5190 } else {
5191
5192 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5193 line => $self->{line},
5194 column => $self->{column});
5195 ## Reconsume.
5196 #
5197 }
5198
5199 my $code = $self->{kwd};
5200 my $l = $self->{line_prev};
5201 my $c = $self->{column_prev};
5202 if ((not $self->{is_xml} and $charref_map->{$code}) or
5203 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5204 ($self->{is_xml} and $code == 0x0000)) {
5205
5206 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5207 text => (sprintf 'U+%04X', $code),
5208 line => $l, column => $c);
5209 $code = $charref_map->{$code};
5210 } elsif ($code > 0x10FFFF) {
5211
5212 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5213 text => (sprintf 'U-%08X', $code),
5214 line => $l, column => $c);
5215 $code = 0xFFFD;
5216 }
5217
5218 if ($self->{prev_state} == DATA_STATE) {
5219
5220 $self->{state} = $self->{prev_state};
5221 $self->{s_kwd} = '';
5222 ## Reconsume.
5223 return ({type => CHARACTER_TOKEN, data => chr $code,
5224 has_reference => 1,
5225 line => $l, column => $c,
5226 });
5227 redo A;
5228 } else {
5229
5230 $self->{ca}->{value} .= chr $code;
5231 $self->{ca}->{has_reference} = 1;
5232 $self->{state} = $self->{prev_state};
5233 $self->{s_kwd} = '';
5234 ## Reconsume.
5235 redo A;
5236 }
5237 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5238 if ((0x0041 <= $self->{nc} and # a
5239 $self->{nc} <= 0x005A) or # x
5240 (0x0061 <= $self->{nc} and # a
5241 $self->{nc} <= 0x007A) or # z
5242 (0x0030 <= $self->{nc} and # 0
5243 $self->{nc} <= 0x0039) or # 9
5244 $self->{nc} == 0x003B or # ;
5245 ($self->{is_xml} and
5246 not ($is_space->{$self->{nc}} or
5247 {
5248 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5249 $self->{entity_add} => 1,
5250 }->{$self->{nc}}))) {
5251 our $EntityChar;
5252 $self->{kwd} .= chr $self->{nc};
5253 if (defined $EntityChar->{$self->{kwd}} or
5254 $self->{ge}->{$self->{kwd}}) {
5255 if ($self->{nc} == 0x003B) { # ;
5256 if (defined $self->{ge}->{$self->{kwd}}) {
5257 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5258
5259 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5260 } else {
5261 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5262
5263 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5264 value => $self->{kwd});
5265 } else {
5266
5267 }
5268 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5269 }
5270 } else {
5271 if ($self->{is_xml}) {
5272
5273 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5274 value => $self->{kwd},
5275 level => {
5276 'amp;' => $self->{level}->{warn},
5277 'quot;' => $self->{level}->{warn},
5278 'lt;' => $self->{level}->{warn},
5279 'gt;' => $self->{level}->{warn},
5280 'apos;' => $self->{level}->{warn},
5281 }->{$self->{kwd}} ||
5282 $self->{level}->{must});
5283 } else {
5284
5285 }
5286 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5287 }
5288 $self->{entity__match} = 1;
5289
5290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5291 $self->{line_prev} = $self->{line};
5292 $self->{column_prev} = $self->{column};
5293 $self->{column}++;
5294 $self->{nc}
5295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5296 } else {
5297 $self->{set_nc}->($self);
5298 }
5299
5300 #
5301 } else {
5302
5303 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5304 $self->{entity__match} = -1;
5305 ## Stay in the state.
5306
5307 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5308 $self->{line_prev} = $self->{line};
5309 $self->{column_prev} = $self->{column};
5310 $self->{column}++;
5311 $self->{nc}
5312 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5313 } else {
5314 $self->{set_nc}->($self);
5315 }
5316
5317 redo A;
5318 }
5319 } else {
5320
5321 $self->{entity__value} .= chr $self->{nc};
5322 $self->{entity__match} *= 2;
5323 ## Stay in the state.
5324
5325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5326 $self->{line_prev} = $self->{line};
5327 $self->{column_prev} = $self->{column};
5328 $self->{column}++;
5329 $self->{nc}
5330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5331 } else {
5332 $self->{set_nc}->($self);
5333 }
5334
5335 redo A;
5336 }
5337 }
5338
5339 my $data;
5340 my $has_ref;
5341 if ($self->{entity__match} > 0) {
5342
5343 $data = $self->{entity__value};
5344 $has_ref = 1;
5345 #
5346 } elsif ($self->{entity__match} < 0) {
5347 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5348 if ($self->{prev_state} != DATA_STATE and # in attribute
5349 $self->{entity__match} < -1) {
5350
5351 $data = '&' . $self->{kwd};
5352 #
5353 } else {
5354
5355 $data = $self->{entity__value};
5356 $has_ref = 1;
5357 #
5358 }
5359 } else {
5360
5361 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5362 line => $self->{line_prev},
5363 column => $self->{column_prev} - length $self->{kwd});
5364 $data = '&' . $self->{kwd};
5365 #
5366 }
5367
5368 ## NOTE: In these cases, when a character reference is found,
5369 ## it is consumed and a character token is returned, or, otherwise,
5370 ## nothing is consumed and returned, according to the spec algorithm.
5371 ## In this implementation, anything that has been examined by the
5372 ## tokenizer is appended to the parent element or the attribute value
5373 ## as string, either literal string when no character reference or
5374 ## entity-replaced string otherwise, in this stage, since any characters
5375 ## that would not be consumed are appended in the data state or in an
5376 ## appropriate attribute value state anyway.
5377
5378 if ($self->{prev_state} == DATA_STATE) {
5379
5380 $self->{state} = $self->{prev_state};
5381 $self->{s_kwd} = '';
5382 ## Reconsume.
5383 return ({type => CHARACTER_TOKEN,
5384 data => $data,
5385 has_reference => $has_ref,
5386 line => $self->{line_prev},
5387 column => $self->{column_prev} + 1 - length $self->{kwd},
5388 });
5389 redo A;
5390 } else {
5391
5392 $self->{ca}->{value} .= $data;
5393 $self->{ca}->{has_reference} = 1 if $has_ref;
5394 $self->{state} = $self->{prev_state};
5395 $self->{s_kwd} = '';
5396 ## Reconsume.
5397 redo A;
5398 }
5399
5400 ## XML-only states
5401
5402 } elsif ($self->{state} == PI_STATE) {
5403 ## XML5: "Pi state" and "DOCTYPE pi state".
5404
5405 if ($is_space->{$self->{nc}} or
5406 $self->{nc} == 0x003F or # ?
5407 $self->{nc} == -1) {
5408 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5409 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5410 ## "DOCTYPE pi state": Parse error, switch to the "data
5411 ## state".
5412 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5413 line => $self->{line_prev},
5414 column => $self->{column_prev}
5415 - 1 * ($self->{nc} != -1));
5416 $self->{state} = BOGUS_COMMENT_STATE;
5417 ## Reconsume.
5418 $self->{ct} = {type => COMMENT_TOKEN,
5419 data => '?',
5420 line => $self->{line_prev},
5421 column => $self->{column_prev}
5422 - 1 * ($self->{nc} != -1),
5423 };
5424 redo A;
5425 } else {
5426 ## XML5: "DOCTYPE pi state": Stay in the state.
5427 $self->{ct} = {type => PI_TOKEN,
5428 target => chr $self->{nc},
5429 data => '',
5430 line => $self->{line_prev},
5431 column => $self->{column_prev} - 1,
5432 };
5433 $self->{state} = PI_TARGET_STATE;
5434
5435 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5436 $self->{line_prev} = $self->{line};
5437 $self->{column_prev} = $self->{column};
5438 $self->{column}++;
5439 $self->{nc}
5440 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5441 } else {
5442 $self->{set_nc}->($self);
5443 }
5444
5445 redo A;
5446 }
5447 } elsif ($self->{state} == PI_TARGET_STATE) {
5448 if ($is_space->{$self->{nc}}) {
5449 $self->{state} = PI_TARGET_AFTER_STATE;
5450
5451 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5452 $self->{line_prev} = $self->{line};
5453 $self->{column_prev} = $self->{column};
5454 $self->{column}++;
5455 $self->{nc}
5456 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5457 } else {
5458 $self->{set_nc}->($self);
5459 }
5460
5461 redo A;
5462 } elsif ($self->{nc} == -1) {
5463 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5464 if ($self->{in_subset}) {
5465 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5466 } else {
5467 $self->{state} = DATA_STATE;
5468 $self->{s_kwd} = '';
5469 }
5470 ## Reconsume.
5471 return ($self->{ct}); # pi
5472 redo A;
5473 } elsif ($self->{nc} == 0x003F) { # ?
5474 $self->{state} = PI_AFTER_STATE;
5475
5476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5477 $self->{line_prev} = $self->{line};
5478 $self->{column_prev} = $self->{column};
5479 $self->{column}++;
5480 $self->{nc}
5481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5482 } else {
5483 $self->{set_nc}->($self);
5484 }
5485
5486 redo A;
5487 } else {
5488 ## XML5: typo ("tag name" -> "target")
5489 $self->{ct}->{target} .= chr $self->{nc}; # pi
5490
5491 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5492 $self->{line_prev} = $self->{line};
5493 $self->{column_prev} = $self->{column};
5494 $self->{column}++;
5495 $self->{nc}
5496 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5497 } else {
5498 $self->{set_nc}->($self);
5499 }
5500
5501 redo A;
5502 }
5503 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5504 if ($is_space->{$self->{nc}}) {
5505 ## Stay in the state.
5506
5507 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5508 $self->{line_prev} = $self->{line};
5509 $self->{column_prev} = $self->{column};
5510 $self->{column}++;
5511 $self->{nc}
5512 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5513 } else {
5514 $self->{set_nc}->($self);
5515 }
5516
5517 redo A;
5518 } else {
5519 $self->{state} = PI_DATA_STATE;
5520 ## Reprocess.
5521 redo A;
5522 }
5523 } elsif ($self->{state} == PI_DATA_STATE) {
5524 if ($self->{nc} == 0x003F) { # ?
5525 $self->{state} = PI_DATA_AFTER_STATE;
5526
5527 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5528 $self->{line_prev} = $self->{line};
5529 $self->{column_prev} = $self->{column};
5530 $self->{column}++;
5531 $self->{nc}
5532 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5533 } else {
5534 $self->{set_nc}->($self);
5535 }
5536
5537 redo A;
5538 } elsif ($self->{nc} == -1) {
5539 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5540 if ($self->{in_subset}) {
5541 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5542 } else {
5543 $self->{state} = DATA_STATE;
5544 $self->{s_kwd} = '';
5545 }
5546 ## Reprocess.
5547 return ($self->{ct}); # pi
5548 redo A;
5549 } else {
5550 $self->{ct}->{data} .= chr $self->{nc}; # pi
5551 $self->{read_until}->($self->{ct}->{data}, q[?],
5552 length $self->{ct}->{data});
5553 ## Stay in the state.
5554
5555 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5556 $self->{line_prev} = $self->{line};
5557 $self->{column_prev} = $self->{column};
5558 $self->{column}++;
5559 $self->{nc}
5560 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5561 } else {
5562 $self->{set_nc}->($self);
5563 }
5564
5565 ## Reprocess.
5566 redo A;
5567 }
5568 } elsif ($self->{state} == PI_AFTER_STATE) {
5569 ## XML5: Part of "Pi after state".
5570
5571 if ($self->{nc} == 0x003E) { # >
5572 if ($self->{in_subset}) {
5573 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5574 } else {
5575 $self->{state} = DATA_STATE;
5576 $self->{s_kwd} = '';
5577 }
5578
5579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5580 $self->{line_prev} = $self->{line};
5581 $self->{column_prev} = $self->{column};
5582 $self->{column}++;
5583 $self->{nc}
5584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5585 } else {
5586 $self->{set_nc}->($self);
5587 }
5588
5589 return ($self->{ct}); # pi
5590 redo A;
5591 } elsif ($self->{nc} == 0x003F) { # ?
5592 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5593 line => $self->{line_prev},
5594 column => $self->{column_prev}); ## XML5: no error
5595 $self->{ct}->{data} .= '?';
5596 $self->{state} = PI_DATA_AFTER_STATE;
5597
5598 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5599 $self->{line_prev} = $self->{line};
5600 $self->{column_prev} = $self->{column};
5601 $self->{column}++;
5602 $self->{nc}
5603 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5604 } else {
5605 $self->{set_nc}->($self);
5606 }
5607
5608 redo A;
5609 } else {
5610 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5611 line => $self->{line_prev},
5612 column => $self->{column_prev}
5613 + 1 * ($self->{nc} == -1)); ## XML5: no error
5614 $self->{ct}->{data} .= '?'; ## XML5: not appended
5615 $self->{state} = PI_DATA_STATE;
5616 ## Reprocess.
5617 redo A;
5618 }
5619 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5620 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5621
5622 if ($self->{nc} == 0x003E) { # >
5623 if ($self->{in_subset}) {
5624 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5625 } else {
5626 $self->{state} = DATA_STATE;
5627 $self->{s_kwd} = '';
5628 }
5629
5630 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5631 $self->{line_prev} = $self->{line};
5632 $self->{column_prev} = $self->{column};
5633 $self->{column}++;
5634 $self->{nc}
5635 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5636 } else {
5637 $self->{set_nc}->($self);
5638 }
5639
5640 return ($self->{ct}); # pi
5641 redo A;
5642 } elsif ($self->{nc} == 0x003F) { # ?
5643 $self->{ct}->{data} .= '?';
5644 ## Stay in the state.
5645
5646 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5647 $self->{line_prev} = $self->{line};
5648 $self->{column_prev} = $self->{column};
5649 $self->{column}++;
5650 $self->{nc}
5651 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5652 } else {
5653 $self->{set_nc}->($self);
5654 }
5655
5656 redo A;
5657 } else {
5658 $self->{ct}->{data} .= '?'; ## XML5: not appended
5659 $self->{state} = PI_DATA_STATE;
5660 ## Reprocess.
5661 redo A;
5662 }
5663
5664 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5665 if ($self->{nc} == 0x003C) { # <
5666 $self->{state} = DOCTYPE_TAG_STATE;
5667
5668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5669 $self->{line_prev} = $self->{line};
5670 $self->{column_prev} = $self->{column};
5671 $self->{column}++;
5672 $self->{nc}
5673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5674 } else {
5675 $self->{set_nc}->($self);
5676 }
5677
5678 redo A;
5679 } elsif ($self->{nc} == 0x0025) { # %
5680 ## XML5: Not defined yet.
5681
5682 ## TODO:
5683
5684 if (not $self->{stop_processing} and
5685 not $self->{document}->xml_standalone) {
5686 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5687 level => $self->{level}->{info});
5688 $self->{stop_processing} = 1;
5689 }
5690
5691
5692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5693 $self->{line_prev} = $self->{line};
5694 $self->{column_prev} = $self->{column};
5695 $self->{column}++;
5696 $self->{nc}
5697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5698 } else {
5699 $self->{set_nc}->($self);
5700 }
5701
5702 redo A;
5703 } elsif ($self->{nc} == 0x005D) { # ]
5704 delete $self->{in_subset};
5705 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5706
5707 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5708 $self->{line_prev} = $self->{line};
5709 $self->{column_prev} = $self->{column};
5710 $self->{column}++;
5711 $self->{nc}
5712 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5713 } else {
5714 $self->{set_nc}->($self);
5715 }
5716
5717 redo A;
5718 } elsif ($is_space->{$self->{nc}}) {
5719 ## Stay in the state.
5720
5721 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5722 $self->{line_prev} = $self->{line};
5723 $self->{column_prev} = $self->{column};
5724 $self->{column}++;
5725 $self->{nc}
5726 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5727 } else {
5728 $self->{set_nc}->($self);
5729 }
5730
5731 redo A;
5732 } elsif ($self->{nc} == -1) {
5733 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5734 delete $self->{in_subset};
5735 $self->{state} = DATA_STATE;
5736 $self->{s_kwd} = '';
5737 ## Reconsume.
5738 return ({type => END_OF_DOCTYPE_TOKEN});
5739 redo A;
5740 } else {
5741 unless ($self->{internal_subset_tainted}) {
5742 ## XML5: No parse error.
5743 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5744 $self->{internal_subset_tainted} = 1;
5745 }
5746 ## Stay in the state.
5747
5748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749 $self->{line_prev} = $self->{line};
5750 $self->{column_prev} = $self->{column};
5751 $self->{column}++;
5752 $self->{nc}
5753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754 } else {
5755 $self->{set_nc}->($self);
5756 }
5757
5758 redo A;
5759 }
5760 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5761 if ($self->{nc} == 0x003E) { # >
5762 $self->{state} = DATA_STATE;
5763 $self->{s_kwd} = '';
5764
5765 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5766 $self->{line_prev} = $self->{line};
5767 $self->{column_prev} = $self->{column};
5768 $self->{column}++;
5769 $self->{nc}
5770 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5771 } else {
5772 $self->{set_nc}->($self);
5773 }
5774
5775 return ({type => END_OF_DOCTYPE_TOKEN});
5776 redo A;
5777 } elsif ($self->{nc} == -1) {
5778 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5779 $self->{state} = DATA_STATE;
5780 $self->{s_kwd} = '';
5781 ## Reconsume.
5782 return ({type => END_OF_DOCTYPE_TOKEN});
5783 redo A;
5784 } else {
5785 ## XML5: No parse error and stay in the state.
5786 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5787
5788 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5789
5790 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5791 $self->{line_prev} = $self->{line};
5792 $self->{column_prev} = $self->{column};
5793 $self->{column}++;
5794 $self->{nc}
5795 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5796 } else {
5797 $self->{set_nc}->($self);
5798 }
5799
5800 redo A;
5801 }
5802 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5803 if ($self->{nc} == 0x003E) { # >
5804 $self->{state} = DATA_STATE;
5805 $self->{s_kwd} = '';
5806
5807 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5808 $self->{line_prev} = $self->{line};
5809 $self->{column_prev} = $self->{column};
5810 $self->{column}++;
5811 $self->{nc}
5812 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5813 } else {
5814 $self->{set_nc}->($self);
5815 }
5816
5817 return ({type => END_OF_DOCTYPE_TOKEN});
5818 redo A;
5819 } elsif ($self->{nc} == -1) {
5820 $self->{state} = DATA_STATE;
5821 $self->{s_kwd} = '';
5822 ## Reconsume.
5823 return ({type => END_OF_DOCTYPE_TOKEN});
5824 redo A;
5825 } else {
5826 ## Stay in the state.
5827
5828 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5829 $self->{line_prev} = $self->{line};
5830 $self->{column_prev} = $self->{column};
5831 $self->{column}++;
5832 $self->{nc}
5833 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5834 } else {
5835 $self->{set_nc}->($self);
5836 }
5837
5838 redo A;
5839 }
5840 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5841 if ($self->{nc} == 0x0021) { # !
5842 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5843
5844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5845 $self->{line_prev} = $self->{line};
5846 $self->{column_prev} = $self->{column};
5847 $self->{column}++;
5848 $self->{nc}
5849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5850 } else {
5851 $self->{set_nc}->($self);
5852 }
5853
5854 redo A;
5855 } elsif ($self->{nc} == 0x003F) { # ?
5856 $self->{state} = PI_STATE;
5857
5858 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5859 $self->{line_prev} = $self->{line};
5860 $self->{column_prev} = $self->{column};
5861 $self->{column}++;
5862 $self->{nc}
5863 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5864 } else {
5865 $self->{set_nc}->($self);
5866 }
5867
5868 redo A;
5869 } elsif ($self->{nc} == -1) {
5870 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5871 $self->{state} = DATA_STATE;
5872 $self->{s_kwd} = '';
5873 ## Reconsume.
5874 redo A;
5875 } else {
5876 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5877 line => $self->{line_prev},
5878 column => $self->{column_prev});
5879 $self->{state} = BOGUS_COMMENT_STATE;
5880 $self->{ct} = {type => COMMENT_TOKEN,
5881 data => '',
5882 }; ## NOTE: Will be discarded.
5883
5884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885 $self->{line_prev} = $self->{line};
5886 $self->{column_prev} = $self->{column};
5887 $self->{column}++;
5888 $self->{nc}
5889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890 } else {
5891 $self->{set_nc}->($self);
5892 }
5893
5894 redo A;
5895 }
5896 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5897 ## XML5: "DOCTYPE markup declaration state".
5898
5899 if ($self->{nc} == 0x002D) { # -
5900 $self->{state} = MD_HYPHEN_STATE;
5901
5902 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5903 $self->{line_prev} = $self->{line};
5904 $self->{column_prev} = $self->{column};
5905 $self->{column}++;
5906 $self->{nc}
5907 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5908 } else {
5909 $self->{set_nc}->($self);
5910 }
5911
5912 redo A;
5913 } elsif ($self->{nc} == 0x0045 or # E
5914 $self->{nc} == 0x0065) { # e
5915 $self->{state} = MD_E_STATE;
5916 $self->{kwd} = chr $self->{nc};
5917
5918 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5919 $self->{line_prev} = $self->{line};
5920 $self->{column_prev} = $self->{column};
5921 $self->{column}++;
5922 $self->{nc}
5923 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5924 } else {
5925 $self->{set_nc}->($self);
5926 }
5927
5928 redo A;
5929 } elsif ($self->{nc} == 0x0041 or # A
5930 $self->{nc} == 0x0061) { # a
5931 $self->{state} = MD_ATTLIST_STATE;
5932 $self->{kwd} = chr $self->{nc};
5933
5934 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5935 $self->{line_prev} = $self->{line};
5936 $self->{column_prev} = $self->{column};
5937 $self->{column}++;
5938 $self->{nc}
5939 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5940 } else {
5941 $self->{set_nc}->($self);
5942 }
5943
5944 redo A;
5945 } elsif ($self->{nc} == 0x004E or # N
5946 $self->{nc} == 0x006E) { # n
5947 $self->{state} = MD_NOTATION_STATE;
5948 $self->{kwd} = chr $self->{nc};
5949
5950 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5951 $self->{line_prev} = $self->{line};
5952 $self->{column_prev} = $self->{column};
5953 $self->{column}++;
5954 $self->{nc}
5955 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5956 } else {
5957 $self->{set_nc}->($self);
5958 }
5959
5960 redo A;
5961 } else {
5962 #
5963 }
5964
5965 ## XML5: No parse error.
5966 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5967 line => $self->{line_prev},
5968 column => $self->{column_prev} - 1);
5969 ## Reconsume.
5970 $self->{state} = BOGUS_COMMENT_STATE;
5971 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5972 redo A;
5973 } elsif ($self->{state} == MD_E_STATE) {
5974 if ($self->{nc} == 0x004E or # N
5975 $self->{nc} == 0x006E) { # n
5976 $self->{state} = MD_ENTITY_STATE;
5977 $self->{kwd} .= chr $self->{nc};
5978
5979 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5980 $self->{line_prev} = $self->{line};
5981 $self->{column_prev} = $self->{column};
5982 $self->{column}++;
5983 $self->{nc}
5984 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5985 } else {
5986 $self->{set_nc}->($self);
5987 }
5988
5989 redo A;
5990 } elsif ($self->{nc} == 0x004C or # L
5991 $self->{nc} == 0x006C) { # l
5992 ## XML5: <!ELEMENT> not supported.
5993 $self->{state} = MD_ELEMENT_STATE;
5994 $self->{kwd} .= chr $self->{nc};
5995
5996 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5997 $self->{line_prev} = $self->{line};
5998 $self->{column_prev} = $self->{column};
5999 $self->{column}++;
6000 $self->{nc}
6001 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6002 } else {
6003 $self->{set_nc}->($self);
6004 }
6005
6006 redo A;
6007 } else {
6008 ## XML5: No parse error.
6009 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6010 line => $self->{line_prev},
6011 column => $self->{column_prev} - 2
6012 + 1 * ($self->{nc} == -1));
6013 ## Reconsume.
6014 $self->{state} = BOGUS_COMMENT_STATE;
6015 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6016 redo A;
6017 }
6018 } elsif ($self->{state} == MD_ENTITY_STATE) {
6019 if ($self->{nc} == [
6020 undef,
6021 undef,
6022 0x0054, # T
6023 0x0049, # I
6024 0x0054, # T
6025 ]->[length $self->{kwd}] or
6026 $self->{nc} == [
6027 undef,
6028 undef,
6029 0x0074, # t
6030 0x0069, # i
6031 0x0074, # t
6032 ]->[length $self->{kwd}]) {
6033 ## Stay in the state.
6034 $self->{kwd} .= chr $self->{nc};
6035
6036 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6037 $self->{line_prev} = $self->{line};
6038 $self->{column_prev} = $self->{column};
6039 $self->{column}++;
6040 $self->{nc}
6041 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6042 } else {
6043 $self->{set_nc}->($self);
6044 }
6045
6046 redo A;
6047 } elsif ((length $self->{kwd}) == 5 and
6048 ($self->{nc} == 0x0059 or # Y
6049 $self->{nc} == 0x0079)) { # y
6050 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6052 text => 'ENTITY',
6053 line => $self->{line_prev},
6054 column => $self->{column_prev} - 4);
6055 }
6056 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6057 line => $self->{line_prev},
6058 column => $self->{column_prev} - 6};
6059 $self->{state} = DOCTYPE_MD_STATE;
6060
6061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6062 $self->{line_prev} = $self->{line};
6063 $self->{column_prev} = $self->{column};
6064 $self->{column}++;
6065 $self->{nc}
6066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6067 } else {
6068 $self->{set_nc}->($self);
6069 }
6070
6071 redo A;
6072 } else {
6073 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6074 line => $self->{line_prev},
6075 column => $self->{column_prev} - 1
6076 - (length $self->{kwd})
6077 + 1 * ($self->{nc} == -1));
6078 $self->{state} = BOGUS_COMMENT_STATE;
6079 ## Reconsume.
6080 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6081 redo A;
6082 }
6083 } elsif ($self->{state} == MD_ELEMENT_STATE) {
6084 if ($self->{nc} == [
6085 undef,
6086 undef,
6087 0x0045, # E
6088 0x004D, # M
6089 0x0045, # E
6090 0x004E, # N
6091 ]->[length $self->{kwd}] or
6092 $self->{nc} == [
6093 undef,
6094 undef,
6095 0x0065, # e
6096 0x006D, # m
6097 0x0065, # e
6098 0x006E, # n
6099 ]->[length $self->{kwd}]) {
6100 ## Stay in the state.
6101 $self->{kwd} .= chr $self->{nc};
6102
6103 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6104 $self->{line_prev} = $self->{line};
6105 $self->{column_prev} = $self->{column};
6106 $self->{column}++;
6107 $self->{nc}
6108 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6109 } else {
6110 $self->{set_nc}->($self);
6111 }
6112
6113 redo A;
6114 } elsif ((length $self->{kwd}) == 6 and
6115 ($self->{nc} == 0x0054 or # T
6116 $self->{nc} == 0x0074)) { # t
6117 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6118 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6119 text => 'ELEMENT',
6120 line => $self->{line_prev},
6121 column => $self->{column_prev} - 5);
6122 }
6123 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6124 line => $self->{line_prev},
6125 column => $self->{column_prev} - 7};
6126 $self->{state} = DOCTYPE_MD_STATE;
6127
6128 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6129 $self->{line_prev} = $self->{line};
6130 $self->{column_prev} = $self->{column};
6131 $self->{column}++;
6132 $self->{nc}
6133 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6134 } else {
6135 $self->{set_nc}->($self);
6136 }
6137
6138 redo A;
6139 } else {
6140 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6141 line => $self->{line_prev},
6142 column => $self->{column_prev} - 1
6143 - (length $self->{kwd})
6144 + 1 * ($self->{nc} == -1));
6145 $self->{state} = BOGUS_COMMENT_STATE;
6146 ## Reconsume.
6147 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6148 redo A;
6149 }
6150 } elsif ($self->{state} == MD_ATTLIST_STATE) {
6151 if ($self->{nc} == [
6152 undef,
6153 0x0054, # T
6154 0x0054, # T
6155 0x004C, # L
6156 0x0049, # I
6157 0x0053, # S
6158 ]->[length $self->{kwd}] or
6159 $self->{nc} == [
6160 undef,
6161 0x0074, # t
6162 0x0074, # t
6163 0x006C, # l
6164 0x0069, # i
6165 0x0073, # s
6166 ]->[length $self->{kwd}]) {
6167 ## Stay in the state.
6168 $self->{kwd} .= chr $self->{nc};
6169
6170 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6171 $self->{line_prev} = $self->{line};
6172 $self->{column_prev} = $self->{column};
6173 $self->{column}++;
6174 $self->{nc}
6175 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6176 } else {
6177 $self->{set_nc}->($self);
6178 }
6179
6180 redo A;
6181 } elsif ((length $self->{kwd}) == 6 and
6182 ($self->{nc} == 0x0054 or # T
6183 $self->{nc} == 0x0074)) { # t
6184 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6185 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6186 text => 'ATTLIST',
6187 line => $self->{line_prev},
6188 column => $self->{column_prev} - 5);
6189 }
6190 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6191 attrdefs => [],
6192 line => $self->{line_prev},
6193 column => $self->{column_prev} - 7};
6194 $self->{state} = DOCTYPE_MD_STATE;
6195
6196 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6197 $self->{line_prev} = $self->{line};
6198 $self->{column_prev} = $self->{column};
6199 $self->{column}++;
6200 $self->{nc}
6201 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6202 } else {
6203 $self->{set_nc}->($self);
6204 }
6205
6206 redo A;
6207 } else {
6208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6209 line => $self->{line_prev},
6210 column => $self->{column_prev} - 1
6211 - (length $self->{kwd})
6212 + 1 * ($self->{nc} == -1));
6213 $self->{state} = BOGUS_COMMENT_STATE;
6214 ## Reconsume.
6215 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6216 redo A;
6217 }
6218 } elsif ($self->{state} == MD_NOTATION_STATE) {
6219 if ($self->{nc} == [
6220 undef,
6221 0x004F, # O
6222 0x0054, # T
6223 0x0041, # A
6224 0x0054, # T
6225 0x0049, # I
6226 0x004F, # O
6227 ]->[length $self->{kwd}] or
6228 $self->{nc} == [
6229 undef,
6230 0x006F, # o
6231 0x0074, # t
6232 0x0061, # a
6233 0x0074, # t
6234 0x0069, # i
6235 0x006F, # o
6236 ]->[length $self->{kwd}]) {
6237 ## Stay in the state.
6238 $self->{kwd} .= chr $self->{nc};
6239
6240 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6241 $self->{line_prev} = $self->{line};
6242 $self->{column_prev} = $self->{column};
6243 $self->{column}++;
6244 $self->{nc}
6245 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6246 } else {
6247 $self->{set_nc}->($self);
6248 }
6249
6250 redo A;
6251 } elsif ((length $self->{kwd}) == 7 and
6252 ($self->{nc} == 0x004E or # N
6253 $self->{nc} == 0x006E)) { # n
6254 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6256 text => 'NOTATION',
6257 line => $self->{line_prev},
6258 column => $self->{column_prev} - 6);
6259 }
6260 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6261 line => $self->{line_prev},
6262 column => $self->{column_prev} - 8};
6263 $self->{state} = DOCTYPE_MD_STATE;
6264
6265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6266 $self->{line_prev} = $self->{line};
6267 $self->{column_prev} = $self->{column};
6268 $self->{column}++;
6269 $self->{nc}
6270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6271 } else {
6272 $self->{set_nc}->($self);
6273 }
6274
6275 redo A;
6276 } else {
6277 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6278 line => $self->{line_prev},
6279 column => $self->{column_prev} - 1
6280 - (length $self->{kwd})
6281 + 1 * ($self->{nc} == -1));
6282 $self->{state} = BOGUS_COMMENT_STATE;
6283 ## Reconsume.
6284 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6285 redo A;
6286 }
6287 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6288 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6289 ## "DOCTYPE NOTATION state".
6290
6291 if ($is_space->{$self->{nc}}) {
6292 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6293 $self->{state} = BEFORE_MD_NAME_STATE;
6294
6295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6296 $self->{line_prev} = $self->{line};
6297 $self->{column_prev} = $self->{column};
6298 $self->{column}++;
6299 $self->{nc}
6300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6301 } else {
6302 $self->{set_nc}->($self);
6303 }
6304
6305 redo A;
6306 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6307 $self->{nc} == 0x0025) { # %
6308 ## XML5: Switch to the "DOCTYPE bogus comment state".
6309 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6310 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6311
6312 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6313 $self->{line_prev} = $self->{line};
6314 $self->{column_prev} = $self->{column};
6315 $self->{column}++;
6316 $self->{nc}
6317 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6318 } else {
6319 $self->{set_nc}->($self);
6320 }
6321
6322 redo A;
6323 } elsif ($self->{nc} == -1) {
6324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6325 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6326 ## Reconsume.
6327 redo A;
6328 } elsif ($self->{nc} == 0x003E) { # >
6329 ## XML5: Switch to the "DOCTYPE bogus comment state".
6330 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6331 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6332
6333 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6334 $self->{line_prev} = $self->{line};
6335 $self->{column_prev} = $self->{column};
6336 $self->{column}++;
6337 $self->{nc}
6338 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6339 } else {
6340 $self->{set_nc}->($self);
6341 }
6342
6343 redo A;
6344 } else {
6345 ## XML5: Switch to the "DOCTYPE bogus comment state".
6346 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6347 $self->{state} = BEFORE_MD_NAME_STATE;
6348 redo A;
6349 }
6350 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6351 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6352 ## before state", "DOCTYPE ATTLIST name before state".
6353
6354 if ($is_space->{$self->{nc}}) {
6355 ## Stay in the state.
6356
6357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6358 $self->{line_prev} = $self->{line};
6359 $self->{column_prev} = $self->{column};
6360 $self->{column}++;
6361 $self->{nc}
6362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6363 } else {
6364 $self->{set_nc}->($self);
6365 }
6366
6367 redo A;
6368 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6369 $self->{nc} == 0x0025) { # %
6370 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6371
6372 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6373 $self->{line_prev} = $self->{line};
6374 $self->{column_prev} = $self->{column};
6375 $self->{column}++;
6376 $self->{nc}
6377 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6378 } else {
6379 $self->{set_nc}->($self);
6380 }
6381
6382 redo A;
6383 } elsif ($self->{nc} == 0x003E) { # >
6384 ## XML5: Same as "Anything else".
6385 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6386 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6387
6388 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6389 $self->{line_prev} = $self->{line};
6390 $self->{column_prev} = $self->{column};
6391 $self->{column}++;
6392 $self->{nc}
6393 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6394 } else {
6395 $self->{set_nc}->($self);
6396 }
6397
6398 redo A;
6399 } elsif ($self->{nc} == -1) {
6400 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6401 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6402 ## Reconsume.
6403 redo A;
6404 } else {
6405 ## XML5: [ATTLIST] Not defined yet.
6406 $self->{ct}->{name} .= chr $self->{nc};
6407 $self->{state} = MD_NAME_STATE;
6408
6409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6410 $self->{line_prev} = $self->{line};
6411 $self->{column_prev} = $self->{column};
6412 $self->{column}++;
6413 $self->{nc}
6414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6415 } else {
6416 $self->{set_nc}->($self);
6417 }
6418
6419 redo A;
6420 }
6421 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6422 if ($is_space->{$self->{nc}}) {
6423 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6424 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6425 $self->{state} = BEFORE_MD_NAME_STATE;
6426
6427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6428 $self->{line_prev} = $self->{line};
6429 $self->{column_prev} = $self->{column};
6430 $self->{column}++;
6431 $self->{nc}
6432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6433 } else {
6434 $self->{set_nc}->($self);
6435 }
6436
6437 redo A;
6438 } elsif ($self->{nc} == 0x003E) { # >
6439 ## XML5: Same as "Anything else".
6440 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6441 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6442
6443 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6444 $self->{line_prev} = $self->{line};
6445 $self->{column_prev} = $self->{column};
6446 $self->{column}++;
6447 $self->{nc}
6448 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6449 } else {
6450 $self->{set_nc}->($self);
6451 }
6452
6453 redo A;
6454 } elsif ($self->{nc} == -1) {
6455 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6456 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6457 ## Reconsume.
6458 redo A;
6459 } else {
6460 ## XML5: No parse error.
6461 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6462 $self->{state} = BOGUS_COMMENT_STATE;
6463 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6464 ## Reconsume.
6465 redo A;
6466 }
6467 } elsif ($self->{state} == MD_NAME_STATE) {
6468 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6469
6470 if ($is_space->{$self->{nc}}) {
6471 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6472 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6473 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6474 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6475 } else { # ENTITY/NOTATION
6476 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6477 }
6478
6479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6480 $self->{line_prev} = $self->{line};
6481 $self->{column_prev} = $self->{column};
6482 $self->{column}++;
6483 $self->{nc}
6484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6485 } else {
6486 $self->{set_nc}->($self);
6487 }
6488
6489 redo A;
6490 } elsif ($self->{nc} == 0x003E) { # >
6491 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6492 #
6493 } else {
6494 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6495 }
6496 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6497
6498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6499 $self->{line_prev} = $self->{line};
6500 $self->{column_prev} = $self->{column};
6501 $self->{column}++;
6502 $self->{nc}
6503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6504 } else {
6505 $self->{set_nc}->($self);
6506 }
6507
6508 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6509 redo A;
6510 } elsif ($self->{nc} == -1) {
6511 ## XML5: [ATTLIST] No parse error.
6512 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6513 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6514 ## Reconsume.
6515 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6516 redo A;
6517 } else {
6518 ## XML5: [ATTLIST] Not defined yet.
6519 $self->{ct}->{name} .= chr $self->{nc};
6520 ## Stay in the state.
6521
6522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6523 $self->{line_prev} = $self->{line};
6524 $self->{column_prev} = $self->{column};
6525 $self->{column}++;
6526 $self->{nc}
6527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6528 } else {
6529 $self->{set_nc}->($self);
6530 }
6531
6532 redo A;
6533 }
6534 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6535 if ($is_space->{$self->{nc}}) {
6536 ## Stay in the state.
6537
6538 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539 $self->{line_prev} = $self->{line};
6540 $self->{column_prev} = $self->{column};
6541 $self->{column}++;
6542 $self->{nc}
6543 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544 } else {
6545 $self->{set_nc}->($self);
6546 }
6547
6548 redo A;
6549 } elsif ($self->{nc} == 0x003E) { # >
6550 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6551
6552 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6553 $self->{line_prev} = $self->{line};
6554 $self->{column_prev} = $self->{column};
6555 $self->{column}++;
6556 $self->{nc}
6557 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6558 } else {
6559 $self->{set_nc}->($self);
6560 }
6561
6562 return ($self->{ct}); # ATTLIST
6563 redo A;
6564 } elsif ($self->{nc} == -1) {
6565 ## XML5: No parse error.
6566 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6567 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6568 return ($self->{ct});
6569 redo A;
6570 } else {
6571 ## XML5: Not defined yet.
6572 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6573 tokens => [],
6574 line => $self->{line}, column => $self->{column}};
6575 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6576
6577 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6578 $self->{line_prev} = $self->{line};
6579 $self->{column_prev} = $self->{column};
6580 $self->{column}++;
6581 $self->{nc}
6582 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6583 } else {
6584 $self->{set_nc}->($self);
6585 }
6586
6587 redo A;
6588 }
6589 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6590 if ($is_space->{$self->{nc}}) {
6591 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6592
6593 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6594 $self->{line_prev} = $self->{line};
6595 $self->{column_prev} = $self->{column};
6596 $self->{column}++;
6597 $self->{nc}
6598 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6599 } else {
6600 $self->{set_nc}->($self);
6601 }
6602
6603 redo A;
6604 } elsif ($self->{nc} == 0x003E) { # >
6605 ## XML5: Same as "anything else".
6606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6607 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6608
6609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6610 $self->{line_prev} = $self->{line};
6611 $self->{column_prev} = $self->{column};
6612 $self->{column}++;
6613 $self->{nc}
6614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6615 } else {
6616 $self->{set_nc}->($self);
6617 }
6618
6619 return ($self->{ct}); # ATTLIST
6620 redo A;
6621 } elsif ($self->{nc} == 0x0028) { # (
6622 ## XML5: Same as "anything else".
6623 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6624 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6625
6626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6627 $self->{line_prev} = $self->{line};
6628 $self->{column_prev} = $self->{column};
6629 $self->{column}++;
6630 $self->{nc}
6631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6632 } else {
6633 $self->{set_nc}->($self);
6634 }
6635
6636 redo A;
6637 } elsif ($self->{nc} == -1) {
6638 ## XML5: No parse error.
6639 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6640 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6641
6642 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6643 $self->{line_prev} = $self->{line};
6644 $self->{column_prev} = $self->{column};
6645 $self->{column}++;
6646 $self->{nc}
6647 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6648 } else {
6649 $self->{set_nc}->($self);
6650 }
6651
6652 return ($self->{ct}); # ATTLIST
6653 redo A;
6654 } else {
6655 ## XML5: Not defined yet.
6656 $self->{ca}->{name} .= chr $self->{nc};
6657 ## Stay in the state.
6658
6659 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6660 $self->{line_prev} = $self->{line};
6661 $self->{column_prev} = $self->{column};
6662 $self->{column}++;
6663 $self->{nc}
6664 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6665 } else {
6666 $self->{set_nc}->($self);
6667 }
6668
6669 redo A;
6670 }
6671 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6672 if ($is_space->{$self->{nc}}) {
6673 ## Stay in the state.
6674
6675 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6676 $self->{line_prev} = $self->{line};
6677 $self->{column_prev} = $self->{column};
6678 $self->{column}++;
6679 $self->{nc}
6680 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6681 } else {
6682 $self->{set_nc}->($self);
6683 }
6684
6685 redo A;
6686 } elsif ($self->{nc} == 0x003E) { # >
6687 ## XML5: Same as "anything else".
6688 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6689 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6690
6691 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6692 $self->{line_prev} = $self->{line};
6693 $self->{column_prev} = $self->{column};
6694 $self->{column}++;
6695 $self->{nc}
6696 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6697 } else {
6698 $self->{set_nc}->($self);
6699 }
6700
6701 return ($self->{ct}); # ATTLIST
6702 redo A;
6703 } elsif ($self->{nc} == 0x0028) { # (
6704 ## XML5: Same as "anything else".
6705 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6706
6707 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6708 $self->{line_prev} = $self->{line};
6709 $self->{column_prev} = $self->{column};
6710 $self->{column}++;
6711 $self->{nc}
6712 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6713 } else {
6714 $self->{set_nc}->($self);
6715 }
6716
6717 redo A;
6718 } elsif ($self->{nc} == -1) {
6719 ## XML5: No parse error.
6720 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6721 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6722
6723 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6724 $self->{line_prev} = $self->{line};
6725 $self->{column_prev} = $self->{column};
6726 $self->{column}++;
6727 $self->{nc}
6728 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6729 } else {
6730 $self->{set_nc}->($self);
6731 }
6732
6733 return ($self->{ct});
6734 redo A;
6735 } else {
6736 ## XML5: Not defined yet.
6737 $self->{ca}->{type} = chr $self->{nc};
6738 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6739
6740 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6741 $self->{line_prev} = $self->{line};
6742 $self->{column_prev} = $self->{column};
6743 $self->{column}++;
6744 $self->{nc}
6745 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6746 } else {
6747 $self->{set_nc}->($self);
6748 }
6749
6750 redo A;
6751 }
6752 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6753 if ($is_space->{$self->{nc}}) {
6754 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6755
6756 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6757 $self->{line_prev} = $self->{line};
6758 $self->{column_prev} = $self->{column};
6759 $self->{column}++;
6760 $self->{nc}
6761 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6762 } else {
6763 $self->{set_nc}->($self);
6764 }
6765
6766 redo A;
6767 } elsif ($self->{nc} == 0x0023) { # #
6768 ## XML5: Same as "anything else".
6769 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6770 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6771
6772 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6773 $self->{line_prev} = $self->{line};
6774 $self->{column_prev} = $self->{column};
6775 $self->{column}++;
6776 $self->{nc}
6777 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6778 } else {
6779 $self->{set_nc}->($self);
6780 }
6781
6782 redo A;
6783 } elsif ($self->{nc} == 0x0022) { # "
6784 ## XML5: Same as "anything else".
6785 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6786 $self->{ca}->{value} = '';
6787 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6788
6789 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790 $self->{line_prev} = $self->{line};
6791 $self->{column_prev} = $self->{column};
6792 $self->{column}++;
6793 $self->{nc}
6794 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795 } else {
6796 $self->{set_nc}->($self);
6797 }
6798
6799 redo A;
6800 } elsif ($self->{nc} == 0x0027) { # '
6801 ## XML5: Same as "anything else".
6802 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6803 $self->{ca}->{value} = '';
6804 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6805
6806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6807 $self->{line_prev} = $self->{line};
6808 $self->{column_prev} = $self->{column};
6809 $self->{column}++;
6810 $self->{nc}
6811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6812 } else {
6813 $self->{set_nc}->($self);
6814 }
6815
6816 redo A;
6817 } elsif ($self->{nc} == 0x003E) { # >
6818 ## XML5: Same as "anything else".
6819 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6820 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6821
6822 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6823 $self->{line_prev} = $self->{line};
6824 $self->{column_prev} = $self->{column};
6825 $self->{column}++;
6826 $self->{nc}
6827 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6828 } else {
6829 $self->{set_nc}->($self);
6830 }
6831
6832 return ($self->{ct}); # ATTLIST
6833 redo A;
6834 } elsif ($self->{nc} == 0x0028) { # (
6835 ## XML5: Same as "anything else".
6836 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6837 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6838
6839 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6840 $self->{line_prev} = $self->{line};
6841 $self->{column_prev} = $self->{column};
6842 $self->{column}++;
6843 $self->{nc}
6844 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6845 } else {
6846 $self->{set_nc}->($self);
6847 }
6848
6849 redo A;
6850 } elsif ($self->{nc} == -1) {
6851 ## XML5: No parse error.
6852 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6853 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6854
6855 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6856 $self->{line_prev} = $self->{line};
6857 $self->{column_prev} = $self->{column};
6858 $self->{column}++;
6859 $self->{nc}
6860 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6861 } else {
6862 $self->{set_nc}->($self);
6863 }
6864
6865 return ($self->{ct});
6866 redo A;
6867 } else {
6868 ## XML5: Not defined yet.
6869 $self->{ca}->{type} .= chr $self->{nc};
6870 ## Stay in the state.
6871
6872 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6873 $self->{line_prev} = $self->{line};
6874 $self->{column_prev} = $self->{column};
6875 $self->{column}++;
6876 $self->{nc}
6877 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6878 } else {
6879 $self->{set_nc}->($self);
6880 }
6881
6882 redo A;
6883 }
6884 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6885 if ($is_space->{$self->{nc}}) {
6886 ## Stay in the state.
6887
6888 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6889 $self->{line_prev} = $self->{line};
6890 $self->{column_prev} = $self->{column};
6891 $self->{column}++;
6892 $self->{nc}
6893 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6894 } else {
6895 $self->{set_nc}->($self);
6896 }
6897
6898 redo A;
6899 } elsif ($self->{nc} == 0x0028) { # (
6900 ## XML5: Same as "anything else".
6901 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6902
6903 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6904 $self->{line_prev} = $self->{line};
6905 $self->{column_prev} = $self->{column};
6906 $self->{column}++;
6907 $self->{nc}
6908 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6909 } else {
6910 $self->{set_nc}->($self);
6911 }
6912
6913 redo A;
6914 } elsif ($self->{nc} == 0x0023) { # #
6915 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6916
6917 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6918 $self->{line_prev} = $self->{line};
6919 $self->{column_prev} = $self->{column};
6920 $self->{column}++;
6921 $self->{nc}
6922 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6923 } else {
6924 $self->{set_nc}->($self);
6925 }
6926
6927 redo A;
6928 } elsif ($self->{nc} == 0x0022) { # "
6929 ## XML5: Same as "anything else".
6930 $self->{ca}->{value} = '';
6931 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6932
6933 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6934 $self->{line_prev} = $self->{line};
6935 $self->{column_prev} = $self->{column};
6936 $self->{column}++;
6937 $self->{nc}
6938 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6939 } else {
6940 $self->{set_nc}->($self);
6941 }
6942
6943 redo A;
6944 } elsif ($self->{nc} == 0x0027) { # '
6945 ## XML5: Same as "anything else".
6946 $self->{ca}->{value} = '';
6947 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6948
6949 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6950 $self->{line_prev} = $self->{line};
6951 $self->{column_prev} = $self->{column};
6952 $self->{column}++;
6953 $self->{nc}
6954 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6955 } else {
6956 $self->{set_nc}->($self);
6957 }
6958
6959 redo A;
6960 } elsif ($self->{nc} == 0x003E) { # >
6961 ## XML5: Same as "anything else".
6962 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6963 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6964
6965 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6966 $self->{line_prev} = $self->{line};
6967 $self->{column_prev} = $self->{column};
6968 $self->{column}++;
6969 $self->{nc}
6970 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6971 } else {
6972 $self->{set_nc}->($self);
6973 }
6974
6975 return ($self->{ct}); # ATTLIST
6976 redo A;
6977 } elsif ($self->{nc} == -1) {
6978 ## XML5: No parse error.
6979 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6980 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6981
6982 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6983 $self->{line_prev} = $self->{line};
6984 $self->{column_prev} = $self->{column};
6985 $self->{column}++;
6986 $self->{nc}
6987 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6988 } else {
6989 $self->{set_nc}->($self);
6990 }
6991
6992 return ($self->{ct});
6993 redo A;
6994 } else {
6995 ## XML5: Switch to the "DOCTYPE bogus comment state".
6996 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6997 $self->{ca}->{value} = '';
6998 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6999 ## Reconsume.
7000 redo A;
7001 }
7002 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7003 if ($is_space->{$self->{nc}}) {
7004 ## Stay in the state.
7005
7006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7007 $self->{line_prev} = $self->{line};
7008 $self->{column_prev} = $self->{column};
7009 $self->{column}++;
7010 $self->{nc}
7011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7012 } else {
7013 $self->{set_nc}->($self);
7014 }
7015
7016 redo A;
7017 } elsif ($self->{nc} == 0x007C) { # |
7018 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7019 ## Stay in the state.
7020
7021 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7022 $self->{line_prev} = $self->{line};
7023 $self->{column_prev} = $self->{column};
7024 $self->{column}++;
7025 $self->{nc}
7026 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7027 } else {
7028 $self->{set_nc}->($self);
7029 }
7030
7031 redo A;
7032 } elsif ($self->{nc} == 0x0029) { # )
7033 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7034 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7035
7036 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7037 $self->{line_prev} = $self->{line};
7038 $self->{column_prev} = $self->{column};
7039 $self->{column}++;
7040 $self->{nc}
7041 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7042 } else {
7043 $self->{set_nc}->($self);
7044 }
7045
7046 redo A;
7047 } elsif ($self->{nc} == 0x003E) { # >
7048 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7049 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7050
7051 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7052 $self->{line_prev} = $self->{line};
7053 $self->{column_prev} = $self->{column};
7054 $self->{column}++;
7055 $self->{nc}
7056 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7057 } else {
7058 $self->{set_nc}->($self);
7059 }
7060
7061 return ($self->{ct}); # ATTLIST
7062 redo A;
7063 } elsif ($self->{nc} == -1) {
7064 ## XML5: No parse error.
7065 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7066 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7067
7068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069 $self->{line_prev} = $self->{line};
7070 $self->{column_prev} = $self->{column};
7071 $self->{column}++;
7072 $self->{nc}
7073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074 } else {
7075 $self->{set_nc}->($self);
7076 }
7077
7078 return ($self->{ct});
7079 redo A;
7080 } else {
7081 push @{$self->{ca}->{tokens}}, chr $self->{nc};
7082 $self->{state} = ALLOWED_TOKEN_STATE;
7083
7084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7085 $self->{line_prev} = $self->{line};
7086 $self->{column_prev} = $self->{column};
7087 $self->{column}++;
7088 $self->{nc}
7089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7090 } else {
7091 $self->{set_nc}->($self);
7092 }
7093
7094 redo A;
7095 }
7096 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7097 if ($is_space->{$self->{nc}}) {
7098 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7099
7100 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7101 $self->{line_prev} = $self->{line};
7102 $self->{column_prev} = $self->{column};
7103 $self->{column}++;
7104 $self->{nc}
7105 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7106 } else {
7107 $self->{set_nc}->($self);
7108 }
7109
7110 redo A;
7111 } elsif ($self->{nc} == 0x007C) { # |
7112 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7113
7114 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115 $self->{line_prev} = $self->{line};
7116 $self->{column_prev} = $self->{column};
7117 $self->{column}++;
7118 $self->{nc}
7119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120 } else {
7121 $self->{set_nc}->($self);
7122 }
7123
7124 redo A;
7125 } elsif ($self->{nc} == 0x0029) { # )
7126 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7127
7128 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7129 $self->{line_prev} = $self->{line};
7130 $self->{column_prev} = $self->{column};
7131 $self->{column}++;
7132 $self->{nc}
7133 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7134 } else {
7135 $self->{set_nc}->($self);
7136 }
7137
7138 redo A;
7139 } elsif ($self->{nc} == 0x003E) { # >
7140 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7141 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7142
7143 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7144 $self->{line_prev} = $self->{line};
7145 $self->{column_prev} = $self->{column};
7146 $self->{column}++;
7147 $self->{nc}
7148 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7149 } else {
7150 $self->{set_nc}->($self);
7151 }
7152
7153 return ($self->{ct}); # ATTLIST
7154 redo A;
7155 } elsif ($self->{nc} == -1) {
7156 ## XML5: No parse error.
7157 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7158 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7159
7160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7161 $self->{line_prev} = $self->{line};
7162 $self->{column_prev} = $self->{column};
7163 $self->{column}++;
7164 $self->{nc}
7165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7166 } else {
7167 $self->{set_nc}->($self);
7168 }
7169
7170 return ($self->{ct});
7171 redo A;
7172 } else {
7173 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7174 ## Stay in the state.
7175
7176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177 $self->{line_prev} = $self->{line};
7178 $self->{column_prev} = $self->{column};
7179 $self->{column}++;
7180 $self->{nc}
7181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182 } else {
7183 $self->{set_nc}->($self);
7184 }
7185
7186 redo A;
7187 }
7188 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7189 if ($is_space->{$self->{nc}}) {
7190 ## Stay in the state.
7191
7192 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193 $self->{line_prev} = $self->{line};
7194 $self->{column_prev} = $self->{column};
7195 $self->{column}++;
7196 $self->{nc}
7197 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198 } else {
7199 $self->{set_nc}->($self);
7200 }
7201
7202 redo A;
7203 } elsif ($self->{nc} == 0x007C) { # |
7204 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7205
7206 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7207 $self->{line_prev} = $self->{line};
7208 $self->{column_prev} = $self->{column};
7209 $self->{column}++;
7210 $self->{nc}
7211 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7212 } else {
7213 $self->{set_nc}->($self);
7214 }
7215
7216 redo A;
7217 } elsif ($self->{nc} == 0x0029) { # )
7218 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7219
7220 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7221 $self->{line_prev} = $self->{line};
7222 $self->{column_prev} = $self->{column};
7223 $self->{column}++;
7224 $self->{nc}
7225 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7226 } else {
7227 $self->{set_nc}->($self);
7228 }
7229
7230 redo A;
7231 } elsif ($self->{nc} == 0x003E) { # >
7232 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7233 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7234
7235 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7236 $self->{line_prev} = $self->{line};
7237 $self->{column_prev} = $self->{column};
7238 $self->{column}++;
7239 $self->{nc}
7240 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7241 } else {
7242 $self->{set_nc}->($self);
7243 }
7244
7245 return ($self->{ct}); # ATTLIST
7246 redo A;
7247 } elsif ($self->{nc} == -1) {
7248 ## XML5: No parse error.
7249 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7250 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7251
7252 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7253 $self->{line_prev} = $self->{line};
7254 $self->{column_prev} = $self->{column};
7255 $self->{column}++;
7256 $self->{nc}
7257 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7258 } else {
7259 $self->{set_nc}->($self);
7260 }
7261
7262 return ($self->{ct});
7263 redo A;
7264 } else {
7265 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7266 line => $self->{line_prev},
7267 column => $self->{column_prev});
7268 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7269 $self->{state} = ALLOWED_TOKEN_STATE;
7270
7271 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7272 $self->{line_prev} = $self->{line};
7273 $self->{column_prev} = $self->{column};
7274 $self->{column}++;
7275 $self->{nc}
7276 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7277 } else {
7278 $self->{set_nc}->($self);
7279 }
7280
7281 redo A;
7282 }
7283 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7284 if ($is_space->{$self->{nc}}) {
7285 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7286
7287 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7288 $self->{line_prev} = $self->{line};
7289 $self->{column_prev} = $self->{column};
7290 $self->{column}++;
7291 $self->{nc}
7292 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7293 } else {
7294 $self->{set_nc}->($self);
7295 }
7296
7297 redo A;
7298 } elsif ($self->{nc} == 0x0023) { # #
7299 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7300 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7301
7302 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7303 $self->{line_prev} = $self->{line};
7304 $self->{column_prev} = $self->{column};
7305 $self->{column}++;
7306 $self->{nc}
7307 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7308 } else {
7309 $self->{set_nc}->($self);
7310 }
7311
7312 redo A;
7313 } elsif ($self->{nc} == 0x0022) { # "
7314 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7315 $self->{ca}->{value} = '';
7316 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7317
7318 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7319 $self->{line_prev} = $self->{line};
7320 $self->{column_prev} = $self->{column};
7321 $self->{column}++;
7322 $self->{nc}
7323 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7324 } else {
7325 $self->{set_nc}->($self);
7326 }
7327
7328 redo A;
7329 } elsif ($self->{nc} == 0x0027) { # '
7330 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7331 $self->{ca}->{value} = '';
7332 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7333
7334 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7335 $self->{line_prev} = $self->{line};
7336 $self->{column_prev} = $self->{column};
7337 $self->{column}++;
7338 $self->{nc}
7339 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7340 } else {
7341 $self->{set_nc}->($self);
7342 }
7343
7344 redo A;
7345 } elsif ($self->{nc} == 0x003E) { # >
7346 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7347 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7348
7349 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7350 $self->{line_prev} = $self->{line};
7351 $self->{column_prev} = $self->{column};
7352 $self->{column}++;
7353 $self->{nc}
7354 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7355 } else {
7356 $self->{set_nc}->($self);
7357 }
7358
7359 return ($self->{ct}); # ATTLIST
7360 redo A;
7361 } elsif ($self->{nc} == -1) {
7362 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7363 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7364
7365 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7366 $self->{line_prev} = $self->{line};
7367 $self->{column_prev} = $self->{column};
7368 $self->{column}++;
7369 $self->{nc}
7370 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7371 } else {
7372 $self->{set_nc}->($self);
7373 }
7374
7375 return ($self->{ct});
7376 redo A;
7377 } else {
7378 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7379 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7380 ## Reconsume.
7381 redo A;
7382 }
7383 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7384 if ($is_space->{$self->{nc}}) {
7385 ## Stay in the state.
7386
7387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7388 $self->{line_prev} = $self->{line};
7389 $self->{column_prev} = $self->{column};
7390 $self->{column}++;
7391 $self->{nc}
7392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7393 } else {
7394 $self->{set_nc}->($self);
7395 }
7396
7397 redo A;
7398 } elsif ($self->{nc} == 0x0023) { # #
7399 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7400
7401 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402 $self->{line_prev} = $self->{line};
7403 $self->{column_prev} = $self->{column};
7404 $self->{column}++;
7405 $self->{nc}
7406 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407 } else {
7408 $self->{set_nc}->($self);
7409 }
7410
7411 redo A;
7412 } elsif ($self->{nc} == 0x0022) { # "
7413 $self->{ca}->{value} = '';
7414 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7415
7416 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7417 $self->{line_prev} = $self->{line};
7418 $self->{column_prev} = $self->{column};
7419 $self->{column}++;
7420 $self->{nc}
7421 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7422 } else {
7423 $self->{set_nc}->($self);
7424 }
7425
7426 redo A;
7427 } elsif ($self->{nc} == 0x0027) { # '
7428 $self->{ca}->{value} = '';
7429 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7430
7431 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7432 $self->{line_prev} = $self->{line};
7433 $self->{column_prev} = $self->{column};
7434 $self->{column}++;
7435 $self->{nc}
7436 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7437 } else {
7438 $self->{set_nc}->($self);
7439 }
7440
7441 redo A;
7442 } elsif ($self->{nc} == 0x003E) { # >
7443 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7444 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7445
7446 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7447 $self->{line_prev} = $self->{line};
7448 $self->{column_prev} = $self->{column};
7449 $self->{column}++;
7450 $self->{nc}
7451 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7452 } else {
7453 $self->{set_nc}->($self);
7454 }
7455
7456 return ($self->{ct}); # ATTLIST
7457 redo A;
7458 } elsif ($self->{nc} == -1) {
7459 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7460 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7461
7462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7463 $self->{line_prev} = $self->{line};
7464 $self->{column_prev} = $self->{column};
7465 $self->{column}++;
7466 $self->{nc}
7467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7468 } else {
7469 $self->{set_nc}->($self);
7470 }
7471
7472 return ($self->{ct});
7473 redo A;
7474 } else {
7475 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7476 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7477 ## Reconsume.
7478 redo A;
7479 }
7480 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7481 if ($is_space->{$self->{nc}}) {
7482 ## XML5: No parse error.
7483 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7484 $self->{state} = BOGUS_MD_STATE;
7485 ## Reconsume.
7486 redo A;
7487 } elsif ($self->{nc} == 0x0022) { # "
7488 ## XML5: Same as "anything else".
7489 $self->{ca}->{value} = '';
7490 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7491
7492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7493 $self->{line_prev} = $self->{line};
7494 $self->{column_prev} = $self->{column};
7495 $self->{column}++;
7496 $self->{nc}
7497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7498 } else {
7499 $self->{set_nc}->($self);
7500 }
7501
7502 redo A;
7503 } elsif ($self->{nc} == 0x0027) { # '
7504 ## XML5: Same as "anything else".
7505 $self->{ca}->{value} = '';
7506 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7507
7508 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7509 $self->{line_prev} = $self->{line};
7510 $self->{column_prev} = $self->{column};
7511 $self->{column}++;
7512 $self->{nc}
7513 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7514 } else {
7515 $self->{set_nc}->($self);
7516 }
7517
7518 redo A;
7519 } elsif ($self->{nc} == 0x003E) { # >
7520 ## XML5: Same as "anything else".
7521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7522 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7523
7524 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7525 $self->{line_prev} = $self->{line};
7526 $self->{column_prev} = $self->{column};
7527 $self->{column}++;
7528 $self->{nc}
7529 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7530 } else {
7531 $self->{set_nc}->($self);
7532 }
7533
7534 return ($self->{ct}); # ATTLIST
7535 redo A;
7536 } elsif ($self->{nc} == -1) {
7537 ## XML5: No parse error.
7538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7539 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7540
7541 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7542 $self->{line_prev} = $self->{line};
7543 $self->{column_prev} = $self->{column};
7544 $self->{column}++;
7545 $self->{nc}
7546 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7547 } else {
7548 $self->{set_nc}->($self);
7549 }
7550
7551 return ($self->{ct});
7552 redo A;
7553 } else {
7554 $self->{ca}->{default} = chr $self->{nc};
7555 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7556
7557 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7558 $self->{line_prev} = $self->{line};
7559 $self->{column_prev} = $self->{column};
7560 $self->{column}++;
7561 $self->{nc}
7562 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7563 } else {
7564 $self->{set_nc}->($self);
7565 }
7566
7567 redo A;
7568 }
7569 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7570 if ($is_space->{$self->{nc}}) {
7571 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7572
7573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7574 $self->{line_prev} = $self->{line};
7575 $self->{column_prev} = $self->{column};
7576 $self->{column}++;
7577 $self->{nc}
7578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7579 } else {
7580 $self->{set_nc}->($self);
7581 }
7582
7583 redo A;
7584 } elsif ($self->{nc} == 0x0022) { # "
7585 ## XML5: Same as "anything else".
7586 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7587 $self->{ca}->{value} = '';
7588 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7589
7590 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7591 $self->{line_prev} = $self->{line};
7592 $self->{column_prev} = $self->{column};
7593 $self->{column}++;
7594 $self->{nc}
7595 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7596 } else {
7597 $self->{set_nc}->($self);
7598 }
7599
7600 redo A;
7601 } elsif ($self->{nc} == 0x0027) { # '
7602 ## XML5: Same as "anything else".
7603 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7604 $self->{ca}->{value} = '';
7605 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7606
7607 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7608 $self->{line_prev} = $self->{line};
7609 $self->{column_prev} = $self->{column};
7610 $self->{column}++;
7611 $self->{nc}
7612 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7613 } else {
7614 $self->{set_nc}->($self);
7615 }
7616
7617 redo A;
7618 } elsif ($self->{nc} == 0x003E) { # >
7619 ## XML5: Same as "anything else".
7620 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7621 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7622
7623 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7624 $self->{line_prev} = $self->{line};
7625 $self->{column_prev} = $self->{column};
7626 $self->{column}++;
7627 $self->{nc}
7628 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7629 } else {
7630 $self->{set_nc}->($self);
7631 }
7632
7633 return ($self->{ct}); # ATTLIST
7634 redo A;
7635 } elsif ($self->{nc} == -1) {
7636 ## XML5: No parse error.
7637 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7638 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7639 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7640
7641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7642 $self->{line_prev} = $self->{line};
7643 $self->{column_prev} = $self->{column};
7644 $self->{column}++;
7645 $self->{nc}
7646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7647 } else {
7648 $self->{set_nc}->($self);
7649 }
7650
7651 return ($self->{ct});
7652 redo A;
7653 } else {
7654 $self->{ca}->{default} .= chr $self->{nc};
7655 ## Stay in the state.
7656
7657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7658 $self->{line_prev} = $self->{line};
7659 $self->{column_prev} = $self->{column};
7660 $self->{column}++;
7661 $self->{nc}
7662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7663 } else {
7664 $self->{set_nc}->($self);
7665 }
7666
7667 redo A;
7668 }
7669 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7670 if ($is_space->{$self->{nc}}) {
7671 ## Stay in the state.
7672
7673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7674 $self->{line_prev} = $self->{line};
7675 $self->{column_prev} = $self->{column};
7676 $self->{column}++;
7677 $self->{nc}
7678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7679 } else {
7680 $self->{set_nc}->($self);
7681 }
7682
7683 redo A;
7684 } elsif ($self->{nc} == 0x0022) { # "
7685 $self->{ca}->{value} = '';
7686 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7687
7688 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7689 $self->{line_prev} = $self->{line};
7690 $self->{column_prev} = $self->{column};
7691 $self->{column}++;
7692 $self->{nc}
7693 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7694 } else {
7695 $self->{set_nc}->($self);
7696 }
7697
7698 redo A;
7699 } elsif ($self->{nc} == 0x0027) { # '
7700 $self->{ca}->{value} = '';
7701 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7702
7703 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7704 $self->{line_prev} = $self->{line};
7705 $self->{column_prev} = $self->{column};
7706 $self->{column}++;
7707 $self->{nc}
7708 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7709 } else {
7710 $self->{set_nc}->($self);
7711 }
7712
7713 redo A;
7714 } elsif ($self->{nc} == 0x003E) { # >
7715 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7716 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7717
7718 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7719 $self->{line_prev} = $self->{line};
7720 $self->{column_prev} = $self->{column};
7721 $self->{column}++;
7722 $self->{nc}
7723 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7724 } else {
7725 $self->{set_nc}->($self);
7726 }
7727
7728 return ($self->{ct}); # ATTLIST
7729 redo A;
7730 } elsif ($self->{nc} == -1) {
7731 ## XML5: No parse error.
7732 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7733 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7734 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7735
7736 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7737 $self->{line_prev} = $self->{line};
7738 $self->{column_prev} = $self->{column};
7739 $self->{column}++;
7740 $self->{nc}
7741 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7742 } else {
7743 $self->{set_nc}->($self);
7744 }
7745
7746 return ($self->{ct});
7747 redo A;
7748 } else {
7749 ## XML5: Not defined yet.
7750 if ($self->{ca}->{default} eq 'FIXED') {
7751 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7752 } else {
7753 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7754 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7755 }
7756 ## Reconsume.
7757 redo A;
7758 }
7759 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7760 if ($is_space->{$self->{nc}} or
7761 $self->{nc} == -1 or
7762 $self->{nc} == 0x003E) { # >
7763 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7764 ## Reconsume.
7765 redo A;
7766 } else {
7767 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7768 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7769 ## Reconsume.
7770 redo A;
7771 }
7772 } elsif ($self->{state} == NDATA_STATE) {
7773 ## ASCII case-insensitive
7774 if ($self->{nc} == [
7775 undef,
7776 0x0044, # D
7777 0x0041, # A
7778 0x0054, # T
7779 ]->[length $self->{kwd}] or
7780 $self->{nc} == [
7781 undef,
7782 0x0064, # d
7783 0x0061, # a
7784 0x0074, # t
7785 ]->[length $self->{kwd}]) {
7786
7787 ## Stay in the state.
7788 $self->{kwd} .= chr $self->{nc};
7789
7790 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7791 $self->{line_prev} = $self->{line};
7792 $self->{column_prev} = $self->{column};
7793 $self->{column}++;
7794 $self->{nc}
7795 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7796 } else {
7797 $self->{set_nc}->($self);
7798 }
7799
7800 redo A;
7801 } elsif ((length $self->{kwd}) == 4 and
7802 ($self->{nc} == 0x0041 or # A
7803 $self->{nc} == 0x0061)) { # a
7804 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7805
7806 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7807 text => 'NDATA',
7808 line => $self->{line_prev},
7809 column => $self->{column_prev} - 4);
7810 } else {
7811
7812 }
7813 $self->{state} = AFTER_NDATA_STATE;
7814
7815 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7816 $self->{line_prev} = $self->{line};
7817 $self->{column_prev} = $self->{column};
7818 $self->{column}++;
7819 $self->{nc}
7820 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7821 } else {
7822 $self->{set_nc}->($self);
7823 }
7824
7825 redo A;
7826 } else {
7827 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7828 line => $self->{line_prev},
7829 column => $self->{column_prev} + 1
7830 - length $self->{kwd});
7831
7832 $self->{state} = BOGUS_MD_STATE;
7833 ## Reconsume.
7834 redo A;
7835 }
7836 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7837 if ($is_space->{$self->{nc}}) {
7838 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7839
7840 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7841 $self->{line_prev} = $self->{line};
7842 $self->{column_prev} = $self->{column};
7843 $self->{column}++;
7844 $self->{nc}
7845 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7846 } else {
7847 $self->{set_nc}->($self);
7848 }
7849
7850 redo A;
7851 } elsif ($self->{nc} == 0x003E) { # >
7852 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7853 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7854
7855 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7856 $self->{line_prev} = $self->{line};
7857 $self->{column_prev} = $self->{column};
7858 $self->{column}++;
7859 $self->{nc}
7860 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7861 } else {
7862 $self->{set_nc}->($self);
7863 }
7864
7865 return ($self->{ct}); # ENTITY
7866 redo A;
7867 } elsif ($self->{nc} == -1) {
7868 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7869 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7870
7871 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7872 $self->{line_prev} = $self->{line};
7873 $self->{column_prev} = $self->{column};
7874 $self->{column}++;
7875 $self->{nc}
7876 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7877 } else {
7878 $self->{set_nc}->($self);
7879 }
7880
7881 return ($self->{ct}); # ENTITY
7882 redo A;
7883 } else {
7884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7885 line => $self->{line_prev},
7886 column => $self->{column_prev} + 1
7887 - length $self->{kwd});
7888 $self->{state} = BOGUS_MD_STATE;
7889 ## Reconsume.
7890 redo A;
7891 }
7892 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7893 if ($is_space->{$self->{nc}}) {
7894 ## Stay in the state.
7895
7896 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7897 $self->{line_prev} = $self->{line};
7898 $self->{column_prev} = $self->{column};
7899 $self->{column}++;
7900 $self->{nc}
7901 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7902 } else {
7903 $self->{set_nc}->($self);
7904 }
7905
7906 redo A;
7907 } elsif ($self->{nc} == 0x003E) { # >
7908 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7909 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7910
7911 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7912 $self->{line_prev} = $self->{line};
7913 $self->{column_prev} = $self->{column};
7914 $self->{column}++;
7915 $self->{nc}
7916 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7917 } else {
7918 $self->{set_nc}->($self);
7919 }
7920
7921 return ($self->{ct}); # ENTITY
7922 redo A;
7923 } elsif ($self->{nc} == -1) {
7924 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7925 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7926
7927 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7928 $self->{line_prev} = $self->{line};
7929 $self->{column_prev} = $self->{column};
7930 $self->{column}++;
7931 $self->{nc}
7932 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7933 } else {
7934 $self->{set_nc}->($self);
7935 }
7936
7937 return ($self->{ct}); # ENTITY
7938 redo A;
7939 } else {
7940 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7941 $self->{state} = NOTATION_NAME_STATE;
7942
7943 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7944 $self->{line_prev} = $self->{line};
7945 $self->{column_prev} = $self->{column};
7946 $self->{column}++;
7947 $self->{nc}
7948 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7949 } else {
7950 $self->{set_nc}->($self);
7951 }
7952
7953 redo A;
7954 }
7955 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7956 if ($is_space->{$self->{nc}}) {
7957 $self->{state} = AFTER_MD_DEF_STATE;
7958
7959 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7960 $self->{line_prev} = $self->{line};
7961 $self->{column_prev} = $self->{column};
7962 $self->{column}++;
7963 $self->{nc}
7964 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7965 } else {
7966 $self->{set_nc}->($self);
7967 }
7968
7969 redo A;
7970 } elsif ($self->{nc} == 0x003E) { # >
7971 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7972
7973 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7974 $self->{line_prev} = $self->{line};
7975 $self->{column_prev} = $self->{column};
7976 $self->{column}++;
7977 $self->{nc}
7978 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7979 } else {
7980 $self->{set_nc}->($self);
7981 }
7982
7983 return ($self->{ct}); # ENTITY
7984 redo A;
7985 } elsif ($self->{nc} == -1) {
7986 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7987 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7988
7989 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7990 $self->{line_prev} = $self->{line};
7991 $self->{column_prev} = $self->{column};
7992 $self->{column}++;
7993 $self->{nc}
7994 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7995 } else {
7996 $self->{set_nc}->($self);
7997 }
7998
7999 return ($self->{ct}); # ENTITY
8000 redo A;
8001 } else {
8002 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8003 ## Stay in the state.
8004
8005 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8006 $self->{line_prev} = $self->{line};
8007 $self->{column_prev} = $self->{column};
8008 $self->{column}++;
8009 $self->{nc}
8010 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8011 } else {
8012 $self->{set_nc}->($self);
8013 }
8014
8015 redo A;
8016 }
8017 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8018 if ($self->{nc} == 0x0022) { # "
8019 $self->{state} = AFTER_MD_DEF_STATE;
8020
8021 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8022 $self->{line_prev} = $self->{line};
8023 $self->{column_prev} = $self->{column};
8024 $self->{column}++;
8025 $self->{nc}
8026 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8027 } else {
8028 $self->{set_nc}->($self);
8029 }
8030
8031 redo A;
8032 } elsif ($self->{nc} == 0x0026) { # &
8033 $self->{prev_state} = $self->{state};
8034 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8035 $self->{entity_add} = 0x0022; # "
8036
8037 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8038 $self->{line_prev} = $self->{line};
8039 $self->{column_prev} = $self->{column};
8040 $self->{column}++;
8041 $self->{nc}
8042 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8043 } else {
8044 $self->{set_nc}->($self);
8045 }
8046
8047 redo A;
8048 ## TODO: %
8049 } elsif ($self->{nc} == -1) {
8050 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8051 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8052 ## Reconsume.
8053 return ($self->{ct}); # ENTITY
8054 redo A;
8055 } else {
8056 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8057
8058 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8059 $self->{line_prev} = $self->{line};
8060 $self->{column_prev} = $self->{column};
8061 $self->{column}++;
8062 $self->{nc}
8063 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8064 } else {
8065 $self->{set_nc}->($self);
8066 }
8067
8068 redo A;
8069 }
8070 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8071 if ($self->{nc} == 0x0027) { # '
8072 $self->{state} = AFTER_MD_DEF_STATE;
8073
8074 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8075 $self->{line_prev} = $self->{line};
8076 $self->{column_prev} = $self->{column};
8077 $self->{column}++;
8078 $self->{nc}
8079 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8080 } else {
8081 $self->{set_nc}->($self);
8082 }
8083
8084 redo A;
8085 } elsif ($self->{nc} == 0x0026) { # &
8086 $self->{prev_state} = $self->{state};
8087 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8088 $self->{entity_add} = 0x0027; # '
8089
8090 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8091 $self->{line_prev} = $self->{line};
8092 $self->{column_prev} = $self->{column};
8093 $self->{column}++;
8094 $self->{nc}
8095 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8096 } else {
8097 $self->{set_nc}->($self);
8098 }
8099
8100 redo A;
8101 ## TODO: %
8102 } elsif ($self->{nc} == -1) {
8103 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8104 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8105 ## Reconsume.
8106 return ($self->{ct}); # ENTITY
8107 redo A;
8108 } else {
8109 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8110
8111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8112 $self->{line_prev} = $self->{line};
8113 $self->{column_prev} = $self->{column};
8114 $self->{column}++;
8115 $self->{nc}
8116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8117 } else {
8118 $self->{set_nc}->($self);
8119 }
8120
8121 redo A;
8122 }
8123 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8124 if ($is_space->{$self->{nc}} or
8125 {
8126 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8127 $self->{entity_add} => 1,
8128 }->{$self->{nc}}) {
8129 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8130 line => $self->{line_prev},
8131 column => $self->{column_prev}
8132 + ($self->{nc} == -1 ? 1 : 0));
8133 ## Don't consume
8134 ## Return nothing.
8135 #
8136 } elsif ($self->{nc} == 0x0023) { # #
8137 $self->{ca} = $self->{ct};
8138 $self->{state} = ENTITY_HASH_STATE;
8139 $self->{kwd} = '#';
8140
8141 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8142 $self->{line_prev} = $self->{line};
8143 $self->{column_prev} = $self->{column};
8144 $self->{column}++;
8145 $self->{nc}
8146 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8147 } else {
8148 $self->{set_nc}->($self);
8149 }
8150
8151 redo A;
8152 } else {
8153 #
8154 }
8155
8156 $self->{ct}->{value} .= '&';
8157 $self->{state} = $self->{prev_state};
8158 ## Reconsume.
8159 redo A;
8160 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8161 if ($is_space->{$self->{nc}}) {
8162 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8163
8164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8165 $self->{line_prev} = $self->{line};
8166 $self->{column_prev} = $self->{column};
8167 $self->{column}++;
8168 $self->{nc}
8169 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8170 } else {
8171 $self->{set_nc}->($self);
8172 }
8173
8174 redo A;
8175 } elsif ($self->{nc} == 0x0028) { # (
8176 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8177 $self->{ct}->{content} = ['('];
8178 $self->{group_depth} = 1;
8179
8180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8181 $self->{line_prev} = $self->{line};
8182 $self->{column_prev} = $self->{column};
8183 $self->{column}++;
8184 $self->{nc}
8185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8186 } else {
8187 $self->{set_nc}->($self);
8188 }
8189
8190 redo A;
8191 } elsif ($self->{nc} == 0x003E) { # >
8192 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8193 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8194
8195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8196 $self->{line_prev} = $self->{line};
8197 $self->{column_prev} = $self->{column};
8198 $self->{column}++;
8199 $self->{nc}
8200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8201 } else {
8202 $self->{set_nc}->($self);
8203 }
8204
8205 return ($self->{ct}); # ELEMENT
8206 redo A;
8207 } elsif ($self->{nc} == -1) {
8208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8209 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8210
8211 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8212 $self->{line_prev} = $self->{line};
8213 $self->{column_prev} = $self->{column};
8214 $self->{column}++;
8215 $self->{nc}
8216 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8217 } else {
8218 $self->{set_nc}->($self);
8219 }
8220
8221 return ($self->{ct}); # ELEMENT
8222 redo A;
8223 } else {
8224 $self->{ct}->{content} = [chr $self->{nc}];
8225 $self->{state} = CONTENT_KEYWORD_STATE;
8226
8227 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8228 $self->{line_prev} = $self->{line};
8229 $self->{column_prev} = $self->{column};
8230 $self->{column}++;
8231 $self->{nc}
8232 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8233 } else {
8234 $self->{set_nc}->($self);
8235 }
8236
8237 redo A;
8238 }
8239 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8240 if ($is_space->{$self->{nc}}) {
8241 $self->{state} = AFTER_MD_DEF_STATE;
8242
8243 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8244 $self->{line_prev} = $self->{line};
8245 $self->{column_prev} = $self->{column};
8246 $self->{column}++;
8247 $self->{nc}
8248 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8249 } else {
8250 $self->{set_nc}->($self);
8251 }
8252
8253 redo A;
8254 } elsif ($self->{nc} == 0x003E) { # >
8255 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8256
8257 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8258 $self->{line_prev} = $self->{line};
8259 $self->{column_prev} = $self->{column};
8260 $self->{column}++;
8261 $self->{nc}
8262 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8263 } else {
8264 $self->{set_nc}->($self);
8265 }
8266
8267 return ($self->{ct}); # ELEMENT
8268 redo A;
8269 } elsif ($self->{nc} == -1) {
8270 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8271 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272
8273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274 $self->{line_prev} = $self->{line};
8275 $self->{column_prev} = $self->{column};
8276 $self->{column}++;
8277 $self->{nc}
8278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279 } else {
8280 $self->{set_nc}->($self);
8281 }
8282
8283 return ($self->{ct}); # ELEMENT
8284 redo A;
8285 } else {
8286 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8287 ## Stay in the state.
8288
8289 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8290 $self->{line_prev} = $self->{line};
8291 $self->{column_prev} = $self->{column};
8292 $self->{column}++;
8293 $self->{nc}
8294 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8295 } else {
8296 $self->{set_nc}->($self);
8297 }
8298
8299 redo A;
8300 }
8301 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8302 if ($is_space->{$self->{nc}}) {
8303 ## Stay in the state.
8304
8305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8306 $self->{line_prev} = $self->{line};
8307 $self->{column_prev} = $self->{column};
8308 $self->{column}++;
8309 $self->{nc}
8310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8311 } else {
8312 $self->{set_nc}->($self);
8313 }
8314
8315 redo A;
8316 } elsif ($self->{nc} == 0x0028) { # (
8317 $self->{group_depth}++;
8318 push @{$self->{ct}->{content}}, chr $self->{nc};
8319 ## Stay in the state.
8320
8321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8322 $self->{line_prev} = $self->{line};
8323 $self->{column_prev} = $self->{column};
8324 $self->{column}++;
8325 $self->{nc}
8326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8327 } else {
8328 $self->{set_nc}->($self);
8329 }
8330
8331 redo A;
8332 } elsif ($self->{nc} == 0x007C or # |
8333 $self->{nc} == 0x002C) { # ,
8334 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8335 ## Stay in the state.
8336
8337 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8338 $self->{line_prev} = $self->{line};
8339 $self->{column_prev} = $self->{column};
8340 $self->{column}++;
8341 $self->{nc}
8342 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8343 } else {
8344 $self->{set_nc}->($self);
8345 }
8346
8347 redo A;
8348 } elsif ($self->{nc} == 0x0029) { # )
8349 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8350 push @{$self->{ct}->{content}}, chr $self->{nc};
8351 $self->{group_depth}--;
8352 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8353
8354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8355 $self->{line_prev} = $self->{line};
8356 $self->{column_prev} = $self->{column};
8357 $self->{column}++;
8358 $self->{nc}
8359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8360 } else {
8361 $self->{set_nc}->($self);
8362 }
8363
8364 redo A;
8365 } elsif ($self->{nc} == 0x003E) { # >
8366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8367 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369
8370 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371 $self->{line_prev} = $self->{line};
8372 $self->{column_prev} = $self->{column};
8373 $self->{column}++;
8374 $self->{nc}
8375 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376 } else {
8377 $self->{set_nc}->($self);
8378 }
8379
8380 return ($self->{ct}); # ELEMENT
8381 redo A;
8382 } elsif ($self->{nc} == -1) {
8383 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8384 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8386
8387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388 $self->{line_prev} = $self->{line};
8389 $self->{column_prev} = $self->{column};
8390 $self->{column}++;
8391 $self->{nc}
8392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393 } else {
8394 $self->{set_nc}->($self);
8395 }
8396
8397 return ($self->{ct}); # ELEMENT
8398 redo A;
8399 } else {
8400 push @{$self->{ct}->{content}}, chr $self->{nc};
8401 $self->{state} = CM_ELEMENT_NAME_STATE;
8402
8403 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8404 $self->{line_prev} = $self->{line};
8405 $self->{column_prev} = $self->{column};
8406 $self->{column}++;
8407 $self->{nc}
8408 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8409 } else {
8410 $self->{set_nc}->($self);
8411 }
8412
8413 redo A;
8414 }
8415 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8416 if ($is_space->{$self->{nc}}) {
8417 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8418
8419 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8420 $self->{line_prev} = $self->{line};
8421 $self->{column_prev} = $self->{column};
8422 $self->{column}++;
8423 $self->{nc}
8424 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8425 } else {
8426 $self->{set_nc}->($self);
8427 }
8428
8429 redo A;
8430 } elsif ($self->{nc} == 0x002A or # *
8431 $self->{nc} == 0x002B or # +
8432 $self->{nc} == 0x003F) { # ?
8433 push @{$self->{ct}->{content}}, chr $self->{nc};
8434 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8435
8436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8437 $self->{line_prev} = $self->{line};
8438 $self->{column_prev} = $self->{column};
8439 $self->{column}++;
8440 $self->{nc}
8441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8442 } else {
8443 $self->{set_nc}->($self);
8444 }
8445
8446 redo A;
8447 } elsif ($self->{nc} == 0x007C or # |
8448 $self->{nc} == 0x002C) { # ,
8449 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8450 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8451
8452 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8453 $self->{line_prev} = $self->{line};
8454 $self->{column_prev} = $self->{column};
8455 $self->{column}++;
8456 $self->{nc}
8457 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8458 } else {
8459 $self->{set_nc}->($self);
8460 }
8461
8462 redo A;
8463 } elsif ($self->{nc} == 0x0029) { # )
8464 $self->{group_depth}--;
8465 push @{$self->{ct}->{content}}, chr $self->{nc};
8466 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8467
8468 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8469 $self->{line_prev} = $self->{line};
8470 $self->{column_prev} = $self->{column};
8471 $self->{column}++;
8472 $self->{nc}
8473 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8474 } else {
8475 $self->{set_nc}->($self);
8476 }
8477
8478 redo A;
8479 } elsif ($self->{nc} == 0x003E) { # >
8480 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8481 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8482 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8483
8484 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8485 $self->{line_prev} = $self->{line};
8486 $self->{column_prev} = $self->{column};
8487 $self->{column}++;
8488 $self->{nc}
8489 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8490 } else {
8491 $self->{set_nc}->($self);
8492 }
8493
8494 return ($self->{ct}); # ELEMENT
8495 redo A;
8496 } elsif ($self->{nc} == -1) {
8497 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8498 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8499 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8500
8501 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8502 $self->{line_prev} = $self->{line};
8503 $self->{column_prev} = $self->{column};
8504 $self->{column}++;
8505 $self->{nc}
8506 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8507 } else {
8508 $self->{set_nc}->($self);
8509 }
8510
8511 return ($self->{ct}); # ELEMENT
8512 redo A;
8513 } else {
8514 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8515 ## Stay in the state.
8516
8517 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8518 $self->{line_prev} = $self->{line};
8519 $self->{column_prev} = $self->{column};
8520 $self->{column}++;
8521 $self->{nc}
8522 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8523 } else {
8524 $self->{set_nc}->($self);
8525 }
8526
8527 redo A;
8528 }
8529 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8530 if ($is_space->{$self->{nc}}) {
8531 ## Stay in the state.
8532
8533 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8534 $self->{line_prev} = $self->{line};
8535 $self->{column_prev} = $self->{column};
8536 $self->{column}++;
8537 $self->{nc}
8538 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8539 } else {
8540 $self->{set_nc}->($self);
8541 }
8542
8543 redo A;
8544 } elsif ($self->{nc} == 0x007C or # |
8545 $self->{nc} == 0x002C) { # ,
8546 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8547 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8548
8549 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8550 $self->{line_prev} = $self->{line};
8551 $self->{column_prev} = $self->{column};
8552 $self->{column}++;
8553 $self->{nc}
8554 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8555 } else {
8556 $self->{set_nc}->($self);
8557 }
8558
8559 redo A;
8560 } elsif ($self->{nc} == 0x0029) { # )
8561 $self->{group_depth}--;
8562 push @{$self->{ct}->{content}}, chr $self->{nc};
8563 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8564
8565 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8566 $self->{line_prev} = $self->{line};
8567 $self->{column_prev} = $self->{column};
8568 $self->{column}++;
8569 $self->{nc}
8570 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8571 } else {
8572 $self->{set_nc}->($self);
8573 }
8574
8575 redo A;
8576 } elsif ($self->{nc} == 0x003E) { # >
8577 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8578 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8579 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580
8581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582 $self->{line_prev} = $self->{line};
8583 $self->{column_prev} = $self->{column};
8584 $self->{column}++;
8585 $self->{nc}
8586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587 } else {
8588 $self->{set_nc}->($self);
8589 }
8590
8591 return ($self->{ct}); # ELEMENT
8592 redo A;
8593 } elsif ($self->{nc} == -1) {
8594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8595 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8596 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8597
8598 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8599 $self->{line_prev} = $self->{line};
8600 $self->{column_prev} = $self->{column};
8601 $self->{column}++;
8602 $self->{nc}
8603 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8604 } else {
8605 $self->{set_nc}->($self);
8606 }
8607
8608 return ($self->{ct}); # ELEMENT
8609 redo A;
8610 } else {
8611 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8612 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8613 $self->{state} = BOGUS_MD_STATE;
8614
8615 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8616 $self->{line_prev} = $self->{line};
8617 $self->{column_prev} = $self->{column};
8618 $self->{column}++;
8619 $self->{nc}
8620 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8621 } else {
8622 $self->{set_nc}->($self);
8623 }
8624
8625 redo A;
8626 }
8627 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8628 if ($is_space->{$self->{nc}}) {
8629 if ($self->{group_depth}) {
8630 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8631 } else {
8632 $self->{state} = AFTER_MD_DEF_STATE;
8633 }
8634
8635 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8636 $self->{line_prev} = $self->{line};
8637 $self->{column_prev} = $self->{column};
8638 $self->{column}++;
8639 $self->{nc}
8640 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8641 } else {
8642 $self->{set_nc}->($self);
8643 }
8644
8645 redo A;
8646 } elsif ($self->{nc} == 0x002A or # *
8647 $self->{nc} == 0x002B or # +
8648 $self->{nc} == 0x003F) { # ?
8649 push @{$self->{ct}->{content}}, chr $self->{nc};
8650 if ($self->{group_depth}) {
8651 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8652 } else {
8653 $self->{state} = AFTER_MD_DEF_STATE;
8654 }
8655
8656 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8657 $self->{line_prev} = $self->{line};
8658 $self->{column_prev} = $self->{column};
8659 $self->{column}++;
8660 $self->{nc}
8661 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8662 } else {
8663 $self->{set_nc}->($self);
8664 }
8665
8666 redo A;
8667 } elsif ($self->{nc} == 0x0029) { # )
8668 if ($self->{group_depth}) {
8669 $self->{group_depth}--;
8670 push @{$self->{ct}->{content}}, chr $self->{nc};
8671 ## Stay in the state.
8672
8673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8674 $self->{line_prev} = $self->{line};
8675 $self->{column_prev} = $self->{column};
8676 $self->{column}++;
8677 $self->{nc}
8678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8679 } else {
8680 $self->{set_nc}->($self);
8681 }
8682
8683 redo A;
8684 } else {
8685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8686 $self->{state} = BOGUS_MD_STATE;
8687 ## Reconsume.
8688 redo A;
8689 }
8690 } elsif ($self->{nc} == 0x003E) { # >
8691 if ($self->{group_depth}) {
8692 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8693 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8694 }
8695 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8696
8697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8698 $self->{line_prev} = $self->{line};
8699 $self->{column_prev} = $self->{column};
8700 $self->{column}++;
8701 $self->{nc}
8702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8703 } else {
8704 $self->{set_nc}->($self);
8705 }
8706
8707 return ($self->{ct}); # ELEMENT
8708 redo A;
8709 } elsif ($self->{nc} == -1) {
8710 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8711 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8712 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8713
8714 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8715 $self->{line_prev} = $self->{line};
8716 $self->{column_prev} = $self->{column};
8717 $self->{column}++;
8718 $self->{nc}
8719 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8720 } else {
8721 $self->{set_nc}->($self);
8722 }
8723
8724 return ($self->{ct}); # ELEMENT
8725 redo A;
8726 } else {
8727 if ($self->{group_depth}) {
8728 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8729 } else {
8730 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8731 $self->{state} = BOGUS_MD_STATE;
8732 }
8733 ## Reconsume.
8734 redo A;
8735 }
8736 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8737 if ($is_space->{$self->{nc}}) {
8738 ## Stay in the state.
8739
8740 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8741 $self->{line_prev} = $self->{line};
8742 $self->{column_prev} = $self->{column};
8743 $self->{column}++;
8744 $self->{nc}
8745 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8746 } else {
8747 $self->{set_nc}->($self);
8748 }
8749
8750 redo A;
8751 } elsif ($self->{nc} == 0x003E) { # >
8752 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8753
8754 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8755 $self->{line_prev} = $self->{line};
8756 $self->{column_prev} = $self->{column};
8757 $self->{column}++;
8758 $self->{nc}
8759 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8760 } else {
8761 $self->{set_nc}->($self);
8762 }
8763
8764 return ($self->{ct}); # ENTITY/ELEMENT
8765 redo A;
8766 } elsif ($self->{nc} == -1) {
8767 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8768 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8769
8770 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8771 $self->{line_prev} = $self->{line};
8772 $self->{column_prev} = $self->{column};
8773 $self->{column}++;
8774 $self->{nc}
8775 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8776 } else {
8777 $self->{set_nc}->($self);
8778 }
8779
8780 return ($self->{ct}); # ENTITY/ELEMENT
8781 redo A;
8782 } else {
8783 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8784 $self->{state} = BOGUS_MD_STATE;
8785 ## Reconsume.
8786 redo A;
8787 }
8788 } elsif ($self->{state} == BOGUS_MD_STATE) {
8789 if ($self->{nc} == 0x003E) { # >
8790 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8791
8792 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8793 $self->{line_prev} = $self->{line};
8794 $self->{column_prev} = $self->{column};
8795 $self->{column}++;
8796 $self->{nc}
8797 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8798 } else {
8799 $self->{set_nc}->($self);
8800 }
8801
8802 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8803 redo A;
8804 } elsif ($self->{nc} == -1) {
8805 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8806 ## Reconsume.
8807 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8808 redo A;
8809 } else {
8810 ## Stay in the state.
8811
8812 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8813 $self->{line_prev} = $self->{line};
8814 $self->{column_prev} = $self->{column};
8815 $self->{column}++;
8816 $self->{nc}
8817 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8818 } else {
8819 $self->{set_nc}->($self);
8820 }
8821
8822 redo A;
8823 }
8824 } else {
8825 die "$0: $self->{state}: Unknown state";
8826 }
8827 } # A
8828
8829 die "$0: _get_next_token: unexpected case";
8830 } # _get_next_token
8831
8832 1;
8833 ## $Date: 2009/09/05 09:26:55 $
8834

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24