/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.25 - (show annotations) (download)
Sun Oct 19 15:17:01 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.24: +45 -7 lines
++ whatpm/t/xml/ChangeLog	19 Oct 2008 15:16:55 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat, attrs-1.dat: Normalization tests added.  Test
	results updated.

	* charrefs-1.dat: Character reference parse error/mapping tests
	added.

	* attlists-1.dat, eldecls-1.dat, entities-1.dat, entities-2.dat,
++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 15:13:57 -0000
	* Tokenizer.pm.src: Normalize white space characters in attribute
	value literals in XML documents.  Don't apply character reference
	mapping table for non-NULL non-surrogate code points.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.24 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276
277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278 $self->{line_prev} = $self->{line};
279 $self->{column_prev} = $self->{column};
280 $self->{column}++;
281 $self->{nc}
282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283 } else {
284 $self->{set_nc}->($self);
285 }
286
287 $self->{token} = [];
288 # $self->{escape}
289 } # _initialize_tokenizer
290
291 ## A token has:
292 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 ## ->{name} (DOCTYPE_TOKEN)
295 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 ## ->{target} (PI_TOKEN)
297 ## ->{pubid} (DOCTYPE_TOKEN)
298 ## ->{sysid} (DOCTYPE_TOKEN)
299 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301 ## ->{name}
302 ## ->{value}
303 ## ->{has_reference} == 1 or 0
304 ## ->{index}: Index of the attribute in a tag.
305 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309
310 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312 ## while the token is pushed back to the stack.
313
314 ## Emitted token MUST immediately be handled by the tree construction state.
315
316 ## Before each step, UA MAY check to see if either one of the scripts in
317 ## "list of scripts that will execute as soon as possible" or the first
318 ## script in the "list of scripts that will execute asynchronously",
319 ## has completed loading. If one has, then it MUST be executed
320 ## and removed from the list.
321
322 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323 ## (This requirement was dropped from HTML5 spec, unfortunately.)
324
325 my $is_space = {
326 0x0009 => 1, # CHARACTER TABULATION (HT)
327 0x000A => 1, # LINE FEED (LF)
328 #0x000B => 0, # LINE TABULATION (VT)
329 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 #0x000D => 1, # CARRIAGE RETURN (CR)
331 0x0020 => 1, # SPACE (SP)
332 };
333
334 sub _get_next_token ($) {
335 my $self = shift;
336
337 if ($self->{self_closing}) {
338 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339 ## NOTE: The |self_closing| flag is only set by start tag token.
340 ## In addition, when a start tag token is emitted, it is always set to
341 ## |ct|.
342 delete $self->{self_closing};
343 }
344
345 if (@{$self->{token}}) {
346 $self->{self_closing} = $self->{token}->[0]->{self_closing};
347 return shift @{$self->{token}};
348 }
349
350 A: {
351 if ($self->{state} == PCDATA_STATE) {
352 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353
354 if ($self->{nc} == 0x0026) { # &
355
356 ## NOTE: In the spec, the tokenizer is switched to the
357 ## "entity data state". In this implementation, the tokenizer
358 ## is switched to the |ENTITY_STATE|, which is an implementation
359 ## of the "consume a character reference" algorithm.
360 $self->{entity_add} = -1;
361 $self->{prev_state} = DATA_STATE;
362 $self->{state} = ENTITY_STATE;
363
364 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365 $self->{line_prev} = $self->{line};
366 $self->{column_prev} = $self->{column};
367 $self->{column}++;
368 $self->{nc}
369 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370 } else {
371 $self->{set_nc}->($self);
372 }
373
374 redo A;
375 } elsif ($self->{nc} == 0x003C) { # <
376
377 $self->{state} = TAG_OPEN_STATE;
378
379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380 $self->{line_prev} = $self->{line};
381 $self->{column_prev} = $self->{column};
382 $self->{column}++;
383 $self->{nc}
384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385 } else {
386 $self->{set_nc}->($self);
387 }
388
389 redo A;
390 } elsif ($self->{nc} == -1) {
391
392 return ({type => END_OF_FILE_TOKEN,
393 line => $self->{line}, column => $self->{column}});
394 last A; ## TODO: ok?
395 } else {
396
397 #
398 }
399
400 # Anything else
401 my $token = {type => CHARACTER_TOKEN,
402 data => chr $self->{nc},
403 line => $self->{line}, column => $self->{column},
404 };
405 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406
407 ## Stay in the state.
408
409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410 $self->{line_prev} = $self->{line};
411 $self->{column_prev} = $self->{column};
412 $self->{column}++;
413 $self->{nc}
414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415 } else {
416 $self->{set_nc}->($self);
417 }
418
419 return ($token);
420 redo A;
421 } elsif ($self->{state} == DATA_STATE) {
422 $self->{s_kwd} = '' unless defined $self->{s_kwd};
423 if ($self->{nc} == 0x0026) { # &
424 $self->{s_kwd} = '';
425 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426 not $self->{escape}) {
427
428 ## NOTE: In the spec, the tokenizer is switched to the
429 ## "entity data state". In this implementation, the tokenizer
430 ## is switched to the |ENTITY_STATE|, which is an implementation
431 ## of the "consume a character reference" algorithm.
432 $self->{entity_add} = -1;
433 $self->{prev_state} = DATA_STATE;
434 $self->{state} = ENTITY_STATE;
435
436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437 $self->{line_prev} = $self->{line};
438 $self->{column_prev} = $self->{column};
439 $self->{column}++;
440 $self->{nc}
441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442 } else {
443 $self->{set_nc}->($self);
444 }
445
446 redo A;
447 } else {
448
449 #
450 }
451 } elsif ($self->{nc} == 0x002D) { # -
452 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 if ($self->{s_kwd} eq '<!-') {
454
455 $self->{escape} = 1; # unless $self->{escape};
456 $self->{s_kwd} = '--';
457 #
458 } elsif ($self->{s_kwd} eq '-') {
459
460 $self->{s_kwd} = '--';
461 #
462 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463
464 $self->{s_kwd} .= '-';
465 #
466 } else {
467
468 $self->{s_kwd} = '-';
469 #
470 }
471 }
472
473 #
474 } elsif ($self->{nc} == 0x0021) { # !
475 if (length $self->{s_kwd}) {
476
477 $self->{s_kwd} .= '!';
478 #
479 } else {
480
481 #$self->{s_kwd} = '';
482 #
483 }
484 #
485 } elsif ($self->{nc} == 0x003C) { # <
486 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488 not $self->{escape})) {
489
490 $self->{state} = TAG_OPEN_STATE;
491
492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493 $self->{line_prev} = $self->{line};
494 $self->{column_prev} = $self->{column};
495 $self->{column}++;
496 $self->{nc}
497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498 } else {
499 $self->{set_nc}->($self);
500 }
501
502 redo A;
503 } else {
504
505 $self->{s_kwd} = '';
506 #
507 }
508 } elsif ($self->{nc} == 0x003E) { # >
509 if ($self->{escape} and
510 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511 if ($self->{s_kwd} eq '--') {
512
513 delete $self->{escape};
514 #
515 } else {
516
517 #
518 }
519 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520
521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522 line => $self->{line_prev},
523 column => $self->{column_prev} - 1);
524 #
525 } else {
526
527 #
528 }
529
530 $self->{s_kwd} = '';
531 #
532 } elsif ($self->{nc} == 0x005D) { # ]
533 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534
535 $self->{s_kwd} .= ']';
536 } elsif ($self->{s_kwd} eq ']]') {
537
538 #
539 } else {
540
541 $self->{s_kwd} = '';
542 }
543 #
544 } elsif ($self->{nc} == -1) {
545
546 $self->{s_kwd} = '';
547 return ({type => END_OF_FILE_TOKEN,
548 line => $self->{line}, column => $self->{column}});
549 last A; ## TODO: ok?
550 } else {
551
552 $self->{s_kwd} = '';
553 #
554 }
555
556 # Anything else
557 my $token = {type => CHARACTER_TOKEN,
558 data => chr $self->{nc},
559 line => $self->{line}, column => $self->{column},
560 };
561 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 length $token->{data})) {
563 $self->{s_kwd} = '';
564 }
565
566 ## Stay in the data state.
567 if (not $self->{is_xml} and
568 $self->{content_model} == PCDATA_CONTENT_MODEL) {
569
570 $self->{state} = PCDATA_STATE;
571 } else {
572
573 ## Stay in the state.
574 }
575
576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577 $self->{line_prev} = $self->{line};
578 $self->{column_prev} = $self->{column};
579 $self->{column}++;
580 $self->{nc}
581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582 } else {
583 $self->{set_nc}->($self);
584 }
585
586 return ($token);
587 redo A;
588 } elsif ($self->{state} == TAG_OPEN_STATE) {
589 ## XML5: "tag state".
590
591 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592 if ($self->{nc} == 0x002F) { # /
593
594
595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596 $self->{line_prev} = $self->{line};
597 $self->{column_prev} = $self->{column};
598 $self->{column}++;
599 $self->{nc}
600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601 } else {
602 $self->{set_nc}->($self);
603 }
604
605 $self->{state} = CLOSE_TAG_OPEN_STATE;
606 redo A;
607 } elsif ($self->{nc} == 0x0021) { # !
608
609 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 #
611 } else {
612
613 $self->{s_kwd} = '';
614 #
615 }
616
617 ## reconsume
618 $self->{state} = DATA_STATE;
619 return ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623 redo A;
624 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625 if ($self->{nc} == 0x0021) { # !
626
627 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628
629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630 $self->{line_prev} = $self->{line};
631 $self->{column_prev} = $self->{column};
632 $self->{column}++;
633 $self->{nc}
634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635 } else {
636 $self->{set_nc}->($self);
637 }
638
639 redo A;
640 } elsif ($self->{nc} == 0x002F) { # /
641
642 $self->{state} = CLOSE_TAG_OPEN_STATE;
643
644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645 $self->{line_prev} = $self->{line};
646 $self->{column_prev} = $self->{column};
647 $self->{column}++;
648 $self->{nc}
649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650 } else {
651 $self->{set_nc}->($self);
652 }
653
654 redo A;
655 } elsif (0x0041 <= $self->{nc} and
656 $self->{nc} <= 0x005A) { # A..Z
657
658 $self->{ct}
659 = {type => START_TAG_TOKEN,
660 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 line => $self->{line_prev},
662 column => $self->{column_prev}};
663 $self->{state} = TAG_NAME_STATE;
664
665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666 $self->{line_prev} = $self->{line};
667 $self->{column_prev} = $self->{column};
668 $self->{column}++;
669 $self->{nc}
670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671 } else {
672 $self->{set_nc}->($self);
673 }
674
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678
679 $self->{ct} = {type => START_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $self->{line_prev},
682 column => $self->{column_prev}};
683 $self->{state} = TAG_NAME_STATE;
684
685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686 $self->{line_prev} = $self->{line};
687 $self->{column_prev} = $self->{column};
688 $self->{column}++;
689 $self->{nc}
690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691 } else {
692 $self->{set_nc}->($self);
693 }
694
695 redo A;
696 } elsif ($self->{nc} == 0x003E) { # >
697
698 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699 line => $self->{line_prev},
700 column => $self->{column_prev});
701 $self->{state} = DATA_STATE;
702 $self->{s_kwd} = '';
703
704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705 $self->{line_prev} = $self->{line};
706 $self->{column_prev} = $self->{column};
707 $self->{column}++;
708 $self->{nc}
709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710 } else {
711 $self->{set_nc}->($self);
712 }
713
714
715 return ({type => CHARACTER_TOKEN, data => '<>',
716 line => $self->{line_prev},
717 column => $self->{column_prev},
718 });
719
720 redo A;
721 } elsif ($self->{nc} == 0x003F) { # ?
722 if ($self->{is_xml}) {
723
724 $self->{state} = PI_STATE;
725
726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727 $self->{line_prev} = $self->{line};
728 $self->{column_prev} = $self->{column};
729 $self->{column}++;
730 $self->{nc}
731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732 } else {
733 $self->{set_nc}->($self);
734 }
735
736 redo A;
737 } else {
738
739 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740 line => $self->{line_prev},
741 column => $self->{column_prev});
742 $self->{state} = BOGUS_COMMENT_STATE;
743 $self->{ct} = {type => COMMENT_TOKEN, data => '',
744 line => $self->{line_prev},
745 column => $self->{column_prev},
746 };
747 ## $self->{nc} is intentionally left as is
748 redo A;
749 }
750 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751
752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753 line => $self->{line_prev},
754 column => $self->{column_prev});
755 $self->{state} = DATA_STATE;
756 $self->{s_kwd} = '';
757 ## reconsume
758
759 return ({type => CHARACTER_TOKEN, data => '<',
760 line => $self->{line_prev},
761 column => $self->{column_prev},
762 });
763
764 redo A;
765 } else {
766 ## XML5: "<:" is a parse error.
767
768 $self->{ct} = {type => START_TAG_TOKEN,
769 tag_name => chr ($self->{nc}),
770 line => $self->{line_prev},
771 column => $self->{column_prev}};
772 $self->{state} = TAG_NAME_STATE;
773
774 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775 $self->{line_prev} = $self->{line};
776 $self->{column_prev} = $self->{column};
777 $self->{column}++;
778 $self->{nc}
779 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780 } else {
781 $self->{set_nc}->($self);
782 }
783
784 redo A;
785 }
786 } else {
787 die "$0: $self->{content_model} in tag open";
788 }
789 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790 ## NOTE: The "close tag open state" in the spec is implemented as
791 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792
793 ## XML5: "end tag state".
794
795 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797 if (defined $self->{last_stag_name}) {
798 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 $self->{kwd} = '';
800 ## Reconsume.
801 redo A;
802 } else {
803 ## No start tag token has ever been emitted
804 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805
806 $self->{state} = DATA_STATE;
807 $self->{s_kwd} = '';
808 ## Reconsume.
809 return ({type => CHARACTER_TOKEN, data => '</',
810 line => $l, column => $c,
811 });
812 redo A;
813 }
814 }
815
816 if (0x0041 <= $self->{nc} and
817 $self->{nc} <= 0x005A) { # A..Z
818
819 $self->{ct}
820 = {type => END_TAG_TOKEN,
821 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 line => $l, column => $c};
823 $self->{state} = TAG_NAME_STATE;
824
825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826 $self->{line_prev} = $self->{line};
827 $self->{column_prev} = $self->{column};
828 $self->{column}++;
829 $self->{nc}
830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831 } else {
832 $self->{set_nc}->($self);
833 }
834
835 redo A;
836 } elsif (0x0061 <= $self->{nc} and
837 $self->{nc} <= 0x007A) { # a..z
838
839 $self->{ct} = {type => END_TAG_TOKEN,
840 tag_name => chr ($self->{nc}),
841 line => $l, column => $c};
842 $self->{state} = TAG_NAME_STATE;
843
844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845 $self->{line_prev} = $self->{line};
846 $self->{column_prev} = $self->{column};
847 $self->{column}++;
848 $self->{nc}
849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850 } else {
851 $self->{set_nc}->($self);
852 }
853
854 redo A;
855 } elsif ($self->{nc} == 0x003E) { # >
856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857 line => $self->{line_prev}, ## "<" in "</>"
858 column => $self->{column_prev} - 1);
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 if ($self->{is_xml}) {
862
863 ## XML5: No parse error.
864
865 ## NOTE: This parser raises a parse error, since it supports
866 ## XML1, not XML5.
867
868 ## NOTE: A short end tag token.
869 my $ct = {type => END_TAG_TOKEN,
870 tag_name => '',
871 line => $self->{line_prev},
872 column => $self->{column_prev} - 1,
873 };
874
875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876 $self->{line_prev} = $self->{line};
877 $self->{column_prev} = $self->{column};
878 $self->{column}++;
879 $self->{nc}
880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881 } else {
882 $self->{set_nc}->($self);
883 }
884
885 return ($ct);
886 } else {
887
888
889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890 $self->{line_prev} = $self->{line};
891 $self->{column_prev} = $self->{column};
892 $self->{column}++;
893 $self->{nc}
894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895 } else {
896 $self->{set_nc}->($self);
897 }
898
899 }
900 redo A;
901 } elsif ($self->{nc} == -1) {
902
903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 $self->{s_kwd} = '';
905 $self->{state} = DATA_STATE;
906 # reconsume
907
908 return ({type => CHARACTER_TOKEN, data => '</',
909 line => $l, column => $c,
910 });
911
912 redo A;
913 } elsif (not $self->{is_xml} or
914 $is_space->{$self->{nc}}) {
915
916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917 line => $self->{line_prev}, # "<" of "</"
918 column => $self->{column_prev} - 1);
919 $self->{state} = BOGUS_COMMENT_STATE;
920 $self->{ct} = {type => COMMENT_TOKEN, data => '',
921 line => $self->{line_prev}, # "<" of "</"
922 column => $self->{column_prev} - 1,
923 };
924 ## NOTE: $self->{nc} is intentionally left as is.
925 ## Although the "anything else" case of the spec not explicitly
926 ## states that the next input character is to be reconsumed,
927 ## it will be included to the |data| of the comment token
928 ## generated from the bogus end tag, as defined in the
929 ## "bogus comment state" entry.
930 redo A;
931 } else {
932 ## XML5: "</:" is a parse error.
933
934 $self->{ct} = {type => END_TAG_TOKEN,
935 tag_name => chr ($self->{nc}),
936 line => $l, column => $c};
937 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938
939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940 $self->{line_prev} = $self->{line};
941 $self->{column_prev} = $self->{column};
942 $self->{column}++;
943 $self->{nc}
944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945 } else {
946 $self->{set_nc}->($self);
947 }
948
949 redo A;
950 }
951 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 if (length $ch) {
954 my $CH = $ch;
955 $ch =~ tr/a-z/A-Z/;
956 my $nch = chr $self->{nc};
957 if ($nch eq $ch or $nch eq $CH) {
958
959 ## Stay in the state.
960 $self->{kwd} .= $nch;
961
962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963 $self->{line_prev} = $self->{line};
964 $self->{column_prev} = $self->{column};
965 $self->{column}++;
966 $self->{nc}
967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968 } else {
969 $self->{set_nc}->($self);
970 }
971
972 redo A;
973 } else {
974
975 $self->{state} = DATA_STATE;
976 $self->{s_kwd} = '';
977 ## Reconsume.
978 return ({type => CHARACTER_TOKEN,
979 data => '</' . $self->{kwd},
980 line => $self->{line_prev},
981 column => $self->{column_prev} - 1 - length $self->{kwd},
982 });
983 redo A;
984 }
985 } else { # after "<{tag-name}"
986 unless ($is_space->{$self->{nc}} or
987 {
988 0x003E => 1, # >
989 0x002F => 1, # /
990 -1 => 1, # EOF
991 }->{$self->{nc}}) {
992
993 ## Reconsume.
994 $self->{state} = DATA_STATE;
995 $self->{s_kwd} = '';
996 return ({type => CHARACTER_TOKEN,
997 data => '</' . $self->{kwd},
998 line => $self->{line_prev},
999 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 });
1001 redo A;
1002 } else {
1003
1004 $self->{ct}
1005 = {type => END_TAG_TOKEN,
1006 tag_name => $self->{last_stag_name},
1007 line => $self->{line_prev},
1008 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 $self->{state} = TAG_NAME_STATE;
1010 ## Reconsume.
1011 redo A;
1012 }
1013 }
1014 } elsif ($self->{state} == TAG_NAME_STATE) {
1015 if ($is_space->{$self->{nc}}) {
1016
1017 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018
1019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020 $self->{line_prev} = $self->{line};
1021 $self->{column_prev} = $self->{column};
1022 $self->{column}++;
1023 $self->{nc}
1024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025 } else {
1026 $self->{set_nc}->($self);
1027 }
1028
1029 redo A;
1030 } elsif ($self->{nc} == 0x003E) { # >
1031 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032
1033 $self->{last_stag_name} = $self->{ct}->{tag_name};
1034 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036 #if ($self->{ct}->{attributes}) {
1037 # ## NOTE: This should never be reached.
1038 # !!! cp (36);
1039 # !!! parse-error (type => 'end tag attribute');
1040 #} else {
1041
1042 #}
1043 } else {
1044 die "$0: $self->{ct}->{type}: Unknown token type";
1045 }
1046 $self->{state} = DATA_STATE;
1047 $self->{s_kwd} = '';
1048
1049 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050 $self->{line_prev} = $self->{line};
1051 $self->{column_prev} = $self->{column};
1052 $self->{column}++;
1053 $self->{nc}
1054 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055 } else {
1056 $self->{set_nc}->($self);
1057 }
1058
1059
1060 return ($self->{ct}); # start tag or end tag
1061
1062 redo A;
1063 } elsif (0x0041 <= $self->{nc} and
1064 $self->{nc} <= 0x005A) { # A..Z
1065
1066 $self->{ct}->{tag_name}
1067 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 # start tag or end tag
1069 ## Stay in this state
1070
1071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072 $self->{line_prev} = $self->{line};
1073 $self->{column_prev} = $self->{column};
1074 $self->{column}++;
1075 $self->{nc}
1076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077 } else {
1078 $self->{set_nc}->($self);
1079 }
1080
1081 redo A;
1082 } elsif ($self->{nc} == -1) {
1083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085
1086 $self->{last_stag_name} = $self->{ct}->{tag_name};
1087 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089 #if ($self->{ct}->{attributes}) {
1090 # ## NOTE: This state should never be reached.
1091 # !!! cp (40);
1092 # !!! parse-error (type => 'end tag attribute');
1093 #} else {
1094
1095 #}
1096 } else {
1097 die "$0: $self->{ct}->{type}: Unknown token type";
1098 }
1099 $self->{state} = DATA_STATE;
1100 $self->{s_kwd} = '';
1101 # reconsume
1102
1103 return ($self->{ct}); # start tag or end tag
1104
1105 redo A;
1106 } elsif ($self->{nc} == 0x002F) { # /
1107
1108 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109
1110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111 $self->{line_prev} = $self->{line};
1112 $self->{column_prev} = $self->{column};
1113 $self->{column}++;
1114 $self->{nc}
1115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116 } else {
1117 $self->{set_nc}->($self);
1118 }
1119
1120 redo A;
1121 } else {
1122
1123 $self->{ct}->{tag_name} .= chr $self->{nc};
1124 # start tag or end tag
1125 ## Stay in the state
1126
1127 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128 $self->{line_prev} = $self->{line};
1129 $self->{column_prev} = $self->{column};
1130 $self->{column}++;
1131 $self->{nc}
1132 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133 } else {
1134 $self->{set_nc}->($self);
1135 }
1136
1137 redo A;
1138 }
1139 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 ## XML5: "Tag attribute name before state".
1141
1142 if ($is_space->{$self->{nc}}) {
1143
1144 ## Stay in the state
1145
1146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147 $self->{line_prev} = $self->{line};
1148 $self->{column_prev} = $self->{column};
1149 $self->{column}++;
1150 $self->{nc}
1151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152 } else {
1153 $self->{set_nc}->($self);
1154 }
1155
1156 redo A;
1157 } elsif ($self->{nc} == 0x003E) { # >
1158 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159
1160 $self->{last_stag_name} = $self->{ct}->{tag_name};
1161 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163 if ($self->{ct}->{attributes}) {
1164
1165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166 } else {
1167
1168 }
1169 } else {
1170 die "$0: $self->{ct}->{type}: Unknown token type";
1171 }
1172 $self->{state} = DATA_STATE;
1173 $self->{s_kwd} = '';
1174
1175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176 $self->{line_prev} = $self->{line};
1177 $self->{column_prev} = $self->{column};
1178 $self->{column}++;
1179 $self->{nc}
1180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181 } else {
1182 $self->{set_nc}->($self);
1183 }
1184
1185
1186 return ($self->{ct}); # start tag or end tag
1187
1188 redo A;
1189 } elsif (0x0041 <= $self->{nc} and
1190 $self->{nc} <= 0x005A) { # A..Z
1191
1192 $self->{ca}
1193 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 value => '',
1195 line => $self->{line}, column => $self->{column}};
1196 $self->{state} = ATTRIBUTE_NAME_STATE;
1197
1198 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199 $self->{line_prev} = $self->{line};
1200 $self->{column_prev} = $self->{column};
1201 $self->{column}++;
1202 $self->{nc}
1203 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204 } else {
1205 $self->{set_nc}->($self);
1206 }
1207
1208 redo A;
1209 } elsif ($self->{nc} == 0x002F) { # /
1210
1211 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212
1213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214 $self->{line_prev} = $self->{line};
1215 $self->{column_prev} = $self->{column};
1216 $self->{column}++;
1217 $self->{nc}
1218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219 } else {
1220 $self->{set_nc}->($self);
1221 }
1222
1223 redo A;
1224 } elsif ($self->{nc} == -1) {
1225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232
1233 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234 } else {
1235
1236 }
1237 } else {
1238 die "$0: $self->{ct}->{type}: Unknown token type";
1239 }
1240 $self->{state} = DATA_STATE;
1241 $self->{s_kwd} = '';
1242 # reconsume
1243
1244 return ($self->{ct}); # start tag or end tag
1245
1246 redo A;
1247 } else {
1248 if ({
1249 0x0022 => 1, # "
1250 0x0027 => 1, # '
1251 0x003D => 1, # =
1252 }->{$self->{nc}}) {
1253
1254 ## XML5: Not a parse error.
1255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256 } else {
1257
1258 ## XML5: ":" raises a parse error and is ignored.
1259 }
1260 $self->{ca}
1261 = {name => chr ($self->{nc}),
1262 value => '',
1263 line => $self->{line}, column => $self->{column}};
1264 $self->{state} = ATTRIBUTE_NAME_STATE;
1265
1266 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267 $self->{line_prev} = $self->{line};
1268 $self->{column_prev} = $self->{column};
1269 $self->{column}++;
1270 $self->{nc}
1271 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272 } else {
1273 $self->{set_nc}->($self);
1274 }
1275
1276 redo A;
1277 }
1278 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 ## XML5: "Tag attribute name state".
1280
1281 my $before_leave = sub {
1282 if (exists $self->{ct}->{attributes} # start tag or end tag
1283 ->{$self->{ca}->{name}}) { # MUST
1284
1285 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286 ## Discard $self->{ca} # MUST
1287 } else {
1288
1289 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290 = $self->{ca};
1291 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 }
1293 }; # $before_leave
1294
1295 if ($is_space->{$self->{nc}}) {
1296
1297 $before_leave->();
1298 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299
1300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301 $self->{line_prev} = $self->{line};
1302 $self->{column_prev} = $self->{column};
1303 $self->{column}++;
1304 $self->{nc}
1305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306 } else {
1307 $self->{set_nc}->($self);
1308 }
1309
1310 redo A;
1311 } elsif ($self->{nc} == 0x003D) { # =
1312
1313 $before_leave->();
1314 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315
1316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317 $self->{line_prev} = $self->{line};
1318 $self->{column_prev} = $self->{column};
1319 $self->{column}++;
1320 $self->{nc}
1321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322 } else {
1323 $self->{set_nc}->($self);
1324 }
1325
1326 redo A;
1327 } elsif ($self->{nc} == 0x003E) { # >
1328 if ($self->{is_xml}) {
1329
1330 ## XML5: Not a parse error.
1331 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332 } else {
1333
1334 }
1335
1336 $before_leave->();
1337 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338
1339 $self->{last_stag_name} = $self->{ct}->{tag_name};
1340 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341
1342 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343 if ($self->{ct}->{attributes}) {
1344 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345 }
1346 } else {
1347 die "$0: $self->{ct}->{type}: Unknown token type";
1348 }
1349 $self->{state} = DATA_STATE;
1350 $self->{s_kwd} = '';
1351
1352 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353 $self->{line_prev} = $self->{line};
1354 $self->{column_prev} = $self->{column};
1355 $self->{column}++;
1356 $self->{nc}
1357 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358 } else {
1359 $self->{set_nc}->($self);
1360 }
1361
1362
1363 return ($self->{ct}); # start tag or end tag
1364
1365 redo A;
1366 } elsif (0x0041 <= $self->{nc} and
1367 $self->{nc} <= 0x005A) { # A..Z
1368
1369 $self->{ca}->{name}
1370 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 ## Stay in the state
1372
1373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374 $self->{line_prev} = $self->{line};
1375 $self->{column_prev} = $self->{column};
1376 $self->{column}++;
1377 $self->{nc}
1378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379 } else {
1380 $self->{set_nc}->($self);
1381 }
1382
1383 redo A;
1384 } elsif ($self->{nc} == 0x002F) { # /
1385 if ($self->{is_xml}) {
1386
1387 ## XML5: Not a parse error.
1388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389 } else {
1390
1391 }
1392
1393 $before_leave->();
1394 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395
1396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397 $self->{line_prev} = $self->{line};
1398 $self->{column_prev} = $self->{column};
1399 $self->{column}++;
1400 $self->{nc}
1401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402 } else {
1403 $self->{set_nc}->($self);
1404 }
1405
1406 redo A;
1407 } elsif ($self->{nc} == -1) {
1408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409 $before_leave->();
1410 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411
1412 $self->{last_stag_name} = $self->{ct}->{tag_name};
1413 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415 if ($self->{ct}->{attributes}) {
1416
1417 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418 } else {
1419 ## NOTE: This state should never be reached.
1420
1421 }
1422 } else {
1423 die "$0: $self->{ct}->{type}: Unknown token type";
1424 }
1425 $self->{state} = DATA_STATE;
1426 $self->{s_kwd} = '';
1427 # reconsume
1428
1429 return ($self->{ct}); # start tag or end tag
1430
1431 redo A;
1432 } else {
1433 if ($self->{nc} == 0x0022 or # "
1434 $self->{nc} == 0x0027) { # '
1435
1436 ## XML5: Not a parse error.
1437 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438 } else {
1439
1440 }
1441 $self->{ca}->{name} .= chr ($self->{nc});
1442 ## Stay in the state
1443
1444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445 $self->{line_prev} = $self->{line};
1446 $self->{column_prev} = $self->{column};
1447 $self->{column}++;
1448 $self->{nc}
1449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450 } else {
1451 $self->{set_nc}->($self);
1452 }
1453
1454 redo A;
1455 }
1456 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 ## XML5: "Tag attribute name after state".
1458
1459 if ($is_space->{$self->{nc}}) {
1460
1461 ## Stay in the state
1462
1463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464 $self->{line_prev} = $self->{line};
1465 $self->{column_prev} = $self->{column};
1466 $self->{column}++;
1467 $self->{nc}
1468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469 } else {
1470 $self->{set_nc}->($self);
1471 }
1472
1473 redo A;
1474 } elsif ($self->{nc} == 0x003D) { # =
1475
1476 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477
1478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479 $self->{line_prev} = $self->{line};
1480 $self->{column_prev} = $self->{column};
1481 $self->{column}++;
1482 $self->{nc}
1483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484 } else {
1485 $self->{set_nc}->($self);
1486 }
1487
1488 redo A;
1489 } elsif ($self->{nc} == 0x003E) { # >
1490 if ($self->{is_xml}) {
1491
1492 ## XML5: Not a parse error.
1493 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494 } else {
1495
1496 }
1497
1498 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499
1500 $self->{last_stag_name} = $self->{ct}->{tag_name};
1501 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503 if ($self->{ct}->{attributes}) {
1504
1505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506 } else {
1507 ## NOTE: This state should never be reached.
1508
1509 }
1510 } else {
1511 die "$0: $self->{ct}->{type}: Unknown token type";
1512 }
1513 $self->{state} = DATA_STATE;
1514 $self->{s_kwd} = '';
1515
1516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517 $self->{line_prev} = $self->{line};
1518 $self->{column_prev} = $self->{column};
1519 $self->{column}++;
1520 $self->{nc}
1521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522 } else {
1523 $self->{set_nc}->($self);
1524 }
1525
1526
1527 return ($self->{ct}); # start tag or end tag
1528
1529 redo A;
1530 } elsif (0x0041 <= $self->{nc} and
1531 $self->{nc} <= 0x005A) { # A..Z
1532
1533 $self->{ca}
1534 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 value => '',
1536 line => $self->{line}, column => $self->{column}};
1537 $self->{state} = ATTRIBUTE_NAME_STATE;
1538
1539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540 $self->{line_prev} = $self->{line};
1541 $self->{column_prev} = $self->{column};
1542 $self->{column}++;
1543 $self->{nc}
1544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545 } else {
1546 $self->{set_nc}->($self);
1547 }
1548
1549 redo A;
1550 } elsif ($self->{nc} == 0x002F) { # /
1551 if ($self->{is_xml}) {
1552
1553 ## XML5: Not a parse error.
1554 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555 } else {
1556
1557 }
1558
1559 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560
1561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562 $self->{line_prev} = $self->{line};
1563 $self->{column_prev} = $self->{column};
1564 $self->{column}++;
1565 $self->{nc}
1566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567 } else {
1568 $self->{set_nc}->($self);
1569 }
1570
1571 redo A;
1572 } elsif ($self->{nc} == -1) {
1573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575
1576 $self->{last_stag_name} = $self->{ct}->{tag_name};
1577 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579 if ($self->{ct}->{attributes}) {
1580
1581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582 } else {
1583 ## NOTE: This state should never be reached.
1584
1585 }
1586 } else {
1587 die "$0: $self->{ct}->{type}: Unknown token type";
1588 }
1589 $self->{s_kwd} = '';
1590 $self->{state} = DATA_STATE;
1591 # reconsume
1592
1593 return ($self->{ct}); # start tag or end tag
1594
1595 redo A;
1596 } else {
1597 if ($self->{is_xml}) {
1598
1599 ## XML5: Not a parse error.
1600 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601 } else {
1602
1603 }
1604
1605 if ($self->{nc} == 0x0022 or # "
1606 $self->{nc} == 0x0027) { # '
1607
1608 ## XML5: Not a parse error.
1609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610 } else {
1611
1612 }
1613 $self->{ca}
1614 = {name => chr ($self->{nc}),
1615 value => '',
1616 line => $self->{line}, column => $self->{column}};
1617 $self->{state} = ATTRIBUTE_NAME_STATE;
1618
1619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620 $self->{line_prev} = $self->{line};
1621 $self->{column_prev} = $self->{column};
1622 $self->{column}++;
1623 $self->{nc}
1624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625 } else {
1626 $self->{set_nc}->($self);
1627 }
1628
1629 redo A;
1630 }
1631 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 ## XML5: "Tag attribute value before state".
1633
1634 if ($is_space->{$self->{nc}}) {
1635
1636 ## Stay in the state
1637
1638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639 $self->{line_prev} = $self->{line};
1640 $self->{column_prev} = $self->{column};
1641 $self->{column}++;
1642 $self->{nc}
1643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644 } else {
1645 $self->{set_nc}->($self);
1646 }
1647
1648 redo A;
1649 } elsif ($self->{nc} == 0x0022) { # "
1650
1651 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652
1653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654 $self->{line_prev} = $self->{line};
1655 $self->{column_prev} = $self->{column};
1656 $self->{column}++;
1657 $self->{nc}
1658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659 } else {
1660 $self->{set_nc}->($self);
1661 }
1662
1663 redo A;
1664 } elsif ($self->{nc} == 0x0026) { # &
1665
1666 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667 ## reconsume
1668 redo A;
1669 } elsif ($self->{nc} == 0x0027) { # '
1670
1671 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672
1673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674 $self->{line_prev} = $self->{line};
1675 $self->{column_prev} = $self->{column};
1676 $self->{column}++;
1677 $self->{nc}
1678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679 } else {
1680 $self->{set_nc}->($self);
1681 }
1682
1683 redo A;
1684 } elsif ($self->{nc} == 0x003E) { # >
1685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687
1688 $self->{last_stag_name} = $self->{ct}->{tag_name};
1689 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691 if ($self->{ct}->{attributes}) {
1692
1693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694 } else {
1695 ## NOTE: This state should never be reached.
1696
1697 }
1698 } else {
1699 die "$0: $self->{ct}->{type}: Unknown token type";
1700 }
1701 $self->{state} = DATA_STATE;
1702 $self->{s_kwd} = '';
1703
1704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705 $self->{line_prev} = $self->{line};
1706 $self->{column_prev} = $self->{column};
1707 $self->{column}++;
1708 $self->{nc}
1709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710 } else {
1711 $self->{set_nc}->($self);
1712 }
1713
1714
1715 return ($self->{ct}); # start tag or end tag
1716
1717 redo A;
1718 } elsif ($self->{nc} == -1) {
1719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721
1722 $self->{last_stag_name} = $self->{ct}->{tag_name};
1723 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725 if ($self->{ct}->{attributes}) {
1726
1727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728 } else {
1729 ## NOTE: This state should never be reached.
1730
1731 }
1732 } else {
1733 die "$0: $self->{ct}->{type}: Unknown token type";
1734 }
1735 $self->{state} = DATA_STATE;
1736 $self->{s_kwd} = '';
1737 ## reconsume
1738
1739 return ($self->{ct}); # start tag or end tag
1740
1741 redo A;
1742 } else {
1743 if ($self->{nc} == 0x003D) { # =
1744
1745 ## XML5: Not a parse error.
1746 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 } elsif ($self->{is_xml}) {
1748
1749 ## XML5: No parse error.
1750 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 } else {
1752
1753 }
1754 $self->{ca}->{value} .= chr ($self->{nc});
1755 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756
1757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758 $self->{line_prev} = $self->{line};
1759 $self->{column_prev} = $self->{column};
1760 $self->{column}++;
1761 $self->{nc}
1762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763 } else {
1764 $self->{set_nc}->($self);
1765 }
1766
1767 redo A;
1768 }
1769 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771 ## ATTLIST attribute value double quoted state".
1772
1773 if ($self->{nc} == 0x0022) { # "
1774 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775
1776 ## XML5: "DOCTYPE ATTLIST name after state".
1777 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779 } else {
1780
1781 ## XML5: "Tag attribute name before state".
1782 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783 }
1784
1785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786 $self->{line_prev} = $self->{line};
1787 $self->{column_prev} = $self->{column};
1788 $self->{column}++;
1789 $self->{nc}
1790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791 } else {
1792 $self->{set_nc}->($self);
1793 }
1794
1795 redo A;
1796 } elsif ($self->{nc} == 0x0026) { # &
1797
1798 ## XML5: Not defined yet.
1799
1800 ## NOTE: In the spec, the tokenizer is switched to the
1801 ## "entity in attribute value state". In this implementation, the
1802 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803 ## implementation of the "consume a character reference" algorithm.
1804 $self->{prev_state} = $self->{state};
1805 $self->{entity_add} = 0x0022; # "
1806 $self->{state} = ENTITY_STATE;
1807
1808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809 $self->{line_prev} = $self->{line};
1810 $self->{column_prev} = $self->{column};
1811 $self->{column}++;
1812 $self->{nc}
1813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814 } else {
1815 $self->{set_nc}->($self);
1816 }
1817
1818 redo A;
1819 } elsif ($self->{is_xml} and
1820 $is_space->{$self->{nc}}) {
1821
1822 $self->{ca}->{value} .= ' ';
1823 ## Stay in the state.
1824
1825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826 $self->{line_prev} = $self->{line};
1827 $self->{column_prev} = $self->{column};
1828 $self->{column}++;
1829 $self->{nc}
1830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831 } else {
1832 $self->{set_nc}->($self);
1833 }
1834
1835 redo A;
1836 } elsif ($self->{nc} == -1) {
1837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839
1840 $self->{last_stag_name} = $self->{ct}->{tag_name};
1841
1842 $self->{state} = DATA_STATE;
1843 $self->{s_kwd} = '';
1844 ## reconsume
1845 return ($self->{ct}); # start tag
1846 redo A;
1847 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849 if ($self->{ct}->{attributes}) {
1850
1851 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1852 } else {
1853 ## NOTE: This state should never be reached.
1854
1855 }
1856
1857 $self->{state} = DATA_STATE;
1858 $self->{s_kwd} = '';
1859 ## reconsume
1860 return ($self->{ct}); # end tag
1861 redo A;
1862 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863 ## XML5: No parse error above; not defined yet.
1864 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866 ## Reconsume.
1867 return ($self->{ct}); # ATTLIST
1868 redo A;
1869 } else {
1870 die "$0: $self->{ct}->{type}: Unknown token type";
1871 }
1872 } else {
1873 ## XML5 [ATTLIST]: Not defined yet.
1874 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875
1876 ## XML5: Not a parse error.
1877 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878 } else {
1879
1880 }
1881 $self->{ca}->{value} .= chr ($self->{nc});
1882 $self->{read_until}->($self->{ca}->{value},
1883 qq["&<\x09\x0C\x20],
1884 length $self->{ca}->{value});
1885
1886 ## Stay in the state
1887
1888 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1889 $self->{line_prev} = $self->{line};
1890 $self->{column_prev} = $self->{column};
1891 $self->{column}++;
1892 $self->{nc}
1893 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1894 } else {
1895 $self->{set_nc}->($self);
1896 }
1897
1898 redo A;
1899 }
1900 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902 ## ATTLIST attribute value single quoted state".
1903
1904 if ($self->{nc} == 0x0027) { # '
1905 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906
1907 ## XML5: "DOCTYPE ATTLIST name after state".
1908 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910 } else {
1911
1912 ## XML5: "Before attribute name state" (sic).
1913 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914 }
1915
1916 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917 $self->{line_prev} = $self->{line};
1918 $self->{column_prev} = $self->{column};
1919 $self->{column}++;
1920 $self->{nc}
1921 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1922 } else {
1923 $self->{set_nc}->($self);
1924 }
1925
1926 redo A;
1927 } elsif ($self->{nc} == 0x0026) { # &
1928
1929 ## XML5: Not defined yet.
1930
1931 ## NOTE: In the spec, the tokenizer is switched to the
1932 ## "entity in attribute value state". In this implementation, the
1933 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1934 ## implementation of the "consume a character reference" algorithm.
1935 $self->{entity_add} = 0x0027; # '
1936 $self->{prev_state} = $self->{state};
1937 $self->{state} = ENTITY_STATE;
1938
1939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940 $self->{line_prev} = $self->{line};
1941 $self->{column_prev} = $self->{column};
1942 $self->{column}++;
1943 $self->{nc}
1944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945 } else {
1946 $self->{set_nc}->($self);
1947 }
1948
1949 redo A;
1950 } elsif ($self->{is_xml} and
1951 $is_space->{$self->{nc}}) {
1952
1953 $self->{ca}->{value} .= ' ';
1954 ## Stay in the state.
1955
1956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957 $self->{line_prev} = $self->{line};
1958 $self->{column_prev} = $self->{column};
1959 $self->{column}++;
1960 $self->{nc}
1961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962 } else {
1963 $self->{set_nc}->($self);
1964 }
1965
1966 redo A;
1967 } elsif ($self->{nc} == -1) {
1968 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970
1971 $self->{last_stag_name} = $self->{ct}->{tag_name};
1972
1973 $self->{state} = DATA_STATE;
1974 $self->{s_kwd} = '';
1975 ## reconsume
1976 return ($self->{ct}); # start tag
1977 redo A;
1978 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980 if ($self->{ct}->{attributes}) {
1981
1982 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1983 } else {
1984 ## NOTE: This state should never be reached.
1985
1986 }
1987
1988 $self->{state} = DATA_STATE;
1989 $self->{s_kwd} = '';
1990 ## reconsume
1991 return ($self->{ct}); # end tag
1992 redo A;
1993 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994 ## XML5: No parse error above; not defined yet.
1995 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997 ## Reconsume.
1998 return ($self->{ct}); # ATTLIST
1999 redo A;
2000 } else {
2001 die "$0: $self->{ct}->{type}: Unknown token type";
2002 }
2003 } else {
2004 ## XML5 [ATTLIST]: Not defined yet.
2005 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006
2007 ## XML5: Not a parse error.
2008 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009 } else {
2010
2011 }
2012 $self->{ca}->{value} .= chr ($self->{nc});
2013 $self->{read_until}->($self->{ca}->{value},
2014 qq['&<\x09\x0C\x20],
2015 length $self->{ca}->{value});
2016
2017 ## Stay in the state
2018
2019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2020 $self->{line_prev} = $self->{line};
2021 $self->{column_prev} = $self->{column};
2022 $self->{column}++;
2023 $self->{nc}
2024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2025 } else {
2026 $self->{set_nc}->($self);
2027 }
2028
2029 redo A;
2030 }
2031 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032 ## XML5: "Tag attribute value unquoted state".
2033
2034 if ($is_space->{$self->{nc}}) {
2035 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036
2037 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039 } else {
2040
2041 ## XML5: "Tag attribute name before state".
2042 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043 }
2044
2045 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046 $self->{line_prev} = $self->{line};
2047 $self->{column_prev} = $self->{column};
2048 $self->{column}++;
2049 $self->{nc}
2050 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051 } else {
2052 $self->{set_nc}->($self);
2053 }
2054
2055 redo A;
2056 } elsif ($self->{nc} == 0x0026) { # &
2057
2058
2059 ## XML5: Not defined yet.
2060
2061 ## NOTE: In the spec, the tokenizer is switched to the
2062 ## "entity in attribute value state". In this implementation, the
2063 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2064 ## implementation of the "consume a character reference" algorithm.
2065 $self->{entity_add} = -1;
2066 $self->{prev_state} = $self->{state};
2067 $self->{state} = ENTITY_STATE;
2068
2069 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070 $self->{line_prev} = $self->{line};
2071 $self->{column_prev} = $self->{column};
2072 $self->{column}++;
2073 $self->{nc}
2074 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075 } else {
2076 $self->{set_nc}->($self);
2077 }
2078
2079 redo A;
2080 } elsif ($self->{nc} == 0x003E) { # >
2081 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082
2083 $self->{last_stag_name} = $self->{ct}->{tag_name};
2084
2085 $self->{state} = DATA_STATE;
2086 $self->{s_kwd} = '';
2087
2088 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089 $self->{line_prev} = $self->{line};
2090 $self->{column_prev} = $self->{column};
2091 $self->{column}++;
2092 $self->{nc}
2093 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094 } else {
2095 $self->{set_nc}->($self);
2096 }
2097
2098 return ($self->{ct}); # start tag
2099 redo A;
2100 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102 if ($self->{ct}->{attributes}) {
2103
2104 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2105 } else {
2106 ## NOTE: This state should never be reached.
2107
2108 }
2109
2110 $self->{state} = DATA_STATE;
2111 $self->{s_kwd} = '';
2112
2113 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114 $self->{line_prev} = $self->{line};
2115 $self->{column_prev} = $self->{column};
2116 $self->{column}++;
2117 $self->{nc}
2118 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2119 } else {
2120 $self->{set_nc}->($self);
2121 }
2122
2123 return ($self->{ct}); # end tag
2124 redo A;
2125 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128
2129 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130 $self->{line_prev} = $self->{line};
2131 $self->{column_prev} = $self->{column};
2132 $self->{column}++;
2133 $self->{nc}
2134 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135 } else {
2136 $self->{set_nc}->($self);
2137 }
2138
2139 return ($self->{ct}); # ATTLIST
2140 redo A;
2141 } else {
2142 die "$0: $self->{ct}->{type}: Unknown token type";
2143 }
2144 } elsif ($self->{nc} == -1) {
2145 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146
2147 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148 $self->{last_stag_name} = $self->{ct}->{tag_name};
2149
2150 $self->{state} = DATA_STATE;
2151 $self->{s_kwd} = '';
2152 ## reconsume
2153 return ($self->{ct}); # start tag
2154 redo A;
2155 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158 if ($self->{ct}->{attributes}) {
2159
2160 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2161 } else {
2162 ## NOTE: This state should never be reached.
2163
2164 }
2165
2166 $self->{state} = DATA_STATE;
2167 $self->{s_kwd} = '';
2168 ## reconsume
2169 return ($self->{ct}); # end tag
2170 redo A;
2171 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175 ## Reconsume.
2176 return ($self->{ct}); # ATTLIST
2177 redo A;
2178 } else {
2179 die "$0: $self->{ct}->{type}: Unknown token type";
2180 }
2181 } else {
2182 if ({
2183 0x0022 => 1, # "
2184 0x0027 => 1, # '
2185 0x003D => 1, # =
2186 }->{$self->{nc}}) {
2187
2188 ## XML5: Not a parse error.
2189 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2190 } else {
2191
2192 }
2193 $self->{ca}->{value} .= chr ($self->{nc});
2194 $self->{read_until}->($self->{ca}->{value},
2195 qq["'=& \x09\x0C>],
2196 length $self->{ca}->{value});
2197
2198 ## Stay in the state
2199
2200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2201 $self->{line_prev} = $self->{line};
2202 $self->{column_prev} = $self->{column};
2203 $self->{column}++;
2204 $self->{nc}
2205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2206 } else {
2207 $self->{set_nc}->($self);
2208 }
2209
2210 redo A;
2211 }
2212 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2213 if ($is_space->{$self->{nc}}) {
2214
2215 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2216
2217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218 $self->{line_prev} = $self->{line};
2219 $self->{column_prev} = $self->{column};
2220 $self->{column}++;
2221 $self->{nc}
2222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223 } else {
2224 $self->{set_nc}->($self);
2225 }
2226
2227 redo A;
2228 } elsif ($self->{nc} == 0x003E) { # >
2229 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2230
2231 $self->{last_stag_name} = $self->{ct}->{tag_name};
2232 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2233 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2234 if ($self->{ct}->{attributes}) {
2235
2236 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2237 } else {
2238 ## NOTE: This state should never be reached.
2239
2240 }
2241 } else {
2242 die "$0: $self->{ct}->{type}: Unknown token type";
2243 }
2244 $self->{state} = DATA_STATE;
2245 $self->{s_kwd} = '';
2246
2247 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2248 $self->{line_prev} = $self->{line};
2249 $self->{column_prev} = $self->{column};
2250 $self->{column}++;
2251 $self->{nc}
2252 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2253 } else {
2254 $self->{set_nc}->($self);
2255 }
2256
2257
2258 return ($self->{ct}); # start tag or end tag
2259
2260 redo A;
2261 } elsif ($self->{nc} == 0x002F) { # /
2262
2263 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2264
2265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2266 $self->{line_prev} = $self->{line};
2267 $self->{column_prev} = $self->{column};
2268 $self->{column}++;
2269 $self->{nc}
2270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2271 } else {
2272 $self->{set_nc}->($self);
2273 }
2274
2275 redo A;
2276 } elsif ($self->{nc} == -1) {
2277 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2278 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2279
2280 $self->{last_stag_name} = $self->{ct}->{tag_name};
2281 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2282 if ($self->{ct}->{attributes}) {
2283
2284 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2285 } else {
2286 ## NOTE: This state should never be reached.
2287
2288 }
2289 } else {
2290 die "$0: $self->{ct}->{type}: Unknown token type";
2291 }
2292 $self->{state} = DATA_STATE;
2293 $self->{s_kwd} = '';
2294 ## Reconsume.
2295 return ($self->{ct}); # start tag or end tag
2296 redo A;
2297 } else {
2298
2299 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2300 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2301 ## reconsume
2302 redo A;
2303 }
2304 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2305 ## XML5: "Empty tag state".
2306
2307 if ($self->{nc} == 0x003E) { # >
2308 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2309
2310 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2311 ## TODO: Different type than slash in start tag
2312 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2313 if ($self->{ct}->{attributes}) {
2314
2315 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2316 } else {
2317
2318 }
2319 ## TODO: Test |<title></title/>|
2320 } else {
2321
2322 $self->{self_closing} = 1;
2323 }
2324
2325 $self->{state} = DATA_STATE;
2326 $self->{s_kwd} = '';
2327
2328 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2329 $self->{line_prev} = $self->{line};
2330 $self->{column_prev} = $self->{column};
2331 $self->{column}++;
2332 $self->{nc}
2333 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2334 } else {
2335 $self->{set_nc}->($self);
2336 }
2337
2338
2339 return ($self->{ct}); # start tag or end tag
2340
2341 redo A;
2342 } elsif ($self->{nc} == -1) {
2343 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2344 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2345
2346 $self->{last_stag_name} = $self->{ct}->{tag_name};
2347 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2348 if ($self->{ct}->{attributes}) {
2349
2350 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2351 } else {
2352 ## NOTE: This state should never be reached.
2353
2354 }
2355 } else {
2356 die "$0: $self->{ct}->{type}: Unknown token type";
2357 }
2358 ## XML5: "Tag attribute name before state".
2359 $self->{state} = DATA_STATE;
2360 $self->{s_kwd} = '';
2361 ## Reconsume.
2362 return ($self->{ct}); # start tag or end tag
2363 redo A;
2364 } else {
2365
2366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2367 ## TODO: This error type is wrong.
2368 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2369 ## Reconsume.
2370 redo A;
2371 }
2372 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2373 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2374
2375 ## NOTE: Unlike spec's "bogus comment state", this implementation
2376 ## consumes characters one-by-one basis.
2377
2378 if ($self->{nc} == 0x003E) { # >
2379 if ($self->{in_subset}) {
2380
2381 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2382 } else {
2383
2384 $self->{state} = DATA_STATE;
2385 $self->{s_kwd} = '';
2386 }
2387
2388 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2389 $self->{line_prev} = $self->{line};
2390 $self->{column_prev} = $self->{column};
2391 $self->{column}++;
2392 $self->{nc}
2393 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2394 } else {
2395 $self->{set_nc}->($self);
2396 }
2397
2398
2399 return ($self->{ct}); # comment
2400 redo A;
2401 } elsif ($self->{nc} == -1) {
2402 if ($self->{in_subset}) {
2403
2404 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2405 } else {
2406
2407 $self->{state} = DATA_STATE;
2408 $self->{s_kwd} = '';
2409 }
2410 ## reconsume
2411
2412 return ($self->{ct}); # comment
2413 redo A;
2414 } else {
2415
2416 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2417 $self->{read_until}->($self->{ct}->{data},
2418 q[>],
2419 length $self->{ct}->{data});
2420
2421 ## Stay in the state.
2422
2423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2424 $self->{line_prev} = $self->{line};
2425 $self->{column_prev} = $self->{column};
2426 $self->{column}++;
2427 $self->{nc}
2428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2429 } else {
2430 $self->{set_nc}->($self);
2431 }
2432
2433 redo A;
2434 }
2435 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2436 ## XML5: "Markup declaration state".
2437
2438 if ($self->{nc} == 0x002D) { # -
2439
2440 $self->{state} = MD_HYPHEN_STATE;
2441
2442 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2443 $self->{line_prev} = $self->{line};
2444 $self->{column_prev} = $self->{column};
2445 $self->{column}++;
2446 $self->{nc}
2447 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2448 } else {
2449 $self->{set_nc}->($self);
2450 }
2451
2452 redo A;
2453 } elsif ($self->{nc} == 0x0044 or # D
2454 $self->{nc} == 0x0064) { # d
2455 ## ASCII case-insensitive.
2456
2457 $self->{state} = MD_DOCTYPE_STATE;
2458 $self->{kwd} = chr $self->{nc};
2459
2460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2461 $self->{line_prev} = $self->{line};
2462 $self->{column_prev} = $self->{column};
2463 $self->{column}++;
2464 $self->{nc}
2465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2466 } else {
2467 $self->{set_nc}->($self);
2468 }
2469
2470 redo A;
2471 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2472 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2473 $self->{is_xml}) and
2474 $self->{nc} == 0x005B) { # [
2475
2476 $self->{state} = MD_CDATA_STATE;
2477 $self->{kwd} = '[';
2478
2479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480 $self->{line_prev} = $self->{line};
2481 $self->{column_prev} = $self->{column};
2482 $self->{column}++;
2483 $self->{nc}
2484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485 } else {
2486 $self->{set_nc}->($self);
2487 }
2488
2489 redo A;
2490 } else {
2491
2492 }
2493
2494 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2495 line => $self->{line_prev},
2496 column => $self->{column_prev} - 1);
2497 ## Reconsume.
2498 $self->{state} = BOGUS_COMMENT_STATE;
2499 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2500 line => $self->{line_prev},
2501 column => $self->{column_prev} - 1,
2502 };
2503 redo A;
2504 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2505 if ($self->{nc} == 0x002D) { # -
2506
2507 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2508 line => $self->{line_prev},
2509 column => $self->{column_prev} - 2,
2510 };
2511 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2512
2513 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2514 $self->{line_prev} = $self->{line};
2515 $self->{column_prev} = $self->{column};
2516 $self->{column}++;
2517 $self->{nc}
2518 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2519 } else {
2520 $self->{set_nc}->($self);
2521 }
2522
2523 redo A;
2524 } else {
2525
2526 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2527 line => $self->{line_prev},
2528 column => $self->{column_prev} - 2);
2529 $self->{state} = BOGUS_COMMENT_STATE;
2530 ## Reconsume.
2531 $self->{ct} = {type => COMMENT_TOKEN,
2532 data => '-',
2533 line => $self->{line_prev},
2534 column => $self->{column_prev} - 2,
2535 };
2536 redo A;
2537 }
2538 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2539 ## ASCII case-insensitive.
2540 if ($self->{nc} == [
2541 undef,
2542 0x004F, # O
2543 0x0043, # C
2544 0x0054, # T
2545 0x0059, # Y
2546 0x0050, # P
2547 ]->[length $self->{kwd}] or
2548 $self->{nc} == [
2549 undef,
2550 0x006F, # o
2551 0x0063, # c
2552 0x0074, # t
2553 0x0079, # y
2554 0x0070, # p
2555 ]->[length $self->{kwd}]) {
2556
2557 ## Stay in the state.
2558 $self->{kwd} .= chr $self->{nc};
2559
2560 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2561 $self->{line_prev} = $self->{line};
2562 $self->{column_prev} = $self->{column};
2563 $self->{column}++;
2564 $self->{nc}
2565 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2566 } else {
2567 $self->{set_nc}->($self);
2568 }
2569
2570 redo A;
2571 } elsif ((length $self->{kwd}) == 6 and
2572 ($self->{nc} == 0x0045 or # E
2573 $self->{nc} == 0x0065)) { # e
2574 if ($self->{is_xml} and
2575 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2576
2577 ## XML5: case-sensitive.
2578 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2579 text => 'DOCTYPE',
2580 line => $self->{line_prev},
2581 column => $self->{column_prev} - 5);
2582 } else {
2583
2584 }
2585 $self->{state} = DOCTYPE_STATE;
2586 $self->{ct} = {type => DOCTYPE_TOKEN,
2587 quirks => 1,
2588 line => $self->{line_prev},
2589 column => $self->{column_prev} - 7,
2590 };
2591
2592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2593 $self->{line_prev} = $self->{line};
2594 $self->{column_prev} = $self->{column};
2595 $self->{column}++;
2596 $self->{nc}
2597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2598 } else {
2599 $self->{set_nc}->($self);
2600 }
2601
2602 redo A;
2603 } else {
2604
2605 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2606 line => $self->{line_prev},
2607 column => $self->{column_prev} - 1 - length $self->{kwd});
2608 $self->{state} = BOGUS_COMMENT_STATE;
2609 ## Reconsume.
2610 $self->{ct} = {type => COMMENT_TOKEN,
2611 data => $self->{kwd},
2612 line => $self->{line_prev},
2613 column => $self->{column_prev} - 1 - length $self->{kwd},
2614 };
2615 redo A;
2616 }
2617 } elsif ($self->{state} == MD_CDATA_STATE) {
2618 if ($self->{nc} == {
2619 '[' => 0x0043, # C
2620 '[C' => 0x0044, # D
2621 '[CD' => 0x0041, # A
2622 '[CDA' => 0x0054, # T
2623 '[CDAT' => 0x0041, # A
2624 }->{$self->{kwd}}) {
2625
2626 ## Stay in the state.
2627 $self->{kwd} .= chr $self->{nc};
2628
2629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2630 $self->{line_prev} = $self->{line};
2631 $self->{column_prev} = $self->{column};
2632 $self->{column}++;
2633 $self->{nc}
2634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2635 } else {
2636 $self->{set_nc}->($self);
2637 }
2638
2639 redo A;
2640 } elsif ($self->{kwd} eq '[CDATA' and
2641 $self->{nc} == 0x005B) { # [
2642 if ($self->{is_xml} and
2643 not $self->{tainted} and
2644 @{$self->{open_elements} or []} == 0) {
2645
2646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2647 line => $self->{line_prev},
2648 column => $self->{column_prev} - 7);
2649 $self->{tainted} = 1;
2650 } else {
2651
2652 }
2653
2654 $self->{ct} = {type => CHARACTER_TOKEN,
2655 data => '',
2656 line => $self->{line_prev},
2657 column => $self->{column_prev} - 7};
2658 $self->{state} = CDATA_SECTION_STATE;
2659
2660 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2661 $self->{line_prev} = $self->{line};
2662 $self->{column_prev} = $self->{column};
2663 $self->{column}++;
2664 $self->{nc}
2665 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2666 } else {
2667 $self->{set_nc}->($self);
2668 }
2669
2670 redo A;
2671 } else {
2672
2673 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2674 line => $self->{line_prev},
2675 column => $self->{column_prev} - 1 - length $self->{kwd});
2676 $self->{state} = BOGUS_COMMENT_STATE;
2677 ## Reconsume.
2678 $self->{ct} = {type => COMMENT_TOKEN,
2679 data => $self->{kwd},
2680 line => $self->{line_prev},
2681 column => $self->{column_prev} - 1 - length $self->{kwd},
2682 };
2683 redo A;
2684 }
2685 } elsif ($self->{state} == COMMENT_START_STATE) {
2686 if ($self->{nc} == 0x002D) { # -
2687
2688 $self->{state} = COMMENT_START_DASH_STATE;
2689
2690 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2691 $self->{line_prev} = $self->{line};
2692 $self->{column_prev} = $self->{column};
2693 $self->{column}++;
2694 $self->{nc}
2695 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2696 } else {
2697 $self->{set_nc}->($self);
2698 }
2699
2700 redo A;
2701 } elsif ($self->{nc} == 0x003E) { # >
2702 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2703 if ($self->{in_subset}) {
2704
2705 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2706 } else {
2707
2708 $self->{state} = DATA_STATE;
2709 $self->{s_kwd} = '';
2710 }
2711
2712 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2713 $self->{line_prev} = $self->{line};
2714 $self->{column_prev} = $self->{column};
2715 $self->{column}++;
2716 $self->{nc}
2717 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2718 } else {
2719 $self->{set_nc}->($self);
2720 }
2721
2722
2723 return ($self->{ct}); # comment
2724
2725 redo A;
2726 } elsif ($self->{nc} == -1) {
2727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2728 if ($self->{in_subset}) {
2729
2730 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2731 } else {
2732
2733 $self->{state} = DATA_STATE;
2734 $self->{s_kwd} = '';
2735 }
2736 ## reconsume
2737
2738 return ($self->{ct}); # comment
2739
2740 redo A;
2741 } else {
2742
2743 $self->{ct}->{data} # comment
2744 .= chr ($self->{nc});
2745 $self->{state} = COMMENT_STATE;
2746
2747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2748 $self->{line_prev} = $self->{line};
2749 $self->{column_prev} = $self->{column};
2750 $self->{column}++;
2751 $self->{nc}
2752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2753 } else {
2754 $self->{set_nc}->($self);
2755 }
2756
2757 redo A;
2758 }
2759 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2760 if ($self->{nc} == 0x002D) { # -
2761
2762 $self->{state} = COMMENT_END_STATE;
2763
2764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2765 $self->{line_prev} = $self->{line};
2766 $self->{column_prev} = $self->{column};
2767 $self->{column}++;
2768 $self->{nc}
2769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2770 } else {
2771 $self->{set_nc}->($self);
2772 }
2773
2774 redo A;
2775 } elsif ($self->{nc} == 0x003E) { # >
2776 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2777 if ($self->{in_subset}) {
2778
2779 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2780 } else {
2781
2782 $self->{state} = DATA_STATE;
2783 $self->{s_kwd} = '';
2784 }
2785
2786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2787 $self->{line_prev} = $self->{line};
2788 $self->{column_prev} = $self->{column};
2789 $self->{column}++;
2790 $self->{nc}
2791 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2792 } else {
2793 $self->{set_nc}->($self);
2794 }
2795
2796
2797 return ($self->{ct}); # comment
2798
2799 redo A;
2800 } elsif ($self->{nc} == -1) {
2801 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2802 if ($self->{in_subset}) {
2803
2804 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2805 } else {
2806
2807 $self->{state} = DATA_STATE;
2808 $self->{s_kwd} = '';
2809 }
2810 ## reconsume
2811
2812 return ($self->{ct}); # comment
2813
2814 redo A;
2815 } else {
2816
2817 $self->{ct}->{data} # comment
2818 .= '-' . chr ($self->{nc});
2819 $self->{state} = COMMENT_STATE;
2820
2821 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2822 $self->{line_prev} = $self->{line};
2823 $self->{column_prev} = $self->{column};
2824 $self->{column}++;
2825 $self->{nc}
2826 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2827 } else {
2828 $self->{set_nc}->($self);
2829 }
2830
2831 redo A;
2832 }
2833 } elsif ($self->{state} == COMMENT_STATE) {
2834 ## XML5: "Comment state" and "DOCTYPE comment state".
2835
2836 if ($self->{nc} == 0x002D) { # -
2837
2838 $self->{state} = COMMENT_END_DASH_STATE;
2839
2840 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2841 $self->{line_prev} = $self->{line};
2842 $self->{column_prev} = $self->{column};
2843 $self->{column}++;
2844 $self->{nc}
2845 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2846 } else {
2847 $self->{set_nc}->($self);
2848 }
2849
2850 redo A;
2851 } elsif ($self->{nc} == -1) {
2852 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2853 if ($self->{in_subset}) {
2854
2855 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856 } else {
2857
2858 $self->{state} = DATA_STATE;
2859 $self->{s_kwd} = '';
2860 }
2861 ## reconsume
2862
2863 return ($self->{ct}); # comment
2864
2865 redo A;
2866 } else {
2867
2868 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2869 $self->{read_until}->($self->{ct}->{data},
2870 q[-],
2871 length $self->{ct}->{data});
2872
2873 ## Stay in the state
2874
2875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2876 $self->{line_prev} = $self->{line};
2877 $self->{column_prev} = $self->{column};
2878 $self->{column}++;
2879 $self->{nc}
2880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2881 } else {
2882 $self->{set_nc}->($self);
2883 }
2884
2885 redo A;
2886 }
2887 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2888 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2889
2890 if ($self->{nc} == 0x002D) { # -
2891
2892 $self->{state} = COMMENT_END_STATE;
2893
2894 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2895 $self->{line_prev} = $self->{line};
2896 $self->{column_prev} = $self->{column};
2897 $self->{column}++;
2898 $self->{nc}
2899 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2900 } else {
2901 $self->{set_nc}->($self);
2902 }
2903
2904 redo A;
2905 } elsif ($self->{nc} == -1) {
2906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2907 if ($self->{in_subset}) {
2908
2909 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910 } else {
2911
2912 $self->{state} = DATA_STATE;
2913 $self->{s_kwd} = '';
2914 }
2915 ## reconsume
2916
2917 return ($self->{ct}); # comment
2918
2919 redo A;
2920 } else {
2921
2922 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2923 $self->{state} = COMMENT_STATE;
2924
2925 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2926 $self->{line_prev} = $self->{line};
2927 $self->{column_prev} = $self->{column};
2928 $self->{column}++;
2929 $self->{nc}
2930 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2931 } else {
2932 $self->{set_nc}->($self);
2933 }
2934
2935 redo A;
2936 }
2937 } elsif ($self->{state} == COMMENT_END_STATE) {
2938 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2939
2940 if ($self->{nc} == 0x003E) { # >
2941 if ($self->{in_subset}) {
2942
2943 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2944 } else {
2945
2946 $self->{state} = DATA_STATE;
2947 $self->{s_kwd} = '';
2948 }
2949
2950 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2951 $self->{line_prev} = $self->{line};
2952 $self->{column_prev} = $self->{column};
2953 $self->{column}++;
2954 $self->{nc}
2955 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2956 } else {
2957 $self->{set_nc}->($self);
2958 }
2959
2960
2961 return ($self->{ct}); # comment
2962
2963 redo A;
2964 } elsif ($self->{nc} == 0x002D) { # -
2965
2966 ## XML5: Not a parse error.
2967 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2968 line => $self->{line_prev},
2969 column => $self->{column_prev});
2970 $self->{ct}->{data} .= '-'; # comment
2971 ## Stay in the state
2972
2973 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2974 $self->{line_prev} = $self->{line};
2975 $self->{column_prev} = $self->{column};
2976 $self->{column}++;
2977 $self->{nc}
2978 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2979 } else {
2980 $self->{set_nc}->($self);
2981 }
2982
2983 redo A;
2984 } elsif ($self->{nc} == -1) {
2985 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2986 if ($self->{in_subset}) {
2987
2988 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2989 } else {
2990
2991 $self->{state} = DATA_STATE;
2992 $self->{s_kwd} = '';
2993 }
2994 ## reconsume
2995
2996 return ($self->{ct}); # comment
2997
2998 redo A;
2999 } else {
3000
3001 ## XML5: Not a parse error.
3002 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3003 line => $self->{line_prev},
3004 column => $self->{column_prev});
3005 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3006 $self->{state} = COMMENT_STATE;
3007
3008 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3009 $self->{line_prev} = $self->{line};
3010 $self->{column_prev} = $self->{column};
3011 $self->{column}++;
3012 $self->{nc}
3013 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3014 } else {
3015 $self->{set_nc}->($self);
3016 }
3017
3018 redo A;
3019 }
3020 } elsif ($self->{state} == DOCTYPE_STATE) {
3021 if ($is_space->{$self->{nc}}) {
3022
3023 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3024
3025 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3026 $self->{line_prev} = $self->{line};
3027 $self->{column_prev} = $self->{column};
3028 $self->{column}++;
3029 $self->{nc}
3030 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3031 } else {
3032 $self->{set_nc}->($self);
3033 }
3034
3035 redo A;
3036 } else {
3037
3038 ## XML5: Unless EOF, swith to the bogus comment state.
3039 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3040 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3041 ## reconsume
3042 redo A;
3043 }
3044 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3045 ## XML5: "DOCTYPE root name before state".
3046
3047 if ($is_space->{$self->{nc}}) {
3048
3049 ## Stay in the state
3050
3051 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3052 $self->{line_prev} = $self->{line};
3053 $self->{column_prev} = $self->{column};
3054 $self->{column}++;
3055 $self->{nc}
3056 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3057 } else {
3058 $self->{set_nc}->($self);
3059 }
3060
3061 redo A;
3062 } elsif ($self->{nc} == 0x003E) { # >
3063
3064 ## XML5: No parse error.
3065 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3066 $self->{state} = DATA_STATE;
3067 $self->{s_kwd} = '';
3068
3069 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3070 $self->{line_prev} = $self->{line};
3071 $self->{column_prev} = $self->{column};
3072 $self->{column}++;
3073 $self->{nc}
3074 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3075 } else {
3076 $self->{set_nc}->($self);
3077 }
3078
3079
3080 return ($self->{ct}); # DOCTYPE (quirks)
3081
3082 redo A;
3083 } elsif ($self->{nc} == -1) {
3084
3085 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3086 $self->{state} = DATA_STATE;
3087 $self->{s_kwd} = '';
3088 ## reconsume
3089
3090 return ($self->{ct}); # DOCTYPE (quirks)
3091
3092 redo A;
3093 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3094
3095 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3096 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3097 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3098 $self->{in_subset} = 1;
3099
3100 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3101 $self->{line_prev} = $self->{line};
3102 $self->{column_prev} = $self->{column};
3103 $self->{column}++;
3104 $self->{nc}
3105 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3106 } else {
3107 $self->{set_nc}->($self);
3108 }
3109
3110 return ($self->{ct}); # DOCTYPE
3111 redo A;
3112 } else {
3113
3114 $self->{ct}->{name} = chr $self->{nc};
3115 delete $self->{ct}->{quirks};
3116 $self->{state} = DOCTYPE_NAME_STATE;
3117
3118 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3119 $self->{line_prev} = $self->{line};
3120 $self->{column_prev} = $self->{column};
3121 $self->{column}++;
3122 $self->{nc}
3123 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3124 } else {
3125 $self->{set_nc}->($self);
3126 }
3127
3128 redo A;
3129 }
3130 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3131 ## XML5: "DOCTYPE root name state".
3132
3133 ## ISSUE: Redundant "First," in the spec.
3134
3135 if ($is_space->{$self->{nc}}) {
3136
3137 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3138
3139 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3140 $self->{line_prev} = $self->{line};
3141 $self->{column_prev} = $self->{column};
3142 $self->{column}++;
3143 $self->{nc}
3144 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3145 } else {
3146 $self->{set_nc}->($self);
3147 }
3148
3149 redo A;
3150 } elsif ($self->{nc} == 0x003E) { # >
3151
3152 $self->{state} = DATA_STATE;
3153 $self->{s_kwd} = '';
3154
3155 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3156 $self->{line_prev} = $self->{line};
3157 $self->{column_prev} = $self->{column};
3158 $self->{column}++;
3159 $self->{nc}
3160 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3161 } else {
3162 $self->{set_nc}->($self);
3163 }
3164
3165
3166 return ($self->{ct}); # DOCTYPE
3167
3168 redo A;
3169 } elsif ($self->{nc} == -1) {
3170
3171 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3172 $self->{state} = DATA_STATE;
3173 $self->{s_kwd} = '';
3174 ## reconsume
3175
3176 $self->{ct}->{quirks} = 1;
3177 return ($self->{ct}); # DOCTYPE
3178
3179 redo A;
3180 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3181
3182 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3183 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3184 $self->{in_subset} = 1;
3185
3186 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3187 $self->{line_prev} = $self->{line};
3188 $self->{column_prev} = $self->{column};
3189 $self->{column}++;
3190 $self->{nc}
3191 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3192 } else {
3193 $self->{set_nc}->($self);
3194 }
3195
3196 return ($self->{ct}); # DOCTYPE
3197 redo A;
3198 } else {
3199
3200 $self->{ct}->{name}
3201 .= chr ($self->{nc}); # DOCTYPE
3202 ## Stay in the state
3203
3204 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3205 $self->{line_prev} = $self->{line};
3206 $self->{column_prev} = $self->{column};
3207 $self->{column}++;
3208 $self->{nc}
3209 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3210 } else {
3211 $self->{set_nc}->($self);
3212 }
3213
3214 redo A;
3215 }
3216 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3217 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3218 ## state", but implemented differently.
3219
3220 if ($is_space->{$self->{nc}}) {
3221
3222 ## Stay in the state
3223
3224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3225 $self->{line_prev} = $self->{line};
3226 $self->{column_prev} = $self->{column};
3227 $self->{column}++;
3228 $self->{nc}
3229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3230 } else {
3231 $self->{set_nc}->($self);
3232 }
3233
3234 redo A;
3235 } elsif ($self->{nc} == 0x003E) { # >
3236 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3237
3238 $self->{state} = DATA_STATE;
3239 $self->{s_kwd} = '';
3240 } else {
3241
3242 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3243 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3244 }
3245
3246
3247 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248 $self->{line_prev} = $self->{line};
3249 $self->{column_prev} = $self->{column};
3250 $self->{column}++;
3251 $self->{nc}
3252 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253 } else {
3254 $self->{set_nc}->($self);
3255 }
3256
3257 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3258 redo A;
3259 } elsif ($self->{nc} == -1) {
3260 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3261
3262 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3263 $self->{state} = DATA_STATE;
3264 $self->{s_kwd} = '';
3265 $self->{ct}->{quirks} = 1;
3266 } else {
3267
3268 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3269 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3270 }
3271
3272 ## Reconsume.
3273 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3274 redo A;
3275 } elsif ($self->{nc} == 0x0050 or # P
3276 $self->{nc} == 0x0070) { # p
3277
3278 $self->{state} = PUBLIC_STATE;
3279 $self->{kwd} = chr $self->{nc};
3280
3281 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3282 $self->{line_prev} = $self->{line};
3283 $self->{column_prev} = $self->{column};
3284 $self->{column}++;
3285 $self->{nc}
3286 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3287 } else {
3288 $self->{set_nc}->($self);
3289 }
3290
3291 redo A;
3292 } elsif ($self->{nc} == 0x0053 or # S
3293 $self->{nc} == 0x0073) { # s
3294
3295 $self->{state} = SYSTEM_STATE;
3296 $self->{kwd} = chr $self->{nc};
3297
3298 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3299 $self->{line_prev} = $self->{line};
3300 $self->{column_prev} = $self->{column};
3301 $self->{column}++;
3302 $self->{nc}
3303 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3304 } else {
3305 $self->{set_nc}->($self);
3306 }
3307
3308 redo A;
3309 } elsif ($self->{nc} == 0x0022 and # "
3310 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3311 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3312
3313 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3314 $self->{ct}->{value} = ''; # ENTITY
3315
3316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3317 $self->{line_prev} = $self->{line};
3318 $self->{column_prev} = $self->{column};
3319 $self->{column}++;
3320 $self->{nc}
3321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3322 } else {
3323 $self->{set_nc}->($self);
3324 }
3325
3326 redo A;
3327 } elsif ($self->{nc} == 0x0027 and # '
3328 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3329 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3330
3331 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3332 $self->{ct}->{value} = ''; # ENTITY
3333
3334 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3335 $self->{line_prev} = $self->{line};
3336 $self->{column_prev} = $self->{column};
3337 $self->{column}++;
3338 $self->{nc}
3339 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3340 } else {
3341 $self->{set_nc}->($self);
3342 }
3343
3344 redo A;
3345 } elsif ($self->{is_xml} and
3346 $self->{ct}->{type} == DOCTYPE_TOKEN and
3347 $self->{nc} == 0x005B) { # [
3348
3349 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3350 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3351 $self->{in_subset} = 1;
3352
3353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3354 $self->{line_prev} = $self->{line};
3355 $self->{column_prev} = $self->{column};
3356 $self->{column}++;
3357 $self->{nc}
3358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3359 } else {
3360 $self->{set_nc}->($self);
3361 }
3362
3363 return ($self->{ct}); # DOCTYPE
3364 redo A;
3365 } else {
3366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3367
3368 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3369
3370 $self->{ct}->{quirks} = 1;
3371 $self->{state} = BOGUS_DOCTYPE_STATE;
3372 } else {
3373
3374 $self->{state} = BOGUS_MD_STATE;
3375 }
3376
3377
3378 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3379 $self->{line_prev} = $self->{line};
3380 $self->{column_prev} = $self->{column};
3381 $self->{column}++;
3382 $self->{nc}
3383 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3384 } else {
3385 $self->{set_nc}->($self);
3386 }
3387
3388 redo A;
3389 }
3390 } elsif ($self->{state} == PUBLIC_STATE) {
3391 ## ASCII case-insensitive
3392 if ($self->{nc} == [
3393 undef,
3394 0x0055, # U
3395 0x0042, # B
3396 0x004C, # L
3397 0x0049, # I
3398 ]->[length $self->{kwd}] or
3399 $self->{nc} == [
3400 undef,
3401 0x0075, # u
3402 0x0062, # b
3403 0x006C, # l
3404 0x0069, # i
3405 ]->[length $self->{kwd}]) {
3406
3407 ## Stay in the state.
3408 $self->{kwd} .= chr $self->{nc};
3409
3410 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3411 $self->{line_prev} = $self->{line};
3412 $self->{column_prev} = $self->{column};
3413 $self->{column}++;
3414 $self->{nc}
3415 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3416 } else {
3417 $self->{set_nc}->($self);
3418 }
3419
3420 redo A;
3421 } elsif ((length $self->{kwd}) == 5 and
3422 ($self->{nc} == 0x0043 or # C
3423 $self->{nc} == 0x0063)) { # c
3424 if ($self->{is_xml} and
3425 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3426
3427 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3428 text => 'PUBLIC',
3429 line => $self->{line_prev},
3430 column => $self->{column_prev} - 4);
3431 } else {
3432
3433 }
3434 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3435
3436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3437 $self->{line_prev} = $self->{line};
3438 $self->{column_prev} = $self->{column};
3439 $self->{column}++;
3440 $self->{nc}
3441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3442 } else {
3443 $self->{set_nc}->($self);
3444 }
3445
3446 redo A;
3447 } else {
3448 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3449 line => $self->{line_prev},
3450 column => $self->{column_prev} + 1 - length $self->{kwd});
3451 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3452
3453 $self->{ct}->{quirks} = 1;
3454 $self->{state} = BOGUS_DOCTYPE_STATE;
3455 } else {
3456
3457 $self->{state} = BOGUS_MD_STATE;
3458 }
3459 ## Reconsume.
3460 redo A;
3461 }
3462 } elsif ($self->{state} == SYSTEM_STATE) {
3463 ## ASCII case-insensitive
3464 if ($self->{nc} == [
3465 undef,
3466 0x0059, # Y
3467 0x0053, # S
3468 0x0054, # T
3469 0x0045, # E
3470 ]->[length $self->{kwd}] or
3471 $self->{nc} == [
3472 undef,
3473 0x0079, # y
3474 0x0073, # s
3475 0x0074, # t
3476 0x0065, # e
3477 ]->[length $self->{kwd}]) {
3478
3479 ## Stay in the state.
3480 $self->{kwd} .= chr $self->{nc};
3481
3482 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3483 $self->{line_prev} = $self->{line};
3484 $self->{column_prev} = $self->{column};
3485 $self->{column}++;
3486 $self->{nc}
3487 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3488 } else {
3489 $self->{set_nc}->($self);
3490 }
3491
3492 redo A;
3493 } elsif ((length $self->{kwd}) == 5 and
3494 ($self->{nc} == 0x004D or # M
3495 $self->{nc} == 0x006D)) { # m
3496 if ($self->{is_xml} and
3497 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3498
3499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3500 text => 'SYSTEM',
3501 line => $self->{line_prev},
3502 column => $self->{column_prev} - 4);
3503 } else {
3504
3505 }
3506 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3507
3508 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3509 $self->{line_prev} = $self->{line};
3510 $self->{column_prev} = $self->{column};
3511 $self->{column}++;
3512 $self->{nc}
3513 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3514 } else {
3515 $self->{set_nc}->($self);
3516 }
3517
3518 redo A;
3519 } else {
3520 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3521 line => $self->{line_prev},
3522 column => $self->{column_prev} + 1 - length $self->{kwd});
3523 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3524
3525 $self->{ct}->{quirks} = 1;
3526 $self->{state} = BOGUS_DOCTYPE_STATE;
3527 } else {
3528
3529 $self->{state} = BOGUS_MD_STATE;
3530 }
3531 ## Reconsume.
3532 redo A;
3533 }
3534 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3535 if ($is_space->{$self->{nc}}) {
3536
3537 ## Stay in the state
3538
3539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3540 $self->{line_prev} = $self->{line};
3541 $self->{column_prev} = $self->{column};
3542 $self->{column}++;
3543 $self->{nc}
3544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3545 } else {
3546 $self->{set_nc}->($self);
3547 }
3548
3549 redo A;
3550 } elsif ($self->{nc} eq 0x0022) { # "
3551
3552 $self->{ct}->{pubid} = ''; # DOCTYPE
3553 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3554
3555 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3556 $self->{line_prev} = $self->{line};
3557 $self->{column_prev} = $self->{column};
3558 $self->{column}++;
3559 $self->{nc}
3560 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3561 } else {
3562 $self->{set_nc}->($self);
3563 }
3564
3565 redo A;
3566 } elsif ($self->{nc} eq 0x0027) { # '
3567
3568 $self->{ct}->{pubid} = ''; # DOCTYPE
3569 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3570
3571 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3572 $self->{line_prev} = $self->{line};
3573 $self->{column_prev} = $self->{column};
3574 $self->{column}++;
3575 $self->{nc}
3576 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3577 } else {
3578 $self->{set_nc}->($self);
3579 }
3580
3581 redo A;
3582 } elsif ($self->{nc} eq 0x003E) { # >
3583 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3584
3585 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3586
3587 $self->{state} = DATA_STATE;
3588 $self->{s_kwd} = '';
3589 $self->{ct}->{quirks} = 1;
3590 } else {
3591
3592 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3593 }
3594
3595
3596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3597 $self->{line_prev} = $self->{line};
3598 $self->{column_prev} = $self->{column};
3599 $self->{column}++;
3600 $self->{nc}
3601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3602 } else {
3603 $self->{set_nc}->($self);
3604 }
3605
3606 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3607 redo A;
3608 } elsif ($self->{nc} == -1) {
3609 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3610
3611 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3612 $self->{state} = DATA_STATE;
3613 $self->{s_kwd} = '';
3614 $self->{ct}->{quirks} = 1;
3615 } else {
3616
3617 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3618 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3619 }
3620
3621 ## reconsume
3622 return ($self->{ct}); # DOCTYPE
3623 redo A;
3624 } elsif ($self->{is_xml} and
3625 $self->{ct}->{type} == DOCTYPE_TOKEN and
3626 $self->{nc} == 0x005B) { # [
3627
3628 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3629 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3630 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3631 $self->{in_subset} = 1;
3632
3633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3634 $self->{line_prev} = $self->{line};
3635 $self->{column_prev} = $self->{column};
3636 $self->{column}++;
3637 $self->{nc}
3638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3639 } else {
3640 $self->{set_nc}->($self);
3641 }
3642
3643 return ($self->{ct}); # DOCTYPE
3644 redo A;
3645 } else {
3646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3647
3648 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3649
3650 $self->{ct}->{quirks} = 1;
3651 $self->{state} = BOGUS_DOCTYPE_STATE;
3652 } else {
3653
3654 $self->{state} = BOGUS_MD_STATE;
3655 }
3656
3657
3658 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3659 $self->{line_prev} = $self->{line};
3660 $self->{column_prev} = $self->{column};
3661 $self->{column}++;
3662 $self->{nc}
3663 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3664 } else {
3665 $self->{set_nc}->($self);
3666 }
3667
3668 redo A;
3669 }
3670 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3671 if ($self->{nc} == 0x0022) { # "
3672
3673 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3674
3675 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3676 $self->{line_prev} = $self->{line};
3677 $self->{column_prev} = $self->{column};
3678 $self->{column}++;
3679 $self->{nc}
3680 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3681 } else {
3682 $self->{set_nc}->($self);
3683 }
3684
3685 redo A;
3686 } elsif ($self->{nc} == 0x003E) { # >
3687 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3688
3689 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3690
3691 $self->{state} = DATA_STATE;
3692 $self->{s_kwd} = '';
3693 $self->{ct}->{quirks} = 1;
3694 } else {
3695
3696 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3697 }
3698
3699
3700 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3701 $self->{line_prev} = $self->{line};
3702 $self->{column_prev} = $self->{column};
3703 $self->{column}++;
3704 $self->{nc}
3705 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3706 } else {
3707 $self->{set_nc}->($self);
3708 }
3709
3710 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3711 redo A;
3712 } elsif ($self->{nc} == -1) {
3713 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3714
3715 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3716
3717 $self->{state} = DATA_STATE;
3718 $self->{s_kwd} = '';
3719 $self->{ct}->{quirks} = 1;
3720 } else {
3721
3722 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3723 }
3724
3725 ## Reconsume.
3726 return ($self->{ct}); # DOCTYPE
3727 redo A;
3728 } else {
3729
3730 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3731 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3732 length $self->{ct}->{pubid});
3733
3734 ## Stay in the state
3735
3736 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3737 $self->{line_prev} = $self->{line};
3738 $self->{column_prev} = $self->{column};
3739 $self->{column}++;
3740 $self->{nc}
3741 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3742 } else {
3743 $self->{set_nc}->($self);
3744 }
3745
3746 redo A;
3747 }
3748 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3749 if ($self->{nc} == 0x0027) { # '
3750
3751 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3752
3753 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3754 $self->{line_prev} = $self->{line};
3755 $self->{column_prev} = $self->{column};
3756 $self->{column}++;
3757 $self->{nc}
3758 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3759 } else {
3760 $self->{set_nc}->($self);
3761 }
3762
3763 redo A;
3764 } elsif ($self->{nc} == 0x003E) { # >
3765 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3766
3767 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3768
3769 $self->{state} = DATA_STATE;
3770 $self->{s_kwd} = '';
3771 $self->{ct}->{quirks} = 1;
3772 } else {
3773
3774 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3775 }
3776
3777
3778 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779 $self->{line_prev} = $self->{line};
3780 $self->{column_prev} = $self->{column};
3781 $self->{column}++;
3782 $self->{nc}
3783 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784 } else {
3785 $self->{set_nc}->($self);
3786 }
3787
3788 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3789 redo A;
3790 } elsif ($self->{nc} == -1) {
3791 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3792
3793 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794
3795 $self->{state} = DATA_STATE;
3796 $self->{s_kwd} = '';
3797 $self->{ct}->{quirks} = 1;
3798 } else {
3799
3800 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801 }
3802
3803 ## reconsume
3804 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3805 redo A;
3806 } else {
3807
3808 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3809 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3810 length $self->{ct}->{pubid});
3811
3812 ## Stay in the state
3813
3814 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3815 $self->{line_prev} = $self->{line};
3816 $self->{column_prev} = $self->{column};
3817 $self->{column}++;
3818 $self->{nc}
3819 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3820 } else {
3821 $self->{set_nc}->($self);
3822 }
3823
3824 redo A;
3825 }
3826 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3827 if ($is_space->{$self->{nc}}) {
3828
3829 ## Stay in the state
3830
3831 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3832 $self->{line_prev} = $self->{line};
3833 $self->{column_prev} = $self->{column};
3834 $self->{column}++;
3835 $self->{nc}
3836 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3837 } else {
3838 $self->{set_nc}->($self);
3839 }
3840
3841 redo A;
3842 } elsif ($self->{nc} == 0x0022) { # "
3843
3844 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3845 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3846
3847 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3848 $self->{line_prev} = $self->{line};
3849 $self->{column_prev} = $self->{column};
3850 $self->{column}++;
3851 $self->{nc}
3852 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3853 } else {
3854 $self->{set_nc}->($self);
3855 }
3856
3857 redo A;
3858 } elsif ($self->{nc} == 0x0027) { # '
3859
3860 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3861 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3862
3863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3864 $self->{line_prev} = $self->{line};
3865 $self->{column_prev} = $self->{column};
3866 $self->{column}++;
3867 $self->{nc}
3868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3869 } else {
3870 $self->{set_nc}->($self);
3871 }
3872
3873 redo A;
3874 } elsif ($self->{nc} == 0x003E) { # >
3875 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3876 if ($self->{is_xml}) {
3877
3878 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3879 } else {
3880
3881 }
3882 $self->{state} = DATA_STATE;
3883 $self->{s_kwd} = '';
3884 } else {
3885 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3886
3887 } else {
3888
3889 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3890 }
3891 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3892 }
3893
3894
3895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3896 $self->{line_prev} = $self->{line};
3897 $self->{column_prev} = $self->{column};
3898 $self->{column}++;
3899 $self->{nc}
3900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3901 } else {
3902 $self->{set_nc}->($self);
3903 }
3904
3905 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3906 redo A;
3907 } elsif ($self->{nc} == -1) {
3908 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3909
3910 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3911
3912 $self->{state} = DATA_STATE;
3913 $self->{s_kwd} = '';
3914 $self->{ct}->{quirks} = 1;
3915 } else {
3916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3917 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3918 }
3919
3920 ## reconsume
3921 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3922 redo A;
3923 } elsif ($self->{is_xml} and
3924 $self->{ct}->{type} == DOCTYPE_TOKEN and
3925 $self->{nc} == 0x005B) { # [
3926
3927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3928 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3929 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3930 $self->{in_subset} = 1;
3931
3932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3933 $self->{line_prev} = $self->{line};
3934 $self->{column_prev} = $self->{column};
3935 $self->{column}++;
3936 $self->{nc}
3937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3938 } else {
3939 $self->{set_nc}->($self);
3940 }
3941
3942 return ($self->{ct}); # DOCTYPE
3943 redo A;
3944 } else {
3945 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3946
3947 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3948
3949 $self->{ct}->{quirks} = 1;
3950 $self->{state} = BOGUS_DOCTYPE_STATE;
3951 } else {
3952
3953 $self->{state} = BOGUS_MD_STATE;
3954 }
3955
3956
3957 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3958 $self->{line_prev} = $self->{line};
3959 $self->{column_prev} = $self->{column};
3960 $self->{column}++;
3961 $self->{nc}
3962 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3963 } else {
3964 $self->{set_nc}->($self);
3965 }
3966
3967 redo A;
3968 }
3969 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3970 if ($is_space->{$self->{nc}}) {
3971
3972 ## Stay in the state
3973
3974 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3975 $self->{line_prev} = $self->{line};
3976 $self->{column_prev} = $self->{column};
3977 $self->{column}++;
3978 $self->{nc}
3979 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3980 } else {
3981 $self->{set_nc}->($self);
3982 }
3983
3984 redo A;
3985 } elsif ($self->{nc} == 0x0022) { # "
3986
3987 $self->{ct}->{sysid} = ''; # DOCTYPE
3988 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3989
3990 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3991 $self->{line_prev} = $self->{line};
3992 $self->{column_prev} = $self->{column};
3993 $self->{column}++;
3994 $self->{nc}
3995 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3996 } else {
3997 $self->{set_nc}->($self);
3998 }
3999
4000 redo A;
4001 } elsif ($self->{nc} == 0x0027) { # '
4002
4003 $self->{ct}->{sysid} = ''; # DOCTYPE
4004 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4005
4006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4007 $self->{line_prev} = $self->{line};
4008 $self->{column_prev} = $self->{column};
4009 $self->{column}++;
4010 $self->{nc}
4011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4012 } else {
4013 $self->{set_nc}->($self);
4014 }
4015
4016 redo A;
4017 } elsif ($self->{nc} == 0x003E) { # >
4018 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4019
4020 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4021 $self->{line_prev} = $self->{line};
4022 $self->{column_prev} = $self->{column};
4023 $self->{column}++;
4024 $self->{nc}
4025 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4026 } else {
4027 $self->{set_nc}->($self);
4028 }
4029
4030
4031 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4032
4033 $self->{state} = DATA_STATE;
4034 $self->{s_kwd} = '';
4035 $self->{ct}->{quirks} = 1;
4036 } else {
4037
4038 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4039 }
4040
4041 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4042 redo A;
4043 } elsif ($self->{nc} == -1) {
4044 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4045
4046 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4047 $self->{state} = DATA_STATE;
4048 $self->{s_kwd} = '';
4049 $self->{ct}->{quirks} = 1;
4050 } else {
4051
4052 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4053 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4054 }
4055
4056 ## reconsume
4057 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4058 redo A;
4059 } elsif ($self->{is_xml} and
4060 $self->{ct}->{type} == DOCTYPE_TOKEN and
4061 $self->{nc} == 0x005B) { # [
4062
4063 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4064
4065 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4067 $self->{in_subset} = 1;
4068
4069 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070 $self->{line_prev} = $self->{line};
4071 $self->{column_prev} = $self->{column};
4072 $self->{column}++;
4073 $self->{nc}
4074 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4075 } else {
4076 $self->{set_nc}->($self);
4077 }
4078
4079 return ($self->{ct}); # DOCTYPE
4080 redo A;
4081 } else {
4082 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4083
4084 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085
4086 $self->{ct}->{quirks} = 1;
4087 $self->{state} = BOGUS_DOCTYPE_STATE;
4088 } else {
4089
4090 $self->{state} = BOGUS_MD_STATE;
4091 }
4092
4093
4094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4095 $self->{line_prev} = $self->{line};
4096 $self->{column_prev} = $self->{column};
4097 $self->{column}++;
4098 $self->{nc}
4099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4100 } else {
4101 $self->{set_nc}->($self);
4102 }
4103
4104 redo A;
4105 }
4106 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4107 if ($self->{nc} == 0x0022) { # "
4108
4109 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4110
4111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4112 $self->{line_prev} = $self->{line};
4113 $self->{column_prev} = $self->{column};
4114 $self->{column}++;
4115 $self->{nc}
4116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4117 } else {
4118 $self->{set_nc}->($self);
4119 }
4120
4121 redo A;
4122 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4123 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4124
4125 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4126
4127 $self->{state} = DATA_STATE;
4128 $self->{s_kwd} = '';
4129 $self->{ct}->{quirks} = 1;
4130 } else {
4131
4132 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4133 }
4134
4135
4136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4137 $self->{line_prev} = $self->{line};
4138 $self->{column_prev} = $self->{column};
4139 $self->{column}++;
4140 $self->{nc}
4141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4142 } else {
4143 $self->{set_nc}->($self);
4144 }
4145
4146 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4147 redo A;
4148 } elsif ($self->{nc} == -1) {
4149 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4150
4151 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4152
4153 $self->{state} = DATA_STATE;
4154 $self->{s_kwd} = '';
4155 $self->{ct}->{quirks} = 1;
4156 } else {
4157
4158 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4159 }
4160
4161 ## reconsume
4162 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4163 redo A;
4164 } else {
4165
4166 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4167 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4168 length $self->{ct}->{sysid});
4169
4170 ## Stay in the state
4171
4172 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4173 $self->{line_prev} = $self->{line};
4174 $self->{column_prev} = $self->{column};
4175 $self->{column}++;
4176 $self->{nc}
4177 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4178 } else {
4179 $self->{set_nc}->($self);
4180 }
4181
4182 redo A;
4183 }
4184 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4185 if ($self->{nc} == 0x0027) { # '
4186
4187 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4188
4189 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190 $self->{line_prev} = $self->{line};
4191 $self->{column_prev} = $self->{column};
4192 $self->{column}++;
4193 $self->{nc}
4194 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195 } else {
4196 $self->{set_nc}->($self);
4197 }
4198
4199 redo A;
4200 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4201
4202 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4203
4204 $self->{state} = DATA_STATE;
4205 $self->{s_kwd} = '';
4206
4207 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4208 $self->{line_prev} = $self->{line};
4209 $self->{column_prev} = $self->{column};
4210 $self->{column}++;
4211 $self->{nc}
4212 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4213 } else {
4214 $self->{set_nc}->($self);
4215 }
4216
4217
4218 $self->{ct}->{quirks} = 1;
4219 return ($self->{ct}); # DOCTYPE
4220
4221 redo A;
4222 } elsif ($self->{nc} == -1) {
4223 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4224
4225 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4226
4227 $self->{state} = DATA_STATE;
4228 $self->{s_kwd} = '';
4229 $self->{ct}->{quirks} = 1;
4230 } else {
4231
4232 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4233 }
4234
4235 ## reconsume
4236 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4237 redo A;
4238 } else {
4239
4240 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4241 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4242 length $self->{ct}->{sysid});
4243
4244 ## Stay in the state
4245
4246 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4247 $self->{line_prev} = $self->{line};
4248 $self->{column_prev} = $self->{column};
4249 $self->{column}++;
4250 $self->{nc}
4251 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4252 } else {
4253 $self->{set_nc}->($self);
4254 }
4255
4256 redo A;
4257 }
4258 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4259 if ($is_space->{$self->{nc}}) {
4260 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4261
4262 $self->{state} = BEFORE_NDATA_STATE;
4263 } else {
4264
4265 ## Stay in the state
4266 }
4267
4268 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4269 $self->{line_prev} = $self->{line};
4270 $self->{column_prev} = $self->{column};
4271 $self->{column}++;
4272 $self->{nc}
4273 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4274 } else {
4275 $self->{set_nc}->($self);
4276 }
4277
4278 redo A;
4279 } elsif ($self->{nc} == 0x003E) { # >
4280 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4281
4282 $self->{state} = DATA_STATE;
4283 $self->{s_kwd} = '';
4284 } else {
4285
4286 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4287 }
4288
4289
4290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4291 $self->{line_prev} = $self->{line};
4292 $self->{column_prev} = $self->{column};
4293 $self->{column}++;
4294 $self->{nc}
4295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4296 } else {
4297 $self->{set_nc}->($self);
4298 }
4299
4300 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4301 redo A;
4302 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4303 ($self->{nc} == 0x004E or # N
4304 $self->{nc} == 0x006E)) { # n
4305
4306 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4307 $self->{state} = NDATA_STATE;
4308 $self->{kwd} = chr $self->{nc};
4309
4310 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4311 $self->{line_prev} = $self->{line};
4312 $self->{column_prev} = $self->{column};
4313 $self->{column}++;
4314 $self->{nc}
4315 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4316 } else {
4317 $self->{set_nc}->($self);
4318 }
4319
4320 redo A;
4321 } elsif ($self->{nc} == -1) {
4322 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4323
4324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4325 $self->{state} = DATA_STATE;
4326 $self->{s_kwd} = '';
4327 $self->{ct}->{quirks} = 1;
4328 } else {
4329
4330 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4331 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4332 }
4333
4334 ## reconsume
4335 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4336 redo A;
4337 } elsif ($self->{is_xml} and
4338 $self->{ct}->{type} == DOCTYPE_TOKEN and
4339 $self->{nc} == 0x005B) { # [
4340
4341 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4342 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4343 $self->{in_subset} = 1;
4344
4345 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346 $self->{line_prev} = $self->{line};
4347 $self->{column_prev} = $self->{column};
4348 $self->{column}++;
4349 $self->{nc}
4350 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4351 } else {
4352 $self->{set_nc}->($self);
4353 }
4354
4355 return ($self->{ct}); # DOCTYPE
4356 redo A;
4357 } else {
4358 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4359
4360 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4361
4362 #$self->{ct}->{quirks} = 1;
4363 $self->{state} = BOGUS_DOCTYPE_STATE;
4364 } else {
4365
4366 $self->{state} = BOGUS_MD_STATE;
4367 }
4368
4369
4370 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4371 $self->{line_prev} = $self->{line};
4372 $self->{column_prev} = $self->{column};
4373 $self->{column}++;
4374 $self->{nc}
4375 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4376 } else {
4377 $self->{set_nc}->($self);
4378 }
4379
4380 redo A;
4381 }
4382 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4383 if ($is_space->{$self->{nc}}) {
4384
4385 ## Stay in the state.
4386
4387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4388 $self->{line_prev} = $self->{line};
4389 $self->{column_prev} = $self->{column};
4390 $self->{column}++;
4391 $self->{nc}
4392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4393 } else {
4394 $self->{set_nc}->($self);
4395 }
4396
4397 redo A;
4398 } elsif ($self->{nc} == 0x003E) { # >
4399
4400 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401
4402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403 $self->{line_prev} = $self->{line};
4404 $self->{column_prev} = $self->{column};
4405 $self->{column}++;
4406 $self->{nc}
4407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408 } else {
4409 $self->{set_nc}->($self);
4410 }
4411
4412 return ($self->{ct}); # ENTITY
4413 redo A;
4414 } elsif ($self->{nc} == 0x004E or # N
4415 $self->{nc} == 0x006E) { # n
4416
4417 $self->{state} = NDATA_STATE;
4418 $self->{kwd} = chr $self->{nc};
4419
4420 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4421 $self->{line_prev} = $self->{line};
4422 $self->{column_prev} = $self->{column};
4423 $self->{column}++;
4424 $self->{nc}
4425 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4426 } else {
4427 $self->{set_nc}->($self);
4428 }
4429
4430 redo A;
4431 } elsif ($self->{nc} == -1) {
4432
4433 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4434 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4435 ## reconsume
4436 return ($self->{ct}); # ENTITY
4437 redo A;
4438 } else {
4439
4440 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4441 $self->{state} = BOGUS_MD_STATE;
4442
4443 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4444 $self->{line_prev} = $self->{line};
4445 $self->{column_prev} = $self->{column};
4446 $self->{column}++;
4447 $self->{nc}
4448 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4449 } else {
4450 $self->{set_nc}->($self);
4451 }
4452
4453 redo A;
4454 }
4455 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4456 if ($self->{nc} == 0x003E) { # >
4457
4458 $self->{state} = DATA_STATE;
4459 $self->{s_kwd} = '';
4460
4461 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4462 $self->{line_prev} = $self->{line};
4463 $self->{column_prev} = $self->{column};
4464 $self->{column}++;
4465 $self->{nc}
4466 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4467 } else {
4468 $self->{set_nc}->($self);
4469 }
4470
4471
4472 return ($self->{ct}); # DOCTYPE
4473
4474 redo A;
4475 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4476
4477 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4478 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4479 $self->{in_subset} = 1;
4480
4481 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4482 $self->{line_prev} = $self->{line};
4483 $self->{column_prev} = $self->{column};
4484 $self->{column}++;
4485 $self->{nc}
4486 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4487 } else {
4488 $self->{set_nc}->($self);
4489 }
4490
4491 return ($self->{ct}); # DOCTYPE
4492 redo A;
4493 } elsif ($self->{nc} == -1) {
4494
4495 $self->{state} = DATA_STATE;
4496 $self->{s_kwd} = '';
4497 ## reconsume
4498
4499 return ($self->{ct}); # DOCTYPE
4500
4501 redo A;
4502 } else {
4503
4504 my $s = '';
4505 $self->{read_until}->($s, q{>[}, 0);
4506
4507 ## Stay in the state
4508
4509 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4510 $self->{line_prev} = $self->{line};
4511 $self->{column_prev} = $self->{column};
4512 $self->{column}++;
4513 $self->{nc}
4514 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4515 } else {
4516 $self->{set_nc}->($self);
4517 }
4518
4519 redo A;
4520 }
4521 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4522 ## NOTE: "CDATA section state" in the state is jointly implemented
4523 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4524 ## and |CDATA_SECTION_MSE2_STATE|.
4525
4526 ## XML5: "CDATA state".
4527
4528 if ($self->{nc} == 0x005D) { # ]
4529
4530 $self->{state} = CDATA_SECTION_MSE1_STATE;
4531
4532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4533 $self->{line_prev} = $self->{line};
4534 $self->{column_prev} = $self->{column};
4535 $self->{column}++;
4536 $self->{nc}
4537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4538 } else {
4539 $self->{set_nc}->($self);
4540 }
4541
4542 redo A;
4543 } elsif ($self->{nc} == -1) {
4544 if ($self->{is_xml}) {
4545
4546 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4547 } else {
4548
4549 }
4550
4551 $self->{state} = DATA_STATE;
4552 $self->{s_kwd} = '';
4553 ## Reconsume.
4554 if (length $self->{ct}->{data}) { # character
4555
4556 return ($self->{ct}); # character
4557 } else {
4558
4559 ## No token to emit. $self->{ct} is discarded.
4560 }
4561 redo A;
4562 } else {
4563
4564 $self->{ct}->{data} .= chr $self->{nc};
4565 $self->{read_until}->($self->{ct}->{data},
4566 q<]>,
4567 length $self->{ct}->{data});
4568
4569 ## Stay in the state.
4570
4571 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4572 $self->{line_prev} = $self->{line};
4573 $self->{column_prev} = $self->{column};
4574 $self->{column}++;
4575 $self->{nc}
4576 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4577 } else {
4578 $self->{set_nc}->($self);
4579 }
4580
4581 redo A;
4582 }
4583
4584 ## ISSUE: "text tokens" in spec.
4585 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4586 ## XML5: "CDATA bracket state".
4587
4588 if ($self->{nc} == 0x005D) { # ]
4589
4590 $self->{state} = CDATA_SECTION_MSE2_STATE;
4591
4592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4593 $self->{line_prev} = $self->{line};
4594 $self->{column_prev} = $self->{column};
4595 $self->{column}++;
4596 $self->{nc}
4597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4598 } else {
4599 $self->{set_nc}->($self);
4600 }
4601
4602 redo A;
4603 } else {
4604
4605 ## XML5: If EOF, "]" is not appended and changed to the data state.
4606 $self->{ct}->{data} .= ']';
4607 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4608 ## Reconsume.
4609 redo A;
4610 }
4611 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4612 ## XML5: "CDATA end state".
4613
4614 if ($self->{nc} == 0x003E) { # >
4615 $self->{state} = DATA_STATE;
4616 $self->{s_kwd} = '';
4617
4618 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4619 $self->{line_prev} = $self->{line};
4620 $self->{column_prev} = $self->{column};
4621 $self->{column}++;
4622 $self->{nc}
4623 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4624 } else {
4625 $self->{set_nc}->($self);
4626 }
4627
4628 if (length $self->{ct}->{data}) { # character
4629
4630 return ($self->{ct}); # character
4631 } else {
4632
4633 ## No token to emit. $self->{ct} is discarded.
4634 }
4635 redo A;
4636 } elsif ($self->{nc} == 0x005D) { # ]
4637 # character
4638 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4639 ## Stay in the state.
4640
4641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642 $self->{line_prev} = $self->{line};
4643 $self->{column_prev} = $self->{column};
4644 $self->{column}++;
4645 $self->{nc}
4646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647 } else {
4648 $self->{set_nc}->($self);
4649 }
4650
4651 redo A;
4652 } else {
4653
4654 $self->{ct}->{data} .= ']]'; # character
4655 $self->{state} = CDATA_SECTION_STATE;
4656 ## Reconsume. ## XML5: Emit.
4657 redo A;
4658 }
4659 } elsif ($self->{state} == ENTITY_STATE) {
4660 if ($is_space->{$self->{nc}} or
4661 {
4662 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4663 $self->{entity_add} => 1,
4664 }->{$self->{nc}}) {
4665 if ($self->{is_xml}) {
4666
4667 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4668 line => $self->{line_prev},
4669 column => $self->{column_prev}
4670 + ($self->{nc} == -1 ? 1 : 0));
4671 } else {
4672
4673 ## No error
4674 }
4675 ## Don't consume
4676 ## Return nothing.
4677 #
4678 } elsif ($self->{nc} == 0x0023) { # #
4679
4680 $self->{state} = ENTITY_HASH_STATE;
4681 $self->{kwd} = '#';
4682
4683 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4684 $self->{line_prev} = $self->{line};
4685 $self->{column_prev} = $self->{column};
4686 $self->{column}++;
4687 $self->{nc}
4688 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4689 } else {
4690 $self->{set_nc}->($self);
4691 }
4692
4693 redo A;
4694 } elsif ($self->{is_xml} or
4695 (0x0041 <= $self->{nc} and
4696 $self->{nc} <= 0x005A) or # A..Z
4697 (0x0061 <= $self->{nc} and
4698 $self->{nc} <= 0x007A)) { # a..z
4699
4700 require Whatpm::_NamedEntityList;
4701 $self->{state} = ENTITY_NAME_STATE;
4702 $self->{kwd} = chr $self->{nc};
4703 $self->{entity__value} = $self->{kwd};
4704 $self->{entity__match} = 0;
4705
4706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4707 $self->{line_prev} = $self->{line};
4708 $self->{column_prev} = $self->{column};
4709 $self->{column}++;
4710 $self->{nc}
4711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4712 } else {
4713 $self->{set_nc}->($self);
4714 }
4715
4716 redo A;
4717 } else {
4718
4719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4720 ## Return nothing.
4721 #
4722 }
4723
4724 ## NOTE: No character is consumed by the "consume a character
4725 ## reference" algorithm. In other word, there is an "&" character
4726 ## that does not introduce a character reference, which would be
4727 ## appended to the parent element or the attribute value in later
4728 ## process of the tokenizer.
4729
4730 if ($self->{prev_state} == DATA_STATE) {
4731
4732 $self->{state} = $self->{prev_state};
4733 $self->{s_kwd} = '';
4734 ## Reconsume.
4735 return ({type => CHARACTER_TOKEN, data => '&',
4736 line => $self->{line_prev},
4737 column => $self->{column_prev},
4738 });
4739 redo A;
4740 } else {
4741
4742 $self->{ca}->{value} .= '&';
4743 $self->{state} = $self->{prev_state};
4744 $self->{s_kwd} = '';
4745 ## Reconsume.
4746 redo A;
4747 }
4748 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4749 if ($self->{nc} == 0x0078) { # x
4750
4751 $self->{state} = HEXREF_X_STATE;
4752 $self->{kwd} .= chr $self->{nc};
4753
4754 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4755 $self->{line_prev} = $self->{line};
4756 $self->{column_prev} = $self->{column};
4757 $self->{column}++;
4758 $self->{nc}
4759 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4760 } else {
4761 $self->{set_nc}->($self);
4762 }
4763
4764 redo A;
4765 } elsif ($self->{nc} == 0x0058) { # X
4766
4767 if ($self->{is_xml}) {
4768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4769 }
4770 $self->{state} = HEXREF_X_STATE;
4771 $self->{kwd} .= chr $self->{nc};
4772
4773 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4774 $self->{line_prev} = $self->{line};
4775 $self->{column_prev} = $self->{column};
4776 $self->{column}++;
4777 $self->{nc}
4778 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4779 } else {
4780 $self->{set_nc}->($self);
4781 }
4782
4783 redo A;
4784 } elsif (0x0030 <= $self->{nc} and
4785 $self->{nc} <= 0x0039) { # 0..9
4786
4787 $self->{state} = NCR_NUM_STATE;
4788 $self->{kwd} = $self->{nc} - 0x0030;
4789
4790 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4791 $self->{line_prev} = $self->{line};
4792 $self->{column_prev} = $self->{column};
4793 $self->{column}++;
4794 $self->{nc}
4795 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4796 } else {
4797 $self->{set_nc}->($self);
4798 }
4799
4800 redo A;
4801 } else {
4802 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4803 line => $self->{line_prev},
4804 column => $self->{column_prev} - 1);
4805
4806 ## NOTE: According to the spec algorithm, nothing is returned,
4807 ## and then "&#" is appended to the parent element or the attribute
4808 ## value in the later processing.
4809
4810 if ($self->{prev_state} == DATA_STATE) {
4811
4812 $self->{state} = $self->{prev_state};
4813 $self->{s_kwd} = '';
4814 ## Reconsume.
4815 return ({type => CHARACTER_TOKEN,
4816 data => '&#',
4817 line => $self->{line_prev},
4818 column => $self->{column_prev} - 1,
4819 });
4820 redo A;
4821 } else {
4822
4823 $self->{ca}->{value} .= '&#';
4824 $self->{state} = $self->{prev_state};
4825 $self->{s_kwd} = '';
4826 ## Reconsume.
4827 redo A;
4828 }
4829 }
4830 } elsif ($self->{state} == NCR_NUM_STATE) {
4831 if (0x0030 <= $self->{nc} and
4832 $self->{nc} <= 0x0039) { # 0..9
4833
4834 $self->{kwd} *= 10;
4835 $self->{kwd} += $self->{nc} - 0x0030;
4836
4837 ## Stay in the state.
4838
4839 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4840 $self->{line_prev} = $self->{line};
4841 $self->{column_prev} = $self->{column};
4842 $self->{column}++;
4843 $self->{nc}
4844 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4845 } else {
4846 $self->{set_nc}->($self);
4847 }
4848
4849 redo A;
4850 } elsif ($self->{nc} == 0x003B) { # ;
4851
4852
4853 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4854 $self->{line_prev} = $self->{line};
4855 $self->{column_prev} = $self->{column};
4856 $self->{column}++;
4857 $self->{nc}
4858 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4859 } else {
4860 $self->{set_nc}->($self);
4861 }
4862
4863 #
4864 } else {
4865
4866 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4867 ## Reconsume.
4868 #
4869 }
4870
4871 my $code = $self->{kwd};
4872 my $l = $self->{line_prev};
4873 my $c = $self->{column_prev};
4874 if ((not $self->{is_xml} and $charref_map->{$code}) or
4875 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4876 ($self->{is_xml} and $code == 0x0000)) {
4877
4878 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4879 text => (sprintf 'U+%04X', $code),
4880 line => $l, column => $c);
4881 $code = $charref_map->{$code};
4882 } elsif ($code > 0x10FFFF) {
4883
4884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4885 text => (sprintf 'U-%08X', $code),
4886 line => $l, column => $c);
4887 $code = 0xFFFD;
4888 }
4889
4890 if ($self->{prev_state} == DATA_STATE) {
4891
4892 $self->{state} = $self->{prev_state};
4893 $self->{s_kwd} = '';
4894 ## Reconsume.
4895 return ({type => CHARACTER_TOKEN, data => chr $code,
4896 has_reference => 1,
4897 line => $l, column => $c,
4898 });
4899 redo A;
4900 } else {
4901
4902 $self->{ca}->{value} .= chr $code;
4903 $self->{ca}->{has_reference} = 1;
4904 $self->{state} = $self->{prev_state};
4905 $self->{s_kwd} = '';
4906 ## Reconsume.
4907 redo A;
4908 }
4909 } elsif ($self->{state} == HEXREF_X_STATE) {
4910 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4911 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4912 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4913 # 0..9, A..F, a..f
4914
4915 $self->{state} = HEXREF_HEX_STATE;
4916 $self->{kwd} = 0;
4917 ## Reconsume.
4918 redo A;
4919 } else {
4920 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4921 line => $self->{line_prev},
4922 column => $self->{column_prev} - 2);
4923
4924 ## NOTE: According to the spec algorithm, nothing is returned,
4925 ## and then "&#" followed by "X" or "x" is appended to the parent
4926 ## element or the attribute value in the later processing.
4927
4928 if ($self->{prev_state} == DATA_STATE) {
4929
4930 $self->{state} = $self->{prev_state};
4931 $self->{s_kwd} = '';
4932 ## Reconsume.
4933 return ({type => CHARACTER_TOKEN,
4934 data => '&' . $self->{kwd},
4935 line => $self->{line_prev},
4936 column => $self->{column_prev} - length $self->{kwd},
4937 });
4938 redo A;
4939 } else {
4940
4941 $self->{ca}->{value} .= '&' . $self->{kwd};
4942 $self->{state} = $self->{prev_state};
4943 $self->{s_kwd} = '';
4944 ## Reconsume.
4945 redo A;
4946 }
4947 }
4948 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4949 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4950 # 0..9
4951
4952 $self->{kwd} *= 0x10;
4953 $self->{kwd} += $self->{nc} - 0x0030;
4954 ## Stay in the state.
4955
4956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957 $self->{line_prev} = $self->{line};
4958 $self->{column_prev} = $self->{column};
4959 $self->{column}++;
4960 $self->{nc}
4961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962 } else {
4963 $self->{set_nc}->($self);
4964 }
4965
4966 redo A;
4967 } elsif (0x0061 <= $self->{nc} and
4968 $self->{nc} <= 0x0066) { # a..f
4969
4970 $self->{kwd} *= 0x10;
4971 $self->{kwd} += $self->{nc} - 0x0060 + 9;
4972 ## Stay in the state.
4973
4974 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4975 $self->{line_prev} = $self->{line};
4976 $self->{column_prev} = $self->{column};
4977 $self->{column}++;
4978 $self->{nc}
4979 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4980 } else {
4981 $self->{set_nc}->($self);
4982 }
4983
4984 redo A;
4985 } elsif (0x0041 <= $self->{nc} and
4986 $self->{nc} <= 0x0046) { # A..F
4987
4988 $self->{kwd} *= 0x10;
4989 $self->{kwd} += $self->{nc} - 0x0040 + 9;
4990 ## Stay in the state.
4991
4992 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4993 $self->{line_prev} = $self->{line};
4994 $self->{column_prev} = $self->{column};
4995 $self->{column}++;
4996 $self->{nc}
4997 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4998 } else {
4999 $self->{set_nc}->($self);
5000 }
5001
5002 redo A;
5003 } elsif ($self->{nc} == 0x003B) { # ;
5004
5005
5006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5007 $self->{line_prev} = $self->{line};
5008 $self->{column_prev} = $self->{column};
5009 $self->{column}++;
5010 $self->{nc}
5011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5012 } else {
5013 $self->{set_nc}->($self);
5014 }
5015
5016 #
5017 } else {
5018
5019 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5020 line => $self->{line},
5021 column => $self->{column});
5022 ## Reconsume.
5023 #
5024 }
5025
5026 my $code = $self->{kwd};
5027 my $l = $self->{line_prev};
5028 my $c = $self->{column_prev};
5029 if ((not $self->{is_xml} and $charref_map->{$code}) or
5030 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5031 ($self->{is_xml} and $code == 0x0000)) {
5032
5033 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5034 text => (sprintf 'U+%04X', $code),
5035 line => $l, column => $c);
5036 $code = $charref_map->{$code};
5037 } elsif ($code > 0x10FFFF) {
5038
5039 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5040 text => (sprintf 'U-%08X', $code),
5041 line => $l, column => $c);
5042 $code = 0xFFFD;
5043 }
5044
5045 if ($self->{prev_state} == DATA_STATE) {
5046
5047 $self->{state} = $self->{prev_state};
5048 $self->{s_kwd} = '';
5049 ## Reconsume.
5050 return ({type => CHARACTER_TOKEN, data => chr $code,
5051 has_reference => 1,
5052 line => $l, column => $c,
5053 });
5054 redo A;
5055 } else {
5056
5057 $self->{ca}->{value} .= chr $code;
5058 $self->{ca}->{has_reference} = 1;
5059 $self->{state} = $self->{prev_state};
5060 $self->{s_kwd} = '';
5061 ## Reconsume.
5062 redo A;
5063 }
5064 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5065 if ((0x0041 <= $self->{nc} and # a
5066 $self->{nc} <= 0x005A) or # x
5067 (0x0061 <= $self->{nc} and # a
5068 $self->{nc} <= 0x007A) or # z
5069 (0x0030 <= $self->{nc} and # 0
5070 $self->{nc} <= 0x0039) or # 9
5071 $self->{nc} == 0x003B or # ;
5072 ($self->{is_xml} and
5073 not ($is_space->{$self->{nc}} or
5074 {
5075 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5076 $self->{entity_add} => 1,
5077 }->{$self->{nc}}))) {
5078 our $EntityChar;
5079 $self->{kwd} .= chr $self->{nc};
5080 if (defined $EntityChar->{$self->{kwd}} or
5081 $self->{ge}->{$self->{kwd}}) {
5082 if ($self->{nc} == 0x003B) { # ;
5083 if (defined $self->{ge}->{$self->{kwd}}) {
5084 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5085
5086 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5087 } else {
5088 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5089
5090 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5091 value => $self->{kwd});
5092 } else {
5093
5094 }
5095 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5096 }
5097 } else {
5098 if ($self->{is_xml}) {
5099
5100 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5101 value => $self->{kwd},
5102 level => {
5103 'amp;' => $self->{level}->{warn},
5104 'quot;' => $self->{level}->{warn},
5105 'lt;' => $self->{level}->{warn},
5106 'gt;' => $self->{level}->{warn},
5107 'apos;' => $self->{level}->{warn},
5108 }->{$self->{kwd}} ||
5109 $self->{level}->{must});
5110 } else {
5111
5112 }
5113 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5114 }
5115 $self->{entity__match} = 1;
5116
5117 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5118 $self->{line_prev} = $self->{line};
5119 $self->{column_prev} = $self->{column};
5120 $self->{column}++;
5121 $self->{nc}
5122 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5123 } else {
5124 $self->{set_nc}->($self);
5125 }
5126
5127 #
5128 } else {
5129
5130 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5131 $self->{entity__match} = -1;
5132 ## Stay in the state.
5133
5134 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5135 $self->{line_prev} = $self->{line};
5136 $self->{column_prev} = $self->{column};
5137 $self->{column}++;
5138 $self->{nc}
5139 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5140 } else {
5141 $self->{set_nc}->($self);
5142 }
5143
5144 redo A;
5145 }
5146 } else {
5147
5148 $self->{entity__value} .= chr $self->{nc};
5149 $self->{entity__match} *= 2;
5150 ## Stay in the state.
5151
5152 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5153 $self->{line_prev} = $self->{line};
5154 $self->{column_prev} = $self->{column};
5155 $self->{column}++;
5156 $self->{nc}
5157 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5158 } else {
5159 $self->{set_nc}->($self);
5160 }
5161
5162 redo A;
5163 }
5164 }
5165
5166 my $data;
5167 my $has_ref;
5168 if ($self->{entity__match} > 0) {
5169
5170 $data = $self->{entity__value};
5171 $has_ref = 1;
5172 #
5173 } elsif ($self->{entity__match} < 0) {
5174 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5175 if ($self->{prev_state} != DATA_STATE and # in attribute
5176 $self->{entity__match} < -1) {
5177
5178 $data = '&' . $self->{kwd};
5179 #
5180 } else {
5181
5182 $data = $self->{entity__value};
5183 $has_ref = 1;
5184 #
5185 }
5186 } else {
5187
5188 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5189 line => $self->{line_prev},
5190 column => $self->{column_prev} - length $self->{kwd});
5191 $data = '&' . $self->{kwd};
5192 #
5193 }
5194
5195 ## NOTE: In these cases, when a character reference is found,
5196 ## it is consumed and a character token is returned, or, otherwise,
5197 ## nothing is consumed and returned, according to the spec algorithm.
5198 ## In this implementation, anything that has been examined by the
5199 ## tokenizer is appended to the parent element or the attribute value
5200 ## as string, either literal string when no character reference or
5201 ## entity-replaced string otherwise, in this stage, since any characters
5202 ## that would not be consumed are appended in the data state or in an
5203 ## appropriate attribute value state anyway.
5204
5205 if ($self->{prev_state} == DATA_STATE) {
5206
5207 $self->{state} = $self->{prev_state};
5208 $self->{s_kwd} = '';
5209 ## Reconsume.
5210 return ({type => CHARACTER_TOKEN,
5211 data => $data,
5212 has_reference => $has_ref,
5213 line => $self->{line_prev},
5214 column => $self->{column_prev} + 1 - length $self->{kwd},
5215 });
5216 redo A;
5217 } else {
5218
5219 $self->{ca}->{value} .= $data;
5220 $self->{ca}->{has_reference} = 1 if $has_ref;
5221 $self->{state} = $self->{prev_state};
5222 $self->{s_kwd} = '';
5223 ## Reconsume.
5224 redo A;
5225 }
5226
5227 ## XML-only states
5228
5229 } elsif ($self->{state} == PI_STATE) {
5230 ## XML5: "Pi state" and "DOCTYPE pi state".
5231
5232 if ($is_space->{$self->{nc}} or
5233 $self->{nc} == 0x003F or # ?
5234 $self->{nc} == -1) {
5235 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5236 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5237 ## "DOCTYPE pi state": Parse error, switch to the "data
5238 ## state".
5239 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5240 line => $self->{line_prev},
5241 column => $self->{column_prev}
5242 - 1 * ($self->{nc} != -1));
5243 $self->{state} = BOGUS_COMMENT_STATE;
5244 ## Reconsume.
5245 $self->{ct} = {type => COMMENT_TOKEN,
5246 data => '?',
5247 line => $self->{line_prev},
5248 column => $self->{column_prev}
5249 - 1 * ($self->{nc} != -1),
5250 };
5251 redo A;
5252 } else {
5253 ## XML5: "DOCTYPE pi state": Stay in the state.
5254 $self->{ct} = {type => PI_TOKEN,
5255 target => chr $self->{nc},
5256 data => '',
5257 line => $self->{line_prev},
5258 column => $self->{column_prev} - 1,
5259 };
5260 $self->{state} = PI_TARGET_STATE;
5261
5262 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5263 $self->{line_prev} = $self->{line};
5264 $self->{column_prev} = $self->{column};
5265 $self->{column}++;
5266 $self->{nc}
5267 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5268 } else {
5269 $self->{set_nc}->($self);
5270 }
5271
5272 redo A;
5273 }
5274 } elsif ($self->{state} == PI_TARGET_STATE) {
5275 if ($is_space->{$self->{nc}}) {
5276 $self->{state} = PI_TARGET_AFTER_STATE;
5277
5278 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5279 $self->{line_prev} = $self->{line};
5280 $self->{column_prev} = $self->{column};
5281 $self->{column}++;
5282 $self->{nc}
5283 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5284 } else {
5285 $self->{set_nc}->($self);
5286 }
5287
5288 redo A;
5289 } elsif ($self->{nc} == -1) {
5290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5291 if ($self->{in_subset}) {
5292 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5293 } else {
5294 $self->{state} = DATA_STATE;
5295 $self->{s_kwd} = '';
5296 }
5297 ## Reconsume.
5298 return ($self->{ct}); # pi
5299 redo A;
5300 } elsif ($self->{nc} == 0x003F) { # ?
5301 $self->{state} = PI_AFTER_STATE;
5302
5303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5304 $self->{line_prev} = $self->{line};
5305 $self->{column_prev} = $self->{column};
5306 $self->{column}++;
5307 $self->{nc}
5308 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5309 } else {
5310 $self->{set_nc}->($self);
5311 }
5312
5313 redo A;
5314 } else {
5315 ## XML5: typo ("tag name" -> "target")
5316 $self->{ct}->{target} .= chr $self->{nc}; # pi
5317
5318 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5319 $self->{line_prev} = $self->{line};
5320 $self->{column_prev} = $self->{column};
5321 $self->{column}++;
5322 $self->{nc}
5323 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5324 } else {
5325 $self->{set_nc}->($self);
5326 }
5327
5328 redo A;
5329 }
5330 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5331 if ($is_space->{$self->{nc}}) {
5332 ## Stay in the state.
5333
5334 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5335 $self->{line_prev} = $self->{line};
5336 $self->{column_prev} = $self->{column};
5337 $self->{column}++;
5338 $self->{nc}
5339 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5340 } else {
5341 $self->{set_nc}->($self);
5342 }
5343
5344 redo A;
5345 } else {
5346 $self->{state} = PI_DATA_STATE;
5347 ## Reprocess.
5348 redo A;
5349 }
5350 } elsif ($self->{state} == PI_DATA_STATE) {
5351 if ($self->{nc} == 0x003F) { # ?
5352 $self->{state} = PI_DATA_AFTER_STATE;
5353
5354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5355 $self->{line_prev} = $self->{line};
5356 $self->{column_prev} = $self->{column};
5357 $self->{column}++;
5358 $self->{nc}
5359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5360 } else {
5361 $self->{set_nc}->($self);
5362 }
5363
5364 redo A;
5365 } elsif ($self->{nc} == -1) {
5366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5367 if ($self->{in_subset}) {
5368 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5369 } else {
5370 $self->{state} = DATA_STATE;
5371 $self->{s_kwd} = '';
5372 }
5373 ## Reprocess.
5374 return ($self->{ct}); # pi
5375 redo A;
5376 } else {
5377 $self->{ct}->{data} .= chr $self->{nc}; # pi
5378 $self->{read_until}->($self->{ct}->{data}, q[?],
5379 length $self->{ct}->{data});
5380 ## Stay in the state.
5381
5382 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5383 $self->{line_prev} = $self->{line};
5384 $self->{column_prev} = $self->{column};
5385 $self->{column}++;
5386 $self->{nc}
5387 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5388 } else {
5389 $self->{set_nc}->($self);
5390 }
5391
5392 ## Reprocess.
5393 redo A;
5394 }
5395 } elsif ($self->{state} == PI_AFTER_STATE) {
5396 ## XML5: Part of "Pi after state".
5397
5398 if ($self->{nc} == 0x003E) { # >
5399 if ($self->{in_subset}) {
5400 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5401 } else {
5402 $self->{state} = DATA_STATE;
5403 $self->{s_kwd} = '';
5404 }
5405
5406 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5407 $self->{line_prev} = $self->{line};
5408 $self->{column_prev} = $self->{column};
5409 $self->{column}++;
5410 $self->{nc}
5411 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5412 } else {
5413 $self->{set_nc}->($self);
5414 }
5415
5416 return ($self->{ct}); # pi
5417 redo A;
5418 } elsif ($self->{nc} == 0x003F) { # ?
5419 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5420 line => $self->{line_prev},
5421 column => $self->{column_prev}); ## XML5: no error
5422 $self->{ct}->{data} .= '?';
5423 $self->{state} = PI_DATA_AFTER_STATE;
5424
5425 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5426 $self->{line_prev} = $self->{line};
5427 $self->{column_prev} = $self->{column};
5428 $self->{column}++;
5429 $self->{nc}
5430 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5431 } else {
5432 $self->{set_nc}->($self);
5433 }
5434
5435 redo A;
5436 } else {
5437 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5438 line => $self->{line_prev},
5439 column => $self->{column_prev}
5440 + 1 * ($self->{nc} == -1)); ## XML5: no error
5441 $self->{ct}->{data} .= '?'; ## XML5: not appended
5442 $self->{state} = PI_DATA_STATE;
5443 ## Reprocess.
5444 redo A;
5445 }
5446 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5447 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5448
5449 if ($self->{nc} == 0x003E) { # >
5450 if ($self->{in_subset}) {
5451 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5452 } else {
5453 $self->{state} = DATA_STATE;
5454 $self->{s_kwd} = '';
5455 }
5456
5457 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5458 $self->{line_prev} = $self->{line};
5459 $self->{column_prev} = $self->{column};
5460 $self->{column}++;
5461 $self->{nc}
5462 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5463 } else {
5464 $self->{set_nc}->($self);
5465 }
5466
5467 return ($self->{ct}); # pi
5468 redo A;
5469 } elsif ($self->{nc} == 0x003F) { # ?
5470 $self->{ct}->{data} .= '?';
5471 ## Stay in the state.
5472
5473 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5474 $self->{line_prev} = $self->{line};
5475 $self->{column_prev} = $self->{column};
5476 $self->{column}++;
5477 $self->{nc}
5478 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5479 } else {
5480 $self->{set_nc}->($self);
5481 }
5482
5483 redo A;
5484 } else {
5485 $self->{ct}->{data} .= '?'; ## XML5: not appended
5486 $self->{state} = PI_DATA_STATE;
5487 ## Reprocess.
5488 redo A;
5489 }
5490
5491 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5492 if ($self->{nc} == 0x003C) { # <
5493 $self->{state} = DOCTYPE_TAG_STATE;
5494
5495 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5496 $self->{line_prev} = $self->{line};
5497 $self->{column_prev} = $self->{column};
5498 $self->{column}++;
5499 $self->{nc}
5500 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5501 } else {
5502 $self->{set_nc}->($self);
5503 }
5504
5505 redo A;
5506 } elsif ($self->{nc} == 0x0025) { # %
5507 ## XML5: Not defined yet.
5508
5509 ## TODO:
5510
5511 if (not $self->{stop_processing} and
5512 not $self->{document}->xml_standalone) {
5513 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5514 level => $self->{level}->{info});
5515 $self->{stop_processing} = 1;
5516 }
5517
5518
5519 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5520 $self->{line_prev} = $self->{line};
5521 $self->{column_prev} = $self->{column};
5522 $self->{column}++;
5523 $self->{nc}
5524 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5525 } else {
5526 $self->{set_nc}->($self);
5527 }
5528
5529 redo A;
5530 } elsif ($self->{nc} == 0x005D) { # ]
5531 delete $self->{in_subset};
5532 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5533
5534 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5535 $self->{line_prev} = $self->{line};
5536 $self->{column_prev} = $self->{column};
5537 $self->{column}++;
5538 $self->{nc}
5539 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5540 } else {
5541 $self->{set_nc}->($self);
5542 }
5543
5544 redo A;
5545 } elsif ($is_space->{$self->{nc}}) {
5546 ## Stay in the state.
5547
5548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5549 $self->{line_prev} = $self->{line};
5550 $self->{column_prev} = $self->{column};
5551 $self->{column}++;
5552 $self->{nc}
5553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5554 } else {
5555 $self->{set_nc}->($self);
5556 }
5557
5558 redo A;
5559 } elsif ($self->{nc} == -1) {
5560 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5561 delete $self->{in_subset};
5562 $self->{state} = DATA_STATE;
5563 $self->{s_kwd} = '';
5564 ## Reconsume.
5565 return ({type => END_OF_DOCTYPE_TOKEN});
5566 redo A;
5567 } else {
5568 unless ($self->{internal_subset_tainted}) {
5569 ## XML5: No parse error.
5570 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5571 $self->{internal_subset_tainted} = 1;
5572 }
5573 ## Stay in the state.
5574
5575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5576 $self->{line_prev} = $self->{line};
5577 $self->{column_prev} = $self->{column};
5578 $self->{column}++;
5579 $self->{nc}
5580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5581 } else {
5582 $self->{set_nc}->($self);
5583 }
5584
5585 redo A;
5586 }
5587 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5588 if ($self->{nc} == 0x003E) { # >
5589 $self->{state} = DATA_STATE;
5590 $self->{s_kwd} = '';
5591
5592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5593 $self->{line_prev} = $self->{line};
5594 $self->{column_prev} = $self->{column};
5595 $self->{column}++;
5596 $self->{nc}
5597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5598 } else {
5599 $self->{set_nc}->($self);
5600 }
5601
5602 return ({type => END_OF_DOCTYPE_TOKEN});
5603 redo A;
5604 } elsif ($self->{nc} == -1) {
5605 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5606 $self->{state} = DATA_STATE;
5607 $self->{s_kwd} = '';
5608 ## Reconsume.
5609 return ({type => END_OF_DOCTYPE_TOKEN});
5610 redo A;
5611 } else {
5612 ## XML5: No parse error and stay in the state.
5613 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5614
5615 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5616
5617 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5618 $self->{line_prev} = $self->{line};
5619 $self->{column_prev} = $self->{column};
5620 $self->{column}++;
5621 $self->{nc}
5622 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5623 } else {
5624 $self->{set_nc}->($self);
5625 }
5626
5627 redo A;
5628 }
5629 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5630 if ($self->{nc} == 0x003E) { # >
5631 $self->{state} = DATA_STATE;
5632 $self->{s_kwd} = '';
5633
5634 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5635 $self->{line_prev} = $self->{line};
5636 $self->{column_prev} = $self->{column};
5637 $self->{column}++;
5638 $self->{nc}
5639 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5640 } else {
5641 $self->{set_nc}->($self);
5642 }
5643
5644 return ({type => END_OF_DOCTYPE_TOKEN});
5645 redo A;
5646 } elsif ($self->{nc} == -1) {
5647 $self->{state} = DATA_STATE;
5648 $self->{s_kwd} = '';
5649 ## Reconsume.
5650 return ({type => END_OF_DOCTYPE_TOKEN});
5651 redo A;
5652 } else {
5653 ## Stay in the state.
5654
5655 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5656 $self->{line_prev} = $self->{line};
5657 $self->{column_prev} = $self->{column};
5658 $self->{column}++;
5659 $self->{nc}
5660 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5661 } else {
5662 $self->{set_nc}->($self);
5663 }
5664
5665 redo A;
5666 }
5667 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5668 if ($self->{nc} == 0x0021) { # !
5669 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5670
5671 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5672 $self->{line_prev} = $self->{line};
5673 $self->{column_prev} = $self->{column};
5674 $self->{column}++;
5675 $self->{nc}
5676 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5677 } else {
5678 $self->{set_nc}->($self);
5679 }
5680
5681 redo A;
5682 } elsif ($self->{nc} == 0x003F) { # ?
5683 $self->{state} = PI_STATE;
5684
5685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5686 $self->{line_prev} = $self->{line};
5687 $self->{column_prev} = $self->{column};
5688 $self->{column}++;
5689 $self->{nc}
5690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5691 } else {
5692 $self->{set_nc}->($self);
5693 }
5694
5695 redo A;
5696 } elsif ($self->{nc} == -1) {
5697 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5698 $self->{state} = DATA_STATE;
5699 $self->{s_kwd} = '';
5700 ## Reconsume.
5701 redo A;
5702 } else {
5703 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5704 line => $self->{line_prev},
5705 column => $self->{column_prev});
5706 $self->{state} = BOGUS_COMMENT_STATE;
5707 $self->{ct} = {type => COMMENT_TOKEN,
5708 data => '',
5709 }; ## NOTE: Will be discarded.
5710
5711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5712 $self->{line_prev} = $self->{line};
5713 $self->{column_prev} = $self->{column};
5714 $self->{column}++;
5715 $self->{nc}
5716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5717 } else {
5718 $self->{set_nc}->($self);
5719 }
5720
5721 redo A;
5722 }
5723 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5724 ## XML5: "DOCTYPE markup declaration state".
5725
5726 if ($self->{nc} == 0x002D) { # -
5727 $self->{state} = MD_HYPHEN_STATE;
5728
5729 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5730 $self->{line_prev} = $self->{line};
5731 $self->{column_prev} = $self->{column};
5732 $self->{column}++;
5733 $self->{nc}
5734 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5735 } else {
5736 $self->{set_nc}->($self);
5737 }
5738
5739 redo A;
5740 } elsif ($self->{nc} == 0x0045 or # E
5741 $self->{nc} == 0x0065) { # e
5742 $self->{state} = MD_E_STATE;
5743 $self->{kwd} = chr $self->{nc};
5744
5745 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5746 $self->{line_prev} = $self->{line};
5747 $self->{column_prev} = $self->{column};
5748 $self->{column}++;
5749 $self->{nc}
5750 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5751 } else {
5752 $self->{set_nc}->($self);
5753 }
5754
5755 redo A;
5756 } elsif ($self->{nc} == 0x0041 or # A
5757 $self->{nc} == 0x0061) { # a
5758 $self->{state} = MD_ATTLIST_STATE;
5759 $self->{kwd} = chr $self->{nc};
5760
5761 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5762 $self->{line_prev} = $self->{line};
5763 $self->{column_prev} = $self->{column};
5764 $self->{column}++;
5765 $self->{nc}
5766 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5767 } else {
5768 $self->{set_nc}->($self);
5769 }
5770
5771 redo A;
5772 } elsif ($self->{nc} == 0x004E or # N
5773 $self->{nc} == 0x006E) { # n
5774 $self->{state} = MD_NOTATION_STATE;
5775 $self->{kwd} = chr $self->{nc};
5776
5777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778 $self->{line_prev} = $self->{line};
5779 $self->{column_prev} = $self->{column};
5780 $self->{column}++;
5781 $self->{nc}
5782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783 } else {
5784 $self->{set_nc}->($self);
5785 }
5786
5787 redo A;
5788 } else {
5789 #
5790 }
5791
5792 ## XML5: No parse error.
5793 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5794 line => $self->{line_prev},
5795 column => $self->{column_prev} - 1);
5796 ## Reconsume.
5797 $self->{state} = BOGUS_COMMENT_STATE;
5798 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5799 redo A;
5800 } elsif ($self->{state} == MD_E_STATE) {
5801 if ($self->{nc} == 0x004E or # N
5802 $self->{nc} == 0x006E) { # n
5803 $self->{state} = MD_ENTITY_STATE;
5804 $self->{kwd} .= chr $self->{nc};
5805
5806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5807 $self->{line_prev} = $self->{line};
5808 $self->{column_prev} = $self->{column};
5809 $self->{column}++;
5810 $self->{nc}
5811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5812 } else {
5813 $self->{set_nc}->($self);
5814 }
5815
5816 redo A;
5817 } elsif ($self->{nc} == 0x004C or # L
5818 $self->{nc} == 0x006C) { # l
5819 ## XML5: <!ELEMENT> not supported.
5820 $self->{state} = MD_ELEMENT_STATE;
5821 $self->{kwd} .= chr $self->{nc};
5822
5823 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5824 $self->{line_prev} = $self->{line};
5825 $self->{column_prev} = $self->{column};
5826 $self->{column}++;
5827 $self->{nc}
5828 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5829 } else {
5830 $self->{set_nc}->($self);
5831 }
5832
5833 redo A;
5834 } else {
5835 ## XML5: No parse error.
5836 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5837 line => $self->{line_prev},
5838 column => $self->{column_prev} - 2
5839 + 1 * ($self->{nc} == -1));
5840 ## Reconsume.
5841 $self->{state} = BOGUS_COMMENT_STATE;
5842 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5843 redo A;
5844 }
5845 } elsif ($self->{state} == MD_ENTITY_STATE) {
5846 if ($self->{nc} == [
5847 undef,
5848 undef,
5849 0x0054, # T
5850 0x0049, # I
5851 0x0054, # T
5852 ]->[length $self->{kwd}] or
5853 $self->{nc} == [
5854 undef,
5855 undef,
5856 0x0074, # t
5857 0x0069, # i
5858 0x0074, # t
5859 ]->[length $self->{kwd}]) {
5860 ## Stay in the state.
5861 $self->{kwd} .= chr $self->{nc};
5862
5863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5864 $self->{line_prev} = $self->{line};
5865 $self->{column_prev} = $self->{column};
5866 $self->{column}++;
5867 $self->{nc}
5868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5869 } else {
5870 $self->{set_nc}->($self);
5871 }
5872
5873 redo A;
5874 } elsif ((length $self->{kwd}) == 5 and
5875 ($self->{nc} == 0x0059 or # Y
5876 $self->{nc} == 0x0079)) { # y
5877 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5878 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5879 text => 'ENTITY',
5880 line => $self->{line_prev},
5881 column => $self->{column_prev} - 4);
5882 }
5883 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5884 line => $self->{line_prev},
5885 column => $self->{column_prev} - 6};
5886 $self->{state} = DOCTYPE_MD_STATE;
5887
5888 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5889 $self->{line_prev} = $self->{line};
5890 $self->{column_prev} = $self->{column};
5891 $self->{column}++;
5892 $self->{nc}
5893 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5894 } else {
5895 $self->{set_nc}->($self);
5896 }
5897
5898 redo A;
5899 } else {
5900 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5901 line => $self->{line_prev},
5902 column => $self->{column_prev} - 1
5903 - (length $self->{kwd})
5904 + 1 * ($self->{nc} == -1));
5905 $self->{state} = BOGUS_COMMENT_STATE;
5906 ## Reconsume.
5907 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5908 redo A;
5909 }
5910 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5911 if ($self->{nc} == [
5912 undef,
5913 undef,
5914 0x0045, # E
5915 0x004D, # M
5916 0x0045, # E
5917 0x004E, # N
5918 ]->[length $self->{kwd}] or
5919 $self->{nc} == [
5920 undef,
5921 undef,
5922 0x0065, # e
5923 0x006D, # m
5924 0x0065, # e
5925 0x006E, # n
5926 ]->[length $self->{kwd}]) {
5927 ## Stay in the state.
5928 $self->{kwd} .= chr $self->{nc};
5929
5930 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5931 $self->{line_prev} = $self->{line};
5932 $self->{column_prev} = $self->{column};
5933 $self->{column}++;
5934 $self->{nc}
5935 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5936 } else {
5937 $self->{set_nc}->($self);
5938 }
5939
5940 redo A;
5941 } elsif ((length $self->{kwd}) == 6 and
5942 ($self->{nc} == 0x0054 or # T
5943 $self->{nc} == 0x0074)) { # t
5944 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5945 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5946 text => 'ELEMENT',
5947 line => $self->{line_prev},
5948 column => $self->{column_prev} - 5);
5949 }
5950 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5951 line => $self->{line_prev},
5952 column => $self->{column_prev} - 7};
5953 $self->{state} = DOCTYPE_MD_STATE;
5954
5955 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5956 $self->{line_prev} = $self->{line};
5957 $self->{column_prev} = $self->{column};
5958 $self->{column}++;
5959 $self->{nc}
5960 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5961 } else {
5962 $self->{set_nc}->($self);
5963 }
5964
5965 redo A;
5966 } else {
5967 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5968 line => $self->{line_prev},
5969 column => $self->{column_prev} - 1
5970 - (length $self->{kwd})
5971 + 1 * ($self->{nc} == -1));
5972 $self->{state} = BOGUS_COMMENT_STATE;
5973 ## Reconsume.
5974 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5975 redo A;
5976 }
5977 } elsif ($self->{state} == MD_ATTLIST_STATE) {
5978 if ($self->{nc} == [
5979 undef,
5980 0x0054, # T
5981 0x0054, # T
5982 0x004C, # L
5983 0x0049, # I
5984 0x0053, # S
5985 ]->[length $self->{kwd}] or
5986 $self->{nc} == [
5987 undef,
5988 0x0074, # t
5989 0x0074, # t
5990 0x006C, # l
5991 0x0069, # i
5992 0x0073, # s
5993 ]->[length $self->{kwd}]) {
5994 ## Stay in the state.
5995 $self->{kwd} .= chr $self->{nc};
5996
5997 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5998 $self->{line_prev} = $self->{line};
5999 $self->{column_prev} = $self->{column};
6000 $self->{column}++;
6001 $self->{nc}
6002 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6003 } else {
6004 $self->{set_nc}->($self);
6005 }
6006
6007 redo A;
6008 } elsif ((length $self->{kwd}) == 6 and
6009 ($self->{nc} == 0x0054 or # T
6010 $self->{nc} == 0x0074)) { # t
6011 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6012 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6013 text => 'ATTLIST',
6014 line => $self->{line_prev},
6015 column => $self->{column_prev} - 5);
6016 }
6017 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6018 attrdefs => [],
6019 line => $self->{line_prev},
6020 column => $self->{column_prev} - 7};
6021 $self->{state} = DOCTYPE_MD_STATE;
6022
6023 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6024 $self->{line_prev} = $self->{line};
6025 $self->{column_prev} = $self->{column};
6026 $self->{column}++;
6027 $self->{nc}
6028 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6029 } else {
6030 $self->{set_nc}->($self);
6031 }
6032
6033 redo A;
6034 } else {
6035 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6036 line => $self->{line_prev},
6037 column => $self->{column_prev} - 1
6038 - (length $self->{kwd})
6039 + 1 * ($self->{nc} == -1));
6040 $self->{state} = BOGUS_COMMENT_STATE;
6041 ## Reconsume.
6042 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6043 redo A;
6044 }
6045 } elsif ($self->{state} == MD_NOTATION_STATE) {
6046 if ($self->{nc} == [
6047 undef,
6048 0x004F, # O
6049 0x0054, # T
6050 0x0041, # A
6051 0x0054, # T
6052 0x0049, # I
6053 0x004F, # O
6054 ]->[length $self->{kwd}] or
6055 $self->{nc} == [
6056 undef,
6057 0x006F, # o
6058 0x0074, # t
6059 0x0061, # a
6060 0x0074, # t
6061 0x0069, # i
6062 0x006F, # o
6063 ]->[length $self->{kwd}]) {
6064 ## Stay in the state.
6065 $self->{kwd} .= chr $self->{nc};
6066
6067 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6068 $self->{line_prev} = $self->{line};
6069 $self->{column_prev} = $self->{column};
6070 $self->{column}++;
6071 $self->{nc}
6072 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6073 } else {
6074 $self->{set_nc}->($self);
6075 }
6076
6077 redo A;
6078 } elsif ((length $self->{kwd}) == 7 and
6079 ($self->{nc} == 0x004E or # N
6080 $self->{nc} == 0x006E)) { # n
6081 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6082 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6083 text => 'NOTATION',
6084 line => $self->{line_prev},
6085 column => $self->{column_prev} - 6);
6086 }
6087 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6088 line => $self->{line_prev},
6089 column => $self->{column_prev} - 8};
6090 $self->{state} = DOCTYPE_MD_STATE;
6091
6092 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6093 $self->{line_prev} = $self->{line};
6094 $self->{column_prev} = $self->{column};
6095 $self->{column}++;
6096 $self->{nc}
6097 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6098 } else {
6099 $self->{set_nc}->($self);
6100 }
6101
6102 redo A;
6103 } else {
6104 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6105 line => $self->{line_prev},
6106 column => $self->{column_prev} - 1
6107 - (length $self->{kwd})
6108 + 1 * ($self->{nc} == -1));
6109 $self->{state} = BOGUS_COMMENT_STATE;
6110 ## Reconsume.
6111 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6112 redo A;
6113 }
6114 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6115 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6116 ## "DOCTYPE NOTATION state".
6117
6118 if ($is_space->{$self->{nc}}) {
6119 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6120 $self->{state} = BEFORE_MD_NAME_STATE;
6121
6122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6123 $self->{line_prev} = $self->{line};
6124 $self->{column_prev} = $self->{column};
6125 $self->{column}++;
6126 $self->{nc}
6127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6128 } else {
6129 $self->{set_nc}->($self);
6130 }
6131
6132 redo A;
6133 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6134 $self->{nc} == 0x0025) { # %
6135 ## XML5: Switch to the "DOCTYPE bogus comment state".
6136 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6137 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6138
6139 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140 $self->{line_prev} = $self->{line};
6141 $self->{column_prev} = $self->{column};
6142 $self->{column}++;
6143 $self->{nc}
6144 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145 } else {
6146 $self->{set_nc}->($self);
6147 }
6148
6149 redo A;
6150 } elsif ($self->{nc} == -1) {
6151 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6152 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6153 ## Reconsume.
6154 redo A;
6155 } elsif ($self->{nc} == 0x003E) { # >
6156 ## XML5: Switch to the "DOCTYPE bogus comment state".
6157 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6158 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6159
6160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6161 $self->{line_prev} = $self->{line};
6162 $self->{column_prev} = $self->{column};
6163 $self->{column}++;
6164 $self->{nc}
6165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6166 } else {
6167 $self->{set_nc}->($self);
6168 }
6169
6170 redo A;
6171 } else {
6172 ## XML5: Switch to the "DOCTYPE bogus comment state".
6173 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6174 $self->{state} = BEFORE_MD_NAME_STATE;
6175 redo A;
6176 }
6177 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6178 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6179 ## before state", "DOCTYPE ATTLIST name before state".
6180
6181 if ($is_space->{$self->{nc}}) {
6182 ## Stay in the state.
6183
6184 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6185 $self->{line_prev} = $self->{line};
6186 $self->{column_prev} = $self->{column};
6187 $self->{column}++;
6188 $self->{nc}
6189 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6190 } else {
6191 $self->{set_nc}->($self);
6192 }
6193
6194 redo A;
6195 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6196 $self->{nc} == 0x0025) { # %
6197 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6198
6199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6200 $self->{line_prev} = $self->{line};
6201 $self->{column_prev} = $self->{column};
6202 $self->{column}++;
6203 $self->{nc}
6204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6205 } else {
6206 $self->{set_nc}->($self);
6207 }
6208
6209 redo A;
6210 } elsif ($self->{nc} == 0x003E) { # >
6211 ## XML5: Same as "Anything else".
6212 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6213 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6214
6215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6216 $self->{line_prev} = $self->{line};
6217 $self->{column_prev} = $self->{column};
6218 $self->{column}++;
6219 $self->{nc}
6220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6221 } else {
6222 $self->{set_nc}->($self);
6223 }
6224
6225 redo A;
6226 } elsif ($self->{nc} == -1) {
6227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6228 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6229 ## Reconsume.
6230 redo A;
6231 } else {
6232 ## XML5: [ATTLIST] Not defined yet.
6233 $self->{ct}->{name} .= chr $self->{nc};
6234 $self->{state} = MD_NAME_STATE;
6235
6236 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6237 $self->{line_prev} = $self->{line};
6238 $self->{column_prev} = $self->{column};
6239 $self->{column}++;
6240 $self->{nc}
6241 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6242 } else {
6243 $self->{set_nc}->($self);
6244 }
6245
6246 redo A;
6247 }
6248 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6249 if ($is_space->{$self->{nc}}) {
6250 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6251 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6252 $self->{state} = BEFORE_MD_NAME_STATE;
6253
6254 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6255 $self->{line_prev} = $self->{line};
6256 $self->{column_prev} = $self->{column};
6257 $self->{column}++;
6258 $self->{nc}
6259 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6260 } else {
6261 $self->{set_nc}->($self);
6262 }
6263
6264 redo A;
6265 } elsif ($self->{nc} == 0x003E) { # >
6266 ## XML5: Same as "Anything else".
6267 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6268 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6269
6270 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6271 $self->{line_prev} = $self->{line};
6272 $self->{column_prev} = $self->{column};
6273 $self->{column}++;
6274 $self->{nc}
6275 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6276 } else {
6277 $self->{set_nc}->($self);
6278 }
6279
6280 redo A;
6281 } elsif ($self->{nc} == -1) {
6282 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6283 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6284 ## Reconsume.
6285 redo A;
6286 } else {
6287 ## XML5: No parse error.
6288 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6289 $self->{state} = BOGUS_COMMENT_STATE;
6290 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6291 ## Reconsume.
6292 redo A;
6293 }
6294 } elsif ($self->{state} == MD_NAME_STATE) {
6295 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6296
6297 if ($is_space->{$self->{nc}}) {
6298 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6299 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6300 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6301 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6302 } else { # ENTITY/NOTATION
6303 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6304 }
6305
6306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6307 $self->{line_prev} = $self->{line};
6308 $self->{column_prev} = $self->{column};
6309 $self->{column}++;
6310 $self->{nc}
6311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6312 } else {
6313 $self->{set_nc}->($self);
6314 }
6315
6316 redo A;
6317 } elsif ($self->{nc} == 0x003E) { # >
6318 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6319 #
6320 } else {
6321 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6322 }
6323 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6324
6325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6326 $self->{line_prev} = $self->{line};
6327 $self->{column_prev} = $self->{column};
6328 $self->{column}++;
6329 $self->{nc}
6330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6331 } else {
6332 $self->{set_nc}->($self);
6333 }
6334
6335 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6336 redo A;
6337 } elsif ($self->{nc} == -1) {
6338 ## XML5: [ATTLIST] No parse error.
6339 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6340 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6341 ## Reconsume.
6342 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6343 redo A;
6344 } else {
6345 ## XML5: [ATTLIST] Not defined yet.
6346 $self->{ct}->{name} .= chr $self->{nc};
6347 ## Stay in the state.
6348
6349 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6350 $self->{line_prev} = $self->{line};
6351 $self->{column_prev} = $self->{column};
6352 $self->{column}++;
6353 $self->{nc}
6354 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6355 } else {
6356 $self->{set_nc}->($self);
6357 }
6358
6359 redo A;
6360 }
6361 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6362 if ($is_space->{$self->{nc}}) {
6363 ## Stay in the state.
6364
6365 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6366 $self->{line_prev} = $self->{line};
6367 $self->{column_prev} = $self->{column};
6368 $self->{column}++;
6369 $self->{nc}
6370 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6371 } else {
6372 $self->{set_nc}->($self);
6373 }
6374
6375 redo A;
6376 } elsif ($self->{nc} == 0x003E) { # >
6377 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6378
6379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6380 $self->{line_prev} = $self->{line};
6381 $self->{column_prev} = $self->{column};
6382 $self->{column}++;
6383 $self->{nc}
6384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6385 } else {
6386 $self->{set_nc}->($self);
6387 }
6388
6389 return ($self->{ct}); # ATTLIST
6390 redo A;
6391 } elsif ($self->{nc} == -1) {
6392 ## XML5: No parse error.
6393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6394 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6395 return ($self->{ct});
6396 redo A;
6397 } else {
6398 ## XML5: Not defined yet.
6399 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6400 tokens => [],
6401 line => $self->{line}, column => $self->{column}};
6402 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6403
6404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6405 $self->{line_prev} = $self->{line};
6406 $self->{column_prev} = $self->{column};
6407 $self->{column}++;
6408 $self->{nc}
6409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6410 } else {
6411 $self->{set_nc}->($self);
6412 }
6413
6414 redo A;
6415 }
6416 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6417 if ($is_space->{$self->{nc}}) {
6418 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6419
6420 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6421 $self->{line_prev} = $self->{line};
6422 $self->{column_prev} = $self->{column};
6423 $self->{column}++;
6424 $self->{nc}
6425 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6426 } else {
6427 $self->{set_nc}->($self);
6428 }
6429
6430 redo A;
6431 } elsif ($self->{nc} == 0x003E) { # >
6432 ## XML5: Same as "anything else".
6433 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6434 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6435
6436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6437 $self->{line_prev} = $self->{line};
6438 $self->{column_prev} = $self->{column};
6439 $self->{column}++;
6440 $self->{nc}
6441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6442 } else {
6443 $self->{set_nc}->($self);
6444 }
6445
6446 return ($self->{ct}); # ATTLIST
6447 redo A;
6448 } elsif ($self->{nc} == 0x0028) { # (
6449 ## XML5: Same as "anything else".
6450 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6451 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6452
6453 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454 $self->{line_prev} = $self->{line};
6455 $self->{column_prev} = $self->{column};
6456 $self->{column}++;
6457 $self->{nc}
6458 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459 } else {
6460 $self->{set_nc}->($self);
6461 }
6462
6463 redo A;
6464 } elsif ($self->{nc} == -1) {
6465 ## XML5: No parse error.
6466 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6467 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6468
6469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470 $self->{line_prev} = $self->{line};
6471 $self->{column_prev} = $self->{column};
6472 $self->{column}++;
6473 $self->{nc}
6474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475 } else {
6476 $self->{set_nc}->($self);
6477 }
6478
6479 return ($self->{ct}); # ATTLIST
6480 redo A;
6481 } else {
6482 ## XML5: Not defined yet.
6483 $self->{ca}->{name} .= chr $self->{nc};
6484 ## Stay in the state.
6485
6486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6487 $self->{line_prev} = $self->{line};
6488 $self->{column_prev} = $self->{column};
6489 $self->{column}++;
6490 $self->{nc}
6491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6492 } else {
6493 $self->{set_nc}->($self);
6494 }
6495
6496 redo A;
6497 }
6498 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6499 if ($is_space->{$self->{nc}}) {
6500 ## Stay in the state.
6501
6502 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503 $self->{line_prev} = $self->{line};
6504 $self->{column_prev} = $self->{column};
6505 $self->{column}++;
6506 $self->{nc}
6507 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508 } else {
6509 $self->{set_nc}->($self);
6510 }
6511
6512 redo A;
6513 } elsif ($self->{nc} == 0x003E) { # >
6514 ## XML5: Same as "anything else".
6515 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6516 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6517
6518 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6519 $self->{line_prev} = $self->{line};
6520 $self->{column_prev} = $self->{column};
6521 $self->{column}++;
6522 $self->{nc}
6523 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6524 } else {
6525 $self->{set_nc}->($self);
6526 }
6527
6528 return ($self->{ct}); # ATTLIST
6529 redo A;
6530 } elsif ($self->{nc} == 0x0028) { # (
6531 ## XML5: Same as "anything else".
6532 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6533
6534 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6535 $self->{line_prev} = $self->{line};
6536 $self->{column_prev} = $self->{column};
6537 $self->{column}++;
6538 $self->{nc}
6539 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6540 } else {
6541 $self->{set_nc}->($self);
6542 }
6543
6544 redo A;
6545 } elsif ($self->{nc} == -1) {
6546 ## XML5: No parse error.
6547 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6548 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6549
6550 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6551 $self->{line_prev} = $self->{line};
6552 $self->{column_prev} = $self->{column};
6553 $self->{column}++;
6554 $self->{nc}
6555 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6556 } else {
6557 $self->{set_nc}->($self);
6558 }
6559
6560 return ($self->{ct});
6561 redo A;
6562 } else {
6563 ## XML5: Not defined yet.
6564 $self->{ca}->{type} = chr $self->{nc};
6565 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6566
6567 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6568 $self->{line_prev} = $self->{line};
6569 $self->{column_prev} = $self->{column};
6570 $self->{column}++;
6571 $self->{nc}
6572 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6573 } else {
6574 $self->{set_nc}->($self);
6575 }
6576
6577 redo A;
6578 }
6579 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6580 if ($is_space->{$self->{nc}}) {
6581 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6582
6583 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6584 $self->{line_prev} = $self->{line};
6585 $self->{column_prev} = $self->{column};
6586 $self->{column}++;
6587 $self->{nc}
6588 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6589 } else {
6590 $self->{set_nc}->($self);
6591 }
6592
6593 redo A;
6594 } elsif ($self->{nc} == 0x0023) { # #
6595 ## XML5: Same as "anything else".
6596 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6597 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6598
6599 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6600 $self->{line_prev} = $self->{line};
6601 $self->{column_prev} = $self->{column};
6602 $self->{column}++;
6603 $self->{nc}
6604 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6605 } else {
6606 $self->{set_nc}->($self);
6607 }
6608
6609 redo A;
6610 } elsif ($self->{nc} == 0x0022) { # "
6611 ## XML5: Same as "anything else".
6612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6613 $self->{ca}->{value} = '';
6614 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6615
6616 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617 $self->{line_prev} = $self->{line};
6618 $self->{column_prev} = $self->{column};
6619 $self->{column}++;
6620 $self->{nc}
6621 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622 } else {
6623 $self->{set_nc}->($self);
6624 }
6625
6626 redo A;
6627 } elsif ($self->{nc} == 0x0027) { # '
6628 ## XML5: Same as "anything else".
6629 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6630 $self->{ca}->{value} = '';
6631 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6632
6633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6634 $self->{line_prev} = $self->{line};
6635 $self->{column_prev} = $self->{column};
6636 $self->{column}++;
6637 $self->{nc}
6638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6639 } else {
6640 $self->{set_nc}->($self);
6641 }
6642
6643 redo A;
6644 } elsif ($self->{nc} == 0x003E) { # >
6645 ## XML5: Same as "anything else".
6646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6647 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6648
6649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6650 $self->{line_prev} = $self->{line};
6651 $self->{column_prev} = $self->{column};
6652 $self->{column}++;
6653 $self->{nc}
6654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6655 } else {
6656 $self->{set_nc}->($self);
6657 }
6658
6659 return ($self->{ct}); # ATTLIST
6660 redo A;
6661 } elsif ($self->{nc} == 0x0028) { # (
6662 ## XML5: Same as "anything else".
6663 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6664 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6665
6666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6667 $self->{line_prev} = $self->{line};
6668 $self->{column_prev} = $self->{column};
6669 $self->{column}++;
6670 $self->{nc}
6671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6672 } else {
6673 $self->{set_nc}->($self);
6674 }
6675
6676 redo A;
6677 } elsif ($self->{nc} == -1) {
6678 ## XML5: No parse error.
6679 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6680 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6681
6682 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6683 $self->{line_prev} = $self->{line};
6684 $self->{column_prev} = $self->{column};
6685 $self->{column}++;
6686 $self->{nc}
6687 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6688 } else {
6689 $self->{set_nc}->($self);
6690 }
6691
6692 return ($self->{ct});
6693 redo A;
6694 } else {
6695 ## XML5: Not defined yet.
6696 $self->{ca}->{type} .= chr $self->{nc};
6697 ## Stay in the state.
6698
6699 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6700 $self->{line_prev} = $self->{line};
6701 $self->{column_prev} = $self->{column};
6702 $self->{column}++;
6703 $self->{nc}
6704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6705 } else {
6706 $self->{set_nc}->($self);
6707 }
6708
6709 redo A;
6710 }
6711 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6712 if ($is_space->{$self->{nc}}) {
6713 ## Stay in the state.
6714
6715 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716 $self->{line_prev} = $self->{line};
6717 $self->{column_prev} = $self->{column};
6718 $self->{column}++;
6719 $self->{nc}
6720 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721 } else {
6722 $self->{set_nc}->($self);
6723 }
6724
6725 redo A;
6726 } elsif ($self->{nc} == 0x0028) { # (
6727 ## XML5: Same as "anything else".
6728 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6729
6730 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6731 $self->{line_prev} = $self->{line};
6732 $self->{column_prev} = $self->{column};
6733 $self->{column}++;
6734 $self->{nc}
6735 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6736 } else {
6737 $self->{set_nc}->($self);
6738 }
6739
6740 redo A;
6741 } elsif ($self->{nc} == 0x0023) { # #
6742 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6743
6744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6745 $self->{line_prev} = $self->{line};
6746 $self->{column_prev} = $self->{column};
6747 $self->{column}++;
6748 $self->{nc}
6749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6750 } else {
6751 $self->{set_nc}->($self);
6752 }
6753
6754 redo A;
6755 } elsif ($self->{nc} == 0x0022) { # "
6756 ## XML5: Same as "anything else".
6757 $self->{ca}->{value} = '';
6758 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6759
6760 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6761 $self->{line_prev} = $self->{line};
6762 $self->{column_prev} = $self->{column};
6763 $self->{column}++;
6764 $self->{nc}
6765 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6766 } else {
6767 $self->{set_nc}->($self);
6768 }
6769
6770 redo A;
6771 } elsif ($self->{nc} == 0x0027) { # '
6772 ## XML5: Same as "anything else".
6773 $self->{ca}->{value} = '';
6774 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6775
6776 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6777 $self->{line_prev} = $self->{line};
6778 $self->{column_prev} = $self->{column};
6779 $self->{column}++;
6780 $self->{nc}
6781 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6782 } else {
6783 $self->{set_nc}->($self);
6784 }
6785
6786 redo A;
6787 } elsif ($self->{nc} == 0x003E) { # >
6788 ## XML5: Same as "anything else".
6789 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6790 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6791
6792 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6793 $self->{line_prev} = $self->{line};
6794 $self->{column_prev} = $self->{column};
6795 $self->{column}++;
6796 $self->{nc}
6797 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6798 } else {
6799 $self->{set_nc}->($self);
6800 }
6801
6802 return ($self->{ct}); # ATTLIST
6803 redo A;
6804 } elsif ($self->{nc} == -1) {
6805 ## XML5: No parse error.
6806 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6807 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6808
6809 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6810 $self->{line_prev} = $self->{line};
6811 $self->{column_prev} = $self->{column};
6812 $self->{column}++;
6813 $self->{nc}
6814 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6815 } else {
6816 $self->{set_nc}->($self);
6817 }
6818
6819 return ($self->{ct});
6820 redo A;
6821 } else {
6822 ## XML5: Switch to the "DOCTYPE bogus comment state".
6823 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6824 $self->{ca}->{value} = '';
6825 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6826 ## Reconsume.
6827 redo A;
6828 }
6829 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6830 if ($is_space->{$self->{nc}}) {
6831 ## Stay in the state.
6832
6833 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6834 $self->{line_prev} = $self->{line};
6835 $self->{column_prev} = $self->{column};
6836 $self->{column}++;
6837 $self->{nc}
6838 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6839 } else {
6840 $self->{set_nc}->($self);
6841 }
6842
6843 redo A;
6844 } elsif ($self->{nc} == 0x007C) { # |
6845 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6846 ## Stay in the state.
6847
6848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6849 $self->{line_prev} = $self->{line};
6850 $self->{column_prev} = $self->{column};
6851 $self->{column}++;
6852 $self->{nc}
6853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6854 } else {
6855 $self->{set_nc}->($self);
6856 }
6857
6858 redo A;
6859 } elsif ($self->{nc} == 0x0029) { # )
6860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6861 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6862
6863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6864 $self->{line_prev} = $self->{line};
6865 $self->{column_prev} = $self->{column};
6866 $self->{column}++;
6867 $self->{nc}
6868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6869 } else {
6870 $self->{set_nc}->($self);
6871 }
6872
6873 redo A;
6874 } elsif ($self->{nc} == 0x003E) { # >
6875 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6876 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6877
6878 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6879 $self->{line_prev} = $self->{line};
6880 $self->{column_prev} = $self->{column};
6881 $self->{column}++;
6882 $self->{nc}
6883 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6884 } else {
6885 $self->{set_nc}->($self);
6886 }
6887
6888 return ($self->{ct}); # ATTLIST
6889 redo A;
6890 } elsif ($self->{nc} == -1) {
6891 ## XML5: No parse error.
6892 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6893 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6894
6895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896 $self->{line_prev} = $self->{line};
6897 $self->{column_prev} = $self->{column};
6898 $self->{column}++;
6899 $self->{nc}
6900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901 } else {
6902 $self->{set_nc}->($self);
6903 }
6904
6905 return ($self->{ct});
6906 redo A;
6907 } else {
6908 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6909 $self->{state} = ALLOWED_TOKEN_STATE;
6910
6911 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6912 $self->{line_prev} = $self->{line};
6913 $self->{column_prev} = $self->{column};
6914 $self->{column}++;
6915 $self->{nc}
6916 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6917 } else {
6918 $self->{set_nc}->($self);
6919 }
6920
6921 redo A;
6922 }
6923 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6924 if ($is_space->{$self->{nc}}) {
6925 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6926
6927 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928 $self->{line_prev} = $self->{line};
6929 $self->{column_prev} = $self->{column};
6930 $self->{column}++;
6931 $self->{nc}
6932 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933 } else {
6934 $self->{set_nc}->($self);
6935 }
6936
6937 redo A;
6938 } elsif ($self->{nc} == 0x007C) { # |
6939 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6940
6941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6942 $self->{line_prev} = $self->{line};
6943 $self->{column_prev} = $self->{column};
6944 $self->{column}++;
6945 $self->{nc}
6946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6947 } else {
6948 $self->{set_nc}->($self);
6949 }
6950
6951 redo A;
6952 } elsif ($self->{nc} == 0x0029) { # )
6953 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6954
6955 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6956 $self->{line_prev} = $self->{line};
6957 $self->{column_prev} = $self->{column};
6958 $self->{column}++;
6959 $self->{nc}
6960 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6961 } else {
6962 $self->{set_nc}->($self);
6963 }
6964
6965 redo A;
6966 } elsif ($self->{nc} == 0x003E) { # >
6967 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6968 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6969
6970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6971 $self->{line_prev} = $self->{line};
6972 $self->{column_prev} = $self->{column};
6973 $self->{column}++;
6974 $self->{nc}
6975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6976 } else {
6977 $self->{set_nc}->($self);
6978 }
6979
6980 return ($self->{ct}); # ATTLIST
6981 redo A;
6982 } elsif ($self->{nc} == -1) {
6983 ## XML5: No parse error.
6984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6985 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6986
6987 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6988 $self->{line_prev} = $self->{line};
6989 $self->{column_prev} = $self->{column};
6990 $self->{column}++;
6991 $self->{nc}
6992 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6993 } else {
6994 $self->{set_nc}->($self);
6995 }
6996
6997 return ($self->{ct});
6998 redo A;
6999 } else {
7000 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7001 ## Stay in the state.
7002
7003 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7004 $self->{line_prev} = $self->{line};
7005 $self->{column_prev} = $self->{column};
7006 $self->{column}++;
7007 $self->{nc}
7008 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7009 } else {
7010 $self->{set_nc}->($self);
7011 }
7012
7013 redo A;
7014 }
7015 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7016 if ($is_space->{$self->{nc}}) {
7017 ## Stay in the state.
7018
7019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020 $self->{line_prev} = $self->{line};
7021 $self->{column_prev} = $self->{column};
7022 $self->{column}++;
7023 $self->{nc}
7024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025 } else {
7026 $self->{set_nc}->($self);
7027 }
7028
7029 redo A;
7030 } elsif ($self->{nc} == 0x007C) { # |
7031 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7032
7033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7034 $self->{line_prev} = $self->{line};
7035 $self->{column_prev} = $self->{column};
7036 $self->{column}++;
7037 $self->{nc}
7038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7039 } else {
7040 $self->{set_nc}->($self);
7041 }
7042
7043 redo A;
7044 } elsif ($self->{nc} == 0x0029) { # )
7045 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7046
7047 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7048 $self->{line_prev} = $self->{line};
7049 $self->{column_prev} = $self->{column};
7050 $self->{column}++;
7051 $self->{nc}
7052 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7053 } else {
7054 $self->{set_nc}->($self);
7055 }
7056
7057 redo A;
7058 } elsif ($self->{nc} == 0x003E) { # >
7059 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7060 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7061
7062 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063 $self->{line_prev} = $self->{line};
7064 $self->{column_prev} = $self->{column};
7065 $self->{column}++;
7066 $self->{nc}
7067 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068 } else {
7069 $self->{set_nc}->($self);
7070 }
7071
7072 return ($self->{ct}); # ATTLIST
7073 redo A;
7074 } elsif ($self->{nc} == -1) {
7075 ## XML5: No parse error.
7076 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7077 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7078
7079 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7080 $self->{line_prev} = $self->{line};
7081 $self->{column_prev} = $self->{column};
7082 $self->{column}++;
7083 $self->{nc}
7084 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7085 } else {
7086 $self->{set_nc}->($self);
7087 }
7088
7089 return ($self->{ct});
7090 redo A;
7091 } else {
7092 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7093 line => $self->{line_prev},
7094 column => $self->{column_prev});
7095 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7096 $self->{state} = ALLOWED_TOKEN_STATE;
7097
7098 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7099 $self->{line_prev} = $self->{line};
7100 $self->{column_prev} = $self->{column};
7101 $self->{column}++;
7102 $self->{nc}
7103 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7104 } else {
7105 $self->{set_nc}->($self);
7106 }
7107
7108 redo A;
7109 }
7110 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7111 if ($is_space->{$self->{nc}}) {
7112 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7113
7114 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115 $self->{line_prev} = $self->{line};
7116 $self->{column_prev} = $self->{column};
7117 $self->{column}++;
7118 $self->{nc}
7119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120 } else {
7121 $self->{set_nc}->($self);
7122 }
7123
7124 redo A;
7125 } elsif ($self->{nc} == 0x0023) { # #
7126 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7127 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7128
7129 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7130 $self->{line_prev} = $self->{line};
7131 $self->{column_prev} = $self->{column};
7132 $self->{column}++;
7133 $self->{nc}
7134 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7135 } else {
7136 $self->{set_nc}->($self);
7137 }
7138
7139 redo A;
7140 } elsif ($self->{nc} == 0x0022) { # "
7141 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7142 $self->{ca}->{value} = '';
7143 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7144
7145 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7146 $self->{line_prev} = $self->{line};
7147 $self->{column_prev} = $self->{column};
7148 $self->{column}++;
7149 $self->{nc}
7150 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7151 } else {
7152 $self->{set_nc}->($self);
7153 }
7154
7155 redo A;
7156 } elsif ($self->{nc} == 0x0027) { # '
7157 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7158 $self->{ca}->{value} = '';
7159 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7160
7161 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7162 $self->{line_prev} = $self->{line};
7163 $self->{column_prev} = $self->{column};
7164 $self->{column}++;
7165 $self->{nc}
7166 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7167 } else {
7168 $self->{set_nc}->($self);
7169 }
7170
7171 redo A;
7172 } elsif ($self->{nc} == 0x003E) { # >
7173 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7174 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7175
7176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177 $self->{line_prev} = $self->{line};
7178 $self->{column_prev} = $self->{column};
7179 $self->{column}++;
7180 $self->{nc}
7181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182 } else {
7183 $self->{set_nc}->($self);
7184 }
7185
7186 return ($self->{ct}); # ATTLIST
7187 redo A;
7188 } elsif ($self->{nc} == -1) {
7189 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7190 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7191
7192 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193 $self->{line_prev} = $self->{line};
7194 $self->{column_prev} = $self->{column};
7195 $self->{column}++;
7196 $self->{nc}
7197 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198 } else {
7199 $self->{set_nc}->($self);
7200 }
7201
7202 return ($self->{ct});
7203 redo A;
7204 } else {
7205 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7206 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7207 ## Reconsume.
7208 redo A;
7209 }
7210 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7211 if ($is_space->{$self->{nc}}) {
7212 ## Stay in the state.
7213
7214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7215 $self->{line_prev} = $self->{line};
7216 $self->{column_prev} = $self->{column};
7217 $self->{column}++;
7218 $self->{nc}
7219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7220 } else {
7221 $self->{set_nc}->($self);
7222 }
7223
7224 redo A;
7225 } elsif ($self->{nc} == 0x0023) { # #
7226 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7227
7228 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229 $self->{line_prev} = $self->{line};
7230 $self->{column_prev} = $self->{column};
7231 $self->{column}++;
7232 $self->{nc}
7233 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234 } else {
7235 $self->{set_nc}->($self);
7236 }
7237
7238 redo A;
7239 } elsif ($self->{nc} == 0x0022) { # "
7240 $self->{ca}->{value} = '';
7241 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7242
7243 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7244 $self->{line_prev} = $self->{line};
7245 $self->{column_prev} = $self->{column};
7246 $self->{column}++;
7247 $self->{nc}
7248 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7249 } else {
7250 $self->{set_nc}->($self);
7251 }
7252
7253 redo A;
7254 } elsif ($self->{nc} == 0x0027) { # '
7255 $self->{ca}->{value} = '';
7256 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7257
7258 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7259 $self->{line_prev} = $self->{line};
7260 $self->{column_prev} = $self->{column};
7261 $self->{column}++;
7262 $self->{nc}
7263 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7264 } else {
7265 $self->{set_nc}->($self);
7266 }
7267
7268 redo A;
7269 } elsif ($self->{nc} == 0x003E) { # >
7270 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7271 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7272
7273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7274 $self->{line_prev} = $self->{line};
7275 $self->{column_prev} = $self->{column};
7276 $self->{column}++;
7277 $self->{nc}
7278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7279 } else {
7280 $self->{set_nc}->($self);
7281 }
7282
7283 return ($self->{ct}); # ATTLIST
7284 redo A;
7285 } elsif ($self->{nc} == -1) {
7286 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7287 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7288
7289 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7290 $self->{line_prev} = $self->{line};
7291 $self->{column_prev} = $self->{column};
7292 $self->{column}++;
7293 $self->{nc}
7294 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7295 } else {
7296 $self->{set_nc}->($self);
7297 }
7298
7299 return ($self->{ct});
7300 redo A;
7301 } else {
7302 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7303 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7304 ## Reconsume.
7305 redo A;
7306 }
7307 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7308 if ($is_space->{$self->{nc}}) {
7309 ## XML5: No parse error.
7310 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7311 $self->{state} = BOGUS_MD_STATE;
7312 ## Reconsume.
7313 redo A;
7314 } elsif ($self->{nc} == 0x0022) { # "
7315 ## XML5: Same as "anything else".
7316 $self->{ca}->{value} = '';
7317 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7318
7319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7320 $self->{line_prev} = $self->{line};
7321 $self->{column_prev} = $self->{column};
7322 $self->{column}++;
7323 $self->{nc}
7324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7325 } else {
7326 $self->{set_nc}->($self);
7327 }
7328
7329 redo A;
7330 } elsif ($self->{nc} == 0x0027) { # '
7331 ## XML5: Same as "anything else".
7332 $self->{ca}->{value} = '';
7333 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7334
7335 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7336 $self->{line_prev} = $self->{line};
7337 $self->{column_prev} = $self->{column};
7338 $self->{column}++;
7339 $self->{nc}
7340 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7341 } else {
7342 $self->{set_nc}->($self);
7343 }
7344
7345 redo A;
7346 } elsif ($self->{nc} == 0x003E) { # >
7347 ## XML5: Same as "anything else".
7348 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7349 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7350
7351 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7352 $self->{line_prev} = $self->{line};
7353 $self->{column_prev} = $self->{column};
7354 $self->{column}++;
7355 $self->{nc}
7356 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7357 } else {
7358 $self->{set_nc}->($self);
7359 }
7360
7361 return ($self->{ct}); # ATTLIST
7362 redo A;
7363 } elsif ($self->{nc} == -1) {
7364 ## XML5: No parse error.
7365 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7366 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7367
7368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369 $self->{line_prev} = $self->{line};
7370 $self->{column_prev} = $self->{column};
7371 $self->{column}++;
7372 $self->{nc}
7373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374 } else {
7375 $self->{set_nc}->($self);
7376 }
7377
7378 return ($self->{ct});
7379 redo A;
7380 } else {
7381 $self->{ca}->{default} = chr $self->{nc};
7382 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7383
7384 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385 $self->{line_prev} = $self->{line};
7386 $self->{column_prev} = $self->{column};
7387 $self->{column}++;
7388 $self->{nc}
7389 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390 } else {
7391 $self->{set_nc}->($self);
7392 }
7393
7394 redo A;
7395 }
7396 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7397 if ($is_space->{$self->{nc}}) {
7398 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7399
7400 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401 $self->{line_prev} = $self->{line};
7402 $self->{column_prev} = $self->{column};
7403 $self->{column}++;
7404 $self->{nc}
7405 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406 } else {
7407 $self->{set_nc}->($self);
7408 }
7409
7410 redo A;
7411 } elsif ($self->{nc} == 0x0022) { # "
7412 ## XML5: Same as "anything else".
7413 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7414 $self->{ca}->{value} = '';
7415 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7416
7417 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7418 $self->{line_prev} = $self->{line};
7419 $self->{column_prev} = $self->{column};
7420 $self->{column}++;
7421 $self->{nc}
7422 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7423 } else {
7424 $self->{set_nc}->($self);
7425 }
7426
7427 redo A;
7428 } elsif ($self->{nc} == 0x0027) { # '
7429 ## XML5: Same as "anything else".
7430 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7431 $self->{ca}->{value} = '';
7432 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7433
7434 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7435 $self->{line_prev} = $self->{line};
7436 $self->{column_prev} = $self->{column};
7437 $self->{column}++;
7438 $self->{nc}
7439 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7440 } else {
7441 $self->{set_nc}->($self);
7442 }
7443
7444 redo A;
7445 } elsif ($self->{nc} == 0x003E) { # >
7446 ## XML5: Same as "anything else".
7447 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7448 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7449
7450 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7451 $self->{line_prev} = $self->{line};
7452 $self->{column_prev} = $self->{column};
7453 $self->{column}++;
7454 $self->{nc}
7455 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7456 } else {
7457 $self->{set_nc}->($self);
7458 }
7459
7460 return ($self->{ct}); # ATTLIST
7461 redo A;
7462 } elsif ($self->{nc} == -1) {
7463 ## XML5: No parse error.
7464 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7465 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7466 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7467
7468 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7469 $self->{line_prev} = $self->{line};
7470 $self->{column_prev} = $self->{column};
7471 $self->{column}++;
7472 $self->{nc}
7473 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7474 } else {
7475 $self->{set_nc}->($self);
7476 }
7477
7478 return ($self->{ct});
7479 redo A;
7480 } else {
7481 $self->{ca}->{default} .= chr $self->{nc};
7482 ## Stay in the state.
7483
7484 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7485 $self->{line_prev} = $self->{line};
7486 $self->{column_prev} = $self->{column};
7487 $self->{column}++;
7488 $self->{nc}
7489 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7490 } else {
7491 $self->{set_nc}->($self);
7492 }
7493
7494 redo A;
7495 }
7496 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7497 if ($is_space->{$self->{nc}}) {
7498 ## Stay in the state.
7499
7500 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7501 $self->{line_prev} = $self->{line};
7502 $self->{column_prev} = $self->{column};
7503 $self->{column}++;
7504 $self->{nc}
7505 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7506 } else {
7507 $self->{set_nc}->($self);
7508 }
7509
7510 redo A;
7511 } elsif ($self->{nc} == 0x0022) { # "
7512 $self->{ca}->{value} = '';
7513 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7514
7515 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516 $self->{line_prev} = $self->{line};
7517 $self->{column_prev} = $self->{column};
7518 $self->{column}++;
7519 $self->{nc}
7520 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521 } else {
7522 $self->{set_nc}->($self);
7523 }
7524
7525 redo A;
7526 } elsif ($self->{nc} == 0x0027) { # '
7527 $self->{ca}->{value} = '';
7528 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7529
7530 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531 $self->{line_prev} = $self->{line};
7532 $self->{column_prev} = $self->{column};
7533 $self->{column}++;
7534 $self->{nc}
7535 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536 } else {
7537 $self->{set_nc}->($self);
7538 }
7539
7540 redo A;
7541 } elsif ($self->{nc} == 0x003E) { # >
7542 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7543 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7544
7545 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7546 $self->{line_prev} = $self->{line};
7547 $self->{column_prev} = $self->{column};
7548 $self->{column}++;
7549 $self->{nc}
7550 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7551 } else {
7552 $self->{set_nc}->($self);
7553 }
7554
7555 return ($self->{ct}); # ATTLIST
7556 redo A;
7557 } elsif ($self->{nc} == -1) {
7558 ## XML5: No parse error.
7559 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7560 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7561 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7562
7563 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7564 $self->{line_prev} = $self->{line};
7565 $self->{column_prev} = $self->{column};
7566 $self->{column}++;
7567 $self->{nc}
7568 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7569 } else {
7570 $self->{set_nc}->($self);
7571 }
7572
7573 return ($self->{ct});
7574 redo A;
7575 } else {
7576 ## XML5: Not defined yet.
7577 if ($self->{ca}->{default} eq 'FIXED') {
7578 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7579 } else {
7580 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7581 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7582 }
7583 ## Reconsume.
7584 redo A;
7585 }
7586 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7587 if ($is_space->{$self->{nc}} or
7588 $self->{nc} == -1 or
7589 $self->{nc} == 0x003E) { # >
7590 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7591 ## Reconsume.
7592 redo A;
7593 } else {
7594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7595 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7596 ## Reconsume.
7597 redo A;
7598 }
7599 } elsif ($self->{state} == NDATA_STATE) {
7600 ## ASCII case-insensitive
7601 if ($self->{nc} == [
7602 undef,
7603 0x0044, # D
7604 0x0041, # A
7605 0x0054, # T
7606 ]->[length $self->{kwd}] or
7607 $self->{nc} == [
7608 undef,
7609 0x0064, # d
7610 0x0061, # a
7611 0x0074, # t
7612 ]->[length $self->{kwd}]) {
7613
7614 ## Stay in the state.
7615 $self->{kwd} .= chr $self->{nc};
7616
7617 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7618 $self->{line_prev} = $self->{line};
7619 $self->{column_prev} = $self->{column};
7620 $self->{column}++;
7621 $self->{nc}
7622 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7623 } else {
7624 $self->{set_nc}->($self);
7625 }
7626
7627 redo A;
7628 } elsif ((length $self->{kwd}) == 4 and
7629 ($self->{nc} == 0x0041 or # A
7630 $self->{nc} == 0x0061)) { # a
7631 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7632
7633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7634 text => 'NDATA',
7635 line => $self->{line_prev},
7636 column => $self->{column_prev} - 4);
7637 } else {
7638
7639 }
7640 $self->{state} = AFTER_NDATA_STATE;
7641
7642 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643 $self->{line_prev} = $self->{line};
7644 $self->{column_prev} = $self->{column};
7645 $self->{column}++;
7646 $self->{nc}
7647 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648 } else {
7649 $self->{set_nc}->($self);
7650 }
7651
7652 redo A;
7653 } else {
7654 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7655 line => $self->{line_prev},
7656 column => $self->{column_prev} + 1
7657 - length $self->{kwd});
7658
7659 $self->{state} = BOGUS_MD_STATE;
7660 ## Reconsume.
7661 redo A;
7662 }
7663 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7664 if ($is_space->{$self->{nc}}) {
7665 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7666
7667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7668 $self->{line_prev} = $self->{line};
7669 $self->{column_prev} = $self->{column};
7670 $self->{column}++;
7671 $self->{nc}
7672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7673 } else {
7674 $self->{set_nc}->($self);
7675 }
7676
7677 redo A;
7678 } elsif ($self->{nc} == 0x003E) { # >
7679 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7680 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7681
7682 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7683 $self->{line_prev} = $self->{line};
7684 $self->{column_prev} = $self->{column};
7685 $self->{column}++;
7686 $self->{nc}
7687 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7688 } else {
7689 $self->{set_nc}->($self);
7690 }
7691
7692 return ($self->{ct}); # ENTITY
7693 redo A;
7694 } elsif ($self->{nc} == -1) {
7695 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7696 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7697
7698 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7699 $self->{line_prev} = $self->{line};
7700 $self->{column_prev} = $self->{column};
7701 $self->{column}++;
7702 $self->{nc}
7703 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7704 } else {
7705 $self->{set_nc}->($self);
7706 }
7707
7708 return ($self->{ct}); # ENTITY
7709 redo A;
7710 } else {
7711 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7712 line => $self->{line_prev},
7713 column => $self->{column_prev} + 1
7714 - length $self->{kwd});
7715 $self->{state} = BOGUS_MD_STATE;
7716 ## Reconsume.
7717 redo A;
7718 }
7719 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7720 if ($is_space->{$self->{nc}}) {
7721 ## Stay in the state.
7722
7723 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7724 $self->{line_prev} = $self->{line};
7725 $self->{column_prev} = $self->{column};
7726 $self->{column}++;
7727 $self->{nc}
7728 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7729 } else {
7730 $self->{set_nc}->($self);
7731 }
7732
7733 redo A;
7734 } elsif ($self->{nc} == 0x003E) { # >
7735 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7736 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7737
7738 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7739 $self->{line_prev} = $self->{line};
7740 $self->{column_prev} = $self->{column};
7741 $self->{column}++;
7742 $self->{nc}
7743 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7744 } else {
7745 $self->{set_nc}->($self);
7746 }
7747
7748 return ($self->{ct}); # ENTITY
7749 redo A;
7750 } elsif ($self->{nc} == -1) {
7751 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7752 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753
7754 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755 $self->{line_prev} = $self->{line};
7756 $self->{column_prev} = $self->{column};
7757 $self->{column}++;
7758 $self->{nc}
7759 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760 } else {
7761 $self->{set_nc}->($self);
7762 }
7763
7764 return ($self->{ct}); # ENTITY
7765 redo A;
7766 } else {
7767 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7768 $self->{state} = NOTATION_NAME_STATE;
7769
7770 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771 $self->{line_prev} = $self->{line};
7772 $self->{column_prev} = $self->{column};
7773 $self->{column}++;
7774 $self->{nc}
7775 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776 } else {
7777 $self->{set_nc}->($self);
7778 }
7779
7780 redo A;
7781 }
7782 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7783 if ($is_space->{$self->{nc}}) {
7784 $self->{state} = AFTER_MD_DEF_STATE;
7785
7786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7787 $self->{line_prev} = $self->{line};
7788 $self->{column_prev} = $self->{column};
7789 $self->{column}++;
7790 $self->{nc}
7791 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7792 } else {
7793 $self->{set_nc}->($self);
7794 }
7795
7796 redo A;
7797 } elsif ($self->{nc} == 0x003E) { # >
7798 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7799
7800 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7801 $self->{line_prev} = $self->{line};
7802 $self->{column_prev} = $self->{column};
7803 $self->{column}++;
7804 $self->{nc}
7805 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7806 } else {
7807 $self->{set_nc}->($self);
7808 }
7809
7810 return ($self->{ct}); # ENTITY
7811 redo A;
7812 } elsif ($self->{nc} == -1) {
7813 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7814 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7815
7816 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7817 $self->{line_prev} = $self->{line};
7818 $self->{column_prev} = $self->{column};
7819 $self->{column}++;
7820 $self->{nc}
7821 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7822 } else {
7823 $self->{set_nc}->($self);
7824 }
7825
7826 return ($self->{ct}); # ENTITY
7827 redo A;
7828 } else {
7829 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7830 ## Stay in the state.
7831
7832 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7833 $self->{line_prev} = $self->{line};
7834 $self->{column_prev} = $self->{column};
7835 $self->{column}++;
7836 $self->{nc}
7837 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7838 } else {
7839 $self->{set_nc}->($self);
7840 }
7841
7842 redo A;
7843 }
7844 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7845 if ($self->{nc} == 0x0022) { # "
7846 $self->{state} = AFTER_MD_DEF_STATE;
7847
7848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7849 $self->{line_prev} = $self->{line};
7850 $self->{column_prev} = $self->{column};
7851 $self->{column}++;
7852 $self->{nc}
7853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7854 } else {
7855 $self->{set_nc}->($self);
7856 }
7857
7858 redo A;
7859 } elsif ($self->{nc} == 0x0026) { # &
7860 $self->{prev_state} = $self->{state};
7861 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7862 $self->{entity_add} = 0x0022; # "
7863
7864 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7865 $self->{line_prev} = $self->{line};
7866 $self->{column_prev} = $self->{column};
7867 $self->{column}++;
7868 $self->{nc}
7869 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7870 } else {
7871 $self->{set_nc}->($self);
7872 }
7873
7874 redo A;
7875 ## TODO: %
7876 } elsif ($self->{nc} == -1) {
7877 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7878 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7879 ## Reconsume.
7880 return ($self->{ct}); # ENTITY
7881 redo A;
7882 } else {
7883 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7884
7885 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7886 $self->{line_prev} = $self->{line};
7887 $self->{column_prev} = $self->{column};
7888 $self->{column}++;
7889 $self->{nc}
7890 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7891 } else {
7892 $self->{set_nc}->($self);
7893 }
7894
7895 redo A;
7896 }
7897 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7898 if ($self->{nc} == 0x0027) { # '
7899 $self->{state} = AFTER_MD_DEF_STATE;
7900
7901 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7902 $self->{line_prev} = $self->{line};
7903 $self->{column_prev} = $self->{column};
7904 $self->{column}++;
7905 $self->{nc}
7906 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7907 } else {
7908 $self->{set_nc}->($self);
7909 }
7910
7911 redo A;
7912 } elsif ($self->{nc} == 0x0026) { # &
7913 $self->{prev_state} = $self->{state};
7914 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7915 $self->{entity_add} = 0x0027; # '
7916
7917 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7918 $self->{line_prev} = $self->{line};
7919 $self->{column_prev} = $self->{column};
7920 $self->{column}++;
7921 $self->{nc}
7922 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7923 } else {
7924 $self->{set_nc}->($self);
7925 }
7926
7927 redo A;
7928 ## TODO: %
7929 } elsif ($self->{nc} == -1) {
7930 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7931 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7932 ## Reconsume.
7933 return ($self->{ct}); # ENTITY
7934 redo A;
7935 } else {
7936 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7937
7938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7939 $self->{line_prev} = $self->{line};
7940 $self->{column_prev} = $self->{column};
7941 $self->{column}++;
7942 $self->{nc}
7943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7944 } else {
7945 $self->{set_nc}->($self);
7946 }
7947
7948 redo A;
7949 }
7950 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7951 if ($is_space->{$self->{nc}} or
7952 {
7953 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7954 $self->{entity_add} => 1,
7955 }->{$self->{nc}}) {
7956 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7957 line => $self->{line_prev},
7958 column => $self->{column_prev}
7959 + ($self->{nc} == -1 ? 1 : 0));
7960 ## Don't consume
7961 ## Return nothing.
7962 #
7963 } elsif ($self->{nc} == 0x0023) { # #
7964 $self->{ca} = $self->{ct};
7965 $self->{state} = ENTITY_HASH_STATE;
7966 $self->{kwd} = '#';
7967
7968 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7969 $self->{line_prev} = $self->{line};
7970 $self->{column_prev} = $self->{column};
7971 $self->{column}++;
7972 $self->{nc}
7973 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7974 } else {
7975 $self->{set_nc}->($self);
7976 }
7977
7978 redo A;
7979 } else {
7980 #
7981 }
7982
7983 $self->{ct}->{value} .= '&';
7984 $self->{state} = $self->{prev_state};
7985 ## Reconsume.
7986 redo A;
7987 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7988 if ($is_space->{$self->{nc}}) {
7989 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7990
7991 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7992 $self->{line_prev} = $self->{line};
7993 $self->{column_prev} = $self->{column};
7994 $self->{column}++;
7995 $self->{nc}
7996 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7997 } else {
7998 $self->{set_nc}->($self);
7999 }
8000
8001 redo A;
8002 } elsif ($self->{nc} == 0x0028) { # (
8003 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8004 $self->{ct}->{content} = ['('];
8005 $self->{group_depth} = 1;
8006
8007 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8008 $self->{line_prev} = $self->{line};
8009 $self->{column_prev} = $self->{column};
8010 $self->{column}++;
8011 $self->{nc}
8012 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8013 } else {
8014 $self->{set_nc}->($self);
8015 }
8016
8017 redo A;
8018 } elsif ($self->{nc} == 0x003E) { # >
8019 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8020 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8021
8022 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8023 $self->{line_prev} = $self->{line};
8024 $self->{column_prev} = $self->{column};
8025 $self->{column}++;
8026 $self->{nc}
8027 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8028 } else {
8029 $self->{set_nc}->($self);
8030 }
8031
8032 return ($self->{ct}); # ELEMENT
8033 redo A;
8034 } elsif ($self->{nc} == -1) {
8035 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8036 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8037
8038 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8039 $self->{line_prev} = $self->{line};
8040 $self->{column_prev} = $self->{column};
8041 $self->{column}++;
8042 $self->{nc}
8043 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8044 } else {
8045 $self->{set_nc}->($self);
8046 }
8047
8048 return ($self->{ct}); # ELEMENT
8049 redo A;
8050 } else {
8051 $self->{ct}->{content} = [chr $self->{nc}];
8052 $self->{state} = CONTENT_KEYWORD_STATE;
8053
8054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8055 $self->{line_prev} = $self->{line};
8056 $self->{column_prev} = $self->{column};
8057 $self->{column}++;
8058 $self->{nc}
8059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8060 } else {
8061 $self->{set_nc}->($self);
8062 }
8063
8064 redo A;
8065 }
8066 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8067 if ($is_space->{$self->{nc}}) {
8068 $self->{state} = AFTER_MD_DEF_STATE;
8069
8070 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8071 $self->{line_prev} = $self->{line};
8072 $self->{column_prev} = $self->{column};
8073 $self->{column}++;
8074 $self->{nc}
8075 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8076 } else {
8077 $self->{set_nc}->($self);
8078 }
8079
8080 redo A;
8081 } elsif ($self->{nc} == 0x003E) { # >
8082 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8083
8084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8085 $self->{line_prev} = $self->{line};
8086 $self->{column_prev} = $self->{column};
8087 $self->{column}++;
8088 $self->{nc}
8089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8090 } else {
8091 $self->{set_nc}->($self);
8092 }
8093
8094 return ($self->{ct}); # ELEMENT
8095 redo A;
8096 } elsif ($self->{nc} == -1) {
8097 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8098 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8099
8100 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8101 $self->{line_prev} = $self->{line};
8102 $self->{column_prev} = $self->{column};
8103 $self->{column}++;
8104 $self->{nc}
8105 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8106 } else {
8107 $self->{set_nc}->($self);
8108 }
8109
8110 return ($self->{ct}); # ELEMENT
8111 redo A;
8112 } else {
8113 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8114 ## Stay in the state.
8115
8116 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8117 $self->{line_prev} = $self->{line};
8118 $self->{column_prev} = $self->{column};
8119 $self->{column}++;
8120 $self->{nc}
8121 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8122 } else {
8123 $self->{set_nc}->($self);
8124 }
8125
8126 redo A;
8127 }
8128 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8129 if ($is_space->{$self->{nc}}) {
8130 ## Stay in the state.
8131
8132 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8133 $self->{line_prev} = $self->{line};
8134 $self->{column_prev} = $self->{column};
8135 $self->{column}++;
8136 $self->{nc}
8137 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8138 } else {
8139 $self->{set_nc}->($self);
8140 }
8141
8142 redo A;
8143 } elsif ($self->{nc} == 0x0028) { # (
8144 $self->{group_depth}++;
8145 push @{$self->{ct}->{content}}, chr $self->{nc};
8146 ## Stay in the state.
8147
8148 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8149 $self->{line_prev} = $self->{line};
8150 $self->{column_prev} = $self->{column};
8151 $self->{column}++;
8152 $self->{nc}
8153 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8154 } else {
8155 $self->{set_nc}->($self);
8156 }
8157
8158 redo A;
8159 } elsif ($self->{nc} == 0x007C or # |
8160 $self->{nc} == 0x002C) { # ,
8161 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8162 ## Stay in the state.
8163
8164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8165 $self->{line_prev} = $self->{line};
8166 $self->{column_prev} = $self->{column};
8167 $self->{column}++;
8168 $self->{nc}
8169 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8170 } else {
8171 $self->{set_nc}->($self);
8172 }
8173
8174 redo A;
8175 } elsif ($self->{nc} == 0x0029) { # )
8176 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8177 push @{$self->{ct}->{content}}, chr $self->{nc};
8178 $self->{group_depth}--;
8179 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8180
8181 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8182 $self->{line_prev} = $self->{line};
8183 $self->{column_prev} = $self->{column};
8184 $self->{column}++;
8185 $self->{nc}
8186 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8187 } else {
8188 $self->{set_nc}->($self);
8189 }
8190
8191 redo A;
8192 } elsif ($self->{nc} == 0x003E) { # >
8193 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8194 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8195 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8196
8197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8198 $self->{line_prev} = $self->{line};
8199 $self->{column_prev} = $self->{column};
8200 $self->{column}++;
8201 $self->{nc}
8202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8203 } else {
8204 $self->{set_nc}->($self);
8205 }
8206
8207 return ($self->{ct}); # ELEMENT
8208 redo A;
8209 } elsif ($self->{nc} == -1) {
8210 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8211 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8212 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8213
8214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8215 $self->{line_prev} = $self->{line};
8216 $self->{column_prev} = $self->{column};
8217 $self->{column}++;
8218 $self->{nc}
8219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8220 } else {
8221 $self->{set_nc}->($self);
8222 }
8223
8224 return ($self->{ct}); # ELEMENT
8225 redo A;
8226 } else {
8227 push @{$self->{ct}->{content}}, chr $self->{nc};
8228 $self->{state} = CM_ELEMENT_NAME_STATE;
8229
8230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231 $self->{line_prev} = $self->{line};
8232 $self->{column_prev} = $self->{column};
8233 $self->{column}++;
8234 $self->{nc}
8235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236 } else {
8237 $self->{set_nc}->($self);
8238 }
8239
8240 redo A;
8241 }
8242 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8243 if ($is_space->{$self->{nc}}) {
8244 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8245
8246 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247 $self->{line_prev} = $self->{line};
8248 $self->{column_prev} = $self->{column};
8249 $self->{column}++;
8250 $self->{nc}
8251 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252 } else {
8253 $self->{set_nc}->($self);
8254 }
8255
8256 redo A;
8257 } elsif ($self->{nc} == 0x002A or # *
8258 $self->{nc} == 0x002B or # +
8259 $self->{nc} == 0x003F) { # ?
8260 push @{$self->{ct}->{content}}, chr $self->{nc};
8261 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8262
8263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264 $self->{line_prev} = $self->{line};
8265 $self->{column_prev} = $self->{column};
8266 $self->{column}++;
8267 $self->{nc}
8268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269 } else {
8270 $self->{set_nc}->($self);
8271 }
8272
8273 redo A;
8274 } elsif ($self->{nc} == 0x007C or # |
8275 $self->{nc} == 0x002C) { # ,
8276 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8277 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8278
8279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280 $self->{line_prev} = $self->{line};
8281 $self->{column_prev} = $self->{column};
8282 $self->{column}++;
8283 $self->{nc}
8284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285 } else {
8286 $self->{set_nc}->($self);
8287 }
8288
8289 redo A;
8290 } elsif ($self->{nc} == 0x0029) { # )
8291 $self->{group_depth}--;
8292 push @{$self->{ct}->{content}}, chr $self->{nc};
8293 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8294
8295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8296 $self->{line_prev} = $self->{line};
8297 $self->{column_prev} = $self->{column};
8298 $self->{column}++;
8299 $self->{nc}
8300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8301 } else {
8302 $self->{set_nc}->($self);
8303 }
8304
8305 redo A;
8306 } elsif ($self->{nc} == 0x003E) { # >
8307 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8308 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8309 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8310
8311 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8312 $self->{line_prev} = $self->{line};
8313 $self->{column_prev} = $self->{column};
8314 $self->{column}++;
8315 $self->{nc}
8316 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8317 } else {
8318 $self->{set_nc}->($self);
8319 }
8320
8321 return ($self->{ct}); # ELEMENT
8322 redo A;
8323 } elsif ($self->{nc} == -1) {
8324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8325 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8326 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8327
8328 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8329 $self->{line_prev} = $self->{line};
8330 $self->{column_prev} = $self->{column};
8331 $self->{column}++;
8332 $self->{nc}
8333 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8334 } else {
8335 $self->{set_nc}->($self);
8336 }
8337
8338 return ($self->{ct}); # ELEMENT
8339 redo A;
8340 } else {
8341 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8342 ## Stay in the state.
8343
8344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8345 $self->{line_prev} = $self->{line};
8346 $self->{column_prev} = $self->{column};
8347 $self->{column}++;
8348 $self->{nc}
8349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8350 } else {
8351 $self->{set_nc}->($self);
8352 }
8353
8354 redo A;
8355 }
8356 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8357 if ($is_space->{$self->{nc}}) {
8358 ## Stay in the state.
8359
8360 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361 $self->{line_prev} = $self->{line};
8362 $self->{column_prev} = $self->{column};
8363 $self->{column}++;
8364 $self->{nc}
8365 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366 } else {
8367 $self->{set_nc}->($self);
8368 }
8369
8370 redo A;
8371 } elsif ($self->{nc} == 0x007C or # |
8372 $self->{nc} == 0x002C) { # ,
8373 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8374 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8375
8376 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8377 $self->{line_prev} = $self->{line};
8378 $self->{column_prev} = $self->{column};
8379 $self->{column}++;
8380 $self->{nc}
8381 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8382 } else {
8383 $self->{set_nc}->($self);
8384 }
8385
8386 redo A;
8387 } elsif ($self->{nc} == 0x0029) { # )
8388 $self->{group_depth}--;
8389 push @{$self->{ct}->{content}}, chr $self->{nc};
8390 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8391
8392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8393 $self->{line_prev} = $self->{line};
8394 $self->{column_prev} = $self->{column};
8395 $self->{column}++;
8396 $self->{nc}
8397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8398 } else {
8399 $self->{set_nc}->($self);
8400 }
8401
8402 redo A;
8403 } elsif ($self->{nc} == 0x003E) { # >
8404 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8405 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8406 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8407
8408 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8409 $self->{line_prev} = $self->{line};
8410 $self->{column_prev} = $self->{column};
8411 $self->{column}++;
8412 $self->{nc}
8413 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8414 } else {
8415 $self->{set_nc}->($self);
8416 }
8417
8418 return ($self->{ct}); # ELEMENT
8419 redo A;
8420 } elsif ($self->{nc} == -1) {
8421 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8422 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8423 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8424
8425 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8426 $self->{line_prev} = $self->{line};
8427 $self->{column_prev} = $self->{column};
8428 $self->{column}++;
8429 $self->{nc}
8430 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8431 } else {
8432 $self->{set_nc}->($self);
8433 }
8434
8435 return ($self->{ct}); # ELEMENT
8436 redo A;
8437 } else {
8438 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8439 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8440 $self->{state} = BOGUS_MD_STATE;
8441
8442 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8443 $self->{line_prev} = $self->{line};
8444 $self->{column_prev} = $self->{column};
8445 $self->{column}++;
8446 $self->{nc}
8447 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8448 } else {
8449 $self->{set_nc}->($self);
8450 }
8451
8452 redo A;
8453 }
8454 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8455 if ($is_space->{$self->{nc}}) {
8456 if ($self->{group_depth}) {
8457 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8458 } else {
8459 $self->{state} = AFTER_MD_DEF_STATE;
8460 }
8461
8462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8463 $self->{line_prev} = $self->{line};
8464 $self->{column_prev} = $self->{column};
8465 $self->{column}++;
8466 $self->{nc}
8467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8468 } else {
8469 $self->{set_nc}->($self);
8470 }
8471
8472 redo A;
8473 } elsif ($self->{nc} == 0x002A or # *
8474 $self->{nc} == 0x002B or # +
8475 $self->{nc} == 0x003F) { # ?
8476 push @{$self->{ct}->{content}}, chr $self->{nc};
8477 if ($self->{group_depth}) {
8478 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8479 } else {
8480 $self->{state} = AFTER_MD_DEF_STATE;
8481 }
8482
8483 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8484 $self->{line_prev} = $self->{line};
8485 $self->{column_prev} = $self->{column};
8486 $self->{column}++;
8487 $self->{nc}
8488 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8489 } else {
8490 $self->{set_nc}->($self);
8491 }
8492
8493 redo A;
8494 } elsif ($self->{nc} == 0x0029) { # )
8495 if ($self->{group_depth}) {
8496 $self->{group_depth}--;
8497 push @{$self->{ct}->{content}}, chr $self->{nc};
8498 ## Stay in the state.
8499
8500 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8501 $self->{line_prev} = $self->{line};
8502 $self->{column_prev} = $self->{column};
8503 $self->{column}++;
8504 $self->{nc}
8505 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8506 } else {
8507 $self->{set_nc}->($self);
8508 }
8509
8510 redo A;
8511 } else {
8512 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8513 $self->{state} = BOGUS_MD_STATE;
8514 ## Reconsume.
8515 redo A;
8516 }
8517 } elsif ($self->{nc} == 0x003E) { # >
8518 if ($self->{group_depth}) {
8519 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8520 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8521 }
8522 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8523
8524 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8525 $self->{line_prev} = $self->{line};
8526 $self->{column_prev} = $self->{column};
8527 $self->{column}++;
8528 $self->{nc}
8529 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8530 } else {
8531 $self->{set_nc}->($self);
8532 }
8533
8534 return ($self->{ct}); # ELEMENT
8535 redo A;
8536 } elsif ($self->{nc} == -1) {
8537 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8538 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8539 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8540
8541 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8542 $self->{line_prev} = $self->{line};
8543 $self->{column_prev} = $self->{column};
8544 $self->{column}++;
8545 $self->{nc}
8546 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8547 } else {
8548 $self->{set_nc}->($self);
8549 }
8550
8551 return ($self->{ct}); # ELEMENT
8552 redo A;
8553 } else {
8554 if ($self->{group_depth}) {
8555 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8556 } else {
8557 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8558 $self->{state} = BOGUS_MD_STATE;
8559 }
8560 ## Reconsume.
8561 redo A;
8562 }
8563 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8564 if ($is_space->{$self->{nc}}) {
8565 ## Stay in the state.
8566
8567 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8568 $self->{line_prev} = $self->{line};
8569 $self->{column_prev} = $self->{column};
8570 $self->{column}++;
8571 $self->{nc}
8572 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8573 } else {
8574 $self->{set_nc}->($self);
8575 }
8576
8577 redo A;
8578 } elsif ($self->{nc} == 0x003E) { # >
8579 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580
8581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582 $self->{line_prev} = $self->{line};
8583 $self->{column_prev} = $self->{column};
8584 $self->{column}++;
8585 $self->{nc}
8586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587 } else {
8588 $self->{set_nc}->($self);
8589 }
8590
8591 return ($self->{ct}); # ENTITY/ELEMENT
8592 redo A;
8593 } elsif ($self->{nc} == -1) {
8594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8595 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8596
8597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8598 $self->{line_prev} = $self->{line};
8599 $self->{column_prev} = $self->{column};
8600 $self->{column}++;
8601 $self->{nc}
8602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8603 } else {
8604 $self->{set_nc}->($self);
8605 }
8606
8607 return ($self->{ct}); # ENTITY/ELEMENT
8608 redo A;
8609 } else {
8610 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8611 $self->{state} = BOGUS_MD_STATE;
8612 ## Reconsume.
8613 redo A;
8614 }
8615 } elsif ($self->{state} == BOGUS_MD_STATE) {
8616 if ($self->{nc} == 0x003E) { # >
8617 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8618
8619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8620 $self->{line_prev} = $self->{line};
8621 $self->{column_prev} = $self->{column};
8622 $self->{column}++;
8623 $self->{nc}
8624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8625 } else {
8626 $self->{set_nc}->($self);
8627 }
8628
8629 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8630 redo A;
8631 } elsif ($self->{nc} == -1) {
8632 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8633 ## Reconsume.
8634 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8635 redo A;
8636 } else {
8637 ## Stay in the state.
8638
8639 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8640 $self->{line_prev} = $self->{line};
8641 $self->{column_prev} = $self->{column};
8642 $self->{column}++;
8643 $self->{nc}
8644 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8645 } else {
8646 $self->{set_nc}->($self);
8647 }
8648
8649 redo A;
8650 }
8651 } else {
8652 die "$0: $self->{state}: Unknown state";
8653 }
8654 } # A
8655
8656 die "$0: _get_next_token: unexpected case";
8657 } # _get_next_token
8658
8659 1;
8660 ## $Date: 2008/10/19 14:05:20 $
8661

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24