/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (show annotations) (download)
Sun Oct 19 14:05:20 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.23: +10 -2 lines
++ whatpm/t/xml/ChangeLog	19 Oct 2008 14:05:17 -0000
	* attlist-1.dat, eldecls-1.dat, entities-1.dat, entities-2.dat,
	notations-1.dat, pis-2.dat: Unexpanded parameter entity tests are
	added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 14:03:50 -0000
	* Tokenizer.pm.src: Set the "stop_processing" flag true when a
	parameter entity occurs in a standalone="no" document.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 14:04:25 -0000
	* Parser.pm.src: Don't process ATTLIST_TOKEN and ENTITY_TOKEN if
	the "stop_processing" flag is set.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.23 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276
277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278 $self->{line_prev} = $self->{line};
279 $self->{column_prev} = $self->{column};
280 $self->{column}++;
281 $self->{nc}
282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283 } else {
284 $self->{set_nc}->($self);
285 }
286
287 $self->{token} = [];
288 # $self->{escape}
289 } # _initialize_tokenizer
290
291 ## A token has:
292 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 ## ->{name} (DOCTYPE_TOKEN)
295 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 ## ->{target} (PI_TOKEN)
297 ## ->{pubid} (DOCTYPE_TOKEN)
298 ## ->{sysid} (DOCTYPE_TOKEN)
299 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301 ## ->{name}
302 ## ->{value}
303 ## ->{has_reference} == 1 or 0
304 ## ->{index}: Index of the attribute in a tag.
305 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309
310 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312 ## while the token is pushed back to the stack.
313
314 ## Emitted token MUST immediately be handled by the tree construction state.
315
316 ## Before each step, UA MAY check to see if either one of the scripts in
317 ## "list of scripts that will execute as soon as possible" or the first
318 ## script in the "list of scripts that will execute asynchronously",
319 ## has completed loading. If one has, then it MUST be executed
320 ## and removed from the list.
321
322 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323 ## (This requirement was dropped from HTML5 spec, unfortunately.)
324
325 my $is_space = {
326 0x0009 => 1, # CHARACTER TABULATION (HT)
327 0x000A => 1, # LINE FEED (LF)
328 #0x000B => 0, # LINE TABULATION (VT)
329 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 #0x000D => 1, # CARRIAGE RETURN (CR)
331 0x0020 => 1, # SPACE (SP)
332 };
333
334 sub _get_next_token ($) {
335 my $self = shift;
336
337 if ($self->{self_closing}) {
338 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339 ## NOTE: The |self_closing| flag is only set by start tag token.
340 ## In addition, when a start tag token is emitted, it is always set to
341 ## |ct|.
342 delete $self->{self_closing};
343 }
344
345 if (@{$self->{token}}) {
346 $self->{self_closing} = $self->{token}->[0]->{self_closing};
347 return shift @{$self->{token}};
348 }
349
350 A: {
351 if ($self->{state} == PCDATA_STATE) {
352 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353
354 if ($self->{nc} == 0x0026) { # &
355
356 ## NOTE: In the spec, the tokenizer is switched to the
357 ## "entity data state". In this implementation, the tokenizer
358 ## is switched to the |ENTITY_STATE|, which is an implementation
359 ## of the "consume a character reference" algorithm.
360 $self->{entity_add} = -1;
361 $self->{prev_state} = DATA_STATE;
362 $self->{state} = ENTITY_STATE;
363
364 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365 $self->{line_prev} = $self->{line};
366 $self->{column_prev} = $self->{column};
367 $self->{column}++;
368 $self->{nc}
369 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370 } else {
371 $self->{set_nc}->($self);
372 }
373
374 redo A;
375 } elsif ($self->{nc} == 0x003C) { # <
376
377 $self->{state} = TAG_OPEN_STATE;
378
379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380 $self->{line_prev} = $self->{line};
381 $self->{column_prev} = $self->{column};
382 $self->{column}++;
383 $self->{nc}
384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385 } else {
386 $self->{set_nc}->($self);
387 }
388
389 redo A;
390 } elsif ($self->{nc} == -1) {
391
392 return ({type => END_OF_FILE_TOKEN,
393 line => $self->{line}, column => $self->{column}});
394 last A; ## TODO: ok?
395 } else {
396
397 #
398 }
399
400 # Anything else
401 my $token = {type => CHARACTER_TOKEN,
402 data => chr $self->{nc},
403 line => $self->{line}, column => $self->{column},
404 };
405 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406
407 ## Stay in the state.
408
409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410 $self->{line_prev} = $self->{line};
411 $self->{column_prev} = $self->{column};
412 $self->{column}++;
413 $self->{nc}
414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415 } else {
416 $self->{set_nc}->($self);
417 }
418
419 return ($token);
420 redo A;
421 } elsif ($self->{state} == DATA_STATE) {
422 $self->{s_kwd} = '' unless defined $self->{s_kwd};
423 if ($self->{nc} == 0x0026) { # &
424 $self->{s_kwd} = '';
425 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426 not $self->{escape}) {
427
428 ## NOTE: In the spec, the tokenizer is switched to the
429 ## "entity data state". In this implementation, the tokenizer
430 ## is switched to the |ENTITY_STATE|, which is an implementation
431 ## of the "consume a character reference" algorithm.
432 $self->{entity_add} = -1;
433 $self->{prev_state} = DATA_STATE;
434 $self->{state} = ENTITY_STATE;
435
436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437 $self->{line_prev} = $self->{line};
438 $self->{column_prev} = $self->{column};
439 $self->{column}++;
440 $self->{nc}
441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442 } else {
443 $self->{set_nc}->($self);
444 }
445
446 redo A;
447 } else {
448
449 #
450 }
451 } elsif ($self->{nc} == 0x002D) { # -
452 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 if ($self->{s_kwd} eq '<!-') {
454
455 $self->{escape} = 1; # unless $self->{escape};
456 $self->{s_kwd} = '--';
457 #
458 } elsif ($self->{s_kwd} eq '-') {
459
460 $self->{s_kwd} = '--';
461 #
462 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463
464 $self->{s_kwd} .= '-';
465 #
466 } else {
467
468 $self->{s_kwd} = '-';
469 #
470 }
471 }
472
473 #
474 } elsif ($self->{nc} == 0x0021) { # !
475 if (length $self->{s_kwd}) {
476
477 $self->{s_kwd} .= '!';
478 #
479 } else {
480
481 #$self->{s_kwd} = '';
482 #
483 }
484 #
485 } elsif ($self->{nc} == 0x003C) { # <
486 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488 not $self->{escape})) {
489
490 $self->{state} = TAG_OPEN_STATE;
491
492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493 $self->{line_prev} = $self->{line};
494 $self->{column_prev} = $self->{column};
495 $self->{column}++;
496 $self->{nc}
497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498 } else {
499 $self->{set_nc}->($self);
500 }
501
502 redo A;
503 } else {
504
505 $self->{s_kwd} = '';
506 #
507 }
508 } elsif ($self->{nc} == 0x003E) { # >
509 if ($self->{escape} and
510 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511 if ($self->{s_kwd} eq '--') {
512
513 delete $self->{escape};
514 #
515 } else {
516
517 #
518 }
519 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520
521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522 line => $self->{line_prev},
523 column => $self->{column_prev} - 1);
524 #
525 } else {
526
527 #
528 }
529
530 $self->{s_kwd} = '';
531 #
532 } elsif ($self->{nc} == 0x005D) { # ]
533 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534
535 $self->{s_kwd} .= ']';
536 } elsif ($self->{s_kwd} eq ']]') {
537
538 #
539 } else {
540
541 $self->{s_kwd} = '';
542 }
543 #
544 } elsif ($self->{nc} == -1) {
545
546 $self->{s_kwd} = '';
547 return ({type => END_OF_FILE_TOKEN,
548 line => $self->{line}, column => $self->{column}});
549 last A; ## TODO: ok?
550 } else {
551
552 $self->{s_kwd} = '';
553 #
554 }
555
556 # Anything else
557 my $token = {type => CHARACTER_TOKEN,
558 data => chr $self->{nc},
559 line => $self->{line}, column => $self->{column},
560 };
561 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 length $token->{data})) {
563 $self->{s_kwd} = '';
564 }
565
566 ## Stay in the data state.
567 if (not $self->{is_xml} and
568 $self->{content_model} == PCDATA_CONTENT_MODEL) {
569
570 $self->{state} = PCDATA_STATE;
571 } else {
572
573 ## Stay in the state.
574 }
575
576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577 $self->{line_prev} = $self->{line};
578 $self->{column_prev} = $self->{column};
579 $self->{column}++;
580 $self->{nc}
581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582 } else {
583 $self->{set_nc}->($self);
584 }
585
586 return ($token);
587 redo A;
588 } elsif ($self->{state} == TAG_OPEN_STATE) {
589 ## XML5: "tag state".
590
591 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592 if ($self->{nc} == 0x002F) { # /
593
594
595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596 $self->{line_prev} = $self->{line};
597 $self->{column_prev} = $self->{column};
598 $self->{column}++;
599 $self->{nc}
600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601 } else {
602 $self->{set_nc}->($self);
603 }
604
605 $self->{state} = CLOSE_TAG_OPEN_STATE;
606 redo A;
607 } elsif ($self->{nc} == 0x0021) { # !
608
609 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 #
611 } else {
612
613 $self->{s_kwd} = '';
614 #
615 }
616
617 ## reconsume
618 $self->{state} = DATA_STATE;
619 return ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623 redo A;
624 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625 if ($self->{nc} == 0x0021) { # !
626
627 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628
629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630 $self->{line_prev} = $self->{line};
631 $self->{column_prev} = $self->{column};
632 $self->{column}++;
633 $self->{nc}
634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635 } else {
636 $self->{set_nc}->($self);
637 }
638
639 redo A;
640 } elsif ($self->{nc} == 0x002F) { # /
641
642 $self->{state} = CLOSE_TAG_OPEN_STATE;
643
644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645 $self->{line_prev} = $self->{line};
646 $self->{column_prev} = $self->{column};
647 $self->{column}++;
648 $self->{nc}
649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650 } else {
651 $self->{set_nc}->($self);
652 }
653
654 redo A;
655 } elsif (0x0041 <= $self->{nc} and
656 $self->{nc} <= 0x005A) { # A..Z
657
658 $self->{ct}
659 = {type => START_TAG_TOKEN,
660 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 line => $self->{line_prev},
662 column => $self->{column_prev}};
663 $self->{state} = TAG_NAME_STATE;
664
665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666 $self->{line_prev} = $self->{line};
667 $self->{column_prev} = $self->{column};
668 $self->{column}++;
669 $self->{nc}
670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671 } else {
672 $self->{set_nc}->($self);
673 }
674
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678
679 $self->{ct} = {type => START_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $self->{line_prev},
682 column => $self->{column_prev}};
683 $self->{state} = TAG_NAME_STATE;
684
685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686 $self->{line_prev} = $self->{line};
687 $self->{column_prev} = $self->{column};
688 $self->{column}++;
689 $self->{nc}
690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691 } else {
692 $self->{set_nc}->($self);
693 }
694
695 redo A;
696 } elsif ($self->{nc} == 0x003E) { # >
697
698 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699 line => $self->{line_prev},
700 column => $self->{column_prev});
701 $self->{state} = DATA_STATE;
702 $self->{s_kwd} = '';
703
704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705 $self->{line_prev} = $self->{line};
706 $self->{column_prev} = $self->{column};
707 $self->{column}++;
708 $self->{nc}
709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710 } else {
711 $self->{set_nc}->($self);
712 }
713
714
715 return ({type => CHARACTER_TOKEN, data => '<>',
716 line => $self->{line_prev},
717 column => $self->{column_prev},
718 });
719
720 redo A;
721 } elsif ($self->{nc} == 0x003F) { # ?
722 if ($self->{is_xml}) {
723
724 $self->{state} = PI_STATE;
725
726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727 $self->{line_prev} = $self->{line};
728 $self->{column_prev} = $self->{column};
729 $self->{column}++;
730 $self->{nc}
731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732 } else {
733 $self->{set_nc}->($self);
734 }
735
736 redo A;
737 } else {
738
739 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740 line => $self->{line_prev},
741 column => $self->{column_prev});
742 $self->{state} = BOGUS_COMMENT_STATE;
743 $self->{ct} = {type => COMMENT_TOKEN, data => '',
744 line => $self->{line_prev},
745 column => $self->{column_prev},
746 };
747 ## $self->{nc} is intentionally left as is
748 redo A;
749 }
750 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751
752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753 line => $self->{line_prev},
754 column => $self->{column_prev});
755 $self->{state} = DATA_STATE;
756 $self->{s_kwd} = '';
757 ## reconsume
758
759 return ({type => CHARACTER_TOKEN, data => '<',
760 line => $self->{line_prev},
761 column => $self->{column_prev},
762 });
763
764 redo A;
765 } else {
766 ## XML5: "<:" is a parse error.
767
768 $self->{ct} = {type => START_TAG_TOKEN,
769 tag_name => chr ($self->{nc}),
770 line => $self->{line_prev},
771 column => $self->{column_prev}};
772 $self->{state} = TAG_NAME_STATE;
773
774 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775 $self->{line_prev} = $self->{line};
776 $self->{column_prev} = $self->{column};
777 $self->{column}++;
778 $self->{nc}
779 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780 } else {
781 $self->{set_nc}->($self);
782 }
783
784 redo A;
785 }
786 } else {
787 die "$0: $self->{content_model} in tag open";
788 }
789 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790 ## NOTE: The "close tag open state" in the spec is implemented as
791 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792
793 ## XML5: "end tag state".
794
795 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797 if (defined $self->{last_stag_name}) {
798 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 $self->{kwd} = '';
800 ## Reconsume.
801 redo A;
802 } else {
803 ## No start tag token has ever been emitted
804 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805
806 $self->{state} = DATA_STATE;
807 $self->{s_kwd} = '';
808 ## Reconsume.
809 return ({type => CHARACTER_TOKEN, data => '</',
810 line => $l, column => $c,
811 });
812 redo A;
813 }
814 }
815
816 if (0x0041 <= $self->{nc} and
817 $self->{nc} <= 0x005A) { # A..Z
818
819 $self->{ct}
820 = {type => END_TAG_TOKEN,
821 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 line => $l, column => $c};
823 $self->{state} = TAG_NAME_STATE;
824
825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826 $self->{line_prev} = $self->{line};
827 $self->{column_prev} = $self->{column};
828 $self->{column}++;
829 $self->{nc}
830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831 } else {
832 $self->{set_nc}->($self);
833 }
834
835 redo A;
836 } elsif (0x0061 <= $self->{nc} and
837 $self->{nc} <= 0x007A) { # a..z
838
839 $self->{ct} = {type => END_TAG_TOKEN,
840 tag_name => chr ($self->{nc}),
841 line => $l, column => $c};
842 $self->{state} = TAG_NAME_STATE;
843
844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845 $self->{line_prev} = $self->{line};
846 $self->{column_prev} = $self->{column};
847 $self->{column}++;
848 $self->{nc}
849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850 } else {
851 $self->{set_nc}->($self);
852 }
853
854 redo A;
855 } elsif ($self->{nc} == 0x003E) { # >
856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857 line => $self->{line_prev}, ## "<" in "</>"
858 column => $self->{column_prev} - 1);
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 if ($self->{is_xml}) {
862
863 ## XML5: No parse error.
864
865 ## NOTE: This parser raises a parse error, since it supports
866 ## XML1, not XML5.
867
868 ## NOTE: A short end tag token.
869 my $ct = {type => END_TAG_TOKEN,
870 tag_name => '',
871 line => $self->{line_prev},
872 column => $self->{column_prev} - 1,
873 };
874
875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876 $self->{line_prev} = $self->{line};
877 $self->{column_prev} = $self->{column};
878 $self->{column}++;
879 $self->{nc}
880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881 } else {
882 $self->{set_nc}->($self);
883 }
884
885 return ($ct);
886 } else {
887
888
889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890 $self->{line_prev} = $self->{line};
891 $self->{column_prev} = $self->{column};
892 $self->{column}++;
893 $self->{nc}
894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895 } else {
896 $self->{set_nc}->($self);
897 }
898
899 }
900 redo A;
901 } elsif ($self->{nc} == -1) {
902
903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 $self->{s_kwd} = '';
905 $self->{state} = DATA_STATE;
906 # reconsume
907
908 return ({type => CHARACTER_TOKEN, data => '</',
909 line => $l, column => $c,
910 });
911
912 redo A;
913 } elsif (not $self->{is_xml} or
914 $is_space->{$self->{nc}}) {
915
916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917 line => $self->{line_prev}, # "<" of "</"
918 column => $self->{column_prev} - 1);
919 $self->{state} = BOGUS_COMMENT_STATE;
920 $self->{ct} = {type => COMMENT_TOKEN, data => '',
921 line => $self->{line_prev}, # "<" of "</"
922 column => $self->{column_prev} - 1,
923 };
924 ## NOTE: $self->{nc} is intentionally left as is.
925 ## Although the "anything else" case of the spec not explicitly
926 ## states that the next input character is to be reconsumed,
927 ## it will be included to the |data| of the comment token
928 ## generated from the bogus end tag, as defined in the
929 ## "bogus comment state" entry.
930 redo A;
931 } else {
932 ## XML5: "</:" is a parse error.
933
934 $self->{ct} = {type => END_TAG_TOKEN,
935 tag_name => chr ($self->{nc}),
936 line => $l, column => $c};
937 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938
939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940 $self->{line_prev} = $self->{line};
941 $self->{column_prev} = $self->{column};
942 $self->{column}++;
943 $self->{nc}
944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945 } else {
946 $self->{set_nc}->($self);
947 }
948
949 redo A;
950 }
951 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 if (length $ch) {
954 my $CH = $ch;
955 $ch =~ tr/a-z/A-Z/;
956 my $nch = chr $self->{nc};
957 if ($nch eq $ch or $nch eq $CH) {
958
959 ## Stay in the state.
960 $self->{kwd} .= $nch;
961
962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963 $self->{line_prev} = $self->{line};
964 $self->{column_prev} = $self->{column};
965 $self->{column}++;
966 $self->{nc}
967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968 } else {
969 $self->{set_nc}->($self);
970 }
971
972 redo A;
973 } else {
974
975 $self->{state} = DATA_STATE;
976 $self->{s_kwd} = '';
977 ## Reconsume.
978 return ({type => CHARACTER_TOKEN,
979 data => '</' . $self->{kwd},
980 line => $self->{line_prev},
981 column => $self->{column_prev} - 1 - length $self->{kwd},
982 });
983 redo A;
984 }
985 } else { # after "<{tag-name}"
986 unless ($is_space->{$self->{nc}} or
987 {
988 0x003E => 1, # >
989 0x002F => 1, # /
990 -1 => 1, # EOF
991 }->{$self->{nc}}) {
992
993 ## Reconsume.
994 $self->{state} = DATA_STATE;
995 $self->{s_kwd} = '';
996 return ({type => CHARACTER_TOKEN,
997 data => '</' . $self->{kwd},
998 line => $self->{line_prev},
999 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 });
1001 redo A;
1002 } else {
1003
1004 $self->{ct}
1005 = {type => END_TAG_TOKEN,
1006 tag_name => $self->{last_stag_name},
1007 line => $self->{line_prev},
1008 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 $self->{state} = TAG_NAME_STATE;
1010 ## Reconsume.
1011 redo A;
1012 }
1013 }
1014 } elsif ($self->{state} == TAG_NAME_STATE) {
1015 if ($is_space->{$self->{nc}}) {
1016
1017 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018
1019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020 $self->{line_prev} = $self->{line};
1021 $self->{column_prev} = $self->{column};
1022 $self->{column}++;
1023 $self->{nc}
1024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025 } else {
1026 $self->{set_nc}->($self);
1027 }
1028
1029 redo A;
1030 } elsif ($self->{nc} == 0x003E) { # >
1031 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032
1033 $self->{last_stag_name} = $self->{ct}->{tag_name};
1034 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036 #if ($self->{ct}->{attributes}) {
1037 # ## NOTE: This should never be reached.
1038 # !!! cp (36);
1039 # !!! parse-error (type => 'end tag attribute');
1040 #} else {
1041
1042 #}
1043 } else {
1044 die "$0: $self->{ct}->{type}: Unknown token type";
1045 }
1046 $self->{state} = DATA_STATE;
1047 $self->{s_kwd} = '';
1048
1049 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050 $self->{line_prev} = $self->{line};
1051 $self->{column_prev} = $self->{column};
1052 $self->{column}++;
1053 $self->{nc}
1054 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055 } else {
1056 $self->{set_nc}->($self);
1057 }
1058
1059
1060 return ($self->{ct}); # start tag or end tag
1061
1062 redo A;
1063 } elsif (0x0041 <= $self->{nc} and
1064 $self->{nc} <= 0x005A) { # A..Z
1065
1066 $self->{ct}->{tag_name}
1067 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 # start tag or end tag
1069 ## Stay in this state
1070
1071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072 $self->{line_prev} = $self->{line};
1073 $self->{column_prev} = $self->{column};
1074 $self->{column}++;
1075 $self->{nc}
1076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077 } else {
1078 $self->{set_nc}->($self);
1079 }
1080
1081 redo A;
1082 } elsif ($self->{nc} == -1) {
1083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085
1086 $self->{last_stag_name} = $self->{ct}->{tag_name};
1087 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089 #if ($self->{ct}->{attributes}) {
1090 # ## NOTE: This state should never be reached.
1091 # !!! cp (40);
1092 # !!! parse-error (type => 'end tag attribute');
1093 #} else {
1094
1095 #}
1096 } else {
1097 die "$0: $self->{ct}->{type}: Unknown token type";
1098 }
1099 $self->{state} = DATA_STATE;
1100 $self->{s_kwd} = '';
1101 # reconsume
1102
1103 return ($self->{ct}); # start tag or end tag
1104
1105 redo A;
1106 } elsif ($self->{nc} == 0x002F) { # /
1107
1108 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109
1110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111 $self->{line_prev} = $self->{line};
1112 $self->{column_prev} = $self->{column};
1113 $self->{column}++;
1114 $self->{nc}
1115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116 } else {
1117 $self->{set_nc}->($self);
1118 }
1119
1120 redo A;
1121 } else {
1122
1123 $self->{ct}->{tag_name} .= chr $self->{nc};
1124 # start tag or end tag
1125 ## Stay in the state
1126
1127 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128 $self->{line_prev} = $self->{line};
1129 $self->{column_prev} = $self->{column};
1130 $self->{column}++;
1131 $self->{nc}
1132 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133 } else {
1134 $self->{set_nc}->($self);
1135 }
1136
1137 redo A;
1138 }
1139 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 ## XML5: "Tag attribute name before state".
1141
1142 if ($is_space->{$self->{nc}}) {
1143
1144 ## Stay in the state
1145
1146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147 $self->{line_prev} = $self->{line};
1148 $self->{column_prev} = $self->{column};
1149 $self->{column}++;
1150 $self->{nc}
1151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152 } else {
1153 $self->{set_nc}->($self);
1154 }
1155
1156 redo A;
1157 } elsif ($self->{nc} == 0x003E) { # >
1158 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159
1160 $self->{last_stag_name} = $self->{ct}->{tag_name};
1161 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163 if ($self->{ct}->{attributes}) {
1164
1165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166 } else {
1167
1168 }
1169 } else {
1170 die "$0: $self->{ct}->{type}: Unknown token type";
1171 }
1172 $self->{state} = DATA_STATE;
1173 $self->{s_kwd} = '';
1174
1175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176 $self->{line_prev} = $self->{line};
1177 $self->{column_prev} = $self->{column};
1178 $self->{column}++;
1179 $self->{nc}
1180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181 } else {
1182 $self->{set_nc}->($self);
1183 }
1184
1185
1186 return ($self->{ct}); # start tag or end tag
1187
1188 redo A;
1189 } elsif (0x0041 <= $self->{nc} and
1190 $self->{nc} <= 0x005A) { # A..Z
1191
1192 $self->{ca}
1193 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 value => '',
1195 line => $self->{line}, column => $self->{column}};
1196 $self->{state} = ATTRIBUTE_NAME_STATE;
1197
1198 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199 $self->{line_prev} = $self->{line};
1200 $self->{column_prev} = $self->{column};
1201 $self->{column}++;
1202 $self->{nc}
1203 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204 } else {
1205 $self->{set_nc}->($self);
1206 }
1207
1208 redo A;
1209 } elsif ($self->{nc} == 0x002F) { # /
1210
1211 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212
1213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214 $self->{line_prev} = $self->{line};
1215 $self->{column_prev} = $self->{column};
1216 $self->{column}++;
1217 $self->{nc}
1218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219 } else {
1220 $self->{set_nc}->($self);
1221 }
1222
1223 redo A;
1224 } elsif ($self->{nc} == -1) {
1225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232
1233 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234 } else {
1235
1236 }
1237 } else {
1238 die "$0: $self->{ct}->{type}: Unknown token type";
1239 }
1240 $self->{state} = DATA_STATE;
1241 $self->{s_kwd} = '';
1242 # reconsume
1243
1244 return ($self->{ct}); # start tag or end tag
1245
1246 redo A;
1247 } else {
1248 if ({
1249 0x0022 => 1, # "
1250 0x0027 => 1, # '
1251 0x003D => 1, # =
1252 }->{$self->{nc}}) {
1253
1254 ## XML5: Not a parse error.
1255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256 } else {
1257
1258 ## XML5: ":" raises a parse error and is ignored.
1259 }
1260 $self->{ca}
1261 = {name => chr ($self->{nc}),
1262 value => '',
1263 line => $self->{line}, column => $self->{column}};
1264 $self->{state} = ATTRIBUTE_NAME_STATE;
1265
1266 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267 $self->{line_prev} = $self->{line};
1268 $self->{column_prev} = $self->{column};
1269 $self->{column}++;
1270 $self->{nc}
1271 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272 } else {
1273 $self->{set_nc}->($self);
1274 }
1275
1276 redo A;
1277 }
1278 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 ## XML5: "Tag attribute name state".
1280
1281 my $before_leave = sub {
1282 if (exists $self->{ct}->{attributes} # start tag or end tag
1283 ->{$self->{ca}->{name}}) { # MUST
1284
1285 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286 ## Discard $self->{ca} # MUST
1287 } else {
1288
1289 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290 = $self->{ca};
1291 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 }
1293 }; # $before_leave
1294
1295 if ($is_space->{$self->{nc}}) {
1296
1297 $before_leave->();
1298 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299
1300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301 $self->{line_prev} = $self->{line};
1302 $self->{column_prev} = $self->{column};
1303 $self->{column}++;
1304 $self->{nc}
1305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306 } else {
1307 $self->{set_nc}->($self);
1308 }
1309
1310 redo A;
1311 } elsif ($self->{nc} == 0x003D) { # =
1312
1313 $before_leave->();
1314 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315
1316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317 $self->{line_prev} = $self->{line};
1318 $self->{column_prev} = $self->{column};
1319 $self->{column}++;
1320 $self->{nc}
1321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322 } else {
1323 $self->{set_nc}->($self);
1324 }
1325
1326 redo A;
1327 } elsif ($self->{nc} == 0x003E) { # >
1328 if ($self->{is_xml}) {
1329
1330 ## XML5: Not a parse error.
1331 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332 } else {
1333
1334 }
1335
1336 $before_leave->();
1337 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338
1339 $self->{last_stag_name} = $self->{ct}->{tag_name};
1340 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341
1342 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343 if ($self->{ct}->{attributes}) {
1344 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345 }
1346 } else {
1347 die "$0: $self->{ct}->{type}: Unknown token type";
1348 }
1349 $self->{state} = DATA_STATE;
1350 $self->{s_kwd} = '';
1351
1352 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353 $self->{line_prev} = $self->{line};
1354 $self->{column_prev} = $self->{column};
1355 $self->{column}++;
1356 $self->{nc}
1357 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358 } else {
1359 $self->{set_nc}->($self);
1360 }
1361
1362
1363 return ($self->{ct}); # start tag or end tag
1364
1365 redo A;
1366 } elsif (0x0041 <= $self->{nc} and
1367 $self->{nc} <= 0x005A) { # A..Z
1368
1369 $self->{ca}->{name}
1370 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 ## Stay in the state
1372
1373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374 $self->{line_prev} = $self->{line};
1375 $self->{column_prev} = $self->{column};
1376 $self->{column}++;
1377 $self->{nc}
1378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379 } else {
1380 $self->{set_nc}->($self);
1381 }
1382
1383 redo A;
1384 } elsif ($self->{nc} == 0x002F) { # /
1385 if ($self->{is_xml}) {
1386
1387 ## XML5: Not a parse error.
1388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389 } else {
1390
1391 }
1392
1393 $before_leave->();
1394 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395
1396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397 $self->{line_prev} = $self->{line};
1398 $self->{column_prev} = $self->{column};
1399 $self->{column}++;
1400 $self->{nc}
1401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402 } else {
1403 $self->{set_nc}->($self);
1404 }
1405
1406 redo A;
1407 } elsif ($self->{nc} == -1) {
1408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409 $before_leave->();
1410 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411
1412 $self->{last_stag_name} = $self->{ct}->{tag_name};
1413 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415 if ($self->{ct}->{attributes}) {
1416
1417 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418 } else {
1419 ## NOTE: This state should never be reached.
1420
1421 }
1422 } else {
1423 die "$0: $self->{ct}->{type}: Unknown token type";
1424 }
1425 $self->{state} = DATA_STATE;
1426 $self->{s_kwd} = '';
1427 # reconsume
1428
1429 return ($self->{ct}); # start tag or end tag
1430
1431 redo A;
1432 } else {
1433 if ($self->{nc} == 0x0022 or # "
1434 $self->{nc} == 0x0027) { # '
1435
1436 ## XML5: Not a parse error.
1437 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438 } else {
1439
1440 }
1441 $self->{ca}->{name} .= chr ($self->{nc});
1442 ## Stay in the state
1443
1444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445 $self->{line_prev} = $self->{line};
1446 $self->{column_prev} = $self->{column};
1447 $self->{column}++;
1448 $self->{nc}
1449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450 } else {
1451 $self->{set_nc}->($self);
1452 }
1453
1454 redo A;
1455 }
1456 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 ## XML5: "Tag attribute name after state".
1458
1459 if ($is_space->{$self->{nc}}) {
1460
1461 ## Stay in the state
1462
1463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464 $self->{line_prev} = $self->{line};
1465 $self->{column_prev} = $self->{column};
1466 $self->{column}++;
1467 $self->{nc}
1468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469 } else {
1470 $self->{set_nc}->($self);
1471 }
1472
1473 redo A;
1474 } elsif ($self->{nc} == 0x003D) { # =
1475
1476 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477
1478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479 $self->{line_prev} = $self->{line};
1480 $self->{column_prev} = $self->{column};
1481 $self->{column}++;
1482 $self->{nc}
1483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484 } else {
1485 $self->{set_nc}->($self);
1486 }
1487
1488 redo A;
1489 } elsif ($self->{nc} == 0x003E) { # >
1490 if ($self->{is_xml}) {
1491
1492 ## XML5: Not a parse error.
1493 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494 } else {
1495
1496 }
1497
1498 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499
1500 $self->{last_stag_name} = $self->{ct}->{tag_name};
1501 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503 if ($self->{ct}->{attributes}) {
1504
1505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506 } else {
1507 ## NOTE: This state should never be reached.
1508
1509 }
1510 } else {
1511 die "$0: $self->{ct}->{type}: Unknown token type";
1512 }
1513 $self->{state} = DATA_STATE;
1514 $self->{s_kwd} = '';
1515
1516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517 $self->{line_prev} = $self->{line};
1518 $self->{column_prev} = $self->{column};
1519 $self->{column}++;
1520 $self->{nc}
1521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522 } else {
1523 $self->{set_nc}->($self);
1524 }
1525
1526
1527 return ($self->{ct}); # start tag or end tag
1528
1529 redo A;
1530 } elsif (0x0041 <= $self->{nc} and
1531 $self->{nc} <= 0x005A) { # A..Z
1532
1533 $self->{ca}
1534 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 value => '',
1536 line => $self->{line}, column => $self->{column}};
1537 $self->{state} = ATTRIBUTE_NAME_STATE;
1538
1539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540 $self->{line_prev} = $self->{line};
1541 $self->{column_prev} = $self->{column};
1542 $self->{column}++;
1543 $self->{nc}
1544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545 } else {
1546 $self->{set_nc}->($self);
1547 }
1548
1549 redo A;
1550 } elsif ($self->{nc} == 0x002F) { # /
1551 if ($self->{is_xml}) {
1552
1553 ## XML5: Not a parse error.
1554 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555 } else {
1556
1557 }
1558
1559 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560
1561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562 $self->{line_prev} = $self->{line};
1563 $self->{column_prev} = $self->{column};
1564 $self->{column}++;
1565 $self->{nc}
1566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567 } else {
1568 $self->{set_nc}->($self);
1569 }
1570
1571 redo A;
1572 } elsif ($self->{nc} == -1) {
1573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575
1576 $self->{last_stag_name} = $self->{ct}->{tag_name};
1577 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579 if ($self->{ct}->{attributes}) {
1580
1581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582 } else {
1583 ## NOTE: This state should never be reached.
1584
1585 }
1586 } else {
1587 die "$0: $self->{ct}->{type}: Unknown token type";
1588 }
1589 $self->{s_kwd} = '';
1590 $self->{state} = DATA_STATE;
1591 # reconsume
1592
1593 return ($self->{ct}); # start tag or end tag
1594
1595 redo A;
1596 } else {
1597 if ($self->{is_xml}) {
1598
1599 ## XML5: Not a parse error.
1600 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601 } else {
1602
1603 }
1604
1605 if ($self->{nc} == 0x0022 or # "
1606 $self->{nc} == 0x0027) { # '
1607
1608 ## XML5: Not a parse error.
1609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610 } else {
1611
1612 }
1613 $self->{ca}
1614 = {name => chr ($self->{nc}),
1615 value => '',
1616 line => $self->{line}, column => $self->{column}};
1617 $self->{state} = ATTRIBUTE_NAME_STATE;
1618
1619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620 $self->{line_prev} = $self->{line};
1621 $self->{column_prev} = $self->{column};
1622 $self->{column}++;
1623 $self->{nc}
1624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625 } else {
1626 $self->{set_nc}->($self);
1627 }
1628
1629 redo A;
1630 }
1631 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 ## XML5: "Tag attribute value before state".
1633
1634 if ($is_space->{$self->{nc}}) {
1635
1636 ## Stay in the state
1637
1638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639 $self->{line_prev} = $self->{line};
1640 $self->{column_prev} = $self->{column};
1641 $self->{column}++;
1642 $self->{nc}
1643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644 } else {
1645 $self->{set_nc}->($self);
1646 }
1647
1648 redo A;
1649 } elsif ($self->{nc} == 0x0022) { # "
1650
1651 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652
1653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654 $self->{line_prev} = $self->{line};
1655 $self->{column_prev} = $self->{column};
1656 $self->{column}++;
1657 $self->{nc}
1658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659 } else {
1660 $self->{set_nc}->($self);
1661 }
1662
1663 redo A;
1664 } elsif ($self->{nc} == 0x0026) { # &
1665
1666 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667 ## reconsume
1668 redo A;
1669 } elsif ($self->{nc} == 0x0027) { # '
1670
1671 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672
1673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674 $self->{line_prev} = $self->{line};
1675 $self->{column_prev} = $self->{column};
1676 $self->{column}++;
1677 $self->{nc}
1678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679 } else {
1680 $self->{set_nc}->($self);
1681 }
1682
1683 redo A;
1684 } elsif ($self->{nc} == 0x003E) { # >
1685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687
1688 $self->{last_stag_name} = $self->{ct}->{tag_name};
1689 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691 if ($self->{ct}->{attributes}) {
1692
1693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694 } else {
1695 ## NOTE: This state should never be reached.
1696
1697 }
1698 } else {
1699 die "$0: $self->{ct}->{type}: Unknown token type";
1700 }
1701 $self->{state} = DATA_STATE;
1702 $self->{s_kwd} = '';
1703
1704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705 $self->{line_prev} = $self->{line};
1706 $self->{column_prev} = $self->{column};
1707 $self->{column}++;
1708 $self->{nc}
1709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710 } else {
1711 $self->{set_nc}->($self);
1712 }
1713
1714
1715 return ($self->{ct}); # start tag or end tag
1716
1717 redo A;
1718 } elsif ($self->{nc} == -1) {
1719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721
1722 $self->{last_stag_name} = $self->{ct}->{tag_name};
1723 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725 if ($self->{ct}->{attributes}) {
1726
1727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728 } else {
1729 ## NOTE: This state should never be reached.
1730
1731 }
1732 } else {
1733 die "$0: $self->{ct}->{type}: Unknown token type";
1734 }
1735 $self->{state} = DATA_STATE;
1736 $self->{s_kwd} = '';
1737 ## reconsume
1738
1739 return ($self->{ct}); # start tag or end tag
1740
1741 redo A;
1742 } else {
1743 if ($self->{nc} == 0x003D) { # =
1744
1745 ## XML5: Not a parse error.
1746 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 } elsif ($self->{is_xml}) {
1748
1749 ## XML5: No parse error.
1750 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 } else {
1752
1753 }
1754 $self->{ca}->{value} .= chr ($self->{nc});
1755 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756
1757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758 $self->{line_prev} = $self->{line};
1759 $self->{column_prev} = $self->{column};
1760 $self->{column}++;
1761 $self->{nc}
1762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763 } else {
1764 $self->{set_nc}->($self);
1765 }
1766
1767 redo A;
1768 }
1769 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771 ## ATTLIST attribute value double quoted state".
1772
1773 if ($self->{nc} == 0x0022) { # "
1774 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775
1776 ## XML5: "DOCTYPE ATTLIST name after state".
1777 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779 } else {
1780
1781 ## XML5: "Tag attribute name before state".
1782 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783 }
1784
1785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786 $self->{line_prev} = $self->{line};
1787 $self->{column_prev} = $self->{column};
1788 $self->{column}++;
1789 $self->{nc}
1790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791 } else {
1792 $self->{set_nc}->($self);
1793 }
1794
1795 redo A;
1796 } elsif ($self->{nc} == 0x0026) { # &
1797
1798 ## XML5: Not defined yet.
1799
1800 ## NOTE: In the spec, the tokenizer is switched to the
1801 ## "entity in attribute value state". In this implementation, the
1802 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803 ## implementation of the "consume a character reference" algorithm.
1804 $self->{prev_state} = $self->{state};
1805 $self->{entity_add} = 0x0022; # "
1806 $self->{state} = ENTITY_STATE;
1807
1808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809 $self->{line_prev} = $self->{line};
1810 $self->{column_prev} = $self->{column};
1811 $self->{column}++;
1812 $self->{nc}
1813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814 } else {
1815 $self->{set_nc}->($self);
1816 }
1817
1818 redo A;
1819 } elsif ($self->{nc} == -1) {
1820 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1821 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822
1823 $self->{last_stag_name} = $self->{ct}->{tag_name};
1824
1825 $self->{state} = DATA_STATE;
1826 $self->{s_kwd} = '';
1827 ## reconsume
1828 return ($self->{ct}); # start tag
1829 redo A;
1830 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832 if ($self->{ct}->{attributes}) {
1833
1834 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1835 } else {
1836 ## NOTE: This state should never be reached.
1837
1838 }
1839
1840 $self->{state} = DATA_STATE;
1841 $self->{s_kwd} = '';
1842 ## reconsume
1843 return ($self->{ct}); # end tag
1844 redo A;
1845 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846 ## XML5: No parse error above; not defined yet.
1847 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849 ## Reconsume.
1850 return ($self->{ct}); # ATTLIST
1851 redo A;
1852 } else {
1853 die "$0: $self->{ct}->{type}: Unknown token type";
1854 }
1855 } else {
1856 ## XML5 [ATTLIST]: Not defined yet.
1857 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858
1859 ## XML5: Not a parse error.
1860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861 } else {
1862
1863 }
1864 $self->{ca}->{value} .= chr ($self->{nc});
1865 $self->{read_until}->($self->{ca}->{value},
1866 q["&<],
1867 length $self->{ca}->{value});
1868
1869 ## Stay in the state
1870
1871 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1872 $self->{line_prev} = $self->{line};
1873 $self->{column_prev} = $self->{column};
1874 $self->{column}++;
1875 $self->{nc}
1876 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1877 } else {
1878 $self->{set_nc}->($self);
1879 }
1880
1881 redo A;
1882 }
1883 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885 ## ATTLIST attribute value single quoted state".
1886
1887 if ($self->{nc} == 0x0027) { # '
1888 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889
1890 ## XML5: "DOCTYPE ATTLIST name after state".
1891 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893 } else {
1894
1895 ## XML5: "Before attribute name state" (sic).
1896 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897 }
1898
1899 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900 $self->{line_prev} = $self->{line};
1901 $self->{column_prev} = $self->{column};
1902 $self->{column}++;
1903 $self->{nc}
1904 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1905 } else {
1906 $self->{set_nc}->($self);
1907 }
1908
1909 redo A;
1910 } elsif ($self->{nc} == 0x0026) { # &
1911
1912 ## XML5: Not defined yet.
1913
1914 ## NOTE: In the spec, the tokenizer is switched to the
1915 ## "entity in attribute value state". In this implementation, the
1916 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1917 ## implementation of the "consume a character reference" algorithm.
1918 $self->{entity_add} = 0x0027; # '
1919 $self->{prev_state} = $self->{state};
1920 $self->{state} = ENTITY_STATE;
1921
1922 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923 $self->{line_prev} = $self->{line};
1924 $self->{column_prev} = $self->{column};
1925 $self->{column}++;
1926 $self->{nc}
1927 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928 } else {
1929 $self->{set_nc}->($self);
1930 }
1931
1932 redo A;
1933 } elsif ($self->{nc} == -1) {
1934 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1935 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936
1937 $self->{last_stag_name} = $self->{ct}->{tag_name};
1938
1939 $self->{state} = DATA_STATE;
1940 $self->{s_kwd} = '';
1941 ## reconsume
1942 return ($self->{ct}); # start tag
1943 redo A;
1944 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946 if ($self->{ct}->{attributes}) {
1947
1948 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1949 } else {
1950 ## NOTE: This state should never be reached.
1951
1952 }
1953
1954 $self->{state} = DATA_STATE;
1955 $self->{s_kwd} = '';
1956 ## reconsume
1957 return ($self->{ct}); # end tag
1958 redo A;
1959 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960 ## XML5: No parse error above; not defined yet.
1961 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963 ## Reconsume.
1964 return ($self->{ct}); # ATTLIST
1965 redo A;
1966 } else {
1967 die "$0: $self->{ct}->{type}: Unknown token type";
1968 }
1969 } else {
1970 ## XML5 [ATTLIST]: Not defined yet.
1971 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972
1973 ## XML5: Not a parse error.
1974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975 } else {
1976
1977 }
1978 $self->{ca}->{value} .= chr ($self->{nc});
1979 $self->{read_until}->($self->{ca}->{value},
1980 q['&<],
1981 length $self->{ca}->{value});
1982
1983 ## Stay in the state
1984
1985 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986 $self->{line_prev} = $self->{line};
1987 $self->{column_prev} = $self->{column};
1988 $self->{column}++;
1989 $self->{nc}
1990 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991 } else {
1992 $self->{set_nc}->($self);
1993 }
1994
1995 redo A;
1996 }
1997 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998 ## XML5: "Tag attribute value unquoted state".
1999
2000 if ($is_space->{$self->{nc}}) {
2001 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002
2003 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005 } else {
2006
2007 ## XML5: "Tag attribute name before state".
2008 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009 }
2010
2011 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012 $self->{line_prev} = $self->{line};
2013 $self->{column_prev} = $self->{column};
2014 $self->{column}++;
2015 $self->{nc}
2016 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017 } else {
2018 $self->{set_nc}->($self);
2019 }
2020
2021 redo A;
2022 } elsif ($self->{nc} == 0x0026) { # &
2023
2024
2025 ## XML5: Not defined yet.
2026
2027 ## NOTE: In the spec, the tokenizer is switched to the
2028 ## "entity in attribute value state". In this implementation, the
2029 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2030 ## implementation of the "consume a character reference" algorithm.
2031 $self->{entity_add} = -1;
2032 $self->{prev_state} = $self->{state};
2033 $self->{state} = ENTITY_STATE;
2034
2035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036 $self->{line_prev} = $self->{line};
2037 $self->{column_prev} = $self->{column};
2038 $self->{column}++;
2039 $self->{nc}
2040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041 } else {
2042 $self->{set_nc}->($self);
2043 }
2044
2045 redo A;
2046 } elsif ($self->{nc} == 0x003E) { # >
2047 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048
2049 $self->{last_stag_name} = $self->{ct}->{tag_name};
2050
2051 $self->{state} = DATA_STATE;
2052 $self->{s_kwd} = '';
2053
2054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055 $self->{line_prev} = $self->{line};
2056 $self->{column_prev} = $self->{column};
2057 $self->{column}++;
2058 $self->{nc}
2059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060 } else {
2061 $self->{set_nc}->($self);
2062 }
2063
2064 return ($self->{ct}); # start tag
2065 redo A;
2066 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068 if ($self->{ct}->{attributes}) {
2069
2070 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2071 } else {
2072 ## NOTE: This state should never be reached.
2073
2074 }
2075
2076 $self->{state} = DATA_STATE;
2077 $self->{s_kwd} = '';
2078
2079 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080 $self->{line_prev} = $self->{line};
2081 $self->{column_prev} = $self->{column};
2082 $self->{column}++;
2083 $self->{nc}
2084 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085 } else {
2086 $self->{set_nc}->($self);
2087 }
2088
2089 return ($self->{ct}); # end tag
2090 redo A;
2091 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094
2095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096 $self->{line_prev} = $self->{line};
2097 $self->{column_prev} = $self->{column};
2098 $self->{column}++;
2099 $self->{nc}
2100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101 } else {
2102 $self->{set_nc}->($self);
2103 }
2104
2105 return ($self->{ct}); # ATTLIST
2106 redo A;
2107 } else {
2108 die "$0: $self->{ct}->{type}: Unknown token type";
2109 }
2110 } elsif ($self->{nc} == -1) {
2111 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112
2113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114 $self->{last_stag_name} = $self->{ct}->{tag_name};
2115
2116 $self->{state} = DATA_STATE;
2117 $self->{s_kwd} = '';
2118 ## reconsume
2119 return ($self->{ct}); # start tag
2120 redo A;
2121 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124 if ($self->{ct}->{attributes}) {
2125
2126 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2127 } else {
2128 ## NOTE: This state should never be reached.
2129
2130 }
2131
2132 $self->{state} = DATA_STATE;
2133 $self->{s_kwd} = '';
2134 ## reconsume
2135 return ($self->{ct}); # end tag
2136 redo A;
2137 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141 ## Reconsume.
2142 return ($self->{ct}); # ATTLIST
2143 redo A;
2144 } else {
2145 die "$0: $self->{ct}->{type}: Unknown token type";
2146 }
2147 } else {
2148 if ({
2149 0x0022 => 1, # "
2150 0x0027 => 1, # '
2151 0x003D => 1, # =
2152 }->{$self->{nc}}) {
2153
2154 ## XML5: Not a parse error.
2155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156 } else {
2157
2158 }
2159 $self->{ca}->{value} .= chr ($self->{nc});
2160 $self->{read_until}->($self->{ca}->{value},
2161 q["'=& >],
2162 length $self->{ca}->{value});
2163
2164 ## Stay in the state
2165
2166 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167 $self->{line_prev} = $self->{line};
2168 $self->{column_prev} = $self->{column};
2169 $self->{column}++;
2170 $self->{nc}
2171 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172 } else {
2173 $self->{set_nc}->($self);
2174 }
2175
2176 redo A;
2177 }
2178 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2179 if ($is_space->{$self->{nc}}) {
2180
2181 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182
2183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2184 $self->{line_prev} = $self->{line};
2185 $self->{column_prev} = $self->{column};
2186 $self->{column}++;
2187 $self->{nc}
2188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2189 } else {
2190 $self->{set_nc}->($self);
2191 }
2192
2193 redo A;
2194 } elsif ($self->{nc} == 0x003E) { # >
2195 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2196
2197 $self->{last_stag_name} = $self->{ct}->{tag_name};
2198 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2199 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2200 if ($self->{ct}->{attributes}) {
2201
2202 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2203 } else {
2204 ## NOTE: This state should never be reached.
2205
2206 }
2207 } else {
2208 die "$0: $self->{ct}->{type}: Unknown token type";
2209 }
2210 $self->{state} = DATA_STATE;
2211 $self->{s_kwd} = '';
2212
2213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214 $self->{line_prev} = $self->{line};
2215 $self->{column_prev} = $self->{column};
2216 $self->{column}++;
2217 $self->{nc}
2218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2219 } else {
2220 $self->{set_nc}->($self);
2221 }
2222
2223
2224 return ($self->{ct}); # start tag or end tag
2225
2226 redo A;
2227 } elsif ($self->{nc} == 0x002F) { # /
2228
2229 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2230
2231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2232 $self->{line_prev} = $self->{line};
2233 $self->{column_prev} = $self->{column};
2234 $self->{column}++;
2235 $self->{nc}
2236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2237 } else {
2238 $self->{set_nc}->($self);
2239 }
2240
2241 redo A;
2242 } elsif ($self->{nc} == -1) {
2243 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2244 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2245
2246 $self->{last_stag_name} = $self->{ct}->{tag_name};
2247 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2248 if ($self->{ct}->{attributes}) {
2249
2250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2251 } else {
2252 ## NOTE: This state should never be reached.
2253
2254 }
2255 } else {
2256 die "$0: $self->{ct}->{type}: Unknown token type";
2257 }
2258 $self->{state} = DATA_STATE;
2259 $self->{s_kwd} = '';
2260 ## Reconsume.
2261 return ($self->{ct}); # start tag or end tag
2262 redo A;
2263 } else {
2264
2265 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2266 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2267 ## reconsume
2268 redo A;
2269 }
2270 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271 ## XML5: "Empty tag state".
2272
2273 if ($self->{nc} == 0x003E) { # >
2274 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275
2276 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2277 ## TODO: Different type than slash in start tag
2278 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2279 if ($self->{ct}->{attributes}) {
2280
2281 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2282 } else {
2283
2284 }
2285 ## TODO: Test |<title></title/>|
2286 } else {
2287
2288 $self->{self_closing} = 1;
2289 }
2290
2291 $self->{state} = DATA_STATE;
2292 $self->{s_kwd} = '';
2293
2294 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295 $self->{line_prev} = $self->{line};
2296 $self->{column_prev} = $self->{column};
2297 $self->{column}++;
2298 $self->{nc}
2299 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300 } else {
2301 $self->{set_nc}->($self);
2302 }
2303
2304
2305 return ($self->{ct}); # start tag or end tag
2306
2307 redo A;
2308 } elsif ($self->{nc} == -1) {
2309 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2310 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2311
2312 $self->{last_stag_name} = $self->{ct}->{tag_name};
2313 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2314 if ($self->{ct}->{attributes}) {
2315
2316 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317 } else {
2318 ## NOTE: This state should never be reached.
2319
2320 }
2321 } else {
2322 die "$0: $self->{ct}->{type}: Unknown token type";
2323 }
2324 ## XML5: "Tag attribute name before state".
2325 $self->{state} = DATA_STATE;
2326 $self->{s_kwd} = '';
2327 ## Reconsume.
2328 return ($self->{ct}); # start tag or end tag
2329 redo A;
2330 } else {
2331
2332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2333 ## TODO: This error type is wrong.
2334 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2335 ## Reconsume.
2336 redo A;
2337 }
2338 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340
2341 ## NOTE: Unlike spec's "bogus comment state", this implementation
2342 ## consumes characters one-by-one basis.
2343
2344 if ($self->{nc} == 0x003E) { # >
2345 if ($self->{in_subset}) {
2346
2347 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348 } else {
2349
2350 $self->{state} = DATA_STATE;
2351 $self->{s_kwd} = '';
2352 }
2353
2354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355 $self->{line_prev} = $self->{line};
2356 $self->{column_prev} = $self->{column};
2357 $self->{column}++;
2358 $self->{nc}
2359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2360 } else {
2361 $self->{set_nc}->($self);
2362 }
2363
2364
2365 return ($self->{ct}); # comment
2366 redo A;
2367 } elsif ($self->{nc} == -1) {
2368 if ($self->{in_subset}) {
2369
2370 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371 } else {
2372
2373 $self->{state} = DATA_STATE;
2374 $self->{s_kwd} = '';
2375 }
2376 ## reconsume
2377
2378 return ($self->{ct}); # comment
2379 redo A;
2380 } else {
2381
2382 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2383 $self->{read_until}->($self->{ct}->{data},
2384 q[>],
2385 length $self->{ct}->{data});
2386
2387 ## Stay in the state.
2388
2389 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390 $self->{line_prev} = $self->{line};
2391 $self->{column_prev} = $self->{column};
2392 $self->{column}++;
2393 $self->{nc}
2394 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395 } else {
2396 $self->{set_nc}->($self);
2397 }
2398
2399 redo A;
2400 }
2401 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402 ## XML5: "Markup declaration state".
2403
2404 if ($self->{nc} == 0x002D) { # -
2405
2406 $self->{state} = MD_HYPHEN_STATE;
2407
2408 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409 $self->{line_prev} = $self->{line};
2410 $self->{column_prev} = $self->{column};
2411 $self->{column}++;
2412 $self->{nc}
2413 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414 } else {
2415 $self->{set_nc}->($self);
2416 }
2417
2418 redo A;
2419 } elsif ($self->{nc} == 0x0044 or # D
2420 $self->{nc} == 0x0064) { # d
2421 ## ASCII case-insensitive.
2422
2423 $self->{state} = MD_DOCTYPE_STATE;
2424 $self->{kwd} = chr $self->{nc};
2425
2426 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427 $self->{line_prev} = $self->{line};
2428 $self->{column_prev} = $self->{column};
2429 $self->{column}++;
2430 $self->{nc}
2431 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432 } else {
2433 $self->{set_nc}->($self);
2434 }
2435
2436 redo A;
2437 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2438 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2439 $self->{is_xml}) and
2440 $self->{nc} == 0x005B) { # [
2441
2442 $self->{state} = MD_CDATA_STATE;
2443 $self->{kwd} = '[';
2444
2445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446 $self->{line_prev} = $self->{line};
2447 $self->{column_prev} = $self->{column};
2448 $self->{column}++;
2449 $self->{nc}
2450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2451 } else {
2452 $self->{set_nc}->($self);
2453 }
2454
2455 redo A;
2456 } else {
2457
2458 }
2459
2460 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461 line => $self->{line_prev},
2462 column => $self->{column_prev} - 1);
2463 ## Reconsume.
2464 $self->{state} = BOGUS_COMMENT_STATE;
2465 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2466 line => $self->{line_prev},
2467 column => $self->{column_prev} - 1,
2468 };
2469 redo A;
2470 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2471 if ($self->{nc} == 0x002D) { # -
2472
2473 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2474 line => $self->{line_prev},
2475 column => $self->{column_prev} - 2,
2476 };
2477 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478
2479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480 $self->{line_prev} = $self->{line};
2481 $self->{column_prev} = $self->{column};
2482 $self->{column}++;
2483 $self->{nc}
2484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485 } else {
2486 $self->{set_nc}->($self);
2487 }
2488
2489 redo A;
2490 } else {
2491
2492 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2493 line => $self->{line_prev},
2494 column => $self->{column_prev} - 2);
2495 $self->{state} = BOGUS_COMMENT_STATE;
2496 ## Reconsume.
2497 $self->{ct} = {type => COMMENT_TOKEN,
2498 data => '-',
2499 line => $self->{line_prev},
2500 column => $self->{column_prev} - 2,
2501 };
2502 redo A;
2503 }
2504 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2505 ## ASCII case-insensitive.
2506 if ($self->{nc} == [
2507 undef,
2508 0x004F, # O
2509 0x0043, # C
2510 0x0054, # T
2511 0x0059, # Y
2512 0x0050, # P
2513 ]->[length $self->{kwd}] or
2514 $self->{nc} == [
2515 undef,
2516 0x006F, # o
2517 0x0063, # c
2518 0x0074, # t
2519 0x0079, # y
2520 0x0070, # p
2521 ]->[length $self->{kwd}]) {
2522
2523 ## Stay in the state.
2524 $self->{kwd} .= chr $self->{nc};
2525
2526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527 $self->{line_prev} = $self->{line};
2528 $self->{column_prev} = $self->{column};
2529 $self->{column}++;
2530 $self->{nc}
2531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532 } else {
2533 $self->{set_nc}->($self);
2534 }
2535
2536 redo A;
2537 } elsif ((length $self->{kwd}) == 6 and
2538 ($self->{nc} == 0x0045 or # E
2539 $self->{nc} == 0x0065)) { # e
2540 if ($self->{is_xml} and
2541 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542
2543 ## XML5: case-sensitive.
2544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545 text => 'DOCTYPE',
2546 line => $self->{line_prev},
2547 column => $self->{column_prev} - 5);
2548 } else {
2549
2550 }
2551 $self->{state} = DOCTYPE_STATE;
2552 $self->{ct} = {type => DOCTYPE_TOKEN,
2553 quirks => 1,
2554 line => $self->{line_prev},
2555 column => $self->{column_prev} - 7,
2556 };
2557
2558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559 $self->{line_prev} = $self->{line};
2560 $self->{column_prev} = $self->{column};
2561 $self->{column}++;
2562 $self->{nc}
2563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564 } else {
2565 $self->{set_nc}->($self);
2566 }
2567
2568 redo A;
2569 } else {
2570
2571 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572 line => $self->{line_prev},
2573 column => $self->{column_prev} - 1 - length $self->{kwd});
2574 $self->{state} = BOGUS_COMMENT_STATE;
2575 ## Reconsume.
2576 $self->{ct} = {type => COMMENT_TOKEN,
2577 data => $self->{kwd},
2578 line => $self->{line_prev},
2579 column => $self->{column_prev} - 1 - length $self->{kwd},
2580 };
2581 redo A;
2582 }
2583 } elsif ($self->{state} == MD_CDATA_STATE) {
2584 if ($self->{nc} == {
2585 '[' => 0x0043, # C
2586 '[C' => 0x0044, # D
2587 '[CD' => 0x0041, # A
2588 '[CDA' => 0x0054, # T
2589 '[CDAT' => 0x0041, # A
2590 }->{$self->{kwd}}) {
2591
2592 ## Stay in the state.
2593 $self->{kwd} .= chr $self->{nc};
2594
2595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596 $self->{line_prev} = $self->{line};
2597 $self->{column_prev} = $self->{column};
2598 $self->{column}++;
2599 $self->{nc}
2600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601 } else {
2602 $self->{set_nc}->($self);
2603 }
2604
2605 redo A;
2606 } elsif ($self->{kwd} eq '[CDATA' and
2607 $self->{nc} == 0x005B) { # [
2608 if ($self->{is_xml} and
2609 not $self->{tainted} and
2610 @{$self->{open_elements} or []} == 0) {
2611
2612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613 line => $self->{line_prev},
2614 column => $self->{column_prev} - 7);
2615 $self->{tainted} = 1;
2616 } else {
2617
2618 }
2619
2620 $self->{ct} = {type => CHARACTER_TOKEN,
2621 data => '',
2622 line => $self->{line_prev},
2623 column => $self->{column_prev} - 7};
2624 $self->{state} = CDATA_SECTION_STATE;
2625
2626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2627 $self->{line_prev} = $self->{line};
2628 $self->{column_prev} = $self->{column};
2629 $self->{column}++;
2630 $self->{nc}
2631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2632 } else {
2633 $self->{set_nc}->($self);
2634 }
2635
2636 redo A;
2637 } else {
2638
2639 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640 line => $self->{line_prev},
2641 column => $self->{column_prev} - 1 - length $self->{kwd});
2642 $self->{state} = BOGUS_COMMENT_STATE;
2643 ## Reconsume.
2644 $self->{ct} = {type => COMMENT_TOKEN,
2645 data => $self->{kwd},
2646 line => $self->{line_prev},
2647 column => $self->{column_prev} - 1 - length $self->{kwd},
2648 };
2649 redo A;
2650 }
2651 } elsif ($self->{state} == COMMENT_START_STATE) {
2652 if ($self->{nc} == 0x002D) { # -
2653
2654 $self->{state} = COMMENT_START_DASH_STATE;
2655
2656 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657 $self->{line_prev} = $self->{line};
2658 $self->{column_prev} = $self->{column};
2659 $self->{column}++;
2660 $self->{nc}
2661 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662 } else {
2663 $self->{set_nc}->($self);
2664 }
2665
2666 redo A;
2667 } elsif ($self->{nc} == 0x003E) { # >
2668 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669 if ($self->{in_subset}) {
2670
2671 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672 } else {
2673
2674 $self->{state} = DATA_STATE;
2675 $self->{s_kwd} = '';
2676 }
2677
2678 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679 $self->{line_prev} = $self->{line};
2680 $self->{column_prev} = $self->{column};
2681 $self->{column}++;
2682 $self->{nc}
2683 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684 } else {
2685 $self->{set_nc}->($self);
2686 }
2687
2688
2689 return ($self->{ct}); # comment
2690
2691 redo A;
2692 } elsif ($self->{nc} == -1) {
2693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694 if ($self->{in_subset}) {
2695
2696 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697 } else {
2698
2699 $self->{state} = DATA_STATE;
2700 $self->{s_kwd} = '';
2701 }
2702 ## reconsume
2703
2704 return ($self->{ct}); # comment
2705
2706 redo A;
2707 } else {
2708
2709 $self->{ct}->{data} # comment
2710 .= chr ($self->{nc});
2711 $self->{state} = COMMENT_STATE;
2712
2713 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714 $self->{line_prev} = $self->{line};
2715 $self->{column_prev} = $self->{column};
2716 $self->{column}++;
2717 $self->{nc}
2718 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719 } else {
2720 $self->{set_nc}->($self);
2721 }
2722
2723 redo A;
2724 }
2725 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2726 if ($self->{nc} == 0x002D) { # -
2727
2728 $self->{state} = COMMENT_END_STATE;
2729
2730 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2731 $self->{line_prev} = $self->{line};
2732 $self->{column_prev} = $self->{column};
2733 $self->{column}++;
2734 $self->{nc}
2735 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2736 } else {
2737 $self->{set_nc}->($self);
2738 }
2739
2740 redo A;
2741 } elsif ($self->{nc} == 0x003E) { # >
2742 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743 if ($self->{in_subset}) {
2744
2745 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746 } else {
2747
2748 $self->{state} = DATA_STATE;
2749 $self->{s_kwd} = '';
2750 }
2751
2752 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753 $self->{line_prev} = $self->{line};
2754 $self->{column_prev} = $self->{column};
2755 $self->{column}++;
2756 $self->{nc}
2757 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758 } else {
2759 $self->{set_nc}->($self);
2760 }
2761
2762
2763 return ($self->{ct}); # comment
2764
2765 redo A;
2766 } elsif ($self->{nc} == -1) {
2767 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768 if ($self->{in_subset}) {
2769
2770 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771 } else {
2772
2773 $self->{state} = DATA_STATE;
2774 $self->{s_kwd} = '';
2775 }
2776 ## reconsume
2777
2778 return ($self->{ct}); # comment
2779
2780 redo A;
2781 } else {
2782
2783 $self->{ct}->{data} # comment
2784 .= '-' . chr ($self->{nc});
2785 $self->{state} = COMMENT_STATE;
2786
2787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788 $self->{line_prev} = $self->{line};
2789 $self->{column_prev} = $self->{column};
2790 $self->{column}++;
2791 $self->{nc}
2792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793 } else {
2794 $self->{set_nc}->($self);
2795 }
2796
2797 redo A;
2798 }
2799 } elsif ($self->{state} == COMMENT_STATE) {
2800 ## XML5: "Comment state" and "DOCTYPE comment state".
2801
2802 if ($self->{nc} == 0x002D) { # -
2803
2804 $self->{state} = COMMENT_END_DASH_STATE;
2805
2806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807 $self->{line_prev} = $self->{line};
2808 $self->{column_prev} = $self->{column};
2809 $self->{column}++;
2810 $self->{nc}
2811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812 } else {
2813 $self->{set_nc}->($self);
2814 }
2815
2816 redo A;
2817 } elsif ($self->{nc} == -1) {
2818 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819 if ($self->{in_subset}) {
2820
2821 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822 } else {
2823
2824 $self->{state} = DATA_STATE;
2825 $self->{s_kwd} = '';
2826 }
2827 ## reconsume
2828
2829 return ($self->{ct}); # comment
2830
2831 redo A;
2832 } else {
2833
2834 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2835 $self->{read_until}->($self->{ct}->{data},
2836 q[-],
2837 length $self->{ct}->{data});
2838
2839 ## Stay in the state
2840
2841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842 $self->{line_prev} = $self->{line};
2843 $self->{column_prev} = $self->{column};
2844 $self->{column}++;
2845 $self->{nc}
2846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847 } else {
2848 $self->{set_nc}->($self);
2849 }
2850
2851 redo A;
2852 }
2853 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855
2856 if ($self->{nc} == 0x002D) { # -
2857
2858 $self->{state} = COMMENT_END_STATE;
2859
2860 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2861 $self->{line_prev} = $self->{line};
2862 $self->{column_prev} = $self->{column};
2863 $self->{column}++;
2864 $self->{nc}
2865 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2866 } else {
2867 $self->{set_nc}->($self);
2868 }
2869
2870 redo A;
2871 } elsif ($self->{nc} == -1) {
2872 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873 if ($self->{in_subset}) {
2874
2875 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876 } else {
2877
2878 $self->{state} = DATA_STATE;
2879 $self->{s_kwd} = '';
2880 }
2881 ## reconsume
2882
2883 return ($self->{ct}); # comment
2884
2885 redo A;
2886 } else {
2887
2888 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2889 $self->{state} = COMMENT_STATE;
2890
2891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2892 $self->{line_prev} = $self->{line};
2893 $self->{column_prev} = $self->{column};
2894 $self->{column}++;
2895 $self->{nc}
2896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2897 } else {
2898 $self->{set_nc}->($self);
2899 }
2900
2901 redo A;
2902 }
2903 } elsif ($self->{state} == COMMENT_END_STATE) {
2904 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905
2906 if ($self->{nc} == 0x003E) { # >
2907 if ($self->{in_subset}) {
2908
2909 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910 } else {
2911
2912 $self->{state} = DATA_STATE;
2913 $self->{s_kwd} = '';
2914 }
2915
2916 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917 $self->{line_prev} = $self->{line};
2918 $self->{column_prev} = $self->{column};
2919 $self->{column}++;
2920 $self->{nc}
2921 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922 } else {
2923 $self->{set_nc}->($self);
2924 }
2925
2926
2927 return ($self->{ct}); # comment
2928
2929 redo A;
2930 } elsif ($self->{nc} == 0x002D) { # -
2931
2932 ## XML5: Not a parse error.
2933 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934 line => $self->{line_prev},
2935 column => $self->{column_prev});
2936 $self->{ct}->{data} .= '-'; # comment
2937 ## Stay in the state
2938
2939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940 $self->{line_prev} = $self->{line};
2941 $self->{column_prev} = $self->{column};
2942 $self->{column}++;
2943 $self->{nc}
2944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945 } else {
2946 $self->{set_nc}->($self);
2947 }
2948
2949 redo A;
2950 } elsif ($self->{nc} == -1) {
2951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 if ($self->{in_subset}) {
2953
2954 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955 } else {
2956
2957 $self->{state} = DATA_STATE;
2958 $self->{s_kwd} = '';
2959 }
2960 ## reconsume
2961
2962 return ($self->{ct}); # comment
2963
2964 redo A;
2965 } else {
2966
2967 ## XML5: Not a parse error.
2968 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969 line => $self->{line_prev},
2970 column => $self->{column_prev});
2971 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2972 $self->{state} = COMMENT_STATE;
2973
2974 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975 $self->{line_prev} = $self->{line};
2976 $self->{column_prev} = $self->{column};
2977 $self->{column}++;
2978 $self->{nc}
2979 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980 } else {
2981 $self->{set_nc}->($self);
2982 }
2983
2984 redo A;
2985 }
2986 } elsif ($self->{state} == DOCTYPE_STATE) {
2987 if ($is_space->{$self->{nc}}) {
2988
2989 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990
2991 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992 $self->{line_prev} = $self->{line};
2993 $self->{column_prev} = $self->{column};
2994 $self->{column}++;
2995 $self->{nc}
2996 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997 } else {
2998 $self->{set_nc}->($self);
2999 }
3000
3001 redo A;
3002 } else {
3003
3004 ## XML5: Unless EOF, swith to the bogus comment state.
3005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007 ## reconsume
3008 redo A;
3009 }
3010 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011 ## XML5: "DOCTYPE root name before state".
3012
3013 if ($is_space->{$self->{nc}}) {
3014
3015 ## Stay in the state
3016
3017 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018 $self->{line_prev} = $self->{line};
3019 $self->{column_prev} = $self->{column};
3020 $self->{column}++;
3021 $self->{nc}
3022 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023 } else {
3024 $self->{set_nc}->($self);
3025 }
3026
3027 redo A;
3028 } elsif ($self->{nc} == 0x003E) { # >
3029
3030 ## XML5: No parse error.
3031 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032 $self->{state} = DATA_STATE;
3033 $self->{s_kwd} = '';
3034
3035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036 $self->{line_prev} = $self->{line};
3037 $self->{column_prev} = $self->{column};
3038 $self->{column}++;
3039 $self->{nc}
3040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041 } else {
3042 $self->{set_nc}->($self);
3043 }
3044
3045
3046 return ($self->{ct}); # DOCTYPE (quirks)
3047
3048 redo A;
3049 } elsif ($self->{nc} == -1) {
3050
3051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052 $self->{state} = DATA_STATE;
3053 $self->{s_kwd} = '';
3054 ## reconsume
3055
3056 return ($self->{ct}); # DOCTYPE (quirks)
3057
3058 redo A;
3059 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060
3061 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064 $self->{in_subset} = 1;
3065
3066 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067 $self->{line_prev} = $self->{line};
3068 $self->{column_prev} = $self->{column};
3069 $self->{column}++;
3070 $self->{nc}
3071 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072 } else {
3073 $self->{set_nc}->($self);
3074 }
3075
3076 return ($self->{ct}); # DOCTYPE
3077 redo A;
3078 } else {
3079
3080 $self->{ct}->{name} = chr $self->{nc};
3081 delete $self->{ct}->{quirks};
3082 $self->{state} = DOCTYPE_NAME_STATE;
3083
3084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085 $self->{line_prev} = $self->{line};
3086 $self->{column_prev} = $self->{column};
3087 $self->{column}++;
3088 $self->{nc}
3089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090 } else {
3091 $self->{set_nc}->($self);
3092 }
3093
3094 redo A;
3095 }
3096 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097 ## XML5: "DOCTYPE root name state".
3098
3099 ## ISSUE: Redundant "First," in the spec.
3100
3101 if ($is_space->{$self->{nc}}) {
3102
3103 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3104
3105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106 $self->{line_prev} = $self->{line};
3107 $self->{column_prev} = $self->{column};
3108 $self->{column}++;
3109 $self->{nc}
3110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111 } else {
3112 $self->{set_nc}->($self);
3113 }
3114
3115 redo A;
3116 } elsif ($self->{nc} == 0x003E) { # >
3117
3118 $self->{state} = DATA_STATE;
3119 $self->{s_kwd} = '';
3120
3121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122 $self->{line_prev} = $self->{line};
3123 $self->{column_prev} = $self->{column};
3124 $self->{column}++;
3125 $self->{nc}
3126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127 } else {
3128 $self->{set_nc}->($self);
3129 }
3130
3131
3132 return ($self->{ct}); # DOCTYPE
3133
3134 redo A;
3135 } elsif ($self->{nc} == -1) {
3136
3137 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138 $self->{state} = DATA_STATE;
3139 $self->{s_kwd} = '';
3140 ## reconsume
3141
3142 $self->{ct}->{quirks} = 1;
3143 return ($self->{ct}); # DOCTYPE
3144
3145 redo A;
3146 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147
3148 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150 $self->{in_subset} = 1;
3151
3152 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153 $self->{line_prev} = $self->{line};
3154 $self->{column_prev} = $self->{column};
3155 $self->{column}++;
3156 $self->{nc}
3157 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158 } else {
3159 $self->{set_nc}->($self);
3160 }
3161
3162 return ($self->{ct}); # DOCTYPE
3163 redo A;
3164 } else {
3165
3166 $self->{ct}->{name}
3167 .= chr ($self->{nc}); # DOCTYPE
3168 ## Stay in the state
3169
3170 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171 $self->{line_prev} = $self->{line};
3172 $self->{column_prev} = $self->{column};
3173 $self->{column}++;
3174 $self->{nc}
3175 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176 } else {
3177 $self->{set_nc}->($self);
3178 }
3179
3180 redo A;
3181 }
3182 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184 ## state", but implemented differently.
3185
3186 if ($is_space->{$self->{nc}}) {
3187
3188 ## Stay in the state
3189
3190 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3191 $self->{line_prev} = $self->{line};
3192 $self->{column_prev} = $self->{column};
3193 $self->{column}++;
3194 $self->{nc}
3195 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3196 } else {
3197 $self->{set_nc}->($self);
3198 }
3199
3200 redo A;
3201 } elsif ($self->{nc} == 0x003E) { # >
3202 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203
3204 $self->{state} = DATA_STATE;
3205 $self->{s_kwd} = '';
3206 } else {
3207
3208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210 }
3211
3212
3213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214 $self->{line_prev} = $self->{line};
3215 $self->{column_prev} = $self->{column};
3216 $self->{column}++;
3217 $self->{nc}
3218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219 } else {
3220 $self->{set_nc}->($self);
3221 }
3222
3223 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 redo A;
3225 } elsif ($self->{nc} == -1) {
3226 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227
3228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229 $self->{state} = DATA_STATE;
3230 $self->{s_kwd} = '';
3231 $self->{ct}->{quirks} = 1;
3232 } else {
3233
3234 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236 }
3237
3238 ## Reconsume.
3239 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3240 redo A;
3241 } elsif ($self->{nc} == 0x0050 or # P
3242 $self->{nc} == 0x0070) { # p
3243
3244 $self->{state} = PUBLIC_STATE;
3245 $self->{kwd} = chr $self->{nc};
3246
3247 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248 $self->{line_prev} = $self->{line};
3249 $self->{column_prev} = $self->{column};
3250 $self->{column}++;
3251 $self->{nc}
3252 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253 } else {
3254 $self->{set_nc}->($self);
3255 }
3256
3257 redo A;
3258 } elsif ($self->{nc} == 0x0053 or # S
3259 $self->{nc} == 0x0073) { # s
3260
3261 $self->{state} = SYSTEM_STATE;
3262 $self->{kwd} = chr $self->{nc};
3263
3264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265 $self->{line_prev} = $self->{line};
3266 $self->{column_prev} = $self->{column};
3267 $self->{column}++;
3268 $self->{nc}
3269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270 } else {
3271 $self->{set_nc}->($self);
3272 }
3273
3274 redo A;
3275 } elsif ($self->{nc} == 0x0022 and # "
3276 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278
3279 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280 $self->{ct}->{value} = ''; # ENTITY
3281
3282 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283 $self->{line_prev} = $self->{line};
3284 $self->{column_prev} = $self->{column};
3285 $self->{column}++;
3286 $self->{nc}
3287 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288 } else {
3289 $self->{set_nc}->($self);
3290 }
3291
3292 redo A;
3293 } elsif ($self->{nc} == 0x0027 and # '
3294 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296
3297 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298 $self->{ct}->{value} = ''; # ENTITY
3299
3300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301 $self->{line_prev} = $self->{line};
3302 $self->{column_prev} = $self->{column};
3303 $self->{column}++;
3304 $self->{nc}
3305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306 } else {
3307 $self->{set_nc}->($self);
3308 }
3309
3310 redo A;
3311 } elsif ($self->{is_xml} and
3312 $self->{ct}->{type} == DOCTYPE_TOKEN and
3313 $self->{nc} == 0x005B) { # [
3314
3315 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317 $self->{in_subset} = 1;
3318
3319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320 $self->{line_prev} = $self->{line};
3321 $self->{column_prev} = $self->{column};
3322 $self->{column}++;
3323 $self->{nc}
3324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325 } else {
3326 $self->{set_nc}->($self);
3327 }
3328
3329 return ($self->{ct}); # DOCTYPE
3330 redo A;
3331 } else {
3332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333
3334 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335
3336 $self->{ct}->{quirks} = 1;
3337 $self->{state} = BOGUS_DOCTYPE_STATE;
3338 } else {
3339
3340 $self->{state} = BOGUS_MD_STATE;
3341 }
3342
3343
3344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345 $self->{line_prev} = $self->{line};
3346 $self->{column_prev} = $self->{column};
3347 $self->{column}++;
3348 $self->{nc}
3349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3350 } else {
3351 $self->{set_nc}->($self);
3352 }
3353
3354 redo A;
3355 }
3356 } elsif ($self->{state} == PUBLIC_STATE) {
3357 ## ASCII case-insensitive
3358 if ($self->{nc} == [
3359 undef,
3360 0x0055, # U
3361 0x0042, # B
3362 0x004C, # L
3363 0x0049, # I
3364 ]->[length $self->{kwd}] or
3365 $self->{nc} == [
3366 undef,
3367 0x0075, # u
3368 0x0062, # b
3369 0x006C, # l
3370 0x0069, # i
3371 ]->[length $self->{kwd}]) {
3372
3373 ## Stay in the state.
3374 $self->{kwd} .= chr $self->{nc};
3375
3376 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377 $self->{line_prev} = $self->{line};
3378 $self->{column_prev} = $self->{column};
3379 $self->{column}++;
3380 $self->{nc}
3381 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382 } else {
3383 $self->{set_nc}->($self);
3384 }
3385
3386 redo A;
3387 } elsif ((length $self->{kwd}) == 5 and
3388 ($self->{nc} == 0x0043 or # C
3389 $self->{nc} == 0x0063)) { # c
3390 if ($self->{is_xml} and
3391 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392
3393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394 text => 'PUBLIC',
3395 line => $self->{line_prev},
3396 column => $self->{column_prev} - 4);
3397 } else {
3398
3399 }
3400 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401
3402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403 $self->{line_prev} = $self->{line};
3404 $self->{column_prev} = $self->{column};
3405 $self->{column}++;
3406 $self->{nc}
3407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408 } else {
3409 $self->{set_nc}->($self);
3410 }
3411
3412 redo A;
3413 } else {
3414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3415 line => $self->{line_prev},
3416 column => $self->{column_prev} + 1 - length $self->{kwd});
3417 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418
3419 $self->{ct}->{quirks} = 1;
3420 $self->{state} = BOGUS_DOCTYPE_STATE;
3421 } else {
3422
3423 $self->{state} = BOGUS_MD_STATE;
3424 }
3425 ## Reconsume.
3426 redo A;
3427 }
3428 } elsif ($self->{state} == SYSTEM_STATE) {
3429 ## ASCII case-insensitive
3430 if ($self->{nc} == [
3431 undef,
3432 0x0059, # Y
3433 0x0053, # S
3434 0x0054, # T
3435 0x0045, # E
3436 ]->[length $self->{kwd}] or
3437 $self->{nc} == [
3438 undef,
3439 0x0079, # y
3440 0x0073, # s
3441 0x0074, # t
3442 0x0065, # e
3443 ]->[length $self->{kwd}]) {
3444
3445 ## Stay in the state.
3446 $self->{kwd} .= chr $self->{nc};
3447
3448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449 $self->{line_prev} = $self->{line};
3450 $self->{column_prev} = $self->{column};
3451 $self->{column}++;
3452 $self->{nc}
3453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3454 } else {
3455 $self->{set_nc}->($self);
3456 }
3457
3458 redo A;
3459 } elsif ((length $self->{kwd}) == 5 and
3460 ($self->{nc} == 0x004D or # M
3461 $self->{nc} == 0x006D)) { # m
3462 if ($self->{is_xml} and
3463 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464
3465 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466 text => 'SYSTEM',
3467 line => $self->{line_prev},
3468 column => $self->{column_prev} - 4);
3469 } else {
3470
3471 }
3472 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473
3474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475 $self->{line_prev} = $self->{line};
3476 $self->{column_prev} = $self->{column};
3477 $self->{column}++;
3478 $self->{nc}
3479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480 } else {
3481 $self->{set_nc}->($self);
3482 }
3483
3484 redo A;
3485 } else {
3486 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3487 line => $self->{line_prev},
3488 column => $self->{column_prev} + 1 - length $self->{kwd});
3489 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490
3491 $self->{ct}->{quirks} = 1;
3492 $self->{state} = BOGUS_DOCTYPE_STATE;
3493 } else {
3494
3495 $self->{state} = BOGUS_MD_STATE;
3496 }
3497 ## Reconsume.
3498 redo A;
3499 }
3500 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3501 if ($is_space->{$self->{nc}}) {
3502
3503 ## Stay in the state
3504
3505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506 $self->{line_prev} = $self->{line};
3507 $self->{column_prev} = $self->{column};
3508 $self->{column}++;
3509 $self->{nc}
3510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511 } else {
3512 $self->{set_nc}->($self);
3513 }
3514
3515 redo A;
3516 } elsif ($self->{nc} eq 0x0022) { # "
3517
3518 $self->{ct}->{pubid} = ''; # DOCTYPE
3519 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3520
3521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522 $self->{line_prev} = $self->{line};
3523 $self->{column_prev} = $self->{column};
3524 $self->{column}++;
3525 $self->{nc}
3526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527 } else {
3528 $self->{set_nc}->($self);
3529 }
3530
3531 redo A;
3532 } elsif ($self->{nc} eq 0x0027) { # '
3533
3534 $self->{ct}->{pubid} = ''; # DOCTYPE
3535 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3536
3537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3538 $self->{line_prev} = $self->{line};
3539 $self->{column_prev} = $self->{column};
3540 $self->{column}++;
3541 $self->{nc}
3542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3543 } else {
3544 $self->{set_nc}->($self);
3545 }
3546
3547 redo A;
3548 } elsif ($self->{nc} eq 0x003E) { # >
3549 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550
3551 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552
3553 $self->{state} = DATA_STATE;
3554 $self->{s_kwd} = '';
3555 $self->{ct}->{quirks} = 1;
3556 } else {
3557
3558 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559 }
3560
3561
3562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563 $self->{line_prev} = $self->{line};
3564 $self->{column_prev} = $self->{column};
3565 $self->{column}++;
3566 $self->{nc}
3567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568 } else {
3569 $self->{set_nc}->($self);
3570 }
3571
3572 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3573 redo A;
3574 } elsif ($self->{nc} == -1) {
3575 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576
3577 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578 $self->{state} = DATA_STATE;
3579 $self->{s_kwd} = '';
3580 $self->{ct}->{quirks} = 1;
3581 } else {
3582
3583 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585 }
3586
3587 ## reconsume
3588 return ($self->{ct}); # DOCTYPE
3589 redo A;
3590 } elsif ($self->{is_xml} and
3591 $self->{ct}->{type} == DOCTYPE_TOKEN and
3592 $self->{nc} == 0x005B) { # [
3593
3594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597 $self->{in_subset} = 1;
3598
3599 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600 $self->{line_prev} = $self->{line};
3601 $self->{column_prev} = $self->{column};
3602 $self->{column}++;
3603 $self->{nc}
3604 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605 } else {
3606 $self->{set_nc}->($self);
3607 }
3608
3609 return ($self->{ct}); # DOCTYPE
3610 redo A;
3611 } else {
3612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3613
3614 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615
3616 $self->{ct}->{quirks} = 1;
3617 $self->{state} = BOGUS_DOCTYPE_STATE;
3618 } else {
3619
3620 $self->{state} = BOGUS_MD_STATE;
3621 }
3622
3623
3624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625 $self->{line_prev} = $self->{line};
3626 $self->{column_prev} = $self->{column};
3627 $self->{column}++;
3628 $self->{nc}
3629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630 } else {
3631 $self->{set_nc}->($self);
3632 }
3633
3634 redo A;
3635 }
3636 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3637 if ($self->{nc} == 0x0022) { # "
3638
3639 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3640
3641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642 $self->{line_prev} = $self->{line};
3643 $self->{column_prev} = $self->{column};
3644 $self->{column}++;
3645 $self->{nc}
3646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647 } else {
3648 $self->{set_nc}->($self);
3649 }
3650
3651 redo A;
3652 } elsif ($self->{nc} == 0x003E) { # >
3653 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654
3655 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656
3657 $self->{state} = DATA_STATE;
3658 $self->{s_kwd} = '';
3659 $self->{ct}->{quirks} = 1;
3660 } else {
3661
3662 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663 }
3664
3665
3666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667 $self->{line_prev} = $self->{line};
3668 $self->{column_prev} = $self->{column};
3669 $self->{column}++;
3670 $self->{nc}
3671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672 } else {
3673 $self->{set_nc}->($self);
3674 }
3675
3676 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3677 redo A;
3678 } elsif ($self->{nc} == -1) {
3679 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680
3681 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682
3683 $self->{state} = DATA_STATE;
3684 $self->{s_kwd} = '';
3685 $self->{ct}->{quirks} = 1;
3686 } else {
3687
3688 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689 }
3690
3691 ## Reconsume.
3692 return ($self->{ct}); # DOCTYPE
3693 redo A;
3694 } else {
3695
3696 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3697 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698 length $self->{ct}->{pubid});
3699
3700 ## Stay in the state
3701
3702 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703 $self->{line_prev} = $self->{line};
3704 $self->{column_prev} = $self->{column};
3705 $self->{column}++;
3706 $self->{nc}
3707 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708 } else {
3709 $self->{set_nc}->($self);
3710 }
3711
3712 redo A;
3713 }
3714 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3715 if ($self->{nc} == 0x0027) { # '
3716
3717 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3718
3719 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720 $self->{line_prev} = $self->{line};
3721 $self->{column_prev} = $self->{column};
3722 $self->{column}++;
3723 $self->{nc}
3724 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725 } else {
3726 $self->{set_nc}->($self);
3727 }
3728
3729 redo A;
3730 } elsif ($self->{nc} == 0x003E) { # >
3731 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732
3733 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734
3735 $self->{state} = DATA_STATE;
3736 $self->{s_kwd} = '';
3737 $self->{ct}->{quirks} = 1;
3738 } else {
3739
3740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741 }
3742
3743
3744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745 $self->{line_prev} = $self->{line};
3746 $self->{column_prev} = $self->{column};
3747 $self->{column}++;
3748 $self->{nc}
3749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750 } else {
3751 $self->{set_nc}->($self);
3752 }
3753
3754 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3755 redo A;
3756 } elsif ($self->{nc} == -1) {
3757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758
3759 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760
3761 $self->{state} = DATA_STATE;
3762 $self->{s_kwd} = '';
3763 $self->{ct}->{quirks} = 1;
3764 } else {
3765
3766 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767 }
3768
3769 ## reconsume
3770 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3771 redo A;
3772 } else {
3773
3774 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3775 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776 length $self->{ct}->{pubid});
3777
3778 ## Stay in the state
3779
3780 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781 $self->{line_prev} = $self->{line};
3782 $self->{column_prev} = $self->{column};
3783 $self->{column}++;
3784 $self->{nc}
3785 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786 } else {
3787 $self->{set_nc}->($self);
3788 }
3789
3790 redo A;
3791 }
3792 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3793 if ($is_space->{$self->{nc}}) {
3794
3795 ## Stay in the state
3796
3797 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798 $self->{line_prev} = $self->{line};
3799 $self->{column_prev} = $self->{column};
3800 $self->{column}++;
3801 $self->{nc}
3802 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803 } else {
3804 $self->{set_nc}->($self);
3805 }
3806
3807 redo A;
3808 } elsif ($self->{nc} == 0x0022) { # "
3809
3810 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812
3813 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3814 $self->{line_prev} = $self->{line};
3815 $self->{column_prev} = $self->{column};
3816 $self->{column}++;
3817 $self->{nc}
3818 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3819 } else {
3820 $self->{set_nc}->($self);
3821 }
3822
3823 redo A;
3824 } elsif ($self->{nc} == 0x0027) { # '
3825
3826 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828
3829 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830 $self->{line_prev} = $self->{line};
3831 $self->{column_prev} = $self->{column};
3832 $self->{column}++;
3833 $self->{nc}
3834 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835 } else {
3836 $self->{set_nc}->($self);
3837 }
3838
3839 redo A;
3840 } elsif ($self->{nc} == 0x003E) { # >
3841 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842 if ($self->{is_xml}) {
3843
3844 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845 } else {
3846
3847 }
3848 $self->{state} = DATA_STATE;
3849 $self->{s_kwd} = '';
3850 } else {
3851 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852
3853 } else {
3854
3855 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856 }
3857 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858 }
3859
3860
3861 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862 $self->{line_prev} = $self->{line};
3863 $self->{column_prev} = $self->{column};
3864 $self->{column}++;
3865 $self->{nc}
3866 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3867 } else {
3868 $self->{set_nc}->($self);
3869 }
3870
3871 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3872 redo A;
3873 } elsif ($self->{nc} == -1) {
3874 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875
3876 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877
3878 $self->{state} = DATA_STATE;
3879 $self->{s_kwd} = '';
3880 $self->{ct}->{quirks} = 1;
3881 } else {
3882 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884 }
3885
3886 ## reconsume
3887 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888 redo A;
3889 } elsif ($self->{is_xml} and
3890 $self->{ct}->{type} == DOCTYPE_TOKEN and
3891 $self->{nc} == 0x005B) { # [
3892
3893 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896 $self->{in_subset} = 1;
3897
3898 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899 $self->{line_prev} = $self->{line};
3900 $self->{column_prev} = $self->{column};
3901 $self->{column}++;
3902 $self->{nc}
3903 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904 } else {
3905 $self->{set_nc}->($self);
3906 }
3907
3908 return ($self->{ct}); # DOCTYPE
3909 redo A;
3910 } else {
3911 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3912
3913 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914
3915 $self->{ct}->{quirks} = 1;
3916 $self->{state} = BOGUS_DOCTYPE_STATE;
3917 } else {
3918
3919 $self->{state} = BOGUS_MD_STATE;
3920 }
3921
3922
3923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924 $self->{line_prev} = $self->{line};
3925 $self->{column_prev} = $self->{column};
3926 $self->{column}++;
3927 $self->{nc}
3928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3929 } else {
3930 $self->{set_nc}->($self);
3931 }
3932
3933 redo A;
3934 }
3935 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3936 if ($is_space->{$self->{nc}}) {
3937
3938 ## Stay in the state
3939
3940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3941 $self->{line_prev} = $self->{line};
3942 $self->{column_prev} = $self->{column};
3943 $self->{column}++;
3944 $self->{nc}
3945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3946 } else {
3947 $self->{set_nc}->($self);
3948 }
3949
3950 redo A;
3951 } elsif ($self->{nc} == 0x0022) { # "
3952
3953 $self->{ct}->{sysid} = ''; # DOCTYPE
3954 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3955
3956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3957 $self->{line_prev} = $self->{line};
3958 $self->{column_prev} = $self->{column};
3959 $self->{column}++;
3960 $self->{nc}
3961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3962 } else {
3963 $self->{set_nc}->($self);
3964 }
3965
3966 redo A;
3967 } elsif ($self->{nc} == 0x0027) { # '
3968
3969 $self->{ct}->{sysid} = ''; # DOCTYPE
3970 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3971
3972 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3973 $self->{line_prev} = $self->{line};
3974 $self->{column_prev} = $self->{column};
3975 $self->{column}++;
3976 $self->{nc}
3977 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3978 } else {
3979 $self->{set_nc}->($self);
3980 }
3981
3982 redo A;
3983 } elsif ($self->{nc} == 0x003E) { # >
3984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985
3986 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987 $self->{line_prev} = $self->{line};
3988 $self->{column_prev} = $self->{column};
3989 $self->{column}++;
3990 $self->{nc}
3991 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992 } else {
3993 $self->{set_nc}->($self);
3994 }
3995
3996
3997 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998
3999 $self->{state} = DATA_STATE;
4000 $self->{s_kwd} = '';
4001 $self->{ct}->{quirks} = 1;
4002 } else {
4003
4004 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005 }
4006
4007 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008 redo A;
4009 } elsif ($self->{nc} == -1) {
4010 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011
4012 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013 $self->{state} = DATA_STATE;
4014 $self->{s_kwd} = '';
4015 $self->{ct}->{quirks} = 1;
4016 } else {
4017
4018 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020 }
4021
4022 ## reconsume
4023 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024 redo A;
4025 } elsif ($self->{is_xml} and
4026 $self->{ct}->{type} == DOCTYPE_TOKEN and
4027 $self->{nc} == 0x005B) { # [
4028
4029 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030
4031 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033 $self->{in_subset} = 1;
4034
4035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036 $self->{line_prev} = $self->{line};
4037 $self->{column_prev} = $self->{column};
4038 $self->{column}++;
4039 $self->{nc}
4040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041 } else {
4042 $self->{set_nc}->($self);
4043 }
4044
4045 return ($self->{ct}); # DOCTYPE
4046 redo A;
4047 } else {
4048 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4049
4050 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051
4052 $self->{ct}->{quirks} = 1;
4053 $self->{state} = BOGUS_DOCTYPE_STATE;
4054 } else {
4055
4056 $self->{state} = BOGUS_MD_STATE;
4057 }
4058
4059
4060 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061 $self->{line_prev} = $self->{line};
4062 $self->{column_prev} = $self->{column};
4063 $self->{column}++;
4064 $self->{nc}
4065 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4066 } else {
4067 $self->{set_nc}->($self);
4068 }
4069
4070 redo A;
4071 }
4072 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4073 if ($self->{nc} == 0x0022) { # "
4074
4075 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4076
4077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078 $self->{line_prev} = $self->{line};
4079 $self->{column_prev} = $self->{column};
4080 $self->{column}++;
4081 $self->{nc}
4082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083 } else {
4084 $self->{set_nc}->($self);
4085 }
4086
4087 redo A;
4088 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4089 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090
4091 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092
4093 $self->{state} = DATA_STATE;
4094 $self->{s_kwd} = '';
4095 $self->{ct}->{quirks} = 1;
4096 } else {
4097
4098 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099 }
4100
4101
4102 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103 $self->{line_prev} = $self->{line};
4104 $self->{column_prev} = $self->{column};
4105 $self->{column}++;
4106 $self->{nc}
4107 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108 } else {
4109 $self->{set_nc}->($self);
4110 }
4111
4112 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4113 redo A;
4114 } elsif ($self->{nc} == -1) {
4115 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116
4117 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118
4119 $self->{state} = DATA_STATE;
4120 $self->{s_kwd} = '';
4121 $self->{ct}->{quirks} = 1;
4122 } else {
4123
4124 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125 }
4126
4127 ## reconsume
4128 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4129 redo A;
4130 } else {
4131
4132 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4133 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134 length $self->{ct}->{sysid});
4135
4136 ## Stay in the state
4137
4138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139 $self->{line_prev} = $self->{line};
4140 $self->{column_prev} = $self->{column};
4141 $self->{column}++;
4142 $self->{nc}
4143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144 } else {
4145 $self->{set_nc}->($self);
4146 }
4147
4148 redo A;
4149 }
4150 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4151 if ($self->{nc} == 0x0027) { # '
4152
4153 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4154
4155 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4156 $self->{line_prev} = $self->{line};
4157 $self->{column_prev} = $self->{column};
4158 $self->{column}++;
4159 $self->{nc}
4160 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4161 } else {
4162 $self->{set_nc}->($self);
4163 }
4164
4165 redo A;
4166 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167
4168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169
4170 $self->{state} = DATA_STATE;
4171 $self->{s_kwd} = '';
4172
4173 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174 $self->{line_prev} = $self->{line};
4175 $self->{column_prev} = $self->{column};
4176 $self->{column}++;
4177 $self->{nc}
4178 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179 } else {
4180 $self->{set_nc}->($self);
4181 }
4182
4183
4184 $self->{ct}->{quirks} = 1;
4185 return ($self->{ct}); # DOCTYPE
4186
4187 redo A;
4188 } elsif ($self->{nc} == -1) {
4189 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190
4191 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192
4193 $self->{state} = DATA_STATE;
4194 $self->{s_kwd} = '';
4195 $self->{ct}->{quirks} = 1;
4196 } else {
4197
4198 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199 }
4200
4201 ## reconsume
4202 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203 redo A;
4204 } else {
4205
4206 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4207 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208 length $self->{ct}->{sysid});
4209
4210 ## Stay in the state
4211
4212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4213 $self->{line_prev} = $self->{line};
4214 $self->{column_prev} = $self->{column};
4215 $self->{column}++;
4216 $self->{nc}
4217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4218 } else {
4219 $self->{set_nc}->($self);
4220 }
4221
4222 redo A;
4223 }
4224 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225 if ($is_space->{$self->{nc}}) {
4226 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227
4228 $self->{state} = BEFORE_NDATA_STATE;
4229 } else {
4230
4231 ## Stay in the state
4232 }
4233
4234 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235 $self->{line_prev} = $self->{line};
4236 $self->{column_prev} = $self->{column};
4237 $self->{column}++;
4238 $self->{nc}
4239 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240 } else {
4241 $self->{set_nc}->($self);
4242 }
4243
4244 redo A;
4245 } elsif ($self->{nc} == 0x003E) { # >
4246 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247
4248 $self->{state} = DATA_STATE;
4249 $self->{s_kwd} = '';
4250 } else {
4251
4252 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253 }
4254
4255
4256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257 $self->{line_prev} = $self->{line};
4258 $self->{column_prev} = $self->{column};
4259 $self->{column}++;
4260 $self->{nc}
4261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262 } else {
4263 $self->{set_nc}->($self);
4264 }
4265
4266 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 redo A;
4268 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269 ($self->{nc} == 0x004E or # N
4270 $self->{nc} == 0x006E)) { # n
4271
4272 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273 $self->{state} = NDATA_STATE;
4274 $self->{kwd} = chr $self->{nc};
4275
4276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277 $self->{line_prev} = $self->{line};
4278 $self->{column_prev} = $self->{column};
4279 $self->{column}++;
4280 $self->{nc}
4281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4282 } else {
4283 $self->{set_nc}->($self);
4284 }
4285
4286 redo A;
4287 } elsif ($self->{nc} == -1) {
4288 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289
4290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291 $self->{state} = DATA_STATE;
4292 $self->{s_kwd} = '';
4293 $self->{ct}->{quirks} = 1;
4294 } else {
4295
4296 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298 }
4299
4300 ## reconsume
4301 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 redo A;
4303 } elsif ($self->{is_xml} and
4304 $self->{ct}->{type} == DOCTYPE_TOKEN and
4305 $self->{nc} == 0x005B) { # [
4306
4307 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309 $self->{in_subset} = 1;
4310
4311 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312 $self->{line_prev} = $self->{line};
4313 $self->{column_prev} = $self->{column};
4314 $self->{column}++;
4315 $self->{nc}
4316 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317 } else {
4318 $self->{set_nc}->($self);
4319 }
4320
4321 return ($self->{ct}); # DOCTYPE
4322 redo A;
4323 } else {
4324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325
4326 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327
4328 #$self->{ct}->{quirks} = 1;
4329 $self->{state} = BOGUS_DOCTYPE_STATE;
4330 } else {
4331
4332 $self->{state} = BOGUS_MD_STATE;
4333 }
4334
4335
4336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337 $self->{line_prev} = $self->{line};
4338 $self->{column_prev} = $self->{column};
4339 $self->{column}++;
4340 $self->{nc}
4341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342 } else {
4343 $self->{set_nc}->($self);
4344 }
4345
4346 redo A;
4347 }
4348 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349 if ($is_space->{$self->{nc}}) {
4350
4351 ## Stay in the state.
4352
4353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354 $self->{line_prev} = $self->{line};
4355 $self->{column_prev} = $self->{column};
4356 $self->{column}++;
4357 $self->{nc}
4358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359 } else {
4360 $self->{set_nc}->($self);
4361 }
4362
4363 redo A;
4364 } elsif ($self->{nc} == 0x003E) { # >
4365
4366 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367
4368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369 $self->{line_prev} = $self->{line};
4370 $self->{column_prev} = $self->{column};
4371 $self->{column}++;
4372 $self->{nc}
4373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374 } else {
4375 $self->{set_nc}->($self);
4376 }
4377
4378 return ($self->{ct}); # ENTITY
4379 redo A;
4380 } elsif ($self->{nc} == 0x004E or # N
4381 $self->{nc} == 0x006E) { # n
4382
4383 $self->{state} = NDATA_STATE;
4384 $self->{kwd} = chr $self->{nc};
4385
4386 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387 $self->{line_prev} = $self->{line};
4388 $self->{column_prev} = $self->{column};
4389 $self->{column}++;
4390 $self->{nc}
4391 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392 } else {
4393 $self->{set_nc}->($self);
4394 }
4395
4396 redo A;
4397 } elsif ($self->{nc} == -1) {
4398
4399 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401 ## reconsume
4402 return ($self->{ct}); # ENTITY
4403 redo A;
4404 } else {
4405
4406 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407 $self->{state} = BOGUS_MD_STATE;
4408
4409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410 $self->{line_prev} = $self->{line};
4411 $self->{column_prev} = $self->{column};
4412 $self->{column}++;
4413 $self->{nc}
4414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415 } else {
4416 $self->{set_nc}->($self);
4417 }
4418
4419 redo A;
4420 }
4421 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4422 if ($self->{nc} == 0x003E) { # >
4423
4424 $self->{state} = DATA_STATE;
4425 $self->{s_kwd} = '';
4426
4427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428 $self->{line_prev} = $self->{line};
4429 $self->{column_prev} = $self->{column};
4430 $self->{column}++;
4431 $self->{nc}
4432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433 } else {
4434 $self->{set_nc}->($self);
4435 }
4436
4437
4438 return ($self->{ct}); # DOCTYPE
4439
4440 redo A;
4441 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442
4443 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445 $self->{in_subset} = 1;
4446
4447 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448 $self->{line_prev} = $self->{line};
4449 $self->{column_prev} = $self->{column};
4450 $self->{column}++;
4451 $self->{nc}
4452 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453 } else {
4454 $self->{set_nc}->($self);
4455 }
4456
4457 return ($self->{ct}); # DOCTYPE
4458 redo A;
4459 } elsif ($self->{nc} == -1) {
4460
4461 $self->{state} = DATA_STATE;
4462 $self->{s_kwd} = '';
4463 ## reconsume
4464
4465 return ($self->{ct}); # DOCTYPE
4466
4467 redo A;
4468 } else {
4469
4470 my $s = '';
4471 $self->{read_until}->($s, q{>[}, 0);
4472
4473 ## Stay in the state
4474
4475 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476 $self->{line_prev} = $self->{line};
4477 $self->{column_prev} = $self->{column};
4478 $self->{column}++;
4479 $self->{nc}
4480 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481 } else {
4482 $self->{set_nc}->($self);
4483 }
4484
4485 redo A;
4486 }
4487 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4488 ## NOTE: "CDATA section state" in the state is jointly implemented
4489 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490 ## and |CDATA_SECTION_MSE2_STATE|.
4491
4492 ## XML5: "CDATA state".
4493
4494 if ($self->{nc} == 0x005D) { # ]
4495
4496 $self->{state} = CDATA_SECTION_MSE1_STATE;
4497
4498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499 $self->{line_prev} = $self->{line};
4500 $self->{column_prev} = $self->{column};
4501 $self->{column}++;
4502 $self->{nc}
4503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504 } else {
4505 $self->{set_nc}->($self);
4506 }
4507
4508 redo A;
4509 } elsif ($self->{nc} == -1) {
4510 if ($self->{is_xml}) {
4511
4512 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513 } else {
4514
4515 }
4516
4517 $self->{state} = DATA_STATE;
4518 $self->{s_kwd} = '';
4519 ## Reconsume.
4520 if (length $self->{ct}->{data}) { # character
4521
4522 return ($self->{ct}); # character
4523 } else {
4524
4525 ## No token to emit. $self->{ct} is discarded.
4526 }
4527 redo A;
4528 } else {
4529
4530 $self->{ct}->{data} .= chr $self->{nc};
4531 $self->{read_until}->($self->{ct}->{data},
4532 q<]>,
4533 length $self->{ct}->{data});
4534
4535 ## Stay in the state.
4536
4537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4538 $self->{line_prev} = $self->{line};
4539 $self->{column_prev} = $self->{column};
4540 $self->{column}++;
4541 $self->{nc}
4542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4543 } else {
4544 $self->{set_nc}->($self);
4545 }
4546
4547 redo A;
4548 }
4549
4550 ## ISSUE: "text tokens" in spec.
4551 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552 ## XML5: "CDATA bracket state".
4553
4554 if ($self->{nc} == 0x005D) { # ]
4555
4556 $self->{state} = CDATA_SECTION_MSE2_STATE;
4557
4558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559 $self->{line_prev} = $self->{line};
4560 $self->{column_prev} = $self->{column};
4561 $self->{column}++;
4562 $self->{nc}
4563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564 } else {
4565 $self->{set_nc}->($self);
4566 }
4567
4568 redo A;
4569 } else {
4570
4571 ## XML5: If EOF, "]" is not appended and changed to the data state.
4572 $self->{ct}->{data} .= ']';
4573 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574 ## Reconsume.
4575 redo A;
4576 }
4577 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578 ## XML5: "CDATA end state".
4579
4580 if ($self->{nc} == 0x003E) { # >
4581 $self->{state} = DATA_STATE;
4582 $self->{s_kwd} = '';
4583
4584 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585 $self->{line_prev} = $self->{line};
4586 $self->{column_prev} = $self->{column};
4587 $self->{column}++;
4588 $self->{nc}
4589 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4590 } else {
4591 $self->{set_nc}->($self);
4592 }
4593
4594 if (length $self->{ct}->{data}) { # character
4595
4596 return ($self->{ct}); # character
4597 } else {
4598
4599 ## No token to emit. $self->{ct} is discarded.
4600 }
4601 redo A;
4602 } elsif ($self->{nc} == 0x005D) { # ]
4603 # character
4604 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4605 ## Stay in the state.
4606
4607 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608 $self->{line_prev} = $self->{line};
4609 $self->{column_prev} = $self->{column};
4610 $self->{column}++;
4611 $self->{nc}
4612 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613 } else {
4614 $self->{set_nc}->($self);
4615 }
4616
4617 redo A;
4618 } else {
4619
4620 $self->{ct}->{data} .= ']]'; # character
4621 $self->{state} = CDATA_SECTION_STATE;
4622 ## Reconsume. ## XML5: Emit.
4623 redo A;
4624 }
4625 } elsif ($self->{state} == ENTITY_STATE) {
4626 if ($is_space->{$self->{nc}} or
4627 {
4628 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4629 $self->{entity_add} => 1,
4630 }->{$self->{nc}}) {
4631 if ($self->{is_xml}) {
4632
4633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4634 line => $self->{line_prev},
4635 column => $self->{column_prev}
4636 + ($self->{nc} == -1 ? 1 : 0));
4637 } else {
4638
4639 ## No error
4640 }
4641 ## Don't consume
4642 ## Return nothing.
4643 #
4644 } elsif ($self->{nc} == 0x0023) { # #
4645
4646 $self->{state} = ENTITY_HASH_STATE;
4647 $self->{kwd} = '#';
4648
4649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650 $self->{line_prev} = $self->{line};
4651 $self->{column_prev} = $self->{column};
4652 $self->{column}++;
4653 $self->{nc}
4654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655 } else {
4656 $self->{set_nc}->($self);
4657 }
4658
4659 redo A;
4660 } elsif ($self->{is_xml} or
4661 (0x0041 <= $self->{nc} and
4662 $self->{nc} <= 0x005A) or # A..Z
4663 (0x0061 <= $self->{nc} and
4664 $self->{nc} <= 0x007A)) { # a..z
4665
4666 require Whatpm::_NamedEntityList;
4667 $self->{state} = ENTITY_NAME_STATE;
4668 $self->{kwd} = chr $self->{nc};
4669 $self->{entity__value} = $self->{kwd};
4670 $self->{entity__match} = 0;
4671
4672 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4673 $self->{line_prev} = $self->{line};
4674 $self->{column_prev} = $self->{column};
4675 $self->{column}++;
4676 $self->{nc}
4677 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4678 } else {
4679 $self->{set_nc}->($self);
4680 }
4681
4682 redo A;
4683 } else {
4684
4685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4686 ## Return nothing.
4687 #
4688 }
4689
4690 ## NOTE: No character is consumed by the "consume a character
4691 ## reference" algorithm. In other word, there is an "&" character
4692 ## that does not introduce a character reference, which would be
4693 ## appended to the parent element or the attribute value in later
4694 ## process of the tokenizer.
4695
4696 if ($self->{prev_state} == DATA_STATE) {
4697
4698 $self->{state} = $self->{prev_state};
4699 $self->{s_kwd} = '';
4700 ## Reconsume.
4701 return ({type => CHARACTER_TOKEN, data => '&',
4702 line => $self->{line_prev},
4703 column => $self->{column_prev},
4704 });
4705 redo A;
4706 } else {
4707
4708 $self->{ca}->{value} .= '&';
4709 $self->{state} = $self->{prev_state};
4710 $self->{s_kwd} = '';
4711 ## Reconsume.
4712 redo A;
4713 }
4714 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4715 if ($self->{nc} == 0x0078) { # x
4716
4717 $self->{state} = HEXREF_X_STATE;
4718 $self->{kwd} .= chr $self->{nc};
4719
4720 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4721 $self->{line_prev} = $self->{line};
4722 $self->{column_prev} = $self->{column};
4723 $self->{column}++;
4724 $self->{nc}
4725 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4726 } else {
4727 $self->{set_nc}->($self);
4728 }
4729
4730 redo A;
4731 } elsif ($self->{nc} == 0x0058) { # X
4732
4733 if ($self->{is_xml}) {
4734 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4735 }
4736 $self->{state} = HEXREF_X_STATE;
4737 $self->{kwd} .= chr $self->{nc};
4738
4739 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4740 $self->{line_prev} = $self->{line};
4741 $self->{column_prev} = $self->{column};
4742 $self->{column}++;
4743 $self->{nc}
4744 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4745 } else {
4746 $self->{set_nc}->($self);
4747 }
4748
4749 redo A;
4750 } elsif (0x0030 <= $self->{nc} and
4751 $self->{nc} <= 0x0039) { # 0..9
4752
4753 $self->{state} = NCR_NUM_STATE;
4754 $self->{kwd} = $self->{nc} - 0x0030;
4755
4756 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4757 $self->{line_prev} = $self->{line};
4758 $self->{column_prev} = $self->{column};
4759 $self->{column}++;
4760 $self->{nc}
4761 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4762 } else {
4763 $self->{set_nc}->($self);
4764 }
4765
4766 redo A;
4767 } else {
4768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4769 line => $self->{line_prev},
4770 column => $self->{column_prev} - 1);
4771
4772 ## NOTE: According to the spec algorithm, nothing is returned,
4773 ## and then "&#" is appended to the parent element or the attribute
4774 ## value in the later processing.
4775
4776 if ($self->{prev_state} == DATA_STATE) {
4777
4778 $self->{state} = $self->{prev_state};
4779 $self->{s_kwd} = '';
4780 ## Reconsume.
4781 return ({type => CHARACTER_TOKEN,
4782 data => '&#',
4783 line => $self->{line_prev},
4784 column => $self->{column_prev} - 1,
4785 });
4786 redo A;
4787 } else {
4788
4789 $self->{ca}->{value} .= '&#';
4790 $self->{state} = $self->{prev_state};
4791 $self->{s_kwd} = '';
4792 ## Reconsume.
4793 redo A;
4794 }
4795 }
4796 } elsif ($self->{state} == NCR_NUM_STATE) {
4797 if (0x0030 <= $self->{nc} and
4798 $self->{nc} <= 0x0039) { # 0..9
4799
4800 $self->{kwd} *= 10;
4801 $self->{kwd} += $self->{nc} - 0x0030;
4802
4803 ## Stay in the state.
4804
4805 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4806 $self->{line_prev} = $self->{line};
4807 $self->{column_prev} = $self->{column};
4808 $self->{column}++;
4809 $self->{nc}
4810 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4811 } else {
4812 $self->{set_nc}->($self);
4813 }
4814
4815 redo A;
4816 } elsif ($self->{nc} == 0x003B) { # ;
4817
4818
4819 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4820 $self->{line_prev} = $self->{line};
4821 $self->{column_prev} = $self->{column};
4822 $self->{column}++;
4823 $self->{nc}
4824 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4825 } else {
4826 $self->{set_nc}->($self);
4827 }
4828
4829 #
4830 } else {
4831
4832 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4833 ## Reconsume.
4834 #
4835 }
4836
4837 my $code = $self->{kwd};
4838 my $l = $self->{line_prev};
4839 my $c = $self->{column_prev};
4840 if ($charref_map->{$code}) {
4841
4842 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4843 text => (sprintf 'U+%04X', $code),
4844 line => $l, column => $c);
4845 $code = $charref_map->{$code};
4846 } elsif ($code > 0x10FFFF) {
4847
4848 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4849 text => (sprintf 'U-%08X', $code),
4850 line => $l, column => $c);
4851 $code = 0xFFFD;
4852 }
4853
4854 if ($self->{prev_state} == DATA_STATE) {
4855
4856 $self->{state} = $self->{prev_state};
4857 $self->{s_kwd} = '';
4858 ## Reconsume.
4859 return ({type => CHARACTER_TOKEN, data => chr $code,
4860 has_reference => 1,
4861 line => $l, column => $c,
4862 });
4863 redo A;
4864 } else {
4865
4866 $self->{ca}->{value} .= chr $code;
4867 $self->{ca}->{has_reference} = 1;
4868 $self->{state} = $self->{prev_state};
4869 $self->{s_kwd} = '';
4870 ## Reconsume.
4871 redo A;
4872 }
4873 } elsif ($self->{state} == HEXREF_X_STATE) {
4874 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4875 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4876 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4877 # 0..9, A..F, a..f
4878
4879 $self->{state} = HEXREF_HEX_STATE;
4880 $self->{kwd} = 0;
4881 ## Reconsume.
4882 redo A;
4883 } else {
4884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4885 line => $self->{line_prev},
4886 column => $self->{column_prev} - 2);
4887
4888 ## NOTE: According to the spec algorithm, nothing is returned,
4889 ## and then "&#" followed by "X" or "x" is appended to the parent
4890 ## element or the attribute value in the later processing.
4891
4892 if ($self->{prev_state} == DATA_STATE) {
4893
4894 $self->{state} = $self->{prev_state};
4895 $self->{s_kwd} = '';
4896 ## Reconsume.
4897 return ({type => CHARACTER_TOKEN,
4898 data => '&' . $self->{kwd},
4899 line => $self->{line_prev},
4900 column => $self->{column_prev} - length $self->{kwd},
4901 });
4902 redo A;
4903 } else {
4904
4905 $self->{ca}->{value} .= '&' . $self->{kwd};
4906 $self->{state} = $self->{prev_state};
4907 $self->{s_kwd} = '';
4908 ## Reconsume.
4909 redo A;
4910 }
4911 }
4912 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4913 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4914 # 0..9
4915
4916 $self->{kwd} *= 0x10;
4917 $self->{kwd} += $self->{nc} - 0x0030;
4918 ## Stay in the state.
4919
4920 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4921 $self->{line_prev} = $self->{line};
4922 $self->{column_prev} = $self->{column};
4923 $self->{column}++;
4924 $self->{nc}
4925 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4926 } else {
4927 $self->{set_nc}->($self);
4928 }
4929
4930 redo A;
4931 } elsif (0x0061 <= $self->{nc} and
4932 $self->{nc} <= 0x0066) { # a..f
4933
4934 $self->{kwd} *= 0x10;
4935 $self->{kwd} += $self->{nc} - 0x0060 + 9;
4936 ## Stay in the state.
4937
4938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4939 $self->{line_prev} = $self->{line};
4940 $self->{column_prev} = $self->{column};
4941 $self->{column}++;
4942 $self->{nc}
4943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4944 } else {
4945 $self->{set_nc}->($self);
4946 }
4947
4948 redo A;
4949 } elsif (0x0041 <= $self->{nc} and
4950 $self->{nc} <= 0x0046) { # A..F
4951
4952 $self->{kwd} *= 0x10;
4953 $self->{kwd} += $self->{nc} - 0x0040 + 9;
4954 ## Stay in the state.
4955
4956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957 $self->{line_prev} = $self->{line};
4958 $self->{column_prev} = $self->{column};
4959 $self->{column}++;
4960 $self->{nc}
4961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962 } else {
4963 $self->{set_nc}->($self);
4964 }
4965
4966 redo A;
4967 } elsif ($self->{nc} == 0x003B) { # ;
4968
4969
4970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4971 $self->{line_prev} = $self->{line};
4972 $self->{column_prev} = $self->{column};
4973 $self->{column}++;
4974 $self->{nc}
4975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4976 } else {
4977 $self->{set_nc}->($self);
4978 }
4979
4980 #
4981 } else {
4982
4983 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4984 line => $self->{line},
4985 column => $self->{column});
4986 ## Reconsume.
4987 #
4988 }
4989
4990 my $code = $self->{kwd};
4991 my $l = $self->{line_prev};
4992 my $c = $self->{column_prev};
4993 if ($charref_map->{$code}) {
4994
4995 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4996 text => (sprintf 'U+%04X', $code),
4997 line => $l, column => $c);
4998 $code = $charref_map->{$code};
4999 } elsif ($code > 0x10FFFF) {
5000
5001 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5002 text => (sprintf 'U-%08X', $code),
5003 line => $l, column => $c);
5004 $code = 0xFFFD;
5005 }
5006
5007 if ($self->{prev_state} == DATA_STATE) {
5008
5009 $self->{state} = $self->{prev_state};
5010 $self->{s_kwd} = '';
5011 ## Reconsume.
5012 return ({type => CHARACTER_TOKEN, data => chr $code,
5013 has_reference => 1,
5014 line => $l, column => $c,
5015 });
5016 redo A;
5017 } else {
5018
5019 $self->{ca}->{value} .= chr $code;
5020 $self->{ca}->{has_reference} = 1;
5021 $self->{state} = $self->{prev_state};
5022 $self->{s_kwd} = '';
5023 ## Reconsume.
5024 redo A;
5025 }
5026 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5027 if ((0x0041 <= $self->{nc} and # a
5028 $self->{nc} <= 0x005A) or # x
5029 (0x0061 <= $self->{nc} and # a
5030 $self->{nc} <= 0x007A) or # z
5031 (0x0030 <= $self->{nc} and # 0
5032 $self->{nc} <= 0x0039) or # 9
5033 $self->{nc} == 0x003B or # ;
5034 ($self->{is_xml} and
5035 not ($is_space->{$self->{nc}} or
5036 {
5037 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5038 $self->{entity_add} => 1,
5039 }->{$self->{nc}}))) {
5040 our $EntityChar;
5041 $self->{kwd} .= chr $self->{nc};
5042 if (defined $EntityChar->{$self->{kwd}} or
5043 $self->{ge}->{$self->{kwd}}) {
5044 if ($self->{nc} == 0x003B) { # ;
5045 if (defined $self->{ge}->{$self->{kwd}}) {
5046 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5047
5048 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5049 } else {
5050 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5051
5052 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5053 value => $self->{kwd});
5054 } else {
5055
5056 }
5057 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5058 }
5059 } else {
5060 if ($self->{is_xml}) {
5061
5062 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5063 value => $self->{kwd},
5064 level => {
5065 'amp;' => $self->{level}->{warn},
5066 'quot;' => $self->{level}->{warn},
5067 'lt;' => $self->{level}->{warn},
5068 'gt;' => $self->{level}->{warn},
5069 'apos;' => $self->{level}->{warn},
5070 }->{$self->{kwd}} ||
5071 $self->{level}->{must});
5072 } else {
5073
5074 }
5075 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5076 }
5077 $self->{entity__match} = 1;
5078
5079 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5080 $self->{line_prev} = $self->{line};
5081 $self->{column_prev} = $self->{column};
5082 $self->{column}++;
5083 $self->{nc}
5084 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5085 } else {
5086 $self->{set_nc}->($self);
5087 }
5088
5089 #
5090 } else {
5091
5092 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5093 $self->{entity__match} = -1;
5094 ## Stay in the state.
5095
5096 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5097 $self->{line_prev} = $self->{line};
5098 $self->{column_prev} = $self->{column};
5099 $self->{column}++;
5100 $self->{nc}
5101 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5102 } else {
5103 $self->{set_nc}->($self);
5104 }
5105
5106 redo A;
5107 }
5108 } else {
5109
5110 $self->{entity__value} .= chr $self->{nc};
5111 $self->{entity__match} *= 2;
5112 ## Stay in the state.
5113
5114 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5115 $self->{line_prev} = $self->{line};
5116 $self->{column_prev} = $self->{column};
5117 $self->{column}++;
5118 $self->{nc}
5119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5120 } else {
5121 $self->{set_nc}->($self);
5122 }
5123
5124 redo A;
5125 }
5126 }
5127
5128 my $data;
5129 my $has_ref;
5130 if ($self->{entity__match} > 0) {
5131
5132 $data = $self->{entity__value};
5133 $has_ref = 1;
5134 #
5135 } elsif ($self->{entity__match} < 0) {
5136 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5137 if ($self->{prev_state} != DATA_STATE and # in attribute
5138 $self->{entity__match} < -1) {
5139
5140 $data = '&' . $self->{kwd};
5141 #
5142 } else {
5143
5144 $data = $self->{entity__value};
5145 $has_ref = 1;
5146 #
5147 }
5148 } else {
5149
5150 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5151 line => $self->{line_prev},
5152 column => $self->{column_prev} - length $self->{kwd});
5153 $data = '&' . $self->{kwd};
5154 #
5155 }
5156
5157 ## NOTE: In these cases, when a character reference is found,
5158 ## it is consumed and a character token is returned, or, otherwise,
5159 ## nothing is consumed and returned, according to the spec algorithm.
5160 ## In this implementation, anything that has been examined by the
5161 ## tokenizer is appended to the parent element or the attribute value
5162 ## as string, either literal string when no character reference or
5163 ## entity-replaced string otherwise, in this stage, since any characters
5164 ## that would not be consumed are appended in the data state or in an
5165 ## appropriate attribute value state anyway.
5166
5167 if ($self->{prev_state} == DATA_STATE) {
5168
5169 $self->{state} = $self->{prev_state};
5170 $self->{s_kwd} = '';
5171 ## Reconsume.
5172 return ({type => CHARACTER_TOKEN,
5173 data => $data,
5174 has_reference => $has_ref,
5175 line => $self->{line_prev},
5176 column => $self->{column_prev} + 1 - length $self->{kwd},
5177 });
5178 redo A;
5179 } else {
5180
5181 $self->{ca}->{value} .= $data;
5182 $self->{ca}->{has_reference} = 1 if $has_ref;
5183 $self->{state} = $self->{prev_state};
5184 $self->{s_kwd} = '';
5185 ## Reconsume.
5186 redo A;
5187 }
5188
5189 ## XML-only states
5190
5191 } elsif ($self->{state} == PI_STATE) {
5192 ## XML5: "Pi state" and "DOCTYPE pi state".
5193
5194 if ($is_space->{$self->{nc}} or
5195 $self->{nc} == 0x003F or # ?
5196 $self->{nc} == -1) {
5197 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5198 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5199 ## "DOCTYPE pi state": Parse error, switch to the "data
5200 ## state".
5201 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5202 line => $self->{line_prev},
5203 column => $self->{column_prev}
5204 - 1 * ($self->{nc} != -1));
5205 $self->{state} = BOGUS_COMMENT_STATE;
5206 ## Reconsume.
5207 $self->{ct} = {type => COMMENT_TOKEN,
5208 data => '?',
5209 line => $self->{line_prev},
5210 column => $self->{column_prev}
5211 - 1 * ($self->{nc} != -1),
5212 };
5213 redo A;
5214 } else {
5215 ## XML5: "DOCTYPE pi state": Stay in the state.
5216 $self->{ct} = {type => PI_TOKEN,
5217 target => chr $self->{nc},
5218 data => '',
5219 line => $self->{line_prev},
5220 column => $self->{column_prev} - 1,
5221 };
5222 $self->{state} = PI_TARGET_STATE;
5223
5224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5225 $self->{line_prev} = $self->{line};
5226 $self->{column_prev} = $self->{column};
5227 $self->{column}++;
5228 $self->{nc}
5229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5230 } else {
5231 $self->{set_nc}->($self);
5232 }
5233
5234 redo A;
5235 }
5236 } elsif ($self->{state} == PI_TARGET_STATE) {
5237 if ($is_space->{$self->{nc}}) {
5238 $self->{state} = PI_TARGET_AFTER_STATE;
5239
5240 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5241 $self->{line_prev} = $self->{line};
5242 $self->{column_prev} = $self->{column};
5243 $self->{column}++;
5244 $self->{nc}
5245 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5246 } else {
5247 $self->{set_nc}->($self);
5248 }
5249
5250 redo A;
5251 } elsif ($self->{nc} == -1) {
5252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5253 if ($self->{in_subset}) {
5254 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5255 } else {
5256 $self->{state} = DATA_STATE;
5257 $self->{s_kwd} = '';
5258 }
5259 ## Reconsume.
5260 return ($self->{ct}); # pi
5261 redo A;
5262 } elsif ($self->{nc} == 0x003F) { # ?
5263 $self->{state} = PI_AFTER_STATE;
5264
5265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5266 $self->{line_prev} = $self->{line};
5267 $self->{column_prev} = $self->{column};
5268 $self->{column}++;
5269 $self->{nc}
5270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5271 } else {
5272 $self->{set_nc}->($self);
5273 }
5274
5275 redo A;
5276 } else {
5277 ## XML5: typo ("tag name" -> "target")
5278 $self->{ct}->{target} .= chr $self->{nc}; # pi
5279
5280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5281 $self->{line_prev} = $self->{line};
5282 $self->{column_prev} = $self->{column};
5283 $self->{column}++;
5284 $self->{nc}
5285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5286 } else {
5287 $self->{set_nc}->($self);
5288 }
5289
5290 redo A;
5291 }
5292 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5293 if ($is_space->{$self->{nc}}) {
5294 ## Stay in the state.
5295
5296 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5297 $self->{line_prev} = $self->{line};
5298 $self->{column_prev} = $self->{column};
5299 $self->{column}++;
5300 $self->{nc}
5301 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5302 } else {
5303 $self->{set_nc}->($self);
5304 }
5305
5306 redo A;
5307 } else {
5308 $self->{state} = PI_DATA_STATE;
5309 ## Reprocess.
5310 redo A;
5311 }
5312 } elsif ($self->{state} == PI_DATA_STATE) {
5313 if ($self->{nc} == 0x003F) { # ?
5314 $self->{state} = PI_DATA_AFTER_STATE;
5315
5316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5317 $self->{line_prev} = $self->{line};
5318 $self->{column_prev} = $self->{column};
5319 $self->{column}++;
5320 $self->{nc}
5321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5322 } else {
5323 $self->{set_nc}->($self);
5324 }
5325
5326 redo A;
5327 } elsif ($self->{nc} == -1) {
5328 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5329 if ($self->{in_subset}) {
5330 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5331 } else {
5332 $self->{state} = DATA_STATE;
5333 $self->{s_kwd} = '';
5334 }
5335 ## Reprocess.
5336 return ($self->{ct}); # pi
5337 redo A;
5338 } else {
5339 $self->{ct}->{data} .= chr $self->{nc}; # pi
5340 $self->{read_until}->($self->{ct}->{data}, q[?],
5341 length $self->{ct}->{data});
5342 ## Stay in the state.
5343
5344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5345 $self->{line_prev} = $self->{line};
5346 $self->{column_prev} = $self->{column};
5347 $self->{column}++;
5348 $self->{nc}
5349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5350 } else {
5351 $self->{set_nc}->($self);
5352 }
5353
5354 ## Reprocess.
5355 redo A;
5356 }
5357 } elsif ($self->{state} == PI_AFTER_STATE) {
5358 ## XML5: Part of "Pi after state".
5359
5360 if ($self->{nc} == 0x003E) { # >
5361 if ($self->{in_subset}) {
5362 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5363 } else {
5364 $self->{state} = DATA_STATE;
5365 $self->{s_kwd} = '';
5366 }
5367
5368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5369 $self->{line_prev} = $self->{line};
5370 $self->{column_prev} = $self->{column};
5371 $self->{column}++;
5372 $self->{nc}
5373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5374 } else {
5375 $self->{set_nc}->($self);
5376 }
5377
5378 return ($self->{ct}); # pi
5379 redo A;
5380 } elsif ($self->{nc} == 0x003F) { # ?
5381 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5382 line => $self->{line_prev},
5383 column => $self->{column_prev}); ## XML5: no error
5384 $self->{ct}->{data} .= '?';
5385 $self->{state} = PI_DATA_AFTER_STATE;
5386
5387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5388 $self->{line_prev} = $self->{line};
5389 $self->{column_prev} = $self->{column};
5390 $self->{column}++;
5391 $self->{nc}
5392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5393 } else {
5394 $self->{set_nc}->($self);
5395 }
5396
5397 redo A;
5398 } else {
5399 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5400 line => $self->{line_prev},
5401 column => $self->{column_prev}
5402 + 1 * ($self->{nc} == -1)); ## XML5: no error
5403 $self->{ct}->{data} .= '?'; ## XML5: not appended
5404 $self->{state} = PI_DATA_STATE;
5405 ## Reprocess.
5406 redo A;
5407 }
5408 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5409 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5410
5411 if ($self->{nc} == 0x003E) { # >
5412 if ($self->{in_subset}) {
5413 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5414 } else {
5415 $self->{state} = DATA_STATE;
5416 $self->{s_kwd} = '';
5417 }
5418
5419 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5420 $self->{line_prev} = $self->{line};
5421 $self->{column_prev} = $self->{column};
5422 $self->{column}++;
5423 $self->{nc}
5424 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5425 } else {
5426 $self->{set_nc}->($self);
5427 }
5428
5429 return ($self->{ct}); # pi
5430 redo A;
5431 } elsif ($self->{nc} == 0x003F) { # ?
5432 $self->{ct}->{data} .= '?';
5433 ## Stay in the state.
5434
5435 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5436 $self->{line_prev} = $self->{line};
5437 $self->{column_prev} = $self->{column};
5438 $self->{column}++;
5439 $self->{nc}
5440 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5441 } else {
5442 $self->{set_nc}->($self);
5443 }
5444
5445 redo A;
5446 } else {
5447 $self->{ct}->{data} .= '?'; ## XML5: not appended
5448 $self->{state} = PI_DATA_STATE;
5449 ## Reprocess.
5450 redo A;
5451 }
5452
5453 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5454 if ($self->{nc} == 0x003C) { # <
5455 $self->{state} = DOCTYPE_TAG_STATE;
5456
5457 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5458 $self->{line_prev} = $self->{line};
5459 $self->{column_prev} = $self->{column};
5460 $self->{column}++;
5461 $self->{nc}
5462 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5463 } else {
5464 $self->{set_nc}->($self);
5465 }
5466
5467 redo A;
5468 } elsif ($self->{nc} == 0x0025) { # %
5469 ## XML5: Not defined yet.
5470
5471 ## TODO:
5472
5473 if (not $self->{stop_processing} and
5474 not $self->{document}->xml_standalone) {
5475 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5476 level => $self->{level}->{info});
5477 $self->{stop_processing} = 1;
5478 }
5479
5480
5481 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5482 $self->{line_prev} = $self->{line};
5483 $self->{column_prev} = $self->{column};
5484 $self->{column}++;
5485 $self->{nc}
5486 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5487 } else {
5488 $self->{set_nc}->($self);
5489 }
5490
5491 redo A;
5492 } elsif ($self->{nc} == 0x005D) { # ]
5493 delete $self->{in_subset};
5494 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5495
5496 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5497 $self->{line_prev} = $self->{line};
5498 $self->{column_prev} = $self->{column};
5499 $self->{column}++;
5500 $self->{nc}
5501 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5502 } else {
5503 $self->{set_nc}->($self);
5504 }
5505
5506 redo A;
5507 } elsif ($is_space->{$self->{nc}}) {
5508 ## Stay in the state.
5509
5510 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5511 $self->{line_prev} = $self->{line};
5512 $self->{column_prev} = $self->{column};
5513 $self->{column}++;
5514 $self->{nc}
5515 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5516 } else {
5517 $self->{set_nc}->($self);
5518 }
5519
5520 redo A;
5521 } elsif ($self->{nc} == -1) {
5522 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5523 delete $self->{in_subset};
5524 $self->{state} = DATA_STATE;
5525 $self->{s_kwd} = '';
5526 ## Reconsume.
5527 return ({type => END_OF_DOCTYPE_TOKEN});
5528 redo A;
5529 } else {
5530 unless ($self->{internal_subset_tainted}) {
5531 ## XML5: No parse error.
5532 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5533 $self->{internal_subset_tainted} = 1;
5534 }
5535 ## Stay in the state.
5536
5537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5538 $self->{line_prev} = $self->{line};
5539 $self->{column_prev} = $self->{column};
5540 $self->{column}++;
5541 $self->{nc}
5542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5543 } else {
5544 $self->{set_nc}->($self);
5545 }
5546
5547 redo A;
5548 }
5549 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5550 if ($self->{nc} == 0x003E) { # >
5551 $self->{state} = DATA_STATE;
5552 $self->{s_kwd} = '';
5553
5554 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5555 $self->{line_prev} = $self->{line};
5556 $self->{column_prev} = $self->{column};
5557 $self->{column}++;
5558 $self->{nc}
5559 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5560 } else {
5561 $self->{set_nc}->($self);
5562 }
5563
5564 return ({type => END_OF_DOCTYPE_TOKEN});
5565 redo A;
5566 } elsif ($self->{nc} == -1) {
5567 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5568 $self->{state} = DATA_STATE;
5569 $self->{s_kwd} = '';
5570 ## Reconsume.
5571 return ({type => END_OF_DOCTYPE_TOKEN});
5572 redo A;
5573 } else {
5574 ## XML5: No parse error and stay in the state.
5575 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5576
5577 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5578
5579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5580 $self->{line_prev} = $self->{line};
5581 $self->{column_prev} = $self->{column};
5582 $self->{column}++;
5583 $self->{nc}
5584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5585 } else {
5586 $self->{set_nc}->($self);
5587 }
5588
5589 redo A;
5590 }
5591 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5592 if ($self->{nc} == 0x003E) { # >
5593 $self->{state} = DATA_STATE;
5594 $self->{s_kwd} = '';
5595
5596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5597 $self->{line_prev} = $self->{line};
5598 $self->{column_prev} = $self->{column};
5599 $self->{column}++;
5600 $self->{nc}
5601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5602 } else {
5603 $self->{set_nc}->($self);
5604 }
5605
5606 return ({type => END_OF_DOCTYPE_TOKEN});
5607 redo A;
5608 } elsif ($self->{nc} == -1) {
5609 $self->{state} = DATA_STATE;
5610 $self->{s_kwd} = '';
5611 ## Reconsume.
5612 return ({type => END_OF_DOCTYPE_TOKEN});
5613 redo A;
5614 } else {
5615 ## Stay in the state.
5616
5617 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5618 $self->{line_prev} = $self->{line};
5619 $self->{column_prev} = $self->{column};
5620 $self->{column}++;
5621 $self->{nc}
5622 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5623 } else {
5624 $self->{set_nc}->($self);
5625 }
5626
5627 redo A;
5628 }
5629 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5630 if ($self->{nc} == 0x0021) { # !
5631 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5632
5633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5634 $self->{line_prev} = $self->{line};
5635 $self->{column_prev} = $self->{column};
5636 $self->{column}++;
5637 $self->{nc}
5638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5639 } else {
5640 $self->{set_nc}->($self);
5641 }
5642
5643 redo A;
5644 } elsif ($self->{nc} == 0x003F) { # ?
5645 $self->{state} = PI_STATE;
5646
5647 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5648 $self->{line_prev} = $self->{line};
5649 $self->{column_prev} = $self->{column};
5650 $self->{column}++;
5651 $self->{nc}
5652 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5653 } else {
5654 $self->{set_nc}->($self);
5655 }
5656
5657 redo A;
5658 } elsif ($self->{nc} == -1) {
5659 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5660 $self->{state} = DATA_STATE;
5661 $self->{s_kwd} = '';
5662 ## Reconsume.
5663 redo A;
5664 } else {
5665 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5666 line => $self->{line_prev},
5667 column => $self->{column_prev});
5668 $self->{state} = BOGUS_COMMENT_STATE;
5669 $self->{ct} = {type => COMMENT_TOKEN,
5670 data => '',
5671 }; ## NOTE: Will be discarded.
5672
5673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5674 $self->{line_prev} = $self->{line};
5675 $self->{column_prev} = $self->{column};
5676 $self->{column}++;
5677 $self->{nc}
5678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5679 } else {
5680 $self->{set_nc}->($self);
5681 }
5682
5683 redo A;
5684 }
5685 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5686 ## XML5: "DOCTYPE markup declaration state".
5687
5688 if ($self->{nc} == 0x002D) { # -
5689 $self->{state} = MD_HYPHEN_STATE;
5690
5691 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5692 $self->{line_prev} = $self->{line};
5693 $self->{column_prev} = $self->{column};
5694 $self->{column}++;
5695 $self->{nc}
5696 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5697 } else {
5698 $self->{set_nc}->($self);
5699 }
5700
5701 redo A;
5702 } elsif ($self->{nc} == 0x0045 or # E
5703 $self->{nc} == 0x0065) { # e
5704 $self->{state} = MD_E_STATE;
5705 $self->{kwd} = chr $self->{nc};
5706
5707 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5708 $self->{line_prev} = $self->{line};
5709 $self->{column_prev} = $self->{column};
5710 $self->{column}++;
5711 $self->{nc}
5712 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5713 } else {
5714 $self->{set_nc}->($self);
5715 }
5716
5717 redo A;
5718 } elsif ($self->{nc} == 0x0041 or # A
5719 $self->{nc} == 0x0061) { # a
5720 $self->{state} = MD_ATTLIST_STATE;
5721 $self->{kwd} = chr $self->{nc};
5722
5723 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5724 $self->{line_prev} = $self->{line};
5725 $self->{column_prev} = $self->{column};
5726 $self->{column}++;
5727 $self->{nc}
5728 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5729 } else {
5730 $self->{set_nc}->($self);
5731 }
5732
5733 redo A;
5734 } elsif ($self->{nc} == 0x004E or # N
5735 $self->{nc} == 0x006E) { # n
5736 $self->{state} = MD_NOTATION_STATE;
5737 $self->{kwd} = chr $self->{nc};
5738
5739 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5740 $self->{line_prev} = $self->{line};
5741 $self->{column_prev} = $self->{column};
5742 $self->{column}++;
5743 $self->{nc}
5744 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5745 } else {
5746 $self->{set_nc}->($self);
5747 }
5748
5749 redo A;
5750 } else {
5751 #
5752 }
5753
5754 ## XML5: No parse error.
5755 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5756 line => $self->{line_prev},
5757 column => $self->{column_prev} - 1);
5758 ## Reconsume.
5759 $self->{state} = BOGUS_COMMENT_STATE;
5760 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5761 redo A;
5762 } elsif ($self->{state} == MD_E_STATE) {
5763 if ($self->{nc} == 0x004E or # N
5764 $self->{nc} == 0x006E) { # n
5765 $self->{state} = MD_ENTITY_STATE;
5766 $self->{kwd} .= chr $self->{nc};
5767
5768 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5769 $self->{line_prev} = $self->{line};
5770 $self->{column_prev} = $self->{column};
5771 $self->{column}++;
5772 $self->{nc}
5773 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5774 } else {
5775 $self->{set_nc}->($self);
5776 }
5777
5778 redo A;
5779 } elsif ($self->{nc} == 0x004C or # L
5780 $self->{nc} == 0x006C) { # l
5781 ## XML5: <!ELEMENT> not supported.
5782 $self->{state} = MD_ELEMENT_STATE;
5783 $self->{kwd} .= chr $self->{nc};
5784
5785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5786 $self->{line_prev} = $self->{line};
5787 $self->{column_prev} = $self->{column};
5788 $self->{column}++;
5789 $self->{nc}
5790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5791 } else {
5792 $self->{set_nc}->($self);
5793 }
5794
5795 redo A;
5796 } else {
5797 ## XML5: No parse error.
5798 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5799 line => $self->{line_prev},
5800 column => $self->{column_prev} - 2
5801 + 1 * ($self->{nc} == -1));
5802 ## Reconsume.
5803 $self->{state} = BOGUS_COMMENT_STATE;
5804 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5805 redo A;
5806 }
5807 } elsif ($self->{state} == MD_ENTITY_STATE) {
5808 if ($self->{nc} == [
5809 undef,
5810 undef,
5811 0x0054, # T
5812 0x0049, # I
5813 0x0054, # T
5814 ]->[length $self->{kwd}] or
5815 $self->{nc} == [
5816 undef,
5817 undef,
5818 0x0074, # t
5819 0x0069, # i
5820 0x0074, # t
5821 ]->[length $self->{kwd}]) {
5822 ## Stay in the state.
5823 $self->{kwd} .= chr $self->{nc};
5824
5825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5826 $self->{line_prev} = $self->{line};
5827 $self->{column_prev} = $self->{column};
5828 $self->{column}++;
5829 $self->{nc}
5830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5831 } else {
5832 $self->{set_nc}->($self);
5833 }
5834
5835 redo A;
5836 } elsif ((length $self->{kwd}) == 5 and
5837 ($self->{nc} == 0x0059 or # Y
5838 $self->{nc} == 0x0079)) { # y
5839 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5840 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5841 text => 'ENTITY',
5842 line => $self->{line_prev},
5843 column => $self->{column_prev} - 4);
5844 }
5845 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5846 line => $self->{line_prev},
5847 column => $self->{column_prev} - 6};
5848 $self->{state} = DOCTYPE_MD_STATE;
5849
5850 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5851 $self->{line_prev} = $self->{line};
5852 $self->{column_prev} = $self->{column};
5853 $self->{column}++;
5854 $self->{nc}
5855 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5856 } else {
5857 $self->{set_nc}->($self);
5858 }
5859
5860 redo A;
5861 } else {
5862 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5863 line => $self->{line_prev},
5864 column => $self->{column_prev} - 1
5865 - (length $self->{kwd})
5866 + 1 * ($self->{nc} == -1));
5867 $self->{state} = BOGUS_COMMENT_STATE;
5868 ## Reconsume.
5869 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5870 redo A;
5871 }
5872 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5873 if ($self->{nc} == [
5874 undef,
5875 undef,
5876 0x0045, # E
5877 0x004D, # M
5878 0x0045, # E
5879 0x004E, # N
5880 ]->[length $self->{kwd}] or
5881 $self->{nc} == [
5882 undef,
5883 undef,
5884 0x0065, # e
5885 0x006D, # m
5886 0x0065, # e
5887 0x006E, # n
5888 ]->[length $self->{kwd}]) {
5889 ## Stay in the state.
5890 $self->{kwd} .= chr $self->{nc};
5891
5892 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5893 $self->{line_prev} = $self->{line};
5894 $self->{column_prev} = $self->{column};
5895 $self->{column}++;
5896 $self->{nc}
5897 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5898 } else {
5899 $self->{set_nc}->($self);
5900 }
5901
5902 redo A;
5903 } elsif ((length $self->{kwd}) == 6 and
5904 ($self->{nc} == 0x0054 or # T
5905 $self->{nc} == 0x0074)) { # t
5906 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5907 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5908 text => 'ELEMENT',
5909 line => $self->{line_prev},
5910 column => $self->{column_prev} - 5);
5911 }
5912 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5913 line => $self->{line_prev},
5914 column => $self->{column_prev} - 7};
5915 $self->{state} = DOCTYPE_MD_STATE;
5916
5917 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5918 $self->{line_prev} = $self->{line};
5919 $self->{column_prev} = $self->{column};
5920 $self->{column}++;
5921 $self->{nc}
5922 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5923 } else {
5924 $self->{set_nc}->($self);
5925 }
5926
5927 redo A;
5928 } else {
5929 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5930 line => $self->{line_prev},
5931 column => $self->{column_prev} - 1
5932 - (length $self->{kwd})
5933 + 1 * ($self->{nc} == -1));
5934 $self->{state} = BOGUS_COMMENT_STATE;
5935 ## Reconsume.
5936 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5937 redo A;
5938 }
5939 } elsif ($self->{state} == MD_ATTLIST_STATE) {
5940 if ($self->{nc} == [
5941 undef,
5942 0x0054, # T
5943 0x0054, # T
5944 0x004C, # L
5945 0x0049, # I
5946 0x0053, # S
5947 ]->[length $self->{kwd}] or
5948 $self->{nc} == [
5949 undef,
5950 0x0074, # t
5951 0x0074, # t
5952 0x006C, # l
5953 0x0069, # i
5954 0x0073, # s
5955 ]->[length $self->{kwd}]) {
5956 ## Stay in the state.
5957 $self->{kwd} .= chr $self->{nc};
5958
5959 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5960 $self->{line_prev} = $self->{line};
5961 $self->{column_prev} = $self->{column};
5962 $self->{column}++;
5963 $self->{nc}
5964 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5965 } else {
5966 $self->{set_nc}->($self);
5967 }
5968
5969 redo A;
5970 } elsif ((length $self->{kwd}) == 6 and
5971 ($self->{nc} == 0x0054 or # T
5972 $self->{nc} == 0x0074)) { # t
5973 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5975 text => 'ATTLIST',
5976 line => $self->{line_prev},
5977 column => $self->{column_prev} - 5);
5978 }
5979 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5980 attrdefs => [],
5981 line => $self->{line_prev},
5982 column => $self->{column_prev} - 7};
5983 $self->{state} = DOCTYPE_MD_STATE;
5984
5985 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5986 $self->{line_prev} = $self->{line};
5987 $self->{column_prev} = $self->{column};
5988 $self->{column}++;
5989 $self->{nc}
5990 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5991 } else {
5992 $self->{set_nc}->($self);
5993 }
5994
5995 redo A;
5996 } else {
5997 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5998 line => $self->{line_prev},
5999 column => $self->{column_prev} - 1
6000 - (length $self->{kwd})
6001 + 1 * ($self->{nc} == -1));
6002 $self->{state} = BOGUS_COMMENT_STATE;
6003 ## Reconsume.
6004 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6005 redo A;
6006 }
6007 } elsif ($self->{state} == MD_NOTATION_STATE) {
6008 if ($self->{nc} == [
6009 undef,
6010 0x004F, # O
6011 0x0054, # T
6012 0x0041, # A
6013 0x0054, # T
6014 0x0049, # I
6015 0x004F, # O
6016 ]->[length $self->{kwd}] or
6017 $self->{nc} == [
6018 undef,
6019 0x006F, # o
6020 0x0074, # t
6021 0x0061, # a
6022 0x0074, # t
6023 0x0069, # i
6024 0x006F, # o
6025 ]->[length $self->{kwd}]) {
6026 ## Stay in the state.
6027 $self->{kwd} .= chr $self->{nc};
6028
6029 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6030 $self->{line_prev} = $self->{line};
6031 $self->{column_prev} = $self->{column};
6032 $self->{column}++;
6033 $self->{nc}
6034 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6035 } else {
6036 $self->{set_nc}->($self);
6037 }
6038
6039 redo A;
6040 } elsif ((length $self->{kwd}) == 7 and
6041 ($self->{nc} == 0x004E or # N
6042 $self->{nc} == 0x006E)) { # n
6043 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6044 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6045 text => 'NOTATION',
6046 line => $self->{line_prev},
6047 column => $self->{column_prev} - 6);
6048 }
6049 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6050 line => $self->{line_prev},
6051 column => $self->{column_prev} - 8};
6052 $self->{state} = DOCTYPE_MD_STATE;
6053
6054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6055 $self->{line_prev} = $self->{line};
6056 $self->{column_prev} = $self->{column};
6057 $self->{column}++;
6058 $self->{nc}
6059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6060 } else {
6061 $self->{set_nc}->($self);
6062 }
6063
6064 redo A;
6065 } else {
6066 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6067 line => $self->{line_prev},
6068 column => $self->{column_prev} - 1
6069 - (length $self->{kwd})
6070 + 1 * ($self->{nc} == -1));
6071 $self->{state} = BOGUS_COMMENT_STATE;
6072 ## Reconsume.
6073 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6074 redo A;
6075 }
6076 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6077 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6078 ## "DOCTYPE NOTATION state".
6079
6080 if ($is_space->{$self->{nc}}) {
6081 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6082 $self->{state} = BEFORE_MD_NAME_STATE;
6083
6084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6085 $self->{line_prev} = $self->{line};
6086 $self->{column_prev} = $self->{column};
6087 $self->{column}++;
6088 $self->{nc}
6089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6090 } else {
6091 $self->{set_nc}->($self);
6092 }
6093
6094 redo A;
6095 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6096 $self->{nc} == 0x0025) { # %
6097 ## XML5: Switch to the "DOCTYPE bogus comment state".
6098 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6099 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6100
6101 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6102 $self->{line_prev} = $self->{line};
6103 $self->{column_prev} = $self->{column};
6104 $self->{column}++;
6105 $self->{nc}
6106 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6107 } else {
6108 $self->{set_nc}->($self);
6109 }
6110
6111 redo A;
6112 } elsif ($self->{nc} == -1) {
6113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6114 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6115 ## Reconsume.
6116 redo A;
6117 } elsif ($self->{nc} == 0x003E) { # >
6118 ## XML5: Switch to the "DOCTYPE bogus comment state".
6119 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6120 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6121
6122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6123 $self->{line_prev} = $self->{line};
6124 $self->{column_prev} = $self->{column};
6125 $self->{column}++;
6126 $self->{nc}
6127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6128 } else {
6129 $self->{set_nc}->($self);
6130 }
6131
6132 redo A;
6133 } else {
6134 ## XML5: Switch to the "DOCTYPE bogus comment state".
6135 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6136 $self->{state} = BEFORE_MD_NAME_STATE;
6137 redo A;
6138 }
6139 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6140 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6141 ## before state", "DOCTYPE ATTLIST name before state".
6142
6143 if ($is_space->{$self->{nc}}) {
6144 ## Stay in the state.
6145
6146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6147 $self->{line_prev} = $self->{line};
6148 $self->{column_prev} = $self->{column};
6149 $self->{column}++;
6150 $self->{nc}
6151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6152 } else {
6153 $self->{set_nc}->($self);
6154 }
6155
6156 redo A;
6157 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6158 $self->{nc} == 0x0025) { # %
6159 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6160
6161 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6162 $self->{line_prev} = $self->{line};
6163 $self->{column_prev} = $self->{column};
6164 $self->{column}++;
6165 $self->{nc}
6166 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6167 } else {
6168 $self->{set_nc}->($self);
6169 }
6170
6171 redo A;
6172 } elsif ($self->{nc} == 0x003E) { # >
6173 ## XML5: Same as "Anything else".
6174 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6175 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6176
6177 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6178 $self->{line_prev} = $self->{line};
6179 $self->{column_prev} = $self->{column};
6180 $self->{column}++;
6181 $self->{nc}
6182 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6183 } else {
6184 $self->{set_nc}->($self);
6185 }
6186
6187 redo A;
6188 } elsif ($self->{nc} == -1) {
6189 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6190 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6191 ## Reconsume.
6192 redo A;
6193 } else {
6194 ## XML5: [ATTLIST] Not defined yet.
6195 $self->{ct}->{name} .= chr $self->{nc};
6196 $self->{state} = MD_NAME_STATE;
6197
6198 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6199 $self->{line_prev} = $self->{line};
6200 $self->{column_prev} = $self->{column};
6201 $self->{column}++;
6202 $self->{nc}
6203 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6204 } else {
6205 $self->{set_nc}->($self);
6206 }
6207
6208 redo A;
6209 }
6210 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6211 if ($is_space->{$self->{nc}}) {
6212 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6213 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6214 $self->{state} = BEFORE_MD_NAME_STATE;
6215
6216 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6217 $self->{line_prev} = $self->{line};
6218 $self->{column_prev} = $self->{column};
6219 $self->{column}++;
6220 $self->{nc}
6221 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6222 } else {
6223 $self->{set_nc}->($self);
6224 }
6225
6226 redo A;
6227 } elsif ($self->{nc} == 0x003E) { # >
6228 ## XML5: Same as "Anything else".
6229 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6230 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6231
6232 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6233 $self->{line_prev} = $self->{line};
6234 $self->{column_prev} = $self->{column};
6235 $self->{column}++;
6236 $self->{nc}
6237 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6238 } else {
6239 $self->{set_nc}->($self);
6240 }
6241
6242 redo A;
6243 } elsif ($self->{nc} == -1) {
6244 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6245 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6246 ## Reconsume.
6247 redo A;
6248 } else {
6249 ## XML5: No parse error.
6250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6251 $self->{state} = BOGUS_COMMENT_STATE;
6252 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6253 ## Reconsume.
6254 redo A;
6255 }
6256 } elsif ($self->{state} == MD_NAME_STATE) {
6257 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6258
6259 if ($is_space->{$self->{nc}}) {
6260 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6261 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6262 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6263 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6264 } else { # ENTITY/NOTATION
6265 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6266 }
6267
6268 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6269 $self->{line_prev} = $self->{line};
6270 $self->{column_prev} = $self->{column};
6271 $self->{column}++;
6272 $self->{nc}
6273 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6274 } else {
6275 $self->{set_nc}->($self);
6276 }
6277
6278 redo A;
6279 } elsif ($self->{nc} == 0x003E) { # >
6280 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6281 #
6282 } else {
6283 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6284 }
6285 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6286
6287 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6288 $self->{line_prev} = $self->{line};
6289 $self->{column_prev} = $self->{column};
6290 $self->{column}++;
6291 $self->{nc}
6292 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6293 } else {
6294 $self->{set_nc}->($self);
6295 }
6296
6297 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6298 redo A;
6299 } elsif ($self->{nc} == -1) {
6300 ## XML5: [ATTLIST] No parse error.
6301 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6302 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6303 ## Reconsume.
6304 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6305 redo A;
6306 } else {
6307 ## XML5: [ATTLIST] Not defined yet.
6308 $self->{ct}->{name} .= chr $self->{nc};
6309 ## Stay in the state.
6310
6311 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6312 $self->{line_prev} = $self->{line};
6313 $self->{column_prev} = $self->{column};
6314 $self->{column}++;
6315 $self->{nc}
6316 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6317 } else {
6318 $self->{set_nc}->($self);
6319 }
6320
6321 redo A;
6322 }
6323 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6324 if ($is_space->{$self->{nc}}) {
6325 ## Stay in the state.
6326
6327 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6328 $self->{line_prev} = $self->{line};
6329 $self->{column_prev} = $self->{column};
6330 $self->{column}++;
6331 $self->{nc}
6332 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6333 } else {
6334 $self->{set_nc}->($self);
6335 }
6336
6337 redo A;
6338 } elsif ($self->{nc} == 0x003E) { # >
6339 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6340
6341 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6342 $self->{line_prev} = $self->{line};
6343 $self->{column_prev} = $self->{column};
6344 $self->{column}++;
6345 $self->{nc}
6346 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6347 } else {
6348 $self->{set_nc}->($self);
6349 }
6350
6351 return ($self->{ct}); # ATTLIST
6352 redo A;
6353 } elsif ($self->{nc} == -1) {
6354 ## XML5: No parse error.
6355 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6356 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6357 return ($self->{ct});
6358 redo A;
6359 } else {
6360 ## XML5: Not defined yet.
6361 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6362 tokens => [],
6363 line => $self->{line}, column => $self->{column}};
6364 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6365
6366 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6367 $self->{line_prev} = $self->{line};
6368 $self->{column_prev} = $self->{column};
6369 $self->{column}++;
6370 $self->{nc}
6371 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6372 } else {
6373 $self->{set_nc}->($self);
6374 }
6375
6376 redo A;
6377 }
6378 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6379 if ($is_space->{$self->{nc}}) {
6380 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6381
6382 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6383 $self->{line_prev} = $self->{line};
6384 $self->{column_prev} = $self->{column};
6385 $self->{column}++;
6386 $self->{nc}
6387 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6388 } else {
6389 $self->{set_nc}->($self);
6390 }
6391
6392 redo A;
6393 } elsif ($self->{nc} == 0x003E) { # >
6394 ## XML5: Same as "anything else".
6395 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6396 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6397
6398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399 $self->{line_prev} = $self->{line};
6400 $self->{column_prev} = $self->{column};
6401 $self->{column}++;
6402 $self->{nc}
6403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404 } else {
6405 $self->{set_nc}->($self);
6406 }
6407
6408 return ($self->{ct}); # ATTLIST
6409 redo A;
6410 } elsif ($self->{nc} == 0x0028) { # (
6411 ## XML5: Same as "anything else".
6412 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6413 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6414
6415 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6416 $self->{line_prev} = $self->{line};
6417 $self->{column_prev} = $self->{column};
6418 $self->{column}++;
6419 $self->{nc}
6420 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6421 } else {
6422 $self->{set_nc}->($self);
6423 }
6424
6425 redo A;
6426 } elsif ($self->{nc} == -1) {
6427 ## XML5: No parse error.
6428 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6429 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6430
6431 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6432 $self->{line_prev} = $self->{line};
6433 $self->{column_prev} = $self->{column};
6434 $self->{column}++;
6435 $self->{nc}
6436 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6437 } else {
6438 $self->{set_nc}->($self);
6439 }
6440
6441 return ($self->{ct}); # ATTLIST
6442 redo A;
6443 } else {
6444 ## XML5: Not defined yet.
6445 $self->{ca}->{name} .= chr $self->{nc};
6446 ## Stay in the state.
6447
6448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6449 $self->{line_prev} = $self->{line};
6450 $self->{column_prev} = $self->{column};
6451 $self->{column}++;
6452 $self->{nc}
6453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6454 } else {
6455 $self->{set_nc}->($self);
6456 }
6457
6458 redo A;
6459 }
6460 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6461 if ($is_space->{$self->{nc}}) {
6462 ## Stay in the state.
6463
6464 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6465 $self->{line_prev} = $self->{line};
6466 $self->{column_prev} = $self->{column};
6467 $self->{column}++;
6468 $self->{nc}
6469 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6470 } else {
6471 $self->{set_nc}->($self);
6472 }
6473
6474 redo A;
6475 } elsif ($self->{nc} == 0x003E) { # >
6476 ## XML5: Same as "anything else".
6477 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6478 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6479
6480 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6481 $self->{line_prev} = $self->{line};
6482 $self->{column_prev} = $self->{column};
6483 $self->{column}++;
6484 $self->{nc}
6485 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6486 } else {
6487 $self->{set_nc}->($self);
6488 }
6489
6490 return ($self->{ct}); # ATTLIST
6491 redo A;
6492 } elsif ($self->{nc} == 0x0028) { # (
6493 ## XML5: Same as "anything else".
6494 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6495
6496 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6497 $self->{line_prev} = $self->{line};
6498 $self->{column_prev} = $self->{column};
6499 $self->{column}++;
6500 $self->{nc}
6501 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6502 } else {
6503 $self->{set_nc}->($self);
6504 }
6505
6506 redo A;
6507 } elsif ($self->{nc} == -1) {
6508 ## XML5: No parse error.
6509 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6510 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6511
6512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6513 $self->{line_prev} = $self->{line};
6514 $self->{column_prev} = $self->{column};
6515 $self->{column}++;
6516 $self->{nc}
6517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6518 } else {
6519 $self->{set_nc}->($self);
6520 }
6521
6522 return ($self->{ct});
6523 redo A;
6524 } else {
6525 ## XML5: Not defined yet.
6526 $self->{ca}->{type} = chr $self->{nc};
6527 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6528
6529 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6530 $self->{line_prev} = $self->{line};
6531 $self->{column_prev} = $self->{column};
6532 $self->{column}++;
6533 $self->{nc}
6534 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6535 } else {
6536 $self->{set_nc}->($self);
6537 }
6538
6539 redo A;
6540 }
6541 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6542 if ($is_space->{$self->{nc}}) {
6543 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6544
6545 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6546 $self->{line_prev} = $self->{line};
6547 $self->{column_prev} = $self->{column};
6548 $self->{column}++;
6549 $self->{nc}
6550 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6551 } else {
6552 $self->{set_nc}->($self);
6553 }
6554
6555 redo A;
6556 } elsif ($self->{nc} == 0x0023) { # #
6557 ## XML5: Same as "anything else".
6558 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6559 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6560
6561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6562 $self->{line_prev} = $self->{line};
6563 $self->{column_prev} = $self->{column};
6564 $self->{column}++;
6565 $self->{nc}
6566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6567 } else {
6568 $self->{set_nc}->($self);
6569 }
6570
6571 redo A;
6572 } elsif ($self->{nc} == 0x0022) { # "
6573 ## XML5: Same as "anything else".
6574 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6575 $self->{ca}->{value} = '';
6576 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6577
6578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6579 $self->{line_prev} = $self->{line};
6580 $self->{column_prev} = $self->{column};
6581 $self->{column}++;
6582 $self->{nc}
6583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6584 } else {
6585 $self->{set_nc}->($self);
6586 }
6587
6588 redo A;
6589 } elsif ($self->{nc} == 0x0027) { # '
6590 ## XML5: Same as "anything else".
6591 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6592 $self->{ca}->{value} = '';
6593 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6594
6595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6596 $self->{line_prev} = $self->{line};
6597 $self->{column_prev} = $self->{column};
6598 $self->{column}++;
6599 $self->{nc}
6600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6601 } else {
6602 $self->{set_nc}->($self);
6603 }
6604
6605 redo A;
6606 } elsif ($self->{nc} == 0x003E) { # >
6607 ## XML5: Same as "anything else".
6608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6609 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6610
6611 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6612 $self->{line_prev} = $self->{line};
6613 $self->{column_prev} = $self->{column};
6614 $self->{column}++;
6615 $self->{nc}
6616 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6617 } else {
6618 $self->{set_nc}->($self);
6619 }
6620
6621 return ($self->{ct}); # ATTLIST
6622 redo A;
6623 } elsif ($self->{nc} == 0x0028) { # (
6624 ## XML5: Same as "anything else".
6625 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6626 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6627
6628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6629 $self->{line_prev} = $self->{line};
6630 $self->{column_prev} = $self->{column};
6631 $self->{column}++;
6632 $self->{nc}
6633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6634 } else {
6635 $self->{set_nc}->($self);
6636 }
6637
6638 redo A;
6639 } elsif ($self->{nc} == -1) {
6640 ## XML5: No parse error.
6641 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6642 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6643
6644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6645 $self->{line_prev} = $self->{line};
6646 $self->{column_prev} = $self->{column};
6647 $self->{column}++;
6648 $self->{nc}
6649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6650 } else {
6651 $self->{set_nc}->($self);
6652 }
6653
6654 return ($self->{ct});
6655 redo A;
6656 } else {
6657 ## XML5: Not defined yet.
6658 $self->{ca}->{type} .= chr $self->{nc};
6659 ## Stay in the state.
6660
6661 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6662 $self->{line_prev} = $self->{line};
6663 $self->{column_prev} = $self->{column};
6664 $self->{column}++;
6665 $self->{nc}
6666 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6667 } else {
6668 $self->{set_nc}->($self);
6669 }
6670
6671 redo A;
6672 }
6673 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6674 if ($is_space->{$self->{nc}}) {
6675 ## Stay in the state.
6676
6677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6678 $self->{line_prev} = $self->{line};
6679 $self->{column_prev} = $self->{column};
6680 $self->{column}++;
6681 $self->{nc}
6682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6683 } else {
6684 $self->{set_nc}->($self);
6685 }
6686
6687 redo A;
6688 } elsif ($self->{nc} == 0x0028) { # (
6689 ## XML5: Same as "anything else".
6690 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6691
6692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6693 $self->{line_prev} = $self->{line};
6694 $self->{column_prev} = $self->{column};
6695 $self->{column}++;
6696 $self->{nc}
6697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6698 } else {
6699 $self->{set_nc}->($self);
6700 }
6701
6702 redo A;
6703 } elsif ($self->{nc} == 0x0023) { # #
6704 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6705
6706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6707 $self->{line_prev} = $self->{line};
6708 $self->{column_prev} = $self->{column};
6709 $self->{column}++;
6710 $self->{nc}
6711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6712 } else {
6713 $self->{set_nc}->($self);
6714 }
6715
6716 redo A;
6717 } elsif ($self->{nc} == 0x0022) { # "
6718 ## XML5: Same as "anything else".
6719 $self->{ca}->{value} = '';
6720 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6721
6722 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6723 $self->{line_prev} = $self->{line};
6724 $self->{column_prev} = $self->{column};
6725 $self->{column}++;
6726 $self->{nc}
6727 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6728 } else {
6729 $self->{set_nc}->($self);
6730 }
6731
6732 redo A;
6733 } elsif ($self->{nc} == 0x0027) { # '
6734 ## XML5: Same as "anything else".
6735 $self->{ca}->{value} = '';
6736 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6737
6738 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6739 $self->{line_prev} = $self->{line};
6740 $self->{column_prev} = $self->{column};
6741 $self->{column}++;
6742 $self->{nc}
6743 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6744 } else {
6745 $self->{set_nc}->($self);
6746 }
6747
6748 redo A;
6749 } elsif ($self->{nc} == 0x003E) { # >
6750 ## XML5: Same as "anything else".
6751 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6752 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6753
6754 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6755 $self->{line_prev} = $self->{line};
6756 $self->{column_prev} = $self->{column};
6757 $self->{column}++;
6758 $self->{nc}
6759 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6760 } else {
6761 $self->{set_nc}->($self);
6762 }
6763
6764 return ($self->{ct}); # ATTLIST
6765 redo A;
6766 } elsif ($self->{nc} == -1) {
6767 ## XML5: No parse error.
6768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6769 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6770
6771 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6772 $self->{line_prev} = $self->{line};
6773 $self->{column_prev} = $self->{column};
6774 $self->{column}++;
6775 $self->{nc}
6776 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6777 } else {
6778 $self->{set_nc}->($self);
6779 }
6780
6781 return ($self->{ct});
6782 redo A;
6783 } else {
6784 ## XML5: Switch to the "DOCTYPE bogus comment state".
6785 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6786 $self->{ca}->{value} = '';
6787 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6788 ## Reconsume.
6789 redo A;
6790 }
6791 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6792 if ($is_space->{$self->{nc}}) {
6793 ## Stay in the state.
6794
6795 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6796 $self->{line_prev} = $self->{line};
6797 $self->{column_prev} = $self->{column};
6798 $self->{column}++;
6799 $self->{nc}
6800 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6801 } else {
6802 $self->{set_nc}->($self);
6803 }
6804
6805 redo A;
6806 } elsif ($self->{nc} == 0x007C) { # |
6807 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6808 ## Stay in the state.
6809
6810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6811 $self->{line_prev} = $self->{line};
6812 $self->{column_prev} = $self->{column};
6813 $self->{column}++;
6814 $self->{nc}
6815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6816 } else {
6817 $self->{set_nc}->($self);
6818 }
6819
6820 redo A;
6821 } elsif ($self->{nc} == 0x0029) { # )
6822 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6823 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6824
6825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826 $self->{line_prev} = $self->{line};
6827 $self->{column_prev} = $self->{column};
6828 $self->{column}++;
6829 $self->{nc}
6830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831 } else {
6832 $self->{set_nc}->($self);
6833 }
6834
6835 redo A;
6836 } elsif ($self->{nc} == 0x003E) { # >
6837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6838 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6839
6840 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6841 $self->{line_prev} = $self->{line};
6842 $self->{column_prev} = $self->{column};
6843 $self->{column}++;
6844 $self->{nc}
6845 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6846 } else {
6847 $self->{set_nc}->($self);
6848 }
6849
6850 return ($self->{ct}); # ATTLIST
6851 redo A;
6852 } elsif ($self->{nc} == -1) {
6853 ## XML5: No parse error.
6854 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6855 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6856
6857 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6858 $self->{line_prev} = $self->{line};
6859 $self->{column_prev} = $self->{column};
6860 $self->{column}++;
6861 $self->{nc}
6862 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6863 } else {
6864 $self->{set_nc}->($self);
6865 }
6866
6867 return ($self->{ct});
6868 redo A;
6869 } else {
6870 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6871 $self->{state} = ALLOWED_TOKEN_STATE;
6872
6873 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6874 $self->{line_prev} = $self->{line};
6875 $self->{column_prev} = $self->{column};
6876 $self->{column}++;
6877 $self->{nc}
6878 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6879 } else {
6880 $self->{set_nc}->($self);
6881 }
6882
6883 redo A;
6884 }
6885 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6886 if ($is_space->{$self->{nc}}) {
6887 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6888
6889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6890 $self->{line_prev} = $self->{line};
6891 $self->{column_prev} = $self->{column};
6892 $self->{column}++;
6893 $self->{nc}
6894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6895 } else {
6896 $self->{set_nc}->($self);
6897 }
6898
6899 redo A;
6900 } elsif ($self->{nc} == 0x007C) { # |
6901 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6902
6903 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6904 $self->{line_prev} = $self->{line};
6905 $self->{column_prev} = $self->{column};
6906 $self->{column}++;
6907 $self->{nc}
6908 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6909 } else {
6910 $self->{set_nc}->($self);
6911 }
6912
6913 redo A;
6914 } elsif ($self->{nc} == 0x0029) { # )
6915 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6916
6917 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6918 $self->{line_prev} = $self->{line};
6919 $self->{column_prev} = $self->{column};
6920 $self->{column}++;
6921 $self->{nc}
6922 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6923 } else {
6924 $self->{set_nc}->($self);
6925 }
6926
6927 redo A;
6928 } elsif ($self->{nc} == 0x003E) { # >
6929 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6930 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6931
6932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6933 $self->{line_prev} = $self->{line};
6934 $self->{column_prev} = $self->{column};
6935 $self->{column}++;
6936 $self->{nc}
6937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6938 } else {
6939 $self->{set_nc}->($self);
6940 }
6941
6942 return ($self->{ct}); # ATTLIST
6943 redo A;
6944 } elsif ($self->{nc} == -1) {
6945 ## XML5: No parse error.
6946 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6947 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6948
6949 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6950 $self->{line_prev} = $self->{line};
6951 $self->{column_prev} = $self->{column};
6952 $self->{column}++;
6953 $self->{nc}
6954 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6955 } else {
6956 $self->{set_nc}->($self);
6957 }
6958
6959 return ($self->{ct});
6960 redo A;
6961 } else {
6962 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6963 ## Stay in the state.
6964
6965 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6966 $self->{line_prev} = $self->{line};
6967 $self->{column_prev} = $self->{column};
6968 $self->{column}++;
6969 $self->{nc}
6970 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6971 } else {
6972 $self->{set_nc}->($self);
6973 }
6974
6975 redo A;
6976 }
6977 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6978 if ($is_space->{$self->{nc}}) {
6979 ## Stay in the state.
6980
6981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6982 $self->{line_prev} = $self->{line};
6983 $self->{column_prev} = $self->{column};
6984 $self->{column}++;
6985 $self->{nc}
6986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6987 } else {
6988 $self->{set_nc}->($self);
6989 }
6990
6991 redo A;
6992 } elsif ($self->{nc} == 0x007C) { # |
6993 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6994
6995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6996 $self->{line_prev} = $self->{line};
6997 $self->{column_prev} = $self->{column};
6998 $self->{column}++;
6999 $self->{nc}
7000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7001 } else {
7002 $self->{set_nc}->($self);
7003 }
7004
7005 redo A;
7006 } elsif ($self->{nc} == 0x0029) { # )
7007 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7008
7009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7010 $self->{line_prev} = $self->{line};
7011 $self->{column_prev} = $self->{column};
7012 $self->{column}++;
7013 $self->{nc}
7014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7015 } else {
7016 $self->{set_nc}->($self);
7017 }
7018
7019 redo A;
7020 } elsif ($self->{nc} == 0x003E) { # >
7021 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7022 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7023
7024 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7025 $self->{line_prev} = $self->{line};
7026 $self->{column_prev} = $self->{column};
7027 $self->{column}++;
7028 $self->{nc}
7029 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7030 } else {
7031 $self->{set_nc}->($self);
7032 }
7033
7034 return ($self->{ct}); # ATTLIST
7035 redo A;
7036 } elsif ($self->{nc} == -1) {
7037 ## XML5: No parse error.
7038 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7039 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7040
7041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7042 $self->{line_prev} = $self->{line};
7043 $self->{column_prev} = $self->{column};
7044 $self->{column}++;
7045 $self->{nc}
7046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7047 } else {
7048 $self->{set_nc}->($self);
7049 }
7050
7051 return ($self->{ct});
7052 redo A;
7053 } else {
7054 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7055 line => $self->{line_prev},
7056 column => $self->{column_prev});
7057 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7058 $self->{state} = ALLOWED_TOKEN_STATE;
7059
7060 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7061 $self->{line_prev} = $self->{line};
7062 $self->{column_prev} = $self->{column};
7063 $self->{column}++;
7064 $self->{nc}
7065 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7066 } else {
7067 $self->{set_nc}->($self);
7068 }
7069
7070 redo A;
7071 }
7072 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7073 if ($is_space->{$self->{nc}}) {
7074 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7075
7076 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7077 $self->{line_prev} = $self->{line};
7078 $self->{column_prev} = $self->{column};
7079 $self->{column}++;
7080 $self->{nc}
7081 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7082 } else {
7083 $self->{set_nc}->($self);
7084 }
7085
7086 redo A;
7087 } elsif ($self->{nc} == 0x0023) { # #
7088 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7089 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7090
7091 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7092 $self->{line_prev} = $self->{line};
7093 $self->{column_prev} = $self->{column};
7094 $self->{column}++;
7095 $self->{nc}
7096 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7097 } else {
7098 $self->{set_nc}->($self);
7099 }
7100
7101 redo A;
7102 } elsif ($self->{nc} == 0x0022) { # "
7103 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7104 $self->{ca}->{value} = '';
7105 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7106
7107 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7108 $self->{line_prev} = $self->{line};
7109 $self->{column_prev} = $self->{column};
7110 $self->{column}++;
7111 $self->{nc}
7112 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7113 } else {
7114 $self->{set_nc}->($self);
7115 }
7116
7117 redo A;
7118 } elsif ($self->{nc} == 0x0027) { # '
7119 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7120 $self->{ca}->{value} = '';
7121 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7122
7123 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7124 $self->{line_prev} = $self->{line};
7125 $self->{column_prev} = $self->{column};
7126 $self->{column}++;
7127 $self->{nc}
7128 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7129 } else {
7130 $self->{set_nc}->($self);
7131 }
7132
7133 redo A;
7134 } elsif ($self->{nc} == 0x003E) { # >
7135 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7136 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7137
7138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7139 $self->{line_prev} = $self->{line};
7140 $self->{column_prev} = $self->{column};
7141 $self->{column}++;
7142 $self->{nc}
7143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7144 } else {
7145 $self->{set_nc}->($self);
7146 }
7147
7148 return ($self->{ct}); # ATTLIST
7149 redo A;
7150 } elsif ($self->{nc} == -1) {
7151 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7152 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7153
7154 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7155 $self->{line_prev} = $self->{line};
7156 $self->{column_prev} = $self->{column};
7157 $self->{column}++;
7158 $self->{nc}
7159 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7160 } else {
7161 $self->{set_nc}->($self);
7162 }
7163
7164 return ($self->{ct});
7165 redo A;
7166 } else {
7167 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7168 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7169 ## Reconsume.
7170 redo A;
7171 }
7172 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7173 if ($is_space->{$self->{nc}}) {
7174 ## Stay in the state.
7175
7176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177 $self->{line_prev} = $self->{line};
7178 $self->{column_prev} = $self->{column};
7179 $self->{column}++;
7180 $self->{nc}
7181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182 } else {
7183 $self->{set_nc}->($self);
7184 }
7185
7186 redo A;
7187 } elsif ($self->{nc} == 0x0023) { # #
7188 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7189
7190 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7191 $self->{line_prev} = $self->{line};
7192 $self->{column_prev} = $self->{column};
7193 $self->{column}++;
7194 $self->{nc}
7195 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7196 } else {
7197 $self->{set_nc}->($self);
7198 }
7199
7200 redo A;
7201 } elsif ($self->{nc} == 0x0022) { # "
7202 $self->{ca}->{value} = '';
7203 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7204
7205 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7206 $self->{line_prev} = $self->{line};
7207 $self->{column_prev} = $self->{column};
7208 $self->{column}++;
7209 $self->{nc}
7210 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7211 } else {
7212 $self->{set_nc}->($self);
7213 }
7214
7215 redo A;
7216 } elsif ($self->{nc} == 0x0027) { # '
7217 $self->{ca}->{value} = '';
7218 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7219
7220 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7221 $self->{line_prev} = $self->{line};
7222 $self->{column_prev} = $self->{column};
7223 $self->{column}++;
7224 $self->{nc}
7225 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7226 } else {
7227 $self->{set_nc}->($self);
7228 }
7229
7230 redo A;
7231 } elsif ($self->{nc} == 0x003E) { # >
7232 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7233 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7234
7235 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7236 $self->{line_prev} = $self->{line};
7237 $self->{column_prev} = $self->{column};
7238 $self->{column}++;
7239 $self->{nc}
7240 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7241 } else {
7242 $self->{set_nc}->($self);
7243 }
7244
7245 return ($self->{ct}); # ATTLIST
7246 redo A;
7247 } elsif ($self->{nc} == -1) {
7248 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7249 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7250
7251 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7252 $self->{line_prev} = $self->{line};
7253 $self->{column_prev} = $self->{column};
7254 $self->{column}++;
7255 $self->{nc}
7256 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7257 } else {
7258 $self->{set_nc}->($self);
7259 }
7260
7261 return ($self->{ct});
7262 redo A;
7263 } else {
7264 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7265 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7266 ## Reconsume.
7267 redo A;
7268 }
7269 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7270 if ($is_space->{$self->{nc}}) {
7271 ## XML5: No parse error.
7272 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7273 $self->{state} = BOGUS_MD_STATE;
7274 ## Reconsume.
7275 redo A;
7276 } elsif ($self->{nc} == 0x0022) { # "
7277 ## XML5: Same as "anything else".
7278 $self->{ca}->{value} = '';
7279 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7280
7281 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7282 $self->{line_prev} = $self->{line};
7283 $self->{column_prev} = $self->{column};
7284 $self->{column}++;
7285 $self->{nc}
7286 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7287 } else {
7288 $self->{set_nc}->($self);
7289 }
7290
7291 redo A;
7292 } elsif ($self->{nc} == 0x0027) { # '
7293 ## XML5: Same as "anything else".
7294 $self->{ca}->{value} = '';
7295 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7296
7297 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7298 $self->{line_prev} = $self->{line};
7299 $self->{column_prev} = $self->{column};
7300 $self->{column}++;
7301 $self->{nc}
7302 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7303 } else {
7304 $self->{set_nc}->($self);
7305 }
7306
7307 redo A;
7308 } elsif ($self->{nc} == 0x003E) { # >
7309 ## XML5: Same as "anything else".
7310 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7311 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7312
7313 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7314 $self->{line_prev} = $self->{line};
7315 $self->{column_prev} = $self->{column};
7316 $self->{column}++;
7317 $self->{nc}
7318 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7319 } else {
7320 $self->{set_nc}->($self);
7321 }
7322
7323 return ($self->{ct}); # ATTLIST
7324 redo A;
7325 } elsif ($self->{nc} == -1) {
7326 ## XML5: No parse error.
7327 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7328 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7329
7330 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7331 $self->{line_prev} = $self->{line};
7332 $self->{column_prev} = $self->{column};
7333 $self->{column}++;
7334 $self->{nc}
7335 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7336 } else {
7337 $self->{set_nc}->($self);
7338 }
7339
7340 return ($self->{ct});
7341 redo A;
7342 } else {
7343 $self->{ca}->{default} = chr $self->{nc};
7344 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7345
7346 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7347 $self->{line_prev} = $self->{line};
7348 $self->{column_prev} = $self->{column};
7349 $self->{column}++;
7350 $self->{nc}
7351 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7352 } else {
7353 $self->{set_nc}->($self);
7354 }
7355
7356 redo A;
7357 }
7358 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7359 if ($is_space->{$self->{nc}}) {
7360 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7361
7362 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7363 $self->{line_prev} = $self->{line};
7364 $self->{column_prev} = $self->{column};
7365 $self->{column}++;
7366 $self->{nc}
7367 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7368 } else {
7369 $self->{set_nc}->($self);
7370 }
7371
7372 redo A;
7373 } elsif ($self->{nc} == 0x0022) { # "
7374 ## XML5: Same as "anything else".
7375 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7376 $self->{ca}->{value} = '';
7377 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7378
7379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7380 $self->{line_prev} = $self->{line};
7381 $self->{column_prev} = $self->{column};
7382 $self->{column}++;
7383 $self->{nc}
7384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7385 } else {
7386 $self->{set_nc}->($self);
7387 }
7388
7389 redo A;
7390 } elsif ($self->{nc} == 0x0027) { # '
7391 ## XML5: Same as "anything else".
7392 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7393 $self->{ca}->{value} = '';
7394 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7395
7396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7397 $self->{line_prev} = $self->{line};
7398 $self->{column_prev} = $self->{column};
7399 $self->{column}++;
7400 $self->{nc}
7401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7402 } else {
7403 $self->{set_nc}->($self);
7404 }
7405
7406 redo A;
7407 } elsif ($self->{nc} == 0x003E) { # >
7408 ## XML5: Same as "anything else".
7409 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7410 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7411
7412 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7413 $self->{line_prev} = $self->{line};
7414 $self->{column_prev} = $self->{column};
7415 $self->{column}++;
7416 $self->{nc}
7417 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7418 } else {
7419 $self->{set_nc}->($self);
7420 }
7421
7422 return ($self->{ct}); # ATTLIST
7423 redo A;
7424 } elsif ($self->{nc} == -1) {
7425 ## XML5: No parse error.
7426 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7427 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7428 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7429
7430 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7431 $self->{line_prev} = $self->{line};
7432 $self->{column_prev} = $self->{column};
7433 $self->{column}++;
7434 $self->{nc}
7435 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7436 } else {
7437 $self->{set_nc}->($self);
7438 }
7439
7440 return ($self->{ct});
7441 redo A;
7442 } else {
7443 $self->{ca}->{default} .= chr $self->{nc};
7444 ## Stay in the state.
7445
7446 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7447 $self->{line_prev} = $self->{line};
7448 $self->{column_prev} = $self->{column};
7449 $self->{column}++;
7450 $self->{nc}
7451 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7452 } else {
7453 $self->{set_nc}->($self);
7454 }
7455
7456 redo A;
7457 }
7458 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7459 if ($is_space->{$self->{nc}}) {
7460 ## Stay in the state.
7461
7462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7463 $self->{line_prev} = $self->{line};
7464 $self->{column_prev} = $self->{column};
7465 $self->{column}++;
7466 $self->{nc}
7467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7468 } else {
7469 $self->{set_nc}->($self);
7470 }
7471
7472 redo A;
7473 } elsif ($self->{nc} == 0x0022) { # "
7474 $self->{ca}->{value} = '';
7475 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7476
7477 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7478 $self->{line_prev} = $self->{line};
7479 $self->{column_prev} = $self->{column};
7480 $self->{column}++;
7481 $self->{nc}
7482 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7483 } else {
7484 $self->{set_nc}->($self);
7485 }
7486
7487 redo A;
7488 } elsif ($self->{nc} == 0x0027) { # '
7489 $self->{ca}->{value} = '';
7490 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7491
7492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7493 $self->{line_prev} = $self->{line};
7494 $self->{column_prev} = $self->{column};
7495 $self->{column}++;
7496 $self->{nc}
7497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7498 } else {
7499 $self->{set_nc}->($self);
7500 }
7501
7502 redo A;
7503 } elsif ($self->{nc} == 0x003E) { # >
7504 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7505 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7506
7507 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7508 $self->{line_prev} = $self->{line};
7509 $self->{column_prev} = $self->{column};
7510 $self->{column}++;
7511 $self->{nc}
7512 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7513 } else {
7514 $self->{set_nc}->($self);
7515 }
7516
7517 return ($self->{ct}); # ATTLIST
7518 redo A;
7519 } elsif ($self->{nc} == -1) {
7520 ## XML5: No parse error.
7521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7522 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7523 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7524
7525 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7526 $self->{line_prev} = $self->{line};
7527 $self->{column_prev} = $self->{column};
7528 $self->{column}++;
7529 $self->{nc}
7530 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7531 } else {
7532 $self->{set_nc}->($self);
7533 }
7534
7535 return ($self->{ct});
7536 redo A;
7537 } else {
7538 ## XML5: Not defined yet.
7539 if ($self->{ca}->{default} eq 'FIXED') {
7540 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7541 } else {
7542 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7543 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7544 }
7545 ## Reconsume.
7546 redo A;
7547 }
7548 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7549 if ($is_space->{$self->{nc}} or
7550 $self->{nc} == -1 or
7551 $self->{nc} == 0x003E) { # >
7552 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7553 ## Reconsume.
7554 redo A;
7555 } else {
7556 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7557 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7558 ## Reconsume.
7559 redo A;
7560 }
7561 } elsif ($self->{state} == NDATA_STATE) {
7562 ## ASCII case-insensitive
7563 if ($self->{nc} == [
7564 undef,
7565 0x0044, # D
7566 0x0041, # A
7567 0x0054, # T
7568 ]->[length $self->{kwd}] or
7569 $self->{nc} == [
7570 undef,
7571 0x0064, # d
7572 0x0061, # a
7573 0x0074, # t
7574 ]->[length $self->{kwd}]) {
7575
7576 ## Stay in the state.
7577 $self->{kwd} .= chr $self->{nc};
7578
7579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7580 $self->{line_prev} = $self->{line};
7581 $self->{column_prev} = $self->{column};
7582 $self->{column}++;
7583 $self->{nc}
7584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7585 } else {
7586 $self->{set_nc}->($self);
7587 }
7588
7589 redo A;
7590 } elsif ((length $self->{kwd}) == 4 and
7591 ($self->{nc} == 0x0041 or # A
7592 $self->{nc} == 0x0061)) { # a
7593 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7594
7595 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7596 text => 'NDATA',
7597 line => $self->{line_prev},
7598 column => $self->{column_prev} - 4);
7599 } else {
7600
7601 }
7602 $self->{state} = AFTER_NDATA_STATE;
7603
7604 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7605 $self->{line_prev} = $self->{line};
7606 $self->{column_prev} = $self->{column};
7607 $self->{column}++;
7608 $self->{nc}
7609 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7610 } else {
7611 $self->{set_nc}->($self);
7612 }
7613
7614 redo A;
7615 } else {
7616 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7617 line => $self->{line_prev},
7618 column => $self->{column_prev} + 1
7619 - length $self->{kwd});
7620
7621 $self->{state} = BOGUS_MD_STATE;
7622 ## Reconsume.
7623 redo A;
7624 }
7625 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7626 if ($is_space->{$self->{nc}}) {
7627 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7628
7629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7630 $self->{line_prev} = $self->{line};
7631 $self->{column_prev} = $self->{column};
7632 $self->{column}++;
7633 $self->{nc}
7634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7635 } else {
7636 $self->{set_nc}->($self);
7637 }
7638
7639 redo A;
7640 } elsif ($self->{nc} == 0x003E) { # >
7641 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7642 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7643
7644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7645 $self->{line_prev} = $self->{line};
7646 $self->{column_prev} = $self->{column};
7647 $self->{column}++;
7648 $self->{nc}
7649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7650 } else {
7651 $self->{set_nc}->($self);
7652 }
7653
7654 return ($self->{ct}); # ENTITY
7655 redo A;
7656 } elsif ($self->{nc} == -1) {
7657 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7658 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7659
7660 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7661 $self->{line_prev} = $self->{line};
7662 $self->{column_prev} = $self->{column};
7663 $self->{column}++;
7664 $self->{nc}
7665 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7666 } else {
7667 $self->{set_nc}->($self);
7668 }
7669
7670 return ($self->{ct}); # ENTITY
7671 redo A;
7672 } else {
7673 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7674 line => $self->{line_prev},
7675 column => $self->{column_prev} + 1
7676 - length $self->{kwd});
7677 $self->{state} = BOGUS_MD_STATE;
7678 ## Reconsume.
7679 redo A;
7680 }
7681 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7682 if ($is_space->{$self->{nc}}) {
7683 ## Stay in the state.
7684
7685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7686 $self->{line_prev} = $self->{line};
7687 $self->{column_prev} = $self->{column};
7688 $self->{column}++;
7689 $self->{nc}
7690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7691 } else {
7692 $self->{set_nc}->($self);
7693 }
7694
7695 redo A;
7696 } elsif ($self->{nc} == 0x003E) { # >
7697 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7698 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7699
7700 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7701 $self->{line_prev} = $self->{line};
7702 $self->{column_prev} = $self->{column};
7703 $self->{column}++;
7704 $self->{nc}
7705 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7706 } else {
7707 $self->{set_nc}->($self);
7708 }
7709
7710 return ($self->{ct}); # ENTITY
7711 redo A;
7712 } elsif ($self->{nc} == -1) {
7713 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7714 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7715
7716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7717 $self->{line_prev} = $self->{line};
7718 $self->{column_prev} = $self->{column};
7719 $self->{column}++;
7720 $self->{nc}
7721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7722 } else {
7723 $self->{set_nc}->($self);
7724 }
7725
7726 return ($self->{ct}); # ENTITY
7727 redo A;
7728 } else {
7729 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7730 $self->{state} = NOTATION_NAME_STATE;
7731
7732 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7733 $self->{line_prev} = $self->{line};
7734 $self->{column_prev} = $self->{column};
7735 $self->{column}++;
7736 $self->{nc}
7737 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7738 } else {
7739 $self->{set_nc}->($self);
7740 }
7741
7742 redo A;
7743 }
7744 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7745 if ($is_space->{$self->{nc}}) {
7746 $self->{state} = AFTER_MD_DEF_STATE;
7747
7748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7749 $self->{line_prev} = $self->{line};
7750 $self->{column_prev} = $self->{column};
7751 $self->{column}++;
7752 $self->{nc}
7753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7754 } else {
7755 $self->{set_nc}->($self);
7756 }
7757
7758 redo A;
7759 } elsif ($self->{nc} == 0x003E) { # >
7760 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7761
7762 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7763 $self->{line_prev} = $self->{line};
7764 $self->{column_prev} = $self->{column};
7765 $self->{column}++;
7766 $self->{nc}
7767 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7768 } else {
7769 $self->{set_nc}->($self);
7770 }
7771
7772 return ($self->{ct}); # ENTITY
7773 redo A;
7774 } elsif ($self->{nc} == -1) {
7775 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7776 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7777
7778 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7779 $self->{line_prev} = $self->{line};
7780 $self->{column_prev} = $self->{column};
7781 $self->{column}++;
7782 $self->{nc}
7783 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7784 } else {
7785 $self->{set_nc}->($self);
7786 }
7787
7788 return ($self->{ct}); # ENTITY
7789 redo A;
7790 } else {
7791 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7792 ## Stay in the state.
7793
7794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7795 $self->{line_prev} = $self->{line};
7796 $self->{column_prev} = $self->{column};
7797 $self->{column}++;
7798 $self->{nc}
7799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7800 } else {
7801 $self->{set_nc}->($self);
7802 }
7803
7804 redo A;
7805 }
7806 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7807 if ($self->{nc} == 0x0022) { # "
7808 $self->{state} = AFTER_MD_DEF_STATE;
7809
7810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7811 $self->{line_prev} = $self->{line};
7812 $self->{column_prev} = $self->{column};
7813 $self->{column}++;
7814 $self->{nc}
7815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7816 } else {
7817 $self->{set_nc}->($self);
7818 }
7819
7820 redo A;
7821 } elsif ($self->{nc} == 0x0026) { # &
7822 $self->{prev_state} = $self->{state};
7823 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7824 $self->{entity_add} = 0x0022; # "
7825
7826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7827 $self->{line_prev} = $self->{line};
7828 $self->{column_prev} = $self->{column};
7829 $self->{column}++;
7830 $self->{nc}
7831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7832 } else {
7833 $self->{set_nc}->($self);
7834 }
7835
7836 redo A;
7837 ## TODO: %
7838 } elsif ($self->{nc} == -1) {
7839 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7840 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7841 ## Reconsume.
7842 return ($self->{ct}); # ENTITY
7843 redo A;
7844 } else {
7845 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7846
7847 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7848 $self->{line_prev} = $self->{line};
7849 $self->{column_prev} = $self->{column};
7850 $self->{column}++;
7851 $self->{nc}
7852 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7853 } else {
7854 $self->{set_nc}->($self);
7855 }
7856
7857 redo A;
7858 }
7859 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7860 if ($self->{nc} == 0x0027) { # '
7861 $self->{state} = AFTER_MD_DEF_STATE;
7862
7863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7864 $self->{line_prev} = $self->{line};
7865 $self->{column_prev} = $self->{column};
7866 $self->{column}++;
7867 $self->{nc}
7868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7869 } else {
7870 $self->{set_nc}->($self);
7871 }
7872
7873 redo A;
7874 } elsif ($self->{nc} == 0x0026) { # &
7875 $self->{prev_state} = $self->{state};
7876 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7877 $self->{entity_add} = 0x0027; # '
7878
7879 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7880 $self->{line_prev} = $self->{line};
7881 $self->{column_prev} = $self->{column};
7882 $self->{column}++;
7883 $self->{nc}
7884 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7885 } else {
7886 $self->{set_nc}->($self);
7887 }
7888
7889 redo A;
7890 ## TODO: %
7891 } elsif ($self->{nc} == -1) {
7892 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7893 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7894 ## Reconsume.
7895 return ($self->{ct}); # ENTITY
7896 redo A;
7897 } else {
7898 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7899
7900 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7901 $self->{line_prev} = $self->{line};
7902 $self->{column_prev} = $self->{column};
7903 $self->{column}++;
7904 $self->{nc}
7905 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7906 } else {
7907 $self->{set_nc}->($self);
7908 }
7909
7910 redo A;
7911 }
7912 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7913 if ($is_space->{$self->{nc}} or
7914 {
7915 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7916 $self->{entity_add} => 1,
7917 }->{$self->{nc}}) {
7918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7919 line => $self->{line_prev},
7920 column => $self->{column_prev}
7921 + ($self->{nc} == -1 ? 1 : 0));
7922 ## Don't consume
7923 ## Return nothing.
7924 #
7925 } elsif ($self->{nc} == 0x0023) { # #
7926 $self->{ca} = $self->{ct};
7927 $self->{state} = ENTITY_HASH_STATE;
7928 $self->{kwd} = '#';
7929
7930 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7931 $self->{line_prev} = $self->{line};
7932 $self->{column_prev} = $self->{column};
7933 $self->{column}++;
7934 $self->{nc}
7935 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7936 } else {
7937 $self->{set_nc}->($self);
7938 }
7939
7940 redo A;
7941 } else {
7942 #
7943 }
7944
7945 $self->{ct}->{value} .= '&';
7946 $self->{state} = $self->{prev_state};
7947 ## Reconsume.
7948 redo A;
7949 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7950 if ($is_space->{$self->{nc}}) {
7951 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7952
7953 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7954 $self->{line_prev} = $self->{line};
7955 $self->{column_prev} = $self->{column};
7956 $self->{column}++;
7957 $self->{nc}
7958 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7959 } else {
7960 $self->{set_nc}->($self);
7961 }
7962
7963 redo A;
7964 } elsif ($self->{nc} == 0x0028) { # (
7965 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7966 $self->{ct}->{content} = ['('];
7967 $self->{group_depth} = 1;
7968
7969 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7970 $self->{line_prev} = $self->{line};
7971 $self->{column_prev} = $self->{column};
7972 $self->{column}++;
7973 $self->{nc}
7974 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7975 } else {
7976 $self->{set_nc}->($self);
7977 }
7978
7979 redo A;
7980 } elsif ($self->{nc} == 0x003E) { # >
7981 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7982 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7983
7984 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7985 $self->{line_prev} = $self->{line};
7986 $self->{column_prev} = $self->{column};
7987 $self->{column}++;
7988 $self->{nc}
7989 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7990 } else {
7991 $self->{set_nc}->($self);
7992 }
7993
7994 return ($self->{ct}); # ELEMENT
7995 redo A;
7996 } elsif ($self->{nc} == -1) {
7997 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7998 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7999
8000 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8001 $self->{line_prev} = $self->{line};
8002 $self->{column_prev} = $self->{column};
8003 $self->{column}++;
8004 $self->{nc}
8005 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8006 } else {
8007 $self->{set_nc}->($self);
8008 }
8009
8010 return ($self->{ct}); # ELEMENT
8011 redo A;
8012 } else {
8013 $self->{ct}->{content} = [chr $self->{nc}];
8014 $self->{state} = CONTENT_KEYWORD_STATE;
8015
8016 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8017 $self->{line_prev} = $self->{line};
8018 $self->{column_prev} = $self->{column};
8019 $self->{column}++;
8020 $self->{nc}
8021 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8022 } else {
8023 $self->{set_nc}->($self);
8024 }
8025
8026 redo A;
8027 }
8028 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8029 if ($is_space->{$self->{nc}}) {
8030 $self->{state} = AFTER_MD_DEF_STATE;
8031
8032 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8033 $self->{line_prev} = $self->{line};
8034 $self->{column_prev} = $self->{column};
8035 $self->{column}++;
8036 $self->{nc}
8037 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8038 } else {
8039 $self->{set_nc}->($self);
8040 }
8041
8042 redo A;
8043 } elsif ($self->{nc} == 0x003E) { # >
8044 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8045
8046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8047 $self->{line_prev} = $self->{line};
8048 $self->{column_prev} = $self->{column};
8049 $self->{column}++;
8050 $self->{nc}
8051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8052 } else {
8053 $self->{set_nc}->($self);
8054 }
8055
8056 return ($self->{ct}); # ELEMENT
8057 redo A;
8058 } elsif ($self->{nc} == -1) {
8059 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8060 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8061
8062 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8063 $self->{line_prev} = $self->{line};
8064 $self->{column_prev} = $self->{column};
8065 $self->{column}++;
8066 $self->{nc}
8067 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8068 } else {
8069 $self->{set_nc}->($self);
8070 }
8071
8072 return ($self->{ct}); # ELEMENT
8073 redo A;
8074 } else {
8075 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8076 ## Stay in the state.
8077
8078 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8079 $self->{line_prev} = $self->{line};
8080 $self->{column_prev} = $self->{column};
8081 $self->{column}++;
8082 $self->{nc}
8083 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8084 } else {
8085 $self->{set_nc}->($self);
8086 }
8087
8088 redo A;
8089 }
8090 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8091 if ($is_space->{$self->{nc}}) {
8092 ## Stay in the state.
8093
8094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095 $self->{line_prev} = $self->{line};
8096 $self->{column_prev} = $self->{column};
8097 $self->{column}++;
8098 $self->{nc}
8099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100 } else {
8101 $self->{set_nc}->($self);
8102 }
8103
8104 redo A;
8105 } elsif ($self->{nc} == 0x0028) { # (
8106 $self->{group_depth}++;
8107 push @{$self->{ct}->{content}}, chr $self->{nc};
8108 ## Stay in the state.
8109
8110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8111 $self->{line_prev} = $self->{line};
8112 $self->{column_prev} = $self->{column};
8113 $self->{column}++;
8114 $self->{nc}
8115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8116 } else {
8117 $self->{set_nc}->($self);
8118 }
8119
8120 redo A;
8121 } elsif ($self->{nc} == 0x007C or # |
8122 $self->{nc} == 0x002C) { # ,
8123 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8124 ## Stay in the state.
8125
8126 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127 $self->{line_prev} = $self->{line};
8128 $self->{column_prev} = $self->{column};
8129 $self->{column}++;
8130 $self->{nc}
8131 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132 } else {
8133 $self->{set_nc}->($self);
8134 }
8135
8136 redo A;
8137 } elsif ($self->{nc} == 0x0029) { # )
8138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8139 push @{$self->{ct}->{content}}, chr $self->{nc};
8140 $self->{group_depth}--;
8141 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8142
8143 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8144 $self->{line_prev} = $self->{line};
8145 $self->{column_prev} = $self->{column};
8146 $self->{column}++;
8147 $self->{nc}
8148 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8149 } else {
8150 $self->{set_nc}->($self);
8151 }
8152
8153 redo A;
8154 } elsif ($self->{nc} == 0x003E) { # >
8155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8156 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8157 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8158
8159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8160 $self->{line_prev} = $self->{line};
8161 $self->{column_prev} = $self->{column};
8162 $self->{column}++;
8163 $self->{nc}
8164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8165 } else {
8166 $self->{set_nc}->($self);
8167 }
8168
8169 return ($self->{ct}); # ELEMENT
8170 redo A;
8171 } elsif ($self->{nc} == -1) {
8172 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8173 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8174 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8175
8176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8177 $self->{line_prev} = $self->{line};
8178 $self->{column_prev} = $self->{column};
8179 $self->{column}++;
8180 $self->{nc}
8181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8182 } else {
8183 $self->{set_nc}->($self);
8184 }
8185
8186 return ($self->{ct}); # ELEMENT
8187 redo A;
8188 } else {
8189 push @{$self->{ct}->{content}}, chr $self->{nc};
8190 $self->{state} = CM_ELEMENT_NAME_STATE;
8191
8192 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8193 $self->{line_prev} = $self->{line};
8194 $self->{column_prev} = $self->{column};
8195 $self->{column}++;
8196 $self->{nc}
8197 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8198 } else {
8199 $self->{set_nc}->($self);
8200 }
8201
8202 redo A;
8203 }
8204 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8205 if ($is_space->{$self->{nc}}) {
8206 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8207
8208 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8209 $self->{line_prev} = $self->{line};
8210 $self->{column_prev} = $self->{column};
8211 $self->{column}++;
8212 $self->{nc}
8213 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8214 } else {
8215 $self->{set_nc}->($self);
8216 }
8217
8218 redo A;
8219 } elsif ($self->{nc} == 0x002A or # *
8220 $self->{nc} == 0x002B or # +
8221 $self->{nc} == 0x003F) { # ?
8222 push @{$self->{ct}->{content}}, chr $self->{nc};
8223 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8224
8225 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8226 $self->{line_prev} = $self->{line};
8227 $self->{column_prev} = $self->{column};
8228 $self->{column}++;
8229 $self->{nc}
8230 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8231 } else {
8232 $self->{set_nc}->($self);
8233 }
8234
8235 redo A;
8236 } elsif ($self->{nc} == 0x007C or # |
8237 $self->{nc} == 0x002C) { # ,
8238 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8239 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8240
8241 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8242 $self->{line_prev} = $self->{line};
8243 $self->{column_prev} = $self->{column};
8244 $self->{column}++;
8245 $self->{nc}
8246 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8247 } else {
8248 $self->{set_nc}->($self);
8249 }
8250
8251 redo A;
8252 } elsif ($self->{nc} == 0x0029) { # )
8253 $self->{group_depth}--;
8254 push @{$self->{ct}->{content}}, chr $self->{nc};
8255 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8256
8257 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8258 $self->{line_prev} = $self->{line};
8259 $self->{column_prev} = $self->{column};
8260 $self->{column}++;
8261 $self->{nc}
8262 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8263 } else {
8264 $self->{set_nc}->($self);
8265 }
8266
8267 redo A;
8268 } elsif ($self->{nc} == 0x003E) { # >
8269 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8270 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8271 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272
8273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274 $self->{line_prev} = $self->{line};
8275 $self->{column_prev} = $self->{column};
8276 $self->{column}++;
8277 $self->{nc}
8278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279 } else {
8280 $self->{set_nc}->($self);
8281 }
8282
8283 return ($self->{ct}); # ELEMENT
8284 redo A;
8285 } elsif ($self->{nc} == -1) {
8286 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8287 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8288 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8289
8290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8291 $self->{line_prev} = $self->{line};
8292 $self->{column_prev} = $self->{column};
8293 $self->{column}++;
8294 $self->{nc}
8295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8296 } else {
8297 $self->{set_nc}->($self);
8298 }
8299
8300 return ($self->{ct}); # ELEMENT
8301 redo A;
8302 } else {
8303 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8304 ## Stay in the state.
8305
8306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8307 $self->{line_prev} = $self->{line};
8308 $self->{column_prev} = $self->{column};
8309 $self->{column}++;
8310 $self->{nc}
8311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8312 } else {
8313 $self->{set_nc}->($self);
8314 }
8315
8316 redo A;
8317 }
8318 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8319 if ($is_space->{$self->{nc}}) {
8320 ## Stay in the state.
8321
8322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8323 $self->{line_prev} = $self->{line};
8324 $self->{column_prev} = $self->{column};
8325 $self->{column}++;
8326 $self->{nc}
8327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8328 } else {
8329 $self->{set_nc}->($self);
8330 }
8331
8332 redo A;
8333 } elsif ($self->{nc} == 0x007C or # |
8334 $self->{nc} == 0x002C) { # ,
8335 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8336 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8337
8338 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8339 $self->{line_prev} = $self->{line};
8340 $self->{column_prev} = $self->{column};
8341 $self->{column}++;
8342 $self->{nc}
8343 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8344 } else {
8345 $self->{set_nc}->($self);
8346 }
8347
8348 redo A;
8349 } elsif ($self->{nc} == 0x0029) { # )
8350 $self->{group_depth}--;
8351 push @{$self->{ct}->{content}}, chr $self->{nc};
8352 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8353
8354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8355 $self->{line_prev} = $self->{line};
8356 $self->{column_prev} = $self->{column};
8357 $self->{column}++;
8358 $self->{nc}
8359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8360 } else {
8361 $self->{set_nc}->($self);
8362 }
8363
8364 redo A;
8365 } elsif ($self->{nc} == 0x003E) { # >
8366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8367 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369
8370 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371 $self->{line_prev} = $self->{line};
8372 $self->{column_prev} = $self->{column};
8373 $self->{column}++;
8374 $self->{nc}
8375 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376 } else {
8377 $self->{set_nc}->($self);
8378 }
8379
8380 return ($self->{ct}); # ELEMENT
8381 redo A;
8382 } elsif ($self->{nc} == -1) {
8383 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8384 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8386
8387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388 $self->{line_prev} = $self->{line};
8389 $self->{column_prev} = $self->{column};
8390 $self->{column}++;
8391 $self->{nc}
8392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393 } else {
8394 $self->{set_nc}->($self);
8395 }
8396
8397 return ($self->{ct}); # ELEMENT
8398 redo A;
8399 } else {
8400 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8401 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8402 $self->{state} = BOGUS_MD_STATE;
8403
8404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8405 $self->{line_prev} = $self->{line};
8406 $self->{column_prev} = $self->{column};
8407 $self->{column}++;
8408 $self->{nc}
8409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8410 } else {
8411 $self->{set_nc}->($self);
8412 }
8413
8414 redo A;
8415 }
8416 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8417 if ($is_space->{$self->{nc}}) {
8418 if ($self->{group_depth}) {
8419 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8420 } else {
8421 $self->{state} = AFTER_MD_DEF_STATE;
8422 }
8423
8424 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8425 $self->{line_prev} = $self->{line};
8426 $self->{column_prev} = $self->{column};
8427 $self->{column}++;
8428 $self->{nc}
8429 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8430 } else {
8431 $self->{set_nc}->($self);
8432 }
8433
8434 redo A;
8435 } elsif ($self->{nc} == 0x002A or # *
8436 $self->{nc} == 0x002B or # +
8437 $self->{nc} == 0x003F) { # ?
8438 push @{$self->{ct}->{content}}, chr $self->{nc};
8439 if ($self->{group_depth}) {
8440 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8441 } else {
8442 $self->{state} = AFTER_MD_DEF_STATE;
8443 }
8444
8445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8446 $self->{line_prev} = $self->{line};
8447 $self->{column_prev} = $self->{column};
8448 $self->{column}++;
8449 $self->{nc}
8450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8451 } else {
8452 $self->{set_nc}->($self);
8453 }
8454
8455 redo A;
8456 } elsif ($self->{nc} == 0x0029) { # )
8457 if ($self->{group_depth}) {
8458 $self->{group_depth}--;
8459 push @{$self->{ct}->{content}}, chr $self->{nc};
8460 ## Stay in the state.
8461
8462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8463 $self->{line_prev} = $self->{line};
8464 $self->{column_prev} = $self->{column};
8465 $self->{column}++;
8466 $self->{nc}
8467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8468 } else {
8469 $self->{set_nc}->($self);
8470 }
8471
8472 redo A;
8473 } else {
8474 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8475 $self->{state} = BOGUS_MD_STATE;
8476 ## Reconsume.
8477 redo A;
8478 }
8479 } elsif ($self->{nc} == 0x003E) { # >
8480 if ($self->{group_depth}) {
8481 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8482 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8483 }
8484 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8485
8486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8487 $self->{line_prev} = $self->{line};
8488 $self->{column_prev} = $self->{column};
8489 $self->{column}++;
8490 $self->{nc}
8491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8492 } else {
8493 $self->{set_nc}->($self);
8494 }
8495
8496 return ($self->{ct}); # ELEMENT
8497 redo A;
8498 } elsif ($self->{nc} == -1) {
8499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8500 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8501 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8502
8503 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8504 $self->{line_prev} = $self->{line};
8505 $self->{column_prev} = $self->{column};
8506 $self->{column}++;
8507 $self->{nc}
8508 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8509 } else {
8510 $self->{set_nc}->($self);
8511 }
8512
8513 return ($self->{ct}); # ELEMENT
8514 redo A;
8515 } else {
8516 if ($self->{group_depth}) {
8517 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8518 } else {
8519 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8520 $self->{state} = BOGUS_MD_STATE;
8521 }
8522 ## Reconsume.
8523 redo A;
8524 }
8525 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8526 if ($is_space->{$self->{nc}}) {
8527 ## Stay in the state.
8528
8529 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8530 $self->{line_prev} = $self->{line};
8531 $self->{column_prev} = $self->{column};
8532 $self->{column}++;
8533 $self->{nc}
8534 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8535 } else {
8536 $self->{set_nc}->($self);
8537 }
8538
8539 redo A;
8540 } elsif ($self->{nc} == 0x003E) { # >
8541 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8542
8543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8544 $self->{line_prev} = $self->{line};
8545 $self->{column_prev} = $self->{column};
8546 $self->{column}++;
8547 $self->{nc}
8548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8549 } else {
8550 $self->{set_nc}->($self);
8551 }
8552
8553 return ($self->{ct}); # ENTITY/ELEMENT
8554 redo A;
8555 } elsif ($self->{nc} == -1) {
8556 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8557 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8558
8559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8560 $self->{line_prev} = $self->{line};
8561 $self->{column_prev} = $self->{column};
8562 $self->{column}++;
8563 $self->{nc}
8564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8565 } else {
8566 $self->{set_nc}->($self);
8567 }
8568
8569 return ($self->{ct}); # ENTITY/ELEMENT
8570 redo A;
8571 } else {
8572 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8573 $self->{state} = BOGUS_MD_STATE;
8574 ## Reconsume.
8575 redo A;
8576 }
8577 } elsif ($self->{state} == BOGUS_MD_STATE) {
8578 if ($self->{nc} == 0x003E) { # >
8579 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580
8581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582 $self->{line_prev} = $self->{line};
8583 $self->{column_prev} = $self->{column};
8584 $self->{column}++;
8585 $self->{nc}
8586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587 } else {
8588 $self->{set_nc}->($self);
8589 }
8590
8591 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8592 redo A;
8593 } elsif ($self->{nc} == -1) {
8594 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8595 ## Reconsume.
8596 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8597 redo A;
8598 } else {
8599 ## Stay in the state.
8600
8601 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8602 $self->{line_prev} = $self->{line};
8603 $self->{column_prev} = $self->{column};
8604 $self->{column}++;
8605 $self->{nc}
8606 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8607 } else {
8608 $self->{set_nc}->($self);
8609 }
8610
8611 redo A;
8612 }
8613 } else {
8614 die "$0: $self->{state}: Unknown state";
8615 }
8616 } # A
8617
8618 die "$0: _get_next_token: unexpected case";
8619 } # _get_next_token
8620
8621 1;
8622 ## $Date: 2008/10/19 13:43:55 $
8623

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24