/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.23 - (show annotations) (download)
Sun Oct 19 13:43:55 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.22: +5 -5 lines
++ whatpm/t/xml/ChangeLog	19 Oct 2008 13:43:45 -0000
	* attlists-1.dat: Test results updated.  New tests on empty
	attlist declaration and duplications are added.

	* doctypes-2.dat: Test results updated.

	* eldecls-1.dat, entities-2.dat, notations-1.dat: New tests on
	duplications are added.

	* entities-1.dat: New tests on duplications and predefined
	entities are added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 13:40:35 -0000
	* Tokenizer.pm.src: Column number counting fixed.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 13:41:50 -0000
	* Parser.pm.src: Raise a parse error or warning for
	declaration/definition duplications.  Raise a warning for an empty
	attlist declaration.  Raise a error for an ill-declared predefined
	entity.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.22 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276
277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278 $self->{line_prev} = $self->{line};
279 $self->{column_prev} = $self->{column};
280 $self->{column}++;
281 $self->{nc}
282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283 } else {
284 $self->{set_nc}->($self);
285 }
286
287 $self->{token} = [];
288 # $self->{escape}
289 } # _initialize_tokenizer
290
291 ## A token has:
292 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 ## ->{name} (DOCTYPE_TOKEN)
295 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 ## ->{target} (PI_TOKEN)
297 ## ->{pubid} (DOCTYPE_TOKEN)
298 ## ->{sysid} (DOCTYPE_TOKEN)
299 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301 ## ->{name}
302 ## ->{value}
303 ## ->{has_reference} == 1 or 0
304 ## ->{index}: Index of the attribute in a tag.
305 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309
310 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312 ## while the token is pushed back to the stack.
313
314 ## Emitted token MUST immediately be handled by the tree construction state.
315
316 ## Before each step, UA MAY check to see if either one of the scripts in
317 ## "list of scripts that will execute as soon as possible" or the first
318 ## script in the "list of scripts that will execute asynchronously",
319 ## has completed loading. If one has, then it MUST be executed
320 ## and removed from the list.
321
322 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323 ## (This requirement was dropped from HTML5 spec, unfortunately.)
324
325 my $is_space = {
326 0x0009 => 1, # CHARACTER TABULATION (HT)
327 0x000A => 1, # LINE FEED (LF)
328 #0x000B => 0, # LINE TABULATION (VT)
329 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 #0x000D => 1, # CARRIAGE RETURN (CR)
331 0x0020 => 1, # SPACE (SP)
332 };
333
334 sub _get_next_token ($) {
335 my $self = shift;
336
337 if ($self->{self_closing}) {
338 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339 ## NOTE: The |self_closing| flag is only set by start tag token.
340 ## In addition, when a start tag token is emitted, it is always set to
341 ## |ct|.
342 delete $self->{self_closing};
343 }
344
345 if (@{$self->{token}}) {
346 $self->{self_closing} = $self->{token}->[0]->{self_closing};
347 return shift @{$self->{token}};
348 }
349
350 A: {
351 if ($self->{state} == PCDATA_STATE) {
352 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353
354 if ($self->{nc} == 0x0026) { # &
355
356 ## NOTE: In the spec, the tokenizer is switched to the
357 ## "entity data state". In this implementation, the tokenizer
358 ## is switched to the |ENTITY_STATE|, which is an implementation
359 ## of the "consume a character reference" algorithm.
360 $self->{entity_add} = -1;
361 $self->{prev_state} = DATA_STATE;
362 $self->{state} = ENTITY_STATE;
363
364 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365 $self->{line_prev} = $self->{line};
366 $self->{column_prev} = $self->{column};
367 $self->{column}++;
368 $self->{nc}
369 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370 } else {
371 $self->{set_nc}->($self);
372 }
373
374 redo A;
375 } elsif ($self->{nc} == 0x003C) { # <
376
377 $self->{state} = TAG_OPEN_STATE;
378
379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380 $self->{line_prev} = $self->{line};
381 $self->{column_prev} = $self->{column};
382 $self->{column}++;
383 $self->{nc}
384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385 } else {
386 $self->{set_nc}->($self);
387 }
388
389 redo A;
390 } elsif ($self->{nc} == -1) {
391
392 return ({type => END_OF_FILE_TOKEN,
393 line => $self->{line}, column => $self->{column}});
394 last A; ## TODO: ok?
395 } else {
396
397 #
398 }
399
400 # Anything else
401 my $token = {type => CHARACTER_TOKEN,
402 data => chr $self->{nc},
403 line => $self->{line}, column => $self->{column},
404 };
405 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406
407 ## Stay in the state.
408
409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410 $self->{line_prev} = $self->{line};
411 $self->{column_prev} = $self->{column};
412 $self->{column}++;
413 $self->{nc}
414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415 } else {
416 $self->{set_nc}->($self);
417 }
418
419 return ($token);
420 redo A;
421 } elsif ($self->{state} == DATA_STATE) {
422 $self->{s_kwd} = '' unless defined $self->{s_kwd};
423 if ($self->{nc} == 0x0026) { # &
424 $self->{s_kwd} = '';
425 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426 not $self->{escape}) {
427
428 ## NOTE: In the spec, the tokenizer is switched to the
429 ## "entity data state". In this implementation, the tokenizer
430 ## is switched to the |ENTITY_STATE|, which is an implementation
431 ## of the "consume a character reference" algorithm.
432 $self->{entity_add} = -1;
433 $self->{prev_state} = DATA_STATE;
434 $self->{state} = ENTITY_STATE;
435
436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437 $self->{line_prev} = $self->{line};
438 $self->{column_prev} = $self->{column};
439 $self->{column}++;
440 $self->{nc}
441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442 } else {
443 $self->{set_nc}->($self);
444 }
445
446 redo A;
447 } else {
448
449 #
450 }
451 } elsif ($self->{nc} == 0x002D) { # -
452 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 if ($self->{s_kwd} eq '<!-') {
454
455 $self->{escape} = 1; # unless $self->{escape};
456 $self->{s_kwd} = '--';
457 #
458 } elsif ($self->{s_kwd} eq '-') {
459
460 $self->{s_kwd} = '--';
461 #
462 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463
464 $self->{s_kwd} .= '-';
465 #
466 } else {
467
468 $self->{s_kwd} = '-';
469 #
470 }
471 }
472
473 #
474 } elsif ($self->{nc} == 0x0021) { # !
475 if (length $self->{s_kwd}) {
476
477 $self->{s_kwd} .= '!';
478 #
479 } else {
480
481 #$self->{s_kwd} = '';
482 #
483 }
484 #
485 } elsif ($self->{nc} == 0x003C) { # <
486 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488 not $self->{escape})) {
489
490 $self->{state} = TAG_OPEN_STATE;
491
492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493 $self->{line_prev} = $self->{line};
494 $self->{column_prev} = $self->{column};
495 $self->{column}++;
496 $self->{nc}
497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498 } else {
499 $self->{set_nc}->($self);
500 }
501
502 redo A;
503 } else {
504
505 $self->{s_kwd} = '';
506 #
507 }
508 } elsif ($self->{nc} == 0x003E) { # >
509 if ($self->{escape} and
510 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511 if ($self->{s_kwd} eq '--') {
512
513 delete $self->{escape};
514 #
515 } else {
516
517 #
518 }
519 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520
521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522 line => $self->{line_prev},
523 column => $self->{column_prev} - 1);
524 #
525 } else {
526
527 #
528 }
529
530 $self->{s_kwd} = '';
531 #
532 } elsif ($self->{nc} == 0x005D) { # ]
533 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534
535 $self->{s_kwd} .= ']';
536 } elsif ($self->{s_kwd} eq ']]') {
537
538 #
539 } else {
540
541 $self->{s_kwd} = '';
542 }
543 #
544 } elsif ($self->{nc} == -1) {
545
546 $self->{s_kwd} = '';
547 return ({type => END_OF_FILE_TOKEN,
548 line => $self->{line}, column => $self->{column}});
549 last A; ## TODO: ok?
550 } else {
551
552 $self->{s_kwd} = '';
553 #
554 }
555
556 # Anything else
557 my $token = {type => CHARACTER_TOKEN,
558 data => chr $self->{nc},
559 line => $self->{line}, column => $self->{column},
560 };
561 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 length $token->{data})) {
563 $self->{s_kwd} = '';
564 }
565
566 ## Stay in the data state.
567 if (not $self->{is_xml} and
568 $self->{content_model} == PCDATA_CONTENT_MODEL) {
569
570 $self->{state} = PCDATA_STATE;
571 } else {
572
573 ## Stay in the state.
574 }
575
576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577 $self->{line_prev} = $self->{line};
578 $self->{column_prev} = $self->{column};
579 $self->{column}++;
580 $self->{nc}
581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582 } else {
583 $self->{set_nc}->($self);
584 }
585
586 return ($token);
587 redo A;
588 } elsif ($self->{state} == TAG_OPEN_STATE) {
589 ## XML5: "tag state".
590
591 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592 if ($self->{nc} == 0x002F) { # /
593
594
595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596 $self->{line_prev} = $self->{line};
597 $self->{column_prev} = $self->{column};
598 $self->{column}++;
599 $self->{nc}
600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601 } else {
602 $self->{set_nc}->($self);
603 }
604
605 $self->{state} = CLOSE_TAG_OPEN_STATE;
606 redo A;
607 } elsif ($self->{nc} == 0x0021) { # !
608
609 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 #
611 } else {
612
613 $self->{s_kwd} = '';
614 #
615 }
616
617 ## reconsume
618 $self->{state} = DATA_STATE;
619 return ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623 redo A;
624 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625 if ($self->{nc} == 0x0021) { # !
626
627 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628
629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630 $self->{line_prev} = $self->{line};
631 $self->{column_prev} = $self->{column};
632 $self->{column}++;
633 $self->{nc}
634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635 } else {
636 $self->{set_nc}->($self);
637 }
638
639 redo A;
640 } elsif ($self->{nc} == 0x002F) { # /
641
642 $self->{state} = CLOSE_TAG_OPEN_STATE;
643
644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645 $self->{line_prev} = $self->{line};
646 $self->{column_prev} = $self->{column};
647 $self->{column}++;
648 $self->{nc}
649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650 } else {
651 $self->{set_nc}->($self);
652 }
653
654 redo A;
655 } elsif (0x0041 <= $self->{nc} and
656 $self->{nc} <= 0x005A) { # A..Z
657
658 $self->{ct}
659 = {type => START_TAG_TOKEN,
660 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 line => $self->{line_prev},
662 column => $self->{column_prev}};
663 $self->{state} = TAG_NAME_STATE;
664
665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666 $self->{line_prev} = $self->{line};
667 $self->{column_prev} = $self->{column};
668 $self->{column}++;
669 $self->{nc}
670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671 } else {
672 $self->{set_nc}->($self);
673 }
674
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678
679 $self->{ct} = {type => START_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $self->{line_prev},
682 column => $self->{column_prev}};
683 $self->{state} = TAG_NAME_STATE;
684
685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686 $self->{line_prev} = $self->{line};
687 $self->{column_prev} = $self->{column};
688 $self->{column}++;
689 $self->{nc}
690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691 } else {
692 $self->{set_nc}->($self);
693 }
694
695 redo A;
696 } elsif ($self->{nc} == 0x003E) { # >
697
698 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699 line => $self->{line_prev},
700 column => $self->{column_prev});
701 $self->{state} = DATA_STATE;
702 $self->{s_kwd} = '';
703
704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705 $self->{line_prev} = $self->{line};
706 $self->{column_prev} = $self->{column};
707 $self->{column}++;
708 $self->{nc}
709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710 } else {
711 $self->{set_nc}->($self);
712 }
713
714
715 return ({type => CHARACTER_TOKEN, data => '<>',
716 line => $self->{line_prev},
717 column => $self->{column_prev},
718 });
719
720 redo A;
721 } elsif ($self->{nc} == 0x003F) { # ?
722 if ($self->{is_xml}) {
723
724 $self->{state} = PI_STATE;
725
726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727 $self->{line_prev} = $self->{line};
728 $self->{column_prev} = $self->{column};
729 $self->{column}++;
730 $self->{nc}
731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732 } else {
733 $self->{set_nc}->($self);
734 }
735
736 redo A;
737 } else {
738
739 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740 line => $self->{line_prev},
741 column => $self->{column_prev});
742 $self->{state} = BOGUS_COMMENT_STATE;
743 $self->{ct} = {type => COMMENT_TOKEN, data => '',
744 line => $self->{line_prev},
745 column => $self->{column_prev},
746 };
747 ## $self->{nc} is intentionally left as is
748 redo A;
749 }
750 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751
752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753 line => $self->{line_prev},
754 column => $self->{column_prev});
755 $self->{state} = DATA_STATE;
756 $self->{s_kwd} = '';
757 ## reconsume
758
759 return ({type => CHARACTER_TOKEN, data => '<',
760 line => $self->{line_prev},
761 column => $self->{column_prev},
762 });
763
764 redo A;
765 } else {
766 ## XML5: "<:" is a parse error.
767
768 $self->{ct} = {type => START_TAG_TOKEN,
769 tag_name => chr ($self->{nc}),
770 line => $self->{line_prev},
771 column => $self->{column_prev}};
772 $self->{state} = TAG_NAME_STATE;
773
774 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775 $self->{line_prev} = $self->{line};
776 $self->{column_prev} = $self->{column};
777 $self->{column}++;
778 $self->{nc}
779 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780 } else {
781 $self->{set_nc}->($self);
782 }
783
784 redo A;
785 }
786 } else {
787 die "$0: $self->{content_model} in tag open";
788 }
789 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790 ## NOTE: The "close tag open state" in the spec is implemented as
791 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792
793 ## XML5: "end tag state".
794
795 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797 if (defined $self->{last_stag_name}) {
798 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 $self->{kwd} = '';
800 ## Reconsume.
801 redo A;
802 } else {
803 ## No start tag token has ever been emitted
804 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805
806 $self->{state} = DATA_STATE;
807 $self->{s_kwd} = '';
808 ## Reconsume.
809 return ({type => CHARACTER_TOKEN, data => '</',
810 line => $l, column => $c,
811 });
812 redo A;
813 }
814 }
815
816 if (0x0041 <= $self->{nc} and
817 $self->{nc} <= 0x005A) { # A..Z
818
819 $self->{ct}
820 = {type => END_TAG_TOKEN,
821 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 line => $l, column => $c};
823 $self->{state} = TAG_NAME_STATE;
824
825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826 $self->{line_prev} = $self->{line};
827 $self->{column_prev} = $self->{column};
828 $self->{column}++;
829 $self->{nc}
830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831 } else {
832 $self->{set_nc}->($self);
833 }
834
835 redo A;
836 } elsif (0x0061 <= $self->{nc} and
837 $self->{nc} <= 0x007A) { # a..z
838
839 $self->{ct} = {type => END_TAG_TOKEN,
840 tag_name => chr ($self->{nc}),
841 line => $l, column => $c};
842 $self->{state} = TAG_NAME_STATE;
843
844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845 $self->{line_prev} = $self->{line};
846 $self->{column_prev} = $self->{column};
847 $self->{column}++;
848 $self->{nc}
849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850 } else {
851 $self->{set_nc}->($self);
852 }
853
854 redo A;
855 } elsif ($self->{nc} == 0x003E) { # >
856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857 line => $self->{line_prev}, ## "<" in "</>"
858 column => $self->{column_prev} - 1);
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 if ($self->{is_xml}) {
862
863 ## XML5: No parse error.
864
865 ## NOTE: This parser raises a parse error, since it supports
866 ## XML1, not XML5.
867
868 ## NOTE: A short end tag token.
869 my $ct = {type => END_TAG_TOKEN,
870 tag_name => '',
871 line => $self->{line_prev},
872 column => $self->{column_prev} - 1,
873 };
874
875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876 $self->{line_prev} = $self->{line};
877 $self->{column_prev} = $self->{column};
878 $self->{column}++;
879 $self->{nc}
880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881 } else {
882 $self->{set_nc}->($self);
883 }
884
885 return ($ct);
886 } else {
887
888
889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890 $self->{line_prev} = $self->{line};
891 $self->{column_prev} = $self->{column};
892 $self->{column}++;
893 $self->{nc}
894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895 } else {
896 $self->{set_nc}->($self);
897 }
898
899 }
900 redo A;
901 } elsif ($self->{nc} == -1) {
902
903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 $self->{s_kwd} = '';
905 $self->{state} = DATA_STATE;
906 # reconsume
907
908 return ({type => CHARACTER_TOKEN, data => '</',
909 line => $l, column => $c,
910 });
911
912 redo A;
913 } elsif (not $self->{is_xml} or
914 $is_space->{$self->{nc}}) {
915
916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917 line => $self->{line_prev}, # "<" of "</"
918 column => $self->{column_prev} - 1);
919 $self->{state} = BOGUS_COMMENT_STATE;
920 $self->{ct} = {type => COMMENT_TOKEN, data => '',
921 line => $self->{line_prev}, # "<" of "</"
922 column => $self->{column_prev} - 1,
923 };
924 ## NOTE: $self->{nc} is intentionally left as is.
925 ## Although the "anything else" case of the spec not explicitly
926 ## states that the next input character is to be reconsumed,
927 ## it will be included to the |data| of the comment token
928 ## generated from the bogus end tag, as defined in the
929 ## "bogus comment state" entry.
930 redo A;
931 } else {
932 ## XML5: "</:" is a parse error.
933
934 $self->{ct} = {type => END_TAG_TOKEN,
935 tag_name => chr ($self->{nc}),
936 line => $l, column => $c};
937 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938
939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940 $self->{line_prev} = $self->{line};
941 $self->{column_prev} = $self->{column};
942 $self->{column}++;
943 $self->{nc}
944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945 } else {
946 $self->{set_nc}->($self);
947 }
948
949 redo A;
950 }
951 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 if (length $ch) {
954 my $CH = $ch;
955 $ch =~ tr/a-z/A-Z/;
956 my $nch = chr $self->{nc};
957 if ($nch eq $ch or $nch eq $CH) {
958
959 ## Stay in the state.
960 $self->{kwd} .= $nch;
961
962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963 $self->{line_prev} = $self->{line};
964 $self->{column_prev} = $self->{column};
965 $self->{column}++;
966 $self->{nc}
967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968 } else {
969 $self->{set_nc}->($self);
970 }
971
972 redo A;
973 } else {
974
975 $self->{state} = DATA_STATE;
976 $self->{s_kwd} = '';
977 ## Reconsume.
978 return ({type => CHARACTER_TOKEN,
979 data => '</' . $self->{kwd},
980 line => $self->{line_prev},
981 column => $self->{column_prev} - 1 - length $self->{kwd},
982 });
983 redo A;
984 }
985 } else { # after "<{tag-name}"
986 unless ($is_space->{$self->{nc}} or
987 {
988 0x003E => 1, # >
989 0x002F => 1, # /
990 -1 => 1, # EOF
991 }->{$self->{nc}}) {
992
993 ## Reconsume.
994 $self->{state} = DATA_STATE;
995 $self->{s_kwd} = '';
996 return ({type => CHARACTER_TOKEN,
997 data => '</' . $self->{kwd},
998 line => $self->{line_prev},
999 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 });
1001 redo A;
1002 } else {
1003
1004 $self->{ct}
1005 = {type => END_TAG_TOKEN,
1006 tag_name => $self->{last_stag_name},
1007 line => $self->{line_prev},
1008 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 $self->{state} = TAG_NAME_STATE;
1010 ## Reconsume.
1011 redo A;
1012 }
1013 }
1014 } elsif ($self->{state} == TAG_NAME_STATE) {
1015 if ($is_space->{$self->{nc}}) {
1016
1017 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018
1019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020 $self->{line_prev} = $self->{line};
1021 $self->{column_prev} = $self->{column};
1022 $self->{column}++;
1023 $self->{nc}
1024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025 } else {
1026 $self->{set_nc}->($self);
1027 }
1028
1029 redo A;
1030 } elsif ($self->{nc} == 0x003E) { # >
1031 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032
1033 $self->{last_stag_name} = $self->{ct}->{tag_name};
1034 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036 #if ($self->{ct}->{attributes}) {
1037 # ## NOTE: This should never be reached.
1038 # !!! cp (36);
1039 # !!! parse-error (type => 'end tag attribute');
1040 #} else {
1041
1042 #}
1043 } else {
1044 die "$0: $self->{ct}->{type}: Unknown token type";
1045 }
1046 $self->{state} = DATA_STATE;
1047 $self->{s_kwd} = '';
1048
1049 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050 $self->{line_prev} = $self->{line};
1051 $self->{column_prev} = $self->{column};
1052 $self->{column}++;
1053 $self->{nc}
1054 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055 } else {
1056 $self->{set_nc}->($self);
1057 }
1058
1059
1060 return ($self->{ct}); # start tag or end tag
1061
1062 redo A;
1063 } elsif (0x0041 <= $self->{nc} and
1064 $self->{nc} <= 0x005A) { # A..Z
1065
1066 $self->{ct}->{tag_name}
1067 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 # start tag or end tag
1069 ## Stay in this state
1070
1071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072 $self->{line_prev} = $self->{line};
1073 $self->{column_prev} = $self->{column};
1074 $self->{column}++;
1075 $self->{nc}
1076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077 } else {
1078 $self->{set_nc}->($self);
1079 }
1080
1081 redo A;
1082 } elsif ($self->{nc} == -1) {
1083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085
1086 $self->{last_stag_name} = $self->{ct}->{tag_name};
1087 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089 #if ($self->{ct}->{attributes}) {
1090 # ## NOTE: This state should never be reached.
1091 # !!! cp (40);
1092 # !!! parse-error (type => 'end tag attribute');
1093 #} else {
1094
1095 #}
1096 } else {
1097 die "$0: $self->{ct}->{type}: Unknown token type";
1098 }
1099 $self->{state} = DATA_STATE;
1100 $self->{s_kwd} = '';
1101 # reconsume
1102
1103 return ($self->{ct}); # start tag or end tag
1104
1105 redo A;
1106 } elsif ($self->{nc} == 0x002F) { # /
1107
1108 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109
1110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111 $self->{line_prev} = $self->{line};
1112 $self->{column_prev} = $self->{column};
1113 $self->{column}++;
1114 $self->{nc}
1115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116 } else {
1117 $self->{set_nc}->($self);
1118 }
1119
1120 redo A;
1121 } else {
1122
1123 $self->{ct}->{tag_name} .= chr $self->{nc};
1124 # start tag or end tag
1125 ## Stay in the state
1126
1127 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128 $self->{line_prev} = $self->{line};
1129 $self->{column_prev} = $self->{column};
1130 $self->{column}++;
1131 $self->{nc}
1132 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133 } else {
1134 $self->{set_nc}->($self);
1135 }
1136
1137 redo A;
1138 }
1139 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 ## XML5: "Tag attribute name before state".
1141
1142 if ($is_space->{$self->{nc}}) {
1143
1144 ## Stay in the state
1145
1146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147 $self->{line_prev} = $self->{line};
1148 $self->{column_prev} = $self->{column};
1149 $self->{column}++;
1150 $self->{nc}
1151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152 } else {
1153 $self->{set_nc}->($self);
1154 }
1155
1156 redo A;
1157 } elsif ($self->{nc} == 0x003E) { # >
1158 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159
1160 $self->{last_stag_name} = $self->{ct}->{tag_name};
1161 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163 if ($self->{ct}->{attributes}) {
1164
1165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166 } else {
1167
1168 }
1169 } else {
1170 die "$0: $self->{ct}->{type}: Unknown token type";
1171 }
1172 $self->{state} = DATA_STATE;
1173 $self->{s_kwd} = '';
1174
1175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176 $self->{line_prev} = $self->{line};
1177 $self->{column_prev} = $self->{column};
1178 $self->{column}++;
1179 $self->{nc}
1180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181 } else {
1182 $self->{set_nc}->($self);
1183 }
1184
1185
1186 return ($self->{ct}); # start tag or end tag
1187
1188 redo A;
1189 } elsif (0x0041 <= $self->{nc} and
1190 $self->{nc} <= 0x005A) { # A..Z
1191
1192 $self->{ca}
1193 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 value => '',
1195 line => $self->{line}, column => $self->{column}};
1196 $self->{state} = ATTRIBUTE_NAME_STATE;
1197
1198 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199 $self->{line_prev} = $self->{line};
1200 $self->{column_prev} = $self->{column};
1201 $self->{column}++;
1202 $self->{nc}
1203 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204 } else {
1205 $self->{set_nc}->($self);
1206 }
1207
1208 redo A;
1209 } elsif ($self->{nc} == 0x002F) { # /
1210
1211 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212
1213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214 $self->{line_prev} = $self->{line};
1215 $self->{column_prev} = $self->{column};
1216 $self->{column}++;
1217 $self->{nc}
1218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219 } else {
1220 $self->{set_nc}->($self);
1221 }
1222
1223 redo A;
1224 } elsif ($self->{nc} == -1) {
1225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232
1233 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234 } else {
1235
1236 }
1237 } else {
1238 die "$0: $self->{ct}->{type}: Unknown token type";
1239 }
1240 $self->{state} = DATA_STATE;
1241 $self->{s_kwd} = '';
1242 # reconsume
1243
1244 return ($self->{ct}); # start tag or end tag
1245
1246 redo A;
1247 } else {
1248 if ({
1249 0x0022 => 1, # "
1250 0x0027 => 1, # '
1251 0x003D => 1, # =
1252 }->{$self->{nc}}) {
1253
1254 ## XML5: Not a parse error.
1255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256 } else {
1257
1258 ## XML5: ":" raises a parse error and is ignored.
1259 }
1260 $self->{ca}
1261 = {name => chr ($self->{nc}),
1262 value => '',
1263 line => $self->{line}, column => $self->{column}};
1264 $self->{state} = ATTRIBUTE_NAME_STATE;
1265
1266 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267 $self->{line_prev} = $self->{line};
1268 $self->{column_prev} = $self->{column};
1269 $self->{column}++;
1270 $self->{nc}
1271 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272 } else {
1273 $self->{set_nc}->($self);
1274 }
1275
1276 redo A;
1277 }
1278 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 ## XML5: "Tag attribute name state".
1280
1281 my $before_leave = sub {
1282 if (exists $self->{ct}->{attributes} # start tag or end tag
1283 ->{$self->{ca}->{name}}) { # MUST
1284
1285 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286 ## Discard $self->{ca} # MUST
1287 } else {
1288
1289 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290 = $self->{ca};
1291 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 }
1293 }; # $before_leave
1294
1295 if ($is_space->{$self->{nc}}) {
1296
1297 $before_leave->();
1298 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299
1300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301 $self->{line_prev} = $self->{line};
1302 $self->{column_prev} = $self->{column};
1303 $self->{column}++;
1304 $self->{nc}
1305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306 } else {
1307 $self->{set_nc}->($self);
1308 }
1309
1310 redo A;
1311 } elsif ($self->{nc} == 0x003D) { # =
1312
1313 $before_leave->();
1314 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315
1316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317 $self->{line_prev} = $self->{line};
1318 $self->{column_prev} = $self->{column};
1319 $self->{column}++;
1320 $self->{nc}
1321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322 } else {
1323 $self->{set_nc}->($self);
1324 }
1325
1326 redo A;
1327 } elsif ($self->{nc} == 0x003E) { # >
1328 if ($self->{is_xml}) {
1329
1330 ## XML5: Not a parse error.
1331 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332 } else {
1333
1334 }
1335
1336 $before_leave->();
1337 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338
1339 $self->{last_stag_name} = $self->{ct}->{tag_name};
1340 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341
1342 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343 if ($self->{ct}->{attributes}) {
1344 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345 }
1346 } else {
1347 die "$0: $self->{ct}->{type}: Unknown token type";
1348 }
1349 $self->{state} = DATA_STATE;
1350 $self->{s_kwd} = '';
1351
1352 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353 $self->{line_prev} = $self->{line};
1354 $self->{column_prev} = $self->{column};
1355 $self->{column}++;
1356 $self->{nc}
1357 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358 } else {
1359 $self->{set_nc}->($self);
1360 }
1361
1362
1363 return ($self->{ct}); # start tag or end tag
1364
1365 redo A;
1366 } elsif (0x0041 <= $self->{nc} and
1367 $self->{nc} <= 0x005A) { # A..Z
1368
1369 $self->{ca}->{name}
1370 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 ## Stay in the state
1372
1373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374 $self->{line_prev} = $self->{line};
1375 $self->{column_prev} = $self->{column};
1376 $self->{column}++;
1377 $self->{nc}
1378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379 } else {
1380 $self->{set_nc}->($self);
1381 }
1382
1383 redo A;
1384 } elsif ($self->{nc} == 0x002F) { # /
1385 if ($self->{is_xml}) {
1386
1387 ## XML5: Not a parse error.
1388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389 } else {
1390
1391 }
1392
1393 $before_leave->();
1394 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395
1396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397 $self->{line_prev} = $self->{line};
1398 $self->{column_prev} = $self->{column};
1399 $self->{column}++;
1400 $self->{nc}
1401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402 } else {
1403 $self->{set_nc}->($self);
1404 }
1405
1406 redo A;
1407 } elsif ($self->{nc} == -1) {
1408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409 $before_leave->();
1410 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411
1412 $self->{last_stag_name} = $self->{ct}->{tag_name};
1413 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415 if ($self->{ct}->{attributes}) {
1416
1417 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418 } else {
1419 ## NOTE: This state should never be reached.
1420
1421 }
1422 } else {
1423 die "$0: $self->{ct}->{type}: Unknown token type";
1424 }
1425 $self->{state} = DATA_STATE;
1426 $self->{s_kwd} = '';
1427 # reconsume
1428
1429 return ($self->{ct}); # start tag or end tag
1430
1431 redo A;
1432 } else {
1433 if ($self->{nc} == 0x0022 or # "
1434 $self->{nc} == 0x0027) { # '
1435
1436 ## XML5: Not a parse error.
1437 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438 } else {
1439
1440 }
1441 $self->{ca}->{name} .= chr ($self->{nc});
1442 ## Stay in the state
1443
1444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445 $self->{line_prev} = $self->{line};
1446 $self->{column_prev} = $self->{column};
1447 $self->{column}++;
1448 $self->{nc}
1449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450 } else {
1451 $self->{set_nc}->($self);
1452 }
1453
1454 redo A;
1455 }
1456 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 ## XML5: "Tag attribute name after state".
1458
1459 if ($is_space->{$self->{nc}}) {
1460
1461 ## Stay in the state
1462
1463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464 $self->{line_prev} = $self->{line};
1465 $self->{column_prev} = $self->{column};
1466 $self->{column}++;
1467 $self->{nc}
1468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469 } else {
1470 $self->{set_nc}->($self);
1471 }
1472
1473 redo A;
1474 } elsif ($self->{nc} == 0x003D) { # =
1475
1476 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477
1478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479 $self->{line_prev} = $self->{line};
1480 $self->{column_prev} = $self->{column};
1481 $self->{column}++;
1482 $self->{nc}
1483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484 } else {
1485 $self->{set_nc}->($self);
1486 }
1487
1488 redo A;
1489 } elsif ($self->{nc} == 0x003E) { # >
1490 if ($self->{is_xml}) {
1491
1492 ## XML5: Not a parse error.
1493 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494 } else {
1495
1496 }
1497
1498 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499
1500 $self->{last_stag_name} = $self->{ct}->{tag_name};
1501 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503 if ($self->{ct}->{attributes}) {
1504
1505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506 } else {
1507 ## NOTE: This state should never be reached.
1508
1509 }
1510 } else {
1511 die "$0: $self->{ct}->{type}: Unknown token type";
1512 }
1513 $self->{state} = DATA_STATE;
1514 $self->{s_kwd} = '';
1515
1516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517 $self->{line_prev} = $self->{line};
1518 $self->{column_prev} = $self->{column};
1519 $self->{column}++;
1520 $self->{nc}
1521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522 } else {
1523 $self->{set_nc}->($self);
1524 }
1525
1526
1527 return ($self->{ct}); # start tag or end tag
1528
1529 redo A;
1530 } elsif (0x0041 <= $self->{nc} and
1531 $self->{nc} <= 0x005A) { # A..Z
1532
1533 $self->{ca}
1534 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 value => '',
1536 line => $self->{line}, column => $self->{column}};
1537 $self->{state} = ATTRIBUTE_NAME_STATE;
1538
1539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540 $self->{line_prev} = $self->{line};
1541 $self->{column_prev} = $self->{column};
1542 $self->{column}++;
1543 $self->{nc}
1544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545 } else {
1546 $self->{set_nc}->($self);
1547 }
1548
1549 redo A;
1550 } elsif ($self->{nc} == 0x002F) { # /
1551 if ($self->{is_xml}) {
1552
1553 ## XML5: Not a parse error.
1554 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555 } else {
1556
1557 }
1558
1559 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560
1561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562 $self->{line_prev} = $self->{line};
1563 $self->{column_prev} = $self->{column};
1564 $self->{column}++;
1565 $self->{nc}
1566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567 } else {
1568 $self->{set_nc}->($self);
1569 }
1570
1571 redo A;
1572 } elsif ($self->{nc} == -1) {
1573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575
1576 $self->{last_stag_name} = $self->{ct}->{tag_name};
1577 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579 if ($self->{ct}->{attributes}) {
1580
1581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582 } else {
1583 ## NOTE: This state should never be reached.
1584
1585 }
1586 } else {
1587 die "$0: $self->{ct}->{type}: Unknown token type";
1588 }
1589 $self->{s_kwd} = '';
1590 $self->{state} = DATA_STATE;
1591 # reconsume
1592
1593 return ($self->{ct}); # start tag or end tag
1594
1595 redo A;
1596 } else {
1597 if ($self->{is_xml}) {
1598
1599 ## XML5: Not a parse error.
1600 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601 } else {
1602
1603 }
1604
1605 if ($self->{nc} == 0x0022 or # "
1606 $self->{nc} == 0x0027) { # '
1607
1608 ## XML5: Not a parse error.
1609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610 } else {
1611
1612 }
1613 $self->{ca}
1614 = {name => chr ($self->{nc}),
1615 value => '',
1616 line => $self->{line}, column => $self->{column}};
1617 $self->{state} = ATTRIBUTE_NAME_STATE;
1618
1619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620 $self->{line_prev} = $self->{line};
1621 $self->{column_prev} = $self->{column};
1622 $self->{column}++;
1623 $self->{nc}
1624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625 } else {
1626 $self->{set_nc}->($self);
1627 }
1628
1629 redo A;
1630 }
1631 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 ## XML5: "Tag attribute value before state".
1633
1634 if ($is_space->{$self->{nc}}) {
1635
1636 ## Stay in the state
1637
1638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639 $self->{line_prev} = $self->{line};
1640 $self->{column_prev} = $self->{column};
1641 $self->{column}++;
1642 $self->{nc}
1643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644 } else {
1645 $self->{set_nc}->($self);
1646 }
1647
1648 redo A;
1649 } elsif ($self->{nc} == 0x0022) { # "
1650
1651 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652
1653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654 $self->{line_prev} = $self->{line};
1655 $self->{column_prev} = $self->{column};
1656 $self->{column}++;
1657 $self->{nc}
1658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659 } else {
1660 $self->{set_nc}->($self);
1661 }
1662
1663 redo A;
1664 } elsif ($self->{nc} == 0x0026) { # &
1665
1666 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667 ## reconsume
1668 redo A;
1669 } elsif ($self->{nc} == 0x0027) { # '
1670
1671 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672
1673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674 $self->{line_prev} = $self->{line};
1675 $self->{column_prev} = $self->{column};
1676 $self->{column}++;
1677 $self->{nc}
1678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679 } else {
1680 $self->{set_nc}->($self);
1681 }
1682
1683 redo A;
1684 } elsif ($self->{nc} == 0x003E) { # >
1685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687
1688 $self->{last_stag_name} = $self->{ct}->{tag_name};
1689 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691 if ($self->{ct}->{attributes}) {
1692
1693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694 } else {
1695 ## NOTE: This state should never be reached.
1696
1697 }
1698 } else {
1699 die "$0: $self->{ct}->{type}: Unknown token type";
1700 }
1701 $self->{state} = DATA_STATE;
1702 $self->{s_kwd} = '';
1703
1704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705 $self->{line_prev} = $self->{line};
1706 $self->{column_prev} = $self->{column};
1707 $self->{column}++;
1708 $self->{nc}
1709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710 } else {
1711 $self->{set_nc}->($self);
1712 }
1713
1714
1715 return ($self->{ct}); # start tag or end tag
1716
1717 redo A;
1718 } elsif ($self->{nc} == -1) {
1719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721
1722 $self->{last_stag_name} = $self->{ct}->{tag_name};
1723 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725 if ($self->{ct}->{attributes}) {
1726
1727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728 } else {
1729 ## NOTE: This state should never be reached.
1730
1731 }
1732 } else {
1733 die "$0: $self->{ct}->{type}: Unknown token type";
1734 }
1735 $self->{state} = DATA_STATE;
1736 $self->{s_kwd} = '';
1737 ## reconsume
1738
1739 return ($self->{ct}); # start tag or end tag
1740
1741 redo A;
1742 } else {
1743 if ($self->{nc} == 0x003D) { # =
1744
1745 ## XML5: Not a parse error.
1746 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 } elsif ($self->{is_xml}) {
1748
1749 ## XML5: No parse error.
1750 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 } else {
1752
1753 }
1754 $self->{ca}->{value} .= chr ($self->{nc});
1755 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756
1757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758 $self->{line_prev} = $self->{line};
1759 $self->{column_prev} = $self->{column};
1760 $self->{column}++;
1761 $self->{nc}
1762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763 } else {
1764 $self->{set_nc}->($self);
1765 }
1766
1767 redo A;
1768 }
1769 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771 ## ATTLIST attribute value double quoted state".
1772
1773 if ($self->{nc} == 0x0022) { # "
1774 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775
1776 ## XML5: "DOCTYPE ATTLIST name after state".
1777 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779 } else {
1780
1781 ## XML5: "Tag attribute name before state".
1782 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783 }
1784
1785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786 $self->{line_prev} = $self->{line};
1787 $self->{column_prev} = $self->{column};
1788 $self->{column}++;
1789 $self->{nc}
1790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791 } else {
1792 $self->{set_nc}->($self);
1793 }
1794
1795 redo A;
1796 } elsif ($self->{nc} == 0x0026) { # &
1797
1798 ## XML5: Not defined yet.
1799
1800 ## NOTE: In the spec, the tokenizer is switched to the
1801 ## "entity in attribute value state". In this implementation, the
1802 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803 ## implementation of the "consume a character reference" algorithm.
1804 $self->{prev_state} = $self->{state};
1805 $self->{entity_add} = 0x0022; # "
1806 $self->{state} = ENTITY_STATE;
1807
1808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809 $self->{line_prev} = $self->{line};
1810 $self->{column_prev} = $self->{column};
1811 $self->{column}++;
1812 $self->{nc}
1813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814 } else {
1815 $self->{set_nc}->($self);
1816 }
1817
1818 redo A;
1819 } elsif ($self->{nc} == -1) {
1820 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1821 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822
1823 $self->{last_stag_name} = $self->{ct}->{tag_name};
1824
1825 $self->{state} = DATA_STATE;
1826 $self->{s_kwd} = '';
1827 ## reconsume
1828 return ($self->{ct}); # start tag
1829 redo A;
1830 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832 if ($self->{ct}->{attributes}) {
1833
1834 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1835 } else {
1836 ## NOTE: This state should never be reached.
1837
1838 }
1839
1840 $self->{state} = DATA_STATE;
1841 $self->{s_kwd} = '';
1842 ## reconsume
1843 return ($self->{ct}); # end tag
1844 redo A;
1845 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846 ## XML5: No parse error above; not defined yet.
1847 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849 ## Reconsume.
1850 return ($self->{ct}); # ATTLIST
1851 redo A;
1852 } else {
1853 die "$0: $self->{ct}->{type}: Unknown token type";
1854 }
1855 } else {
1856 ## XML5 [ATTLIST]: Not defined yet.
1857 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858
1859 ## XML5: Not a parse error.
1860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861 } else {
1862
1863 }
1864 $self->{ca}->{value} .= chr ($self->{nc});
1865 $self->{read_until}->($self->{ca}->{value},
1866 q["&<],
1867 length $self->{ca}->{value});
1868
1869 ## Stay in the state
1870
1871 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1872 $self->{line_prev} = $self->{line};
1873 $self->{column_prev} = $self->{column};
1874 $self->{column}++;
1875 $self->{nc}
1876 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1877 } else {
1878 $self->{set_nc}->($self);
1879 }
1880
1881 redo A;
1882 }
1883 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885 ## ATTLIST attribute value single quoted state".
1886
1887 if ($self->{nc} == 0x0027) { # '
1888 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889
1890 ## XML5: "DOCTYPE ATTLIST name after state".
1891 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893 } else {
1894
1895 ## XML5: "Before attribute name state" (sic).
1896 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897 }
1898
1899 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900 $self->{line_prev} = $self->{line};
1901 $self->{column_prev} = $self->{column};
1902 $self->{column}++;
1903 $self->{nc}
1904 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1905 } else {
1906 $self->{set_nc}->($self);
1907 }
1908
1909 redo A;
1910 } elsif ($self->{nc} == 0x0026) { # &
1911
1912 ## XML5: Not defined yet.
1913
1914 ## NOTE: In the spec, the tokenizer is switched to the
1915 ## "entity in attribute value state". In this implementation, the
1916 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1917 ## implementation of the "consume a character reference" algorithm.
1918 $self->{entity_add} = 0x0027; # '
1919 $self->{prev_state} = $self->{state};
1920 $self->{state} = ENTITY_STATE;
1921
1922 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923 $self->{line_prev} = $self->{line};
1924 $self->{column_prev} = $self->{column};
1925 $self->{column}++;
1926 $self->{nc}
1927 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928 } else {
1929 $self->{set_nc}->($self);
1930 }
1931
1932 redo A;
1933 } elsif ($self->{nc} == -1) {
1934 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1935 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936
1937 $self->{last_stag_name} = $self->{ct}->{tag_name};
1938
1939 $self->{state} = DATA_STATE;
1940 $self->{s_kwd} = '';
1941 ## reconsume
1942 return ($self->{ct}); # start tag
1943 redo A;
1944 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946 if ($self->{ct}->{attributes}) {
1947
1948 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1949 } else {
1950 ## NOTE: This state should never be reached.
1951
1952 }
1953
1954 $self->{state} = DATA_STATE;
1955 $self->{s_kwd} = '';
1956 ## reconsume
1957 return ($self->{ct}); # end tag
1958 redo A;
1959 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960 ## XML5: No parse error above; not defined yet.
1961 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963 ## Reconsume.
1964 return ($self->{ct}); # ATTLIST
1965 redo A;
1966 } else {
1967 die "$0: $self->{ct}->{type}: Unknown token type";
1968 }
1969 } else {
1970 ## XML5 [ATTLIST]: Not defined yet.
1971 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972
1973 ## XML5: Not a parse error.
1974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975 } else {
1976
1977 }
1978 $self->{ca}->{value} .= chr ($self->{nc});
1979 $self->{read_until}->($self->{ca}->{value},
1980 q['&<],
1981 length $self->{ca}->{value});
1982
1983 ## Stay in the state
1984
1985 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986 $self->{line_prev} = $self->{line};
1987 $self->{column_prev} = $self->{column};
1988 $self->{column}++;
1989 $self->{nc}
1990 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991 } else {
1992 $self->{set_nc}->($self);
1993 }
1994
1995 redo A;
1996 }
1997 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998 ## XML5: "Tag attribute value unquoted state".
1999
2000 if ($is_space->{$self->{nc}}) {
2001 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002
2003 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005 } else {
2006
2007 ## XML5: "Tag attribute name before state".
2008 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009 }
2010
2011 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012 $self->{line_prev} = $self->{line};
2013 $self->{column_prev} = $self->{column};
2014 $self->{column}++;
2015 $self->{nc}
2016 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017 } else {
2018 $self->{set_nc}->($self);
2019 }
2020
2021 redo A;
2022 } elsif ($self->{nc} == 0x0026) { # &
2023
2024
2025 ## XML5: Not defined yet.
2026
2027 ## NOTE: In the spec, the tokenizer is switched to the
2028 ## "entity in attribute value state". In this implementation, the
2029 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2030 ## implementation of the "consume a character reference" algorithm.
2031 $self->{entity_add} = -1;
2032 $self->{prev_state} = $self->{state};
2033 $self->{state} = ENTITY_STATE;
2034
2035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036 $self->{line_prev} = $self->{line};
2037 $self->{column_prev} = $self->{column};
2038 $self->{column}++;
2039 $self->{nc}
2040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041 } else {
2042 $self->{set_nc}->($self);
2043 }
2044
2045 redo A;
2046 } elsif ($self->{nc} == 0x003E) { # >
2047 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048
2049 $self->{last_stag_name} = $self->{ct}->{tag_name};
2050
2051 $self->{state} = DATA_STATE;
2052 $self->{s_kwd} = '';
2053
2054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055 $self->{line_prev} = $self->{line};
2056 $self->{column_prev} = $self->{column};
2057 $self->{column}++;
2058 $self->{nc}
2059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060 } else {
2061 $self->{set_nc}->($self);
2062 }
2063
2064 return ($self->{ct}); # start tag
2065 redo A;
2066 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068 if ($self->{ct}->{attributes}) {
2069
2070 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2071 } else {
2072 ## NOTE: This state should never be reached.
2073
2074 }
2075
2076 $self->{state} = DATA_STATE;
2077 $self->{s_kwd} = '';
2078
2079 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080 $self->{line_prev} = $self->{line};
2081 $self->{column_prev} = $self->{column};
2082 $self->{column}++;
2083 $self->{nc}
2084 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085 } else {
2086 $self->{set_nc}->($self);
2087 }
2088
2089 return ($self->{ct}); # end tag
2090 redo A;
2091 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094
2095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096 $self->{line_prev} = $self->{line};
2097 $self->{column_prev} = $self->{column};
2098 $self->{column}++;
2099 $self->{nc}
2100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101 } else {
2102 $self->{set_nc}->($self);
2103 }
2104
2105 return ($self->{ct}); # ATTLIST
2106 redo A;
2107 } else {
2108 die "$0: $self->{ct}->{type}: Unknown token type";
2109 }
2110 } elsif ($self->{nc} == -1) {
2111 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112
2113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114 $self->{last_stag_name} = $self->{ct}->{tag_name};
2115
2116 $self->{state} = DATA_STATE;
2117 $self->{s_kwd} = '';
2118 ## reconsume
2119 return ($self->{ct}); # start tag
2120 redo A;
2121 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124 if ($self->{ct}->{attributes}) {
2125
2126 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2127 } else {
2128 ## NOTE: This state should never be reached.
2129
2130 }
2131
2132 $self->{state} = DATA_STATE;
2133 $self->{s_kwd} = '';
2134 ## reconsume
2135 return ($self->{ct}); # end tag
2136 redo A;
2137 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141 ## Reconsume.
2142 return ($self->{ct}); # ATTLIST
2143 redo A;
2144 } else {
2145 die "$0: $self->{ct}->{type}: Unknown token type";
2146 }
2147 } else {
2148 if ({
2149 0x0022 => 1, # "
2150 0x0027 => 1, # '
2151 0x003D => 1, # =
2152 }->{$self->{nc}}) {
2153
2154 ## XML5: Not a parse error.
2155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156 } else {
2157
2158 }
2159 $self->{ca}->{value} .= chr ($self->{nc});
2160 $self->{read_until}->($self->{ca}->{value},
2161 q["'=& >],
2162 length $self->{ca}->{value});
2163
2164 ## Stay in the state
2165
2166 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167 $self->{line_prev} = $self->{line};
2168 $self->{column_prev} = $self->{column};
2169 $self->{column}++;
2170 $self->{nc}
2171 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172 } else {
2173 $self->{set_nc}->($self);
2174 }
2175
2176 redo A;
2177 }
2178 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2179 if ($is_space->{$self->{nc}}) {
2180
2181 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182
2183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2184 $self->{line_prev} = $self->{line};
2185 $self->{column_prev} = $self->{column};
2186 $self->{column}++;
2187 $self->{nc}
2188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2189 } else {
2190 $self->{set_nc}->($self);
2191 }
2192
2193 redo A;
2194 } elsif ($self->{nc} == 0x003E) { # >
2195 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2196
2197 $self->{last_stag_name} = $self->{ct}->{tag_name};
2198 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2199 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2200 if ($self->{ct}->{attributes}) {
2201
2202 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2203 } else {
2204 ## NOTE: This state should never be reached.
2205
2206 }
2207 } else {
2208 die "$0: $self->{ct}->{type}: Unknown token type";
2209 }
2210 $self->{state} = DATA_STATE;
2211 $self->{s_kwd} = '';
2212
2213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214 $self->{line_prev} = $self->{line};
2215 $self->{column_prev} = $self->{column};
2216 $self->{column}++;
2217 $self->{nc}
2218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2219 } else {
2220 $self->{set_nc}->($self);
2221 }
2222
2223
2224 return ($self->{ct}); # start tag or end tag
2225
2226 redo A;
2227 } elsif ($self->{nc} == 0x002F) { # /
2228
2229 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2230
2231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2232 $self->{line_prev} = $self->{line};
2233 $self->{column_prev} = $self->{column};
2234 $self->{column}++;
2235 $self->{nc}
2236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2237 } else {
2238 $self->{set_nc}->($self);
2239 }
2240
2241 redo A;
2242 } elsif ($self->{nc} == -1) {
2243 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2244 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2245
2246 $self->{last_stag_name} = $self->{ct}->{tag_name};
2247 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2248 if ($self->{ct}->{attributes}) {
2249
2250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2251 } else {
2252 ## NOTE: This state should never be reached.
2253
2254 }
2255 } else {
2256 die "$0: $self->{ct}->{type}: Unknown token type";
2257 }
2258 $self->{state} = DATA_STATE;
2259 $self->{s_kwd} = '';
2260 ## Reconsume.
2261 return ($self->{ct}); # start tag or end tag
2262 redo A;
2263 } else {
2264
2265 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2266 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2267 ## reconsume
2268 redo A;
2269 }
2270 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271 ## XML5: "Empty tag state".
2272
2273 if ($self->{nc} == 0x003E) { # >
2274 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275
2276 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2277 ## TODO: Different type than slash in start tag
2278 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2279 if ($self->{ct}->{attributes}) {
2280
2281 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2282 } else {
2283
2284 }
2285 ## TODO: Test |<title></title/>|
2286 } else {
2287
2288 $self->{self_closing} = 1;
2289 }
2290
2291 $self->{state} = DATA_STATE;
2292 $self->{s_kwd} = '';
2293
2294 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295 $self->{line_prev} = $self->{line};
2296 $self->{column_prev} = $self->{column};
2297 $self->{column}++;
2298 $self->{nc}
2299 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300 } else {
2301 $self->{set_nc}->($self);
2302 }
2303
2304
2305 return ($self->{ct}); # start tag or end tag
2306
2307 redo A;
2308 } elsif ($self->{nc} == -1) {
2309 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2310 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2311
2312 $self->{last_stag_name} = $self->{ct}->{tag_name};
2313 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2314 if ($self->{ct}->{attributes}) {
2315
2316 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317 } else {
2318 ## NOTE: This state should never be reached.
2319
2320 }
2321 } else {
2322 die "$0: $self->{ct}->{type}: Unknown token type";
2323 }
2324 ## XML5: "Tag attribute name before state".
2325 $self->{state} = DATA_STATE;
2326 $self->{s_kwd} = '';
2327 ## Reconsume.
2328 return ($self->{ct}); # start tag or end tag
2329 redo A;
2330 } else {
2331
2332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2333 ## TODO: This error type is wrong.
2334 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2335 ## Reconsume.
2336 redo A;
2337 }
2338 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340
2341 ## NOTE: Unlike spec's "bogus comment state", this implementation
2342 ## consumes characters one-by-one basis.
2343
2344 if ($self->{nc} == 0x003E) { # >
2345 if ($self->{in_subset}) {
2346
2347 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348 } else {
2349
2350 $self->{state} = DATA_STATE;
2351 $self->{s_kwd} = '';
2352 }
2353
2354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355 $self->{line_prev} = $self->{line};
2356 $self->{column_prev} = $self->{column};
2357 $self->{column}++;
2358 $self->{nc}
2359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2360 } else {
2361 $self->{set_nc}->($self);
2362 }
2363
2364
2365 return ($self->{ct}); # comment
2366 redo A;
2367 } elsif ($self->{nc} == -1) {
2368 if ($self->{in_subset}) {
2369
2370 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371 } else {
2372
2373 $self->{state} = DATA_STATE;
2374 $self->{s_kwd} = '';
2375 }
2376 ## reconsume
2377
2378 return ($self->{ct}); # comment
2379 redo A;
2380 } else {
2381
2382 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2383 $self->{read_until}->($self->{ct}->{data},
2384 q[>],
2385 length $self->{ct}->{data});
2386
2387 ## Stay in the state.
2388
2389 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390 $self->{line_prev} = $self->{line};
2391 $self->{column_prev} = $self->{column};
2392 $self->{column}++;
2393 $self->{nc}
2394 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395 } else {
2396 $self->{set_nc}->($self);
2397 }
2398
2399 redo A;
2400 }
2401 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402 ## XML5: "Markup declaration state".
2403
2404 if ($self->{nc} == 0x002D) { # -
2405
2406 $self->{state} = MD_HYPHEN_STATE;
2407
2408 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409 $self->{line_prev} = $self->{line};
2410 $self->{column_prev} = $self->{column};
2411 $self->{column}++;
2412 $self->{nc}
2413 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414 } else {
2415 $self->{set_nc}->($self);
2416 }
2417
2418 redo A;
2419 } elsif ($self->{nc} == 0x0044 or # D
2420 $self->{nc} == 0x0064) { # d
2421 ## ASCII case-insensitive.
2422
2423 $self->{state} = MD_DOCTYPE_STATE;
2424 $self->{kwd} = chr $self->{nc};
2425
2426 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427 $self->{line_prev} = $self->{line};
2428 $self->{column_prev} = $self->{column};
2429 $self->{column}++;
2430 $self->{nc}
2431 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432 } else {
2433 $self->{set_nc}->($self);
2434 }
2435
2436 redo A;
2437 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2438 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2439 $self->{is_xml}) and
2440 $self->{nc} == 0x005B) { # [
2441
2442 $self->{state} = MD_CDATA_STATE;
2443 $self->{kwd} = '[';
2444
2445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446 $self->{line_prev} = $self->{line};
2447 $self->{column_prev} = $self->{column};
2448 $self->{column}++;
2449 $self->{nc}
2450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2451 } else {
2452 $self->{set_nc}->($self);
2453 }
2454
2455 redo A;
2456 } else {
2457
2458 }
2459
2460 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461 line => $self->{line_prev},
2462 column => $self->{column_prev} - 1);
2463 ## Reconsume.
2464 $self->{state} = BOGUS_COMMENT_STATE;
2465 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2466 line => $self->{line_prev},
2467 column => $self->{column_prev} - 1,
2468 };
2469 redo A;
2470 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2471 if ($self->{nc} == 0x002D) { # -
2472
2473 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2474 line => $self->{line_prev},
2475 column => $self->{column_prev} - 2,
2476 };
2477 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478
2479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480 $self->{line_prev} = $self->{line};
2481 $self->{column_prev} = $self->{column};
2482 $self->{column}++;
2483 $self->{nc}
2484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485 } else {
2486 $self->{set_nc}->($self);
2487 }
2488
2489 redo A;
2490 } else {
2491
2492 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2493 line => $self->{line_prev},
2494 column => $self->{column_prev} - 2);
2495 $self->{state} = BOGUS_COMMENT_STATE;
2496 ## Reconsume.
2497 $self->{ct} = {type => COMMENT_TOKEN,
2498 data => '-',
2499 line => $self->{line_prev},
2500 column => $self->{column_prev} - 2,
2501 };
2502 redo A;
2503 }
2504 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2505 ## ASCII case-insensitive.
2506 if ($self->{nc} == [
2507 undef,
2508 0x004F, # O
2509 0x0043, # C
2510 0x0054, # T
2511 0x0059, # Y
2512 0x0050, # P
2513 ]->[length $self->{kwd}] or
2514 $self->{nc} == [
2515 undef,
2516 0x006F, # o
2517 0x0063, # c
2518 0x0074, # t
2519 0x0079, # y
2520 0x0070, # p
2521 ]->[length $self->{kwd}]) {
2522
2523 ## Stay in the state.
2524 $self->{kwd} .= chr $self->{nc};
2525
2526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527 $self->{line_prev} = $self->{line};
2528 $self->{column_prev} = $self->{column};
2529 $self->{column}++;
2530 $self->{nc}
2531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532 } else {
2533 $self->{set_nc}->($self);
2534 }
2535
2536 redo A;
2537 } elsif ((length $self->{kwd}) == 6 and
2538 ($self->{nc} == 0x0045 or # E
2539 $self->{nc} == 0x0065)) { # e
2540 if ($self->{is_xml} and
2541 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542
2543 ## XML5: case-sensitive.
2544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545 text => 'DOCTYPE',
2546 line => $self->{line_prev},
2547 column => $self->{column_prev} - 5);
2548 } else {
2549
2550 }
2551 $self->{state} = DOCTYPE_STATE;
2552 $self->{ct} = {type => DOCTYPE_TOKEN,
2553 quirks => 1,
2554 line => $self->{line_prev},
2555 column => $self->{column_prev} - 7,
2556 };
2557
2558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559 $self->{line_prev} = $self->{line};
2560 $self->{column_prev} = $self->{column};
2561 $self->{column}++;
2562 $self->{nc}
2563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564 } else {
2565 $self->{set_nc}->($self);
2566 }
2567
2568 redo A;
2569 } else {
2570
2571 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572 line => $self->{line_prev},
2573 column => $self->{column_prev} - 1 - length $self->{kwd});
2574 $self->{state} = BOGUS_COMMENT_STATE;
2575 ## Reconsume.
2576 $self->{ct} = {type => COMMENT_TOKEN,
2577 data => $self->{kwd},
2578 line => $self->{line_prev},
2579 column => $self->{column_prev} - 1 - length $self->{kwd},
2580 };
2581 redo A;
2582 }
2583 } elsif ($self->{state} == MD_CDATA_STATE) {
2584 if ($self->{nc} == {
2585 '[' => 0x0043, # C
2586 '[C' => 0x0044, # D
2587 '[CD' => 0x0041, # A
2588 '[CDA' => 0x0054, # T
2589 '[CDAT' => 0x0041, # A
2590 }->{$self->{kwd}}) {
2591
2592 ## Stay in the state.
2593 $self->{kwd} .= chr $self->{nc};
2594
2595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596 $self->{line_prev} = $self->{line};
2597 $self->{column_prev} = $self->{column};
2598 $self->{column}++;
2599 $self->{nc}
2600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601 } else {
2602 $self->{set_nc}->($self);
2603 }
2604
2605 redo A;
2606 } elsif ($self->{kwd} eq '[CDATA' and
2607 $self->{nc} == 0x005B) { # [
2608 if ($self->{is_xml} and
2609 not $self->{tainted} and
2610 @{$self->{open_elements} or []} == 0) {
2611
2612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613 line => $self->{line_prev},
2614 column => $self->{column_prev} - 7);
2615 $self->{tainted} = 1;
2616 } else {
2617
2618 }
2619
2620 $self->{ct} = {type => CHARACTER_TOKEN,
2621 data => '',
2622 line => $self->{line_prev},
2623 column => $self->{column_prev} - 7};
2624 $self->{state} = CDATA_SECTION_STATE;
2625
2626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2627 $self->{line_prev} = $self->{line};
2628 $self->{column_prev} = $self->{column};
2629 $self->{column}++;
2630 $self->{nc}
2631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2632 } else {
2633 $self->{set_nc}->($self);
2634 }
2635
2636 redo A;
2637 } else {
2638
2639 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640 line => $self->{line_prev},
2641 column => $self->{column_prev} - 1 - length $self->{kwd});
2642 $self->{state} = BOGUS_COMMENT_STATE;
2643 ## Reconsume.
2644 $self->{ct} = {type => COMMENT_TOKEN,
2645 data => $self->{kwd},
2646 line => $self->{line_prev},
2647 column => $self->{column_prev} - 1 - length $self->{kwd},
2648 };
2649 redo A;
2650 }
2651 } elsif ($self->{state} == COMMENT_START_STATE) {
2652 if ($self->{nc} == 0x002D) { # -
2653
2654 $self->{state} = COMMENT_START_DASH_STATE;
2655
2656 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657 $self->{line_prev} = $self->{line};
2658 $self->{column_prev} = $self->{column};
2659 $self->{column}++;
2660 $self->{nc}
2661 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662 } else {
2663 $self->{set_nc}->($self);
2664 }
2665
2666 redo A;
2667 } elsif ($self->{nc} == 0x003E) { # >
2668 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669 if ($self->{in_subset}) {
2670
2671 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672 } else {
2673
2674 $self->{state} = DATA_STATE;
2675 $self->{s_kwd} = '';
2676 }
2677
2678 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679 $self->{line_prev} = $self->{line};
2680 $self->{column_prev} = $self->{column};
2681 $self->{column}++;
2682 $self->{nc}
2683 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684 } else {
2685 $self->{set_nc}->($self);
2686 }
2687
2688
2689 return ($self->{ct}); # comment
2690
2691 redo A;
2692 } elsif ($self->{nc} == -1) {
2693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694 if ($self->{in_subset}) {
2695
2696 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697 } else {
2698
2699 $self->{state} = DATA_STATE;
2700 $self->{s_kwd} = '';
2701 }
2702 ## reconsume
2703
2704 return ($self->{ct}); # comment
2705
2706 redo A;
2707 } else {
2708
2709 $self->{ct}->{data} # comment
2710 .= chr ($self->{nc});
2711 $self->{state} = COMMENT_STATE;
2712
2713 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714 $self->{line_prev} = $self->{line};
2715 $self->{column_prev} = $self->{column};
2716 $self->{column}++;
2717 $self->{nc}
2718 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719 } else {
2720 $self->{set_nc}->($self);
2721 }
2722
2723 redo A;
2724 }
2725 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2726 if ($self->{nc} == 0x002D) { # -
2727
2728 $self->{state} = COMMENT_END_STATE;
2729
2730 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2731 $self->{line_prev} = $self->{line};
2732 $self->{column_prev} = $self->{column};
2733 $self->{column}++;
2734 $self->{nc}
2735 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2736 } else {
2737 $self->{set_nc}->($self);
2738 }
2739
2740 redo A;
2741 } elsif ($self->{nc} == 0x003E) { # >
2742 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743 if ($self->{in_subset}) {
2744
2745 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746 } else {
2747
2748 $self->{state} = DATA_STATE;
2749 $self->{s_kwd} = '';
2750 }
2751
2752 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753 $self->{line_prev} = $self->{line};
2754 $self->{column_prev} = $self->{column};
2755 $self->{column}++;
2756 $self->{nc}
2757 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758 } else {
2759 $self->{set_nc}->($self);
2760 }
2761
2762
2763 return ($self->{ct}); # comment
2764
2765 redo A;
2766 } elsif ($self->{nc} == -1) {
2767 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768 if ($self->{in_subset}) {
2769
2770 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771 } else {
2772
2773 $self->{state} = DATA_STATE;
2774 $self->{s_kwd} = '';
2775 }
2776 ## reconsume
2777
2778 return ($self->{ct}); # comment
2779
2780 redo A;
2781 } else {
2782
2783 $self->{ct}->{data} # comment
2784 .= '-' . chr ($self->{nc});
2785 $self->{state} = COMMENT_STATE;
2786
2787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788 $self->{line_prev} = $self->{line};
2789 $self->{column_prev} = $self->{column};
2790 $self->{column}++;
2791 $self->{nc}
2792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793 } else {
2794 $self->{set_nc}->($self);
2795 }
2796
2797 redo A;
2798 }
2799 } elsif ($self->{state} == COMMENT_STATE) {
2800 ## XML5: "Comment state" and "DOCTYPE comment state".
2801
2802 if ($self->{nc} == 0x002D) { # -
2803
2804 $self->{state} = COMMENT_END_DASH_STATE;
2805
2806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807 $self->{line_prev} = $self->{line};
2808 $self->{column_prev} = $self->{column};
2809 $self->{column}++;
2810 $self->{nc}
2811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812 } else {
2813 $self->{set_nc}->($self);
2814 }
2815
2816 redo A;
2817 } elsif ($self->{nc} == -1) {
2818 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819 if ($self->{in_subset}) {
2820
2821 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822 } else {
2823
2824 $self->{state} = DATA_STATE;
2825 $self->{s_kwd} = '';
2826 }
2827 ## reconsume
2828
2829 return ($self->{ct}); # comment
2830
2831 redo A;
2832 } else {
2833
2834 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2835 $self->{read_until}->($self->{ct}->{data},
2836 q[-],
2837 length $self->{ct}->{data});
2838
2839 ## Stay in the state
2840
2841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842 $self->{line_prev} = $self->{line};
2843 $self->{column_prev} = $self->{column};
2844 $self->{column}++;
2845 $self->{nc}
2846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847 } else {
2848 $self->{set_nc}->($self);
2849 }
2850
2851 redo A;
2852 }
2853 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855
2856 if ($self->{nc} == 0x002D) { # -
2857
2858 $self->{state} = COMMENT_END_STATE;
2859
2860 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2861 $self->{line_prev} = $self->{line};
2862 $self->{column_prev} = $self->{column};
2863 $self->{column}++;
2864 $self->{nc}
2865 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2866 } else {
2867 $self->{set_nc}->($self);
2868 }
2869
2870 redo A;
2871 } elsif ($self->{nc} == -1) {
2872 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873 if ($self->{in_subset}) {
2874
2875 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876 } else {
2877
2878 $self->{state} = DATA_STATE;
2879 $self->{s_kwd} = '';
2880 }
2881 ## reconsume
2882
2883 return ($self->{ct}); # comment
2884
2885 redo A;
2886 } else {
2887
2888 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2889 $self->{state} = COMMENT_STATE;
2890
2891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2892 $self->{line_prev} = $self->{line};
2893 $self->{column_prev} = $self->{column};
2894 $self->{column}++;
2895 $self->{nc}
2896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2897 } else {
2898 $self->{set_nc}->($self);
2899 }
2900
2901 redo A;
2902 }
2903 } elsif ($self->{state} == COMMENT_END_STATE) {
2904 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905
2906 if ($self->{nc} == 0x003E) { # >
2907 if ($self->{in_subset}) {
2908
2909 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910 } else {
2911
2912 $self->{state} = DATA_STATE;
2913 $self->{s_kwd} = '';
2914 }
2915
2916 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917 $self->{line_prev} = $self->{line};
2918 $self->{column_prev} = $self->{column};
2919 $self->{column}++;
2920 $self->{nc}
2921 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922 } else {
2923 $self->{set_nc}->($self);
2924 }
2925
2926
2927 return ($self->{ct}); # comment
2928
2929 redo A;
2930 } elsif ($self->{nc} == 0x002D) { # -
2931
2932 ## XML5: Not a parse error.
2933 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934 line => $self->{line_prev},
2935 column => $self->{column_prev});
2936 $self->{ct}->{data} .= '-'; # comment
2937 ## Stay in the state
2938
2939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940 $self->{line_prev} = $self->{line};
2941 $self->{column_prev} = $self->{column};
2942 $self->{column}++;
2943 $self->{nc}
2944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945 } else {
2946 $self->{set_nc}->($self);
2947 }
2948
2949 redo A;
2950 } elsif ($self->{nc} == -1) {
2951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 if ($self->{in_subset}) {
2953
2954 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955 } else {
2956
2957 $self->{state} = DATA_STATE;
2958 $self->{s_kwd} = '';
2959 }
2960 ## reconsume
2961
2962 return ($self->{ct}); # comment
2963
2964 redo A;
2965 } else {
2966
2967 ## XML5: Not a parse error.
2968 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969 line => $self->{line_prev},
2970 column => $self->{column_prev});
2971 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2972 $self->{state} = COMMENT_STATE;
2973
2974 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975 $self->{line_prev} = $self->{line};
2976 $self->{column_prev} = $self->{column};
2977 $self->{column}++;
2978 $self->{nc}
2979 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980 } else {
2981 $self->{set_nc}->($self);
2982 }
2983
2984 redo A;
2985 }
2986 } elsif ($self->{state} == DOCTYPE_STATE) {
2987 if ($is_space->{$self->{nc}}) {
2988
2989 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990
2991 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992 $self->{line_prev} = $self->{line};
2993 $self->{column_prev} = $self->{column};
2994 $self->{column}++;
2995 $self->{nc}
2996 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997 } else {
2998 $self->{set_nc}->($self);
2999 }
3000
3001 redo A;
3002 } else {
3003
3004 ## XML5: Unless EOF, swith to the bogus comment state.
3005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007 ## reconsume
3008 redo A;
3009 }
3010 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011 ## XML5: "DOCTYPE root name before state".
3012
3013 if ($is_space->{$self->{nc}}) {
3014
3015 ## Stay in the state
3016
3017 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018 $self->{line_prev} = $self->{line};
3019 $self->{column_prev} = $self->{column};
3020 $self->{column}++;
3021 $self->{nc}
3022 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023 } else {
3024 $self->{set_nc}->($self);
3025 }
3026
3027 redo A;
3028 } elsif ($self->{nc} == 0x003E) { # >
3029
3030 ## XML5: No parse error.
3031 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032 $self->{state} = DATA_STATE;
3033 $self->{s_kwd} = '';
3034
3035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036 $self->{line_prev} = $self->{line};
3037 $self->{column_prev} = $self->{column};
3038 $self->{column}++;
3039 $self->{nc}
3040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041 } else {
3042 $self->{set_nc}->($self);
3043 }
3044
3045
3046 return ($self->{ct}); # DOCTYPE (quirks)
3047
3048 redo A;
3049 } elsif ($self->{nc} == -1) {
3050
3051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052 $self->{state} = DATA_STATE;
3053 $self->{s_kwd} = '';
3054 ## reconsume
3055
3056 return ($self->{ct}); # DOCTYPE (quirks)
3057
3058 redo A;
3059 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060
3061 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064 $self->{in_subset} = 1;
3065
3066 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067 $self->{line_prev} = $self->{line};
3068 $self->{column_prev} = $self->{column};
3069 $self->{column}++;
3070 $self->{nc}
3071 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072 } else {
3073 $self->{set_nc}->($self);
3074 }
3075
3076 return ($self->{ct}); # DOCTYPE
3077 redo A;
3078 } else {
3079
3080 $self->{ct}->{name} = chr $self->{nc};
3081 delete $self->{ct}->{quirks};
3082 $self->{state} = DOCTYPE_NAME_STATE;
3083
3084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085 $self->{line_prev} = $self->{line};
3086 $self->{column_prev} = $self->{column};
3087 $self->{column}++;
3088 $self->{nc}
3089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090 } else {
3091 $self->{set_nc}->($self);
3092 }
3093
3094 redo A;
3095 }
3096 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097 ## XML5: "DOCTYPE root name state".
3098
3099 ## ISSUE: Redundant "First," in the spec.
3100
3101 if ($is_space->{$self->{nc}}) {
3102
3103 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3104
3105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106 $self->{line_prev} = $self->{line};
3107 $self->{column_prev} = $self->{column};
3108 $self->{column}++;
3109 $self->{nc}
3110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111 } else {
3112 $self->{set_nc}->($self);
3113 }
3114
3115 redo A;
3116 } elsif ($self->{nc} == 0x003E) { # >
3117
3118 $self->{state} = DATA_STATE;
3119 $self->{s_kwd} = '';
3120
3121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122 $self->{line_prev} = $self->{line};
3123 $self->{column_prev} = $self->{column};
3124 $self->{column}++;
3125 $self->{nc}
3126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127 } else {
3128 $self->{set_nc}->($self);
3129 }
3130
3131
3132 return ($self->{ct}); # DOCTYPE
3133
3134 redo A;
3135 } elsif ($self->{nc} == -1) {
3136
3137 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138 $self->{state} = DATA_STATE;
3139 $self->{s_kwd} = '';
3140 ## reconsume
3141
3142 $self->{ct}->{quirks} = 1;
3143 return ($self->{ct}); # DOCTYPE
3144
3145 redo A;
3146 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147
3148 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150 $self->{in_subset} = 1;
3151
3152 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153 $self->{line_prev} = $self->{line};
3154 $self->{column_prev} = $self->{column};
3155 $self->{column}++;
3156 $self->{nc}
3157 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158 } else {
3159 $self->{set_nc}->($self);
3160 }
3161
3162 return ($self->{ct}); # DOCTYPE
3163 redo A;
3164 } else {
3165
3166 $self->{ct}->{name}
3167 .= chr ($self->{nc}); # DOCTYPE
3168 ## Stay in the state
3169
3170 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171 $self->{line_prev} = $self->{line};
3172 $self->{column_prev} = $self->{column};
3173 $self->{column}++;
3174 $self->{nc}
3175 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176 } else {
3177 $self->{set_nc}->($self);
3178 }
3179
3180 redo A;
3181 }
3182 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184 ## state", but implemented differently.
3185
3186 if ($is_space->{$self->{nc}}) {
3187
3188 ## Stay in the state
3189
3190 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3191 $self->{line_prev} = $self->{line};
3192 $self->{column_prev} = $self->{column};
3193 $self->{column}++;
3194 $self->{nc}
3195 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3196 } else {
3197 $self->{set_nc}->($self);
3198 }
3199
3200 redo A;
3201 } elsif ($self->{nc} == 0x003E) { # >
3202 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203
3204 $self->{state} = DATA_STATE;
3205 $self->{s_kwd} = '';
3206 } else {
3207
3208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210 }
3211
3212
3213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214 $self->{line_prev} = $self->{line};
3215 $self->{column_prev} = $self->{column};
3216 $self->{column}++;
3217 $self->{nc}
3218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219 } else {
3220 $self->{set_nc}->($self);
3221 }
3222
3223 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 redo A;
3225 } elsif ($self->{nc} == -1) {
3226 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227
3228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229 $self->{state} = DATA_STATE;
3230 $self->{s_kwd} = '';
3231 $self->{ct}->{quirks} = 1;
3232 } else {
3233
3234 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236 }
3237
3238 ## Reconsume.
3239 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3240 redo A;
3241 } elsif ($self->{nc} == 0x0050 or # P
3242 $self->{nc} == 0x0070) { # p
3243
3244 $self->{state} = PUBLIC_STATE;
3245 $self->{kwd} = chr $self->{nc};
3246
3247 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248 $self->{line_prev} = $self->{line};
3249 $self->{column_prev} = $self->{column};
3250 $self->{column}++;
3251 $self->{nc}
3252 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253 } else {
3254 $self->{set_nc}->($self);
3255 }
3256
3257 redo A;
3258 } elsif ($self->{nc} == 0x0053 or # S
3259 $self->{nc} == 0x0073) { # s
3260
3261 $self->{state} = SYSTEM_STATE;
3262 $self->{kwd} = chr $self->{nc};
3263
3264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265 $self->{line_prev} = $self->{line};
3266 $self->{column_prev} = $self->{column};
3267 $self->{column}++;
3268 $self->{nc}
3269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270 } else {
3271 $self->{set_nc}->($self);
3272 }
3273
3274 redo A;
3275 } elsif ($self->{nc} == 0x0022 and # "
3276 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278
3279 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280 $self->{ct}->{value} = ''; # ENTITY
3281
3282 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283 $self->{line_prev} = $self->{line};
3284 $self->{column_prev} = $self->{column};
3285 $self->{column}++;
3286 $self->{nc}
3287 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288 } else {
3289 $self->{set_nc}->($self);
3290 }
3291
3292 redo A;
3293 } elsif ($self->{nc} == 0x0027 and # '
3294 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296
3297 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298 $self->{ct}->{value} = ''; # ENTITY
3299
3300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301 $self->{line_prev} = $self->{line};
3302 $self->{column_prev} = $self->{column};
3303 $self->{column}++;
3304 $self->{nc}
3305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306 } else {
3307 $self->{set_nc}->($self);
3308 }
3309
3310 redo A;
3311 } elsif ($self->{is_xml} and
3312 $self->{ct}->{type} == DOCTYPE_TOKEN and
3313 $self->{nc} == 0x005B) { # [
3314
3315 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317 $self->{in_subset} = 1;
3318
3319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320 $self->{line_prev} = $self->{line};
3321 $self->{column_prev} = $self->{column};
3322 $self->{column}++;
3323 $self->{nc}
3324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325 } else {
3326 $self->{set_nc}->($self);
3327 }
3328
3329 return ($self->{ct}); # DOCTYPE
3330 redo A;
3331 } else {
3332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333
3334 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335
3336 $self->{ct}->{quirks} = 1;
3337 $self->{state} = BOGUS_DOCTYPE_STATE;
3338 } else {
3339
3340 $self->{state} = BOGUS_MD_STATE;
3341 }
3342
3343
3344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345 $self->{line_prev} = $self->{line};
3346 $self->{column_prev} = $self->{column};
3347 $self->{column}++;
3348 $self->{nc}
3349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3350 } else {
3351 $self->{set_nc}->($self);
3352 }
3353
3354 redo A;
3355 }
3356 } elsif ($self->{state} == PUBLIC_STATE) {
3357 ## ASCII case-insensitive
3358 if ($self->{nc} == [
3359 undef,
3360 0x0055, # U
3361 0x0042, # B
3362 0x004C, # L
3363 0x0049, # I
3364 ]->[length $self->{kwd}] or
3365 $self->{nc} == [
3366 undef,
3367 0x0075, # u
3368 0x0062, # b
3369 0x006C, # l
3370 0x0069, # i
3371 ]->[length $self->{kwd}]) {
3372
3373 ## Stay in the state.
3374 $self->{kwd} .= chr $self->{nc};
3375
3376 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377 $self->{line_prev} = $self->{line};
3378 $self->{column_prev} = $self->{column};
3379 $self->{column}++;
3380 $self->{nc}
3381 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382 } else {
3383 $self->{set_nc}->($self);
3384 }
3385
3386 redo A;
3387 } elsif ((length $self->{kwd}) == 5 and
3388 ($self->{nc} == 0x0043 or # C
3389 $self->{nc} == 0x0063)) { # c
3390 if ($self->{is_xml} and
3391 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392
3393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394 text => 'PUBLIC',
3395 line => $self->{line_prev},
3396 column => $self->{column_prev} - 4);
3397 } else {
3398
3399 }
3400 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401
3402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403 $self->{line_prev} = $self->{line};
3404 $self->{column_prev} = $self->{column};
3405 $self->{column}++;
3406 $self->{nc}
3407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408 } else {
3409 $self->{set_nc}->($self);
3410 }
3411
3412 redo A;
3413 } else {
3414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3415 line => $self->{line_prev},
3416 column => $self->{column_prev} + 1 - length $self->{kwd});
3417 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418
3419 $self->{ct}->{quirks} = 1;
3420 $self->{state} = BOGUS_DOCTYPE_STATE;
3421 } else {
3422
3423 $self->{state} = BOGUS_MD_STATE;
3424 }
3425 ## Reconsume.
3426 redo A;
3427 }
3428 } elsif ($self->{state} == SYSTEM_STATE) {
3429 ## ASCII case-insensitive
3430 if ($self->{nc} == [
3431 undef,
3432 0x0059, # Y
3433 0x0053, # S
3434 0x0054, # T
3435 0x0045, # E
3436 ]->[length $self->{kwd}] or
3437 $self->{nc} == [
3438 undef,
3439 0x0079, # y
3440 0x0073, # s
3441 0x0074, # t
3442 0x0065, # e
3443 ]->[length $self->{kwd}]) {
3444
3445 ## Stay in the state.
3446 $self->{kwd} .= chr $self->{nc};
3447
3448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449 $self->{line_prev} = $self->{line};
3450 $self->{column_prev} = $self->{column};
3451 $self->{column}++;
3452 $self->{nc}
3453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3454 } else {
3455 $self->{set_nc}->($self);
3456 }
3457
3458 redo A;
3459 } elsif ((length $self->{kwd}) == 5 and
3460 ($self->{nc} == 0x004D or # M
3461 $self->{nc} == 0x006D)) { # m
3462 if ($self->{is_xml} and
3463 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464
3465 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466 text => 'SYSTEM',
3467 line => $self->{line_prev},
3468 column => $self->{column_prev} - 4);
3469 } else {
3470
3471 }
3472 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473
3474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475 $self->{line_prev} = $self->{line};
3476 $self->{column_prev} = $self->{column};
3477 $self->{column}++;
3478 $self->{nc}
3479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480 } else {
3481 $self->{set_nc}->($self);
3482 }
3483
3484 redo A;
3485 } else {
3486 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3487 line => $self->{line_prev},
3488 column => $self->{column_prev} + 1 - length $self->{kwd});
3489 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490
3491 $self->{ct}->{quirks} = 1;
3492 $self->{state} = BOGUS_DOCTYPE_STATE;
3493 } else {
3494
3495 $self->{state} = BOGUS_MD_STATE;
3496 }
3497 ## Reconsume.
3498 redo A;
3499 }
3500 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3501 if ($is_space->{$self->{nc}}) {
3502
3503 ## Stay in the state
3504
3505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506 $self->{line_prev} = $self->{line};
3507 $self->{column_prev} = $self->{column};
3508 $self->{column}++;
3509 $self->{nc}
3510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511 } else {
3512 $self->{set_nc}->($self);
3513 }
3514
3515 redo A;
3516 } elsif ($self->{nc} eq 0x0022) { # "
3517
3518 $self->{ct}->{pubid} = ''; # DOCTYPE
3519 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3520
3521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522 $self->{line_prev} = $self->{line};
3523 $self->{column_prev} = $self->{column};
3524 $self->{column}++;
3525 $self->{nc}
3526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527 } else {
3528 $self->{set_nc}->($self);
3529 }
3530
3531 redo A;
3532 } elsif ($self->{nc} eq 0x0027) { # '
3533
3534 $self->{ct}->{pubid} = ''; # DOCTYPE
3535 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3536
3537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3538 $self->{line_prev} = $self->{line};
3539 $self->{column_prev} = $self->{column};
3540 $self->{column}++;
3541 $self->{nc}
3542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3543 } else {
3544 $self->{set_nc}->($self);
3545 }
3546
3547 redo A;
3548 } elsif ($self->{nc} eq 0x003E) { # >
3549 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550
3551 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552
3553 $self->{state} = DATA_STATE;
3554 $self->{s_kwd} = '';
3555 $self->{ct}->{quirks} = 1;
3556 } else {
3557
3558 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559 }
3560
3561
3562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563 $self->{line_prev} = $self->{line};
3564 $self->{column_prev} = $self->{column};
3565 $self->{column}++;
3566 $self->{nc}
3567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568 } else {
3569 $self->{set_nc}->($self);
3570 }
3571
3572 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3573 redo A;
3574 } elsif ($self->{nc} == -1) {
3575 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576
3577 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578 $self->{state} = DATA_STATE;
3579 $self->{s_kwd} = '';
3580 $self->{ct}->{quirks} = 1;
3581 } else {
3582
3583 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585 }
3586
3587 ## reconsume
3588 return ($self->{ct}); # DOCTYPE
3589 redo A;
3590 } elsif ($self->{is_xml} and
3591 $self->{ct}->{type} == DOCTYPE_TOKEN and
3592 $self->{nc} == 0x005B) { # [
3593
3594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597 $self->{in_subset} = 1;
3598
3599 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600 $self->{line_prev} = $self->{line};
3601 $self->{column_prev} = $self->{column};
3602 $self->{column}++;
3603 $self->{nc}
3604 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605 } else {
3606 $self->{set_nc}->($self);
3607 }
3608
3609 return ($self->{ct}); # DOCTYPE
3610 redo A;
3611 } else {
3612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3613
3614 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615
3616 $self->{ct}->{quirks} = 1;
3617 $self->{state} = BOGUS_DOCTYPE_STATE;
3618 } else {
3619
3620 $self->{state} = BOGUS_MD_STATE;
3621 }
3622
3623
3624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625 $self->{line_prev} = $self->{line};
3626 $self->{column_prev} = $self->{column};
3627 $self->{column}++;
3628 $self->{nc}
3629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630 } else {
3631 $self->{set_nc}->($self);
3632 }
3633
3634 redo A;
3635 }
3636 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3637 if ($self->{nc} == 0x0022) { # "
3638
3639 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3640
3641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642 $self->{line_prev} = $self->{line};
3643 $self->{column_prev} = $self->{column};
3644 $self->{column}++;
3645 $self->{nc}
3646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647 } else {
3648 $self->{set_nc}->($self);
3649 }
3650
3651 redo A;
3652 } elsif ($self->{nc} == 0x003E) { # >
3653 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654
3655 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656
3657 $self->{state} = DATA_STATE;
3658 $self->{s_kwd} = '';
3659 $self->{ct}->{quirks} = 1;
3660 } else {
3661
3662 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663 }
3664
3665
3666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667 $self->{line_prev} = $self->{line};
3668 $self->{column_prev} = $self->{column};
3669 $self->{column}++;
3670 $self->{nc}
3671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672 } else {
3673 $self->{set_nc}->($self);
3674 }
3675
3676 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3677 redo A;
3678 } elsif ($self->{nc} == -1) {
3679 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680
3681 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682
3683 $self->{state} = DATA_STATE;
3684 $self->{s_kwd} = '';
3685 $self->{ct}->{quirks} = 1;
3686 } else {
3687
3688 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689 }
3690
3691 ## Reconsume.
3692 return ($self->{ct}); # DOCTYPE
3693 redo A;
3694 } else {
3695
3696 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3697 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698 length $self->{ct}->{pubid});
3699
3700 ## Stay in the state
3701
3702 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703 $self->{line_prev} = $self->{line};
3704 $self->{column_prev} = $self->{column};
3705 $self->{column}++;
3706 $self->{nc}
3707 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708 } else {
3709 $self->{set_nc}->($self);
3710 }
3711
3712 redo A;
3713 }
3714 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3715 if ($self->{nc} == 0x0027) { # '
3716
3717 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3718
3719 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720 $self->{line_prev} = $self->{line};
3721 $self->{column_prev} = $self->{column};
3722 $self->{column}++;
3723 $self->{nc}
3724 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725 } else {
3726 $self->{set_nc}->($self);
3727 }
3728
3729 redo A;
3730 } elsif ($self->{nc} == 0x003E) { # >
3731 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732
3733 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734
3735 $self->{state} = DATA_STATE;
3736 $self->{s_kwd} = '';
3737 $self->{ct}->{quirks} = 1;
3738 } else {
3739
3740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741 }
3742
3743
3744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745 $self->{line_prev} = $self->{line};
3746 $self->{column_prev} = $self->{column};
3747 $self->{column}++;
3748 $self->{nc}
3749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750 } else {
3751 $self->{set_nc}->($self);
3752 }
3753
3754 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3755 redo A;
3756 } elsif ($self->{nc} == -1) {
3757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758
3759 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760
3761 $self->{state} = DATA_STATE;
3762 $self->{s_kwd} = '';
3763 $self->{ct}->{quirks} = 1;
3764 } else {
3765
3766 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767 }
3768
3769 ## reconsume
3770 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3771 redo A;
3772 } else {
3773
3774 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3775 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776 length $self->{ct}->{pubid});
3777
3778 ## Stay in the state
3779
3780 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781 $self->{line_prev} = $self->{line};
3782 $self->{column_prev} = $self->{column};
3783 $self->{column}++;
3784 $self->{nc}
3785 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786 } else {
3787 $self->{set_nc}->($self);
3788 }
3789
3790 redo A;
3791 }
3792 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3793 if ($is_space->{$self->{nc}}) {
3794
3795 ## Stay in the state
3796
3797 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798 $self->{line_prev} = $self->{line};
3799 $self->{column_prev} = $self->{column};
3800 $self->{column}++;
3801 $self->{nc}
3802 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803 } else {
3804 $self->{set_nc}->($self);
3805 }
3806
3807 redo A;
3808 } elsif ($self->{nc} == 0x0022) { # "
3809
3810 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812
3813 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3814 $self->{line_prev} = $self->{line};
3815 $self->{column_prev} = $self->{column};
3816 $self->{column}++;
3817 $self->{nc}
3818 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3819 } else {
3820 $self->{set_nc}->($self);
3821 }
3822
3823 redo A;
3824 } elsif ($self->{nc} == 0x0027) { # '
3825
3826 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828
3829 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830 $self->{line_prev} = $self->{line};
3831 $self->{column_prev} = $self->{column};
3832 $self->{column}++;
3833 $self->{nc}
3834 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835 } else {
3836 $self->{set_nc}->($self);
3837 }
3838
3839 redo A;
3840 } elsif ($self->{nc} == 0x003E) { # >
3841 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842 if ($self->{is_xml}) {
3843
3844 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845 } else {
3846
3847 }
3848 $self->{state} = DATA_STATE;
3849 $self->{s_kwd} = '';
3850 } else {
3851 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852
3853 } else {
3854
3855 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856 }
3857 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858 }
3859
3860
3861 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862 $self->{line_prev} = $self->{line};
3863 $self->{column_prev} = $self->{column};
3864 $self->{column}++;
3865 $self->{nc}
3866 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3867 } else {
3868 $self->{set_nc}->($self);
3869 }
3870
3871 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3872 redo A;
3873 } elsif ($self->{nc} == -1) {
3874 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875
3876 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877
3878 $self->{state} = DATA_STATE;
3879 $self->{s_kwd} = '';
3880 $self->{ct}->{quirks} = 1;
3881 } else {
3882 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884 }
3885
3886 ## reconsume
3887 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888 redo A;
3889 } elsif ($self->{is_xml} and
3890 $self->{ct}->{type} == DOCTYPE_TOKEN and
3891 $self->{nc} == 0x005B) { # [
3892
3893 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896 $self->{in_subset} = 1;
3897
3898 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899 $self->{line_prev} = $self->{line};
3900 $self->{column_prev} = $self->{column};
3901 $self->{column}++;
3902 $self->{nc}
3903 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904 } else {
3905 $self->{set_nc}->($self);
3906 }
3907
3908 return ($self->{ct}); # DOCTYPE
3909 redo A;
3910 } else {
3911 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3912
3913 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914
3915 $self->{ct}->{quirks} = 1;
3916 $self->{state} = BOGUS_DOCTYPE_STATE;
3917 } else {
3918
3919 $self->{state} = BOGUS_MD_STATE;
3920 }
3921
3922
3923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924 $self->{line_prev} = $self->{line};
3925 $self->{column_prev} = $self->{column};
3926 $self->{column}++;
3927 $self->{nc}
3928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3929 } else {
3930 $self->{set_nc}->($self);
3931 }
3932
3933 redo A;
3934 }
3935 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3936 if ($is_space->{$self->{nc}}) {
3937
3938 ## Stay in the state
3939
3940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3941 $self->{line_prev} = $self->{line};
3942 $self->{column_prev} = $self->{column};
3943 $self->{column}++;
3944 $self->{nc}
3945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3946 } else {
3947 $self->{set_nc}->($self);
3948 }
3949
3950 redo A;
3951 } elsif ($self->{nc} == 0x0022) { # "
3952
3953 $self->{ct}->{sysid} = ''; # DOCTYPE
3954 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3955
3956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3957 $self->{line_prev} = $self->{line};
3958 $self->{column_prev} = $self->{column};
3959 $self->{column}++;
3960 $self->{nc}
3961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3962 } else {
3963 $self->{set_nc}->($self);
3964 }
3965
3966 redo A;
3967 } elsif ($self->{nc} == 0x0027) { # '
3968
3969 $self->{ct}->{sysid} = ''; # DOCTYPE
3970 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3971
3972 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3973 $self->{line_prev} = $self->{line};
3974 $self->{column_prev} = $self->{column};
3975 $self->{column}++;
3976 $self->{nc}
3977 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3978 } else {
3979 $self->{set_nc}->($self);
3980 }
3981
3982 redo A;
3983 } elsif ($self->{nc} == 0x003E) { # >
3984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985
3986 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987 $self->{line_prev} = $self->{line};
3988 $self->{column_prev} = $self->{column};
3989 $self->{column}++;
3990 $self->{nc}
3991 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992 } else {
3993 $self->{set_nc}->($self);
3994 }
3995
3996
3997 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998
3999 $self->{state} = DATA_STATE;
4000 $self->{s_kwd} = '';
4001 $self->{ct}->{quirks} = 1;
4002 } else {
4003
4004 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005 }
4006
4007 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008 redo A;
4009 } elsif ($self->{nc} == -1) {
4010 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011
4012 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013 $self->{state} = DATA_STATE;
4014 $self->{s_kwd} = '';
4015 $self->{ct}->{quirks} = 1;
4016 } else {
4017
4018 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020 }
4021
4022 ## reconsume
4023 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024 redo A;
4025 } elsif ($self->{is_xml} and
4026 $self->{ct}->{type} == DOCTYPE_TOKEN and
4027 $self->{nc} == 0x005B) { # [
4028
4029 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030
4031 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033 $self->{in_subset} = 1;
4034
4035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036 $self->{line_prev} = $self->{line};
4037 $self->{column_prev} = $self->{column};
4038 $self->{column}++;
4039 $self->{nc}
4040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041 } else {
4042 $self->{set_nc}->($self);
4043 }
4044
4045 return ($self->{ct}); # DOCTYPE
4046 redo A;
4047 } else {
4048 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4049
4050 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051
4052 $self->{ct}->{quirks} = 1;
4053 $self->{state} = BOGUS_DOCTYPE_STATE;
4054 } else {
4055
4056 $self->{state} = BOGUS_MD_STATE;
4057 }
4058
4059
4060 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061 $self->{line_prev} = $self->{line};
4062 $self->{column_prev} = $self->{column};
4063 $self->{column}++;
4064 $self->{nc}
4065 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4066 } else {
4067 $self->{set_nc}->($self);
4068 }
4069
4070 redo A;
4071 }
4072 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4073 if ($self->{nc} == 0x0022) { # "
4074
4075 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4076
4077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078 $self->{line_prev} = $self->{line};
4079 $self->{column_prev} = $self->{column};
4080 $self->{column}++;
4081 $self->{nc}
4082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083 } else {
4084 $self->{set_nc}->($self);
4085 }
4086
4087 redo A;
4088 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4089 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090
4091 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092
4093 $self->{state} = DATA_STATE;
4094 $self->{s_kwd} = '';
4095 $self->{ct}->{quirks} = 1;
4096 } else {
4097
4098 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099 }
4100
4101
4102 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103 $self->{line_prev} = $self->{line};
4104 $self->{column_prev} = $self->{column};
4105 $self->{column}++;
4106 $self->{nc}
4107 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108 } else {
4109 $self->{set_nc}->($self);
4110 }
4111
4112 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4113 redo A;
4114 } elsif ($self->{nc} == -1) {
4115 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116
4117 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118
4119 $self->{state} = DATA_STATE;
4120 $self->{s_kwd} = '';
4121 $self->{ct}->{quirks} = 1;
4122 } else {
4123
4124 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125 }
4126
4127 ## reconsume
4128 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4129 redo A;
4130 } else {
4131
4132 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4133 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134 length $self->{ct}->{sysid});
4135
4136 ## Stay in the state
4137
4138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139 $self->{line_prev} = $self->{line};
4140 $self->{column_prev} = $self->{column};
4141 $self->{column}++;
4142 $self->{nc}
4143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144 } else {
4145 $self->{set_nc}->($self);
4146 }
4147
4148 redo A;
4149 }
4150 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4151 if ($self->{nc} == 0x0027) { # '
4152
4153 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4154
4155 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4156 $self->{line_prev} = $self->{line};
4157 $self->{column_prev} = $self->{column};
4158 $self->{column}++;
4159 $self->{nc}
4160 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4161 } else {
4162 $self->{set_nc}->($self);
4163 }
4164
4165 redo A;
4166 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167
4168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169
4170 $self->{state} = DATA_STATE;
4171 $self->{s_kwd} = '';
4172
4173 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174 $self->{line_prev} = $self->{line};
4175 $self->{column_prev} = $self->{column};
4176 $self->{column}++;
4177 $self->{nc}
4178 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179 } else {
4180 $self->{set_nc}->($self);
4181 }
4182
4183
4184 $self->{ct}->{quirks} = 1;
4185 return ($self->{ct}); # DOCTYPE
4186
4187 redo A;
4188 } elsif ($self->{nc} == -1) {
4189 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190
4191 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192
4193 $self->{state} = DATA_STATE;
4194 $self->{s_kwd} = '';
4195 $self->{ct}->{quirks} = 1;
4196 } else {
4197
4198 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199 }
4200
4201 ## reconsume
4202 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203 redo A;
4204 } else {
4205
4206 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4207 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208 length $self->{ct}->{sysid});
4209
4210 ## Stay in the state
4211
4212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4213 $self->{line_prev} = $self->{line};
4214 $self->{column_prev} = $self->{column};
4215 $self->{column}++;
4216 $self->{nc}
4217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4218 } else {
4219 $self->{set_nc}->($self);
4220 }
4221
4222 redo A;
4223 }
4224 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225 if ($is_space->{$self->{nc}}) {
4226 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227
4228 $self->{state} = BEFORE_NDATA_STATE;
4229 } else {
4230
4231 ## Stay in the state
4232 }
4233
4234 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235 $self->{line_prev} = $self->{line};
4236 $self->{column_prev} = $self->{column};
4237 $self->{column}++;
4238 $self->{nc}
4239 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240 } else {
4241 $self->{set_nc}->($self);
4242 }
4243
4244 redo A;
4245 } elsif ($self->{nc} == 0x003E) { # >
4246 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247
4248 $self->{state} = DATA_STATE;
4249 $self->{s_kwd} = '';
4250 } else {
4251
4252 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253 }
4254
4255
4256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257 $self->{line_prev} = $self->{line};
4258 $self->{column_prev} = $self->{column};
4259 $self->{column}++;
4260 $self->{nc}
4261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262 } else {
4263 $self->{set_nc}->($self);
4264 }
4265
4266 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 redo A;
4268 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269 ($self->{nc} == 0x004E or # N
4270 $self->{nc} == 0x006E)) { # n
4271
4272 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273 $self->{state} = NDATA_STATE;
4274 $self->{kwd} = chr $self->{nc};
4275
4276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277 $self->{line_prev} = $self->{line};
4278 $self->{column_prev} = $self->{column};
4279 $self->{column}++;
4280 $self->{nc}
4281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4282 } else {
4283 $self->{set_nc}->($self);
4284 }
4285
4286 redo A;
4287 } elsif ($self->{nc} == -1) {
4288 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289
4290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291 $self->{state} = DATA_STATE;
4292 $self->{s_kwd} = '';
4293 $self->{ct}->{quirks} = 1;
4294 } else {
4295
4296 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298 }
4299
4300 ## reconsume
4301 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 redo A;
4303 } elsif ($self->{is_xml} and
4304 $self->{ct}->{type} == DOCTYPE_TOKEN and
4305 $self->{nc} == 0x005B) { # [
4306
4307 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309 $self->{in_subset} = 1;
4310
4311 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312 $self->{line_prev} = $self->{line};
4313 $self->{column_prev} = $self->{column};
4314 $self->{column}++;
4315 $self->{nc}
4316 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317 } else {
4318 $self->{set_nc}->($self);
4319 }
4320
4321 return ($self->{ct}); # DOCTYPE
4322 redo A;
4323 } else {
4324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325
4326 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327
4328 #$self->{ct}->{quirks} = 1;
4329 $self->{state} = BOGUS_DOCTYPE_STATE;
4330 } else {
4331
4332 $self->{state} = BOGUS_MD_STATE;
4333 }
4334
4335
4336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337 $self->{line_prev} = $self->{line};
4338 $self->{column_prev} = $self->{column};
4339 $self->{column}++;
4340 $self->{nc}
4341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342 } else {
4343 $self->{set_nc}->($self);
4344 }
4345
4346 redo A;
4347 }
4348 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349 if ($is_space->{$self->{nc}}) {
4350
4351 ## Stay in the state.
4352
4353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354 $self->{line_prev} = $self->{line};
4355 $self->{column_prev} = $self->{column};
4356 $self->{column}++;
4357 $self->{nc}
4358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359 } else {
4360 $self->{set_nc}->($self);
4361 }
4362
4363 redo A;
4364 } elsif ($self->{nc} == 0x003E) { # >
4365
4366 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367
4368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369 $self->{line_prev} = $self->{line};
4370 $self->{column_prev} = $self->{column};
4371 $self->{column}++;
4372 $self->{nc}
4373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374 } else {
4375 $self->{set_nc}->($self);
4376 }
4377
4378 return ($self->{ct}); # ENTITY
4379 redo A;
4380 } elsif ($self->{nc} == 0x004E or # N
4381 $self->{nc} == 0x006E) { # n
4382
4383 $self->{state} = NDATA_STATE;
4384 $self->{kwd} = chr $self->{nc};
4385
4386 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387 $self->{line_prev} = $self->{line};
4388 $self->{column_prev} = $self->{column};
4389 $self->{column}++;
4390 $self->{nc}
4391 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392 } else {
4393 $self->{set_nc}->($self);
4394 }
4395
4396 redo A;
4397 } elsif ($self->{nc} == -1) {
4398
4399 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401 ## reconsume
4402 return ($self->{ct}); # ENTITY
4403 redo A;
4404 } else {
4405
4406 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407 $self->{state} = BOGUS_MD_STATE;
4408
4409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410 $self->{line_prev} = $self->{line};
4411 $self->{column_prev} = $self->{column};
4412 $self->{column}++;
4413 $self->{nc}
4414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415 } else {
4416 $self->{set_nc}->($self);
4417 }
4418
4419 redo A;
4420 }
4421 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4422 if ($self->{nc} == 0x003E) { # >
4423
4424 $self->{state} = DATA_STATE;
4425 $self->{s_kwd} = '';
4426
4427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428 $self->{line_prev} = $self->{line};
4429 $self->{column_prev} = $self->{column};
4430 $self->{column}++;
4431 $self->{nc}
4432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433 } else {
4434 $self->{set_nc}->($self);
4435 }
4436
4437
4438 return ($self->{ct}); # DOCTYPE
4439
4440 redo A;
4441 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442
4443 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445 $self->{in_subset} = 1;
4446
4447 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448 $self->{line_prev} = $self->{line};
4449 $self->{column_prev} = $self->{column};
4450 $self->{column}++;
4451 $self->{nc}
4452 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453 } else {
4454 $self->{set_nc}->($self);
4455 }
4456
4457 return ($self->{ct}); # DOCTYPE
4458 redo A;
4459 } elsif ($self->{nc} == -1) {
4460
4461 $self->{state} = DATA_STATE;
4462 $self->{s_kwd} = '';
4463 ## reconsume
4464
4465 return ($self->{ct}); # DOCTYPE
4466
4467 redo A;
4468 } else {
4469
4470 my $s = '';
4471 $self->{read_until}->($s, q{>[}, 0);
4472
4473 ## Stay in the state
4474
4475 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476 $self->{line_prev} = $self->{line};
4477 $self->{column_prev} = $self->{column};
4478 $self->{column}++;
4479 $self->{nc}
4480 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481 } else {
4482 $self->{set_nc}->($self);
4483 }
4484
4485 redo A;
4486 }
4487 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4488 ## NOTE: "CDATA section state" in the state is jointly implemented
4489 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490 ## and |CDATA_SECTION_MSE2_STATE|.
4491
4492 ## XML5: "CDATA state".
4493
4494 if ($self->{nc} == 0x005D) { # ]
4495
4496 $self->{state} = CDATA_SECTION_MSE1_STATE;
4497
4498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499 $self->{line_prev} = $self->{line};
4500 $self->{column_prev} = $self->{column};
4501 $self->{column}++;
4502 $self->{nc}
4503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504 } else {
4505 $self->{set_nc}->($self);
4506 }
4507
4508 redo A;
4509 } elsif ($self->{nc} == -1) {
4510 if ($self->{is_xml}) {
4511
4512 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513 } else {
4514
4515 }
4516
4517 $self->{state} = DATA_STATE;
4518 $self->{s_kwd} = '';
4519 ## Reconsume.
4520 if (length $self->{ct}->{data}) { # character
4521
4522 return ($self->{ct}); # character
4523 } else {
4524
4525 ## No token to emit. $self->{ct} is discarded.
4526 }
4527 redo A;
4528 } else {
4529
4530 $self->{ct}->{data} .= chr $self->{nc};
4531 $self->{read_until}->($self->{ct}->{data},
4532 q<]>,
4533 length $self->{ct}->{data});
4534
4535 ## Stay in the state.
4536
4537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4538 $self->{line_prev} = $self->{line};
4539 $self->{column_prev} = $self->{column};
4540 $self->{column}++;
4541 $self->{nc}
4542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4543 } else {
4544 $self->{set_nc}->($self);
4545 }
4546
4547 redo A;
4548 }
4549
4550 ## ISSUE: "text tokens" in spec.
4551 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552 ## XML5: "CDATA bracket state".
4553
4554 if ($self->{nc} == 0x005D) { # ]
4555
4556 $self->{state} = CDATA_SECTION_MSE2_STATE;
4557
4558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559 $self->{line_prev} = $self->{line};
4560 $self->{column_prev} = $self->{column};
4561 $self->{column}++;
4562 $self->{nc}
4563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564 } else {
4565 $self->{set_nc}->($self);
4566 }
4567
4568 redo A;
4569 } else {
4570
4571 ## XML5: If EOF, "]" is not appended and changed to the data state.
4572 $self->{ct}->{data} .= ']';
4573 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574 ## Reconsume.
4575 redo A;
4576 }
4577 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578 ## XML5: "CDATA end state".
4579
4580 if ($self->{nc} == 0x003E) { # >
4581 $self->{state} = DATA_STATE;
4582 $self->{s_kwd} = '';
4583
4584 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585 $self->{line_prev} = $self->{line};
4586 $self->{column_prev} = $self->{column};
4587 $self->{column}++;
4588 $self->{nc}
4589 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4590 } else {
4591 $self->{set_nc}->($self);
4592 }
4593
4594 if (length $self->{ct}->{data}) { # character
4595
4596 return ($self->{ct}); # character
4597 } else {
4598
4599 ## No token to emit. $self->{ct} is discarded.
4600 }
4601 redo A;
4602 } elsif ($self->{nc} == 0x005D) { # ]
4603 # character
4604 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4605 ## Stay in the state.
4606
4607 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608 $self->{line_prev} = $self->{line};
4609 $self->{column_prev} = $self->{column};
4610 $self->{column}++;
4611 $self->{nc}
4612 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613 } else {
4614 $self->{set_nc}->($self);
4615 }
4616
4617 redo A;
4618 } else {
4619
4620 $self->{ct}->{data} .= ']]'; # character
4621 $self->{state} = CDATA_SECTION_STATE;
4622 ## Reconsume. ## XML5: Emit.
4623 redo A;
4624 }
4625 } elsif ($self->{state} == ENTITY_STATE) {
4626 if ($is_space->{$self->{nc}} or
4627 {
4628 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4629 $self->{entity_add} => 1,
4630 }->{$self->{nc}}) {
4631 if ($self->{is_xml}) {
4632
4633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4634 line => $self->{line_prev},
4635 column => $self->{column_prev}
4636 + ($self->{nc} == -1 ? 1 : 0));
4637 } else {
4638
4639 ## No error
4640 }
4641 ## Don't consume
4642 ## Return nothing.
4643 #
4644 } elsif ($self->{nc} == 0x0023) { # #
4645
4646 $self->{state} = ENTITY_HASH_STATE;
4647 $self->{kwd} = '#';
4648
4649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650 $self->{line_prev} = $self->{line};
4651 $self->{column_prev} = $self->{column};
4652 $self->{column}++;
4653 $self->{nc}
4654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655 } else {
4656 $self->{set_nc}->($self);
4657 }
4658
4659 redo A;
4660 } elsif ($self->{is_xml} or
4661 (0x0041 <= $self->{nc} and
4662 $self->{nc} <= 0x005A) or # A..Z
4663 (0x0061 <= $self->{nc} and
4664 $self->{nc} <= 0x007A)) { # a..z
4665
4666 require Whatpm::_NamedEntityList;
4667 $self->{state} = ENTITY_NAME_STATE;
4668 $self->{kwd} = chr $self->{nc};
4669 $self->{entity__value} = $self->{kwd};
4670 $self->{entity__match} = 0;
4671
4672 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4673 $self->{line_prev} = $self->{line};
4674 $self->{column_prev} = $self->{column};
4675 $self->{column}++;
4676 $self->{nc}
4677 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4678 } else {
4679 $self->{set_nc}->($self);
4680 }
4681
4682 redo A;
4683 } else {
4684
4685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4686 ## Return nothing.
4687 #
4688 }
4689
4690 ## NOTE: No character is consumed by the "consume a character
4691 ## reference" algorithm. In other word, there is an "&" character
4692 ## that does not introduce a character reference, which would be
4693 ## appended to the parent element or the attribute value in later
4694 ## process of the tokenizer.
4695
4696 if ($self->{prev_state} == DATA_STATE) {
4697
4698 $self->{state} = $self->{prev_state};
4699 $self->{s_kwd} = '';
4700 ## Reconsume.
4701 return ({type => CHARACTER_TOKEN, data => '&',
4702 line => $self->{line_prev},
4703 column => $self->{column_prev},
4704 });
4705 redo A;
4706 } else {
4707
4708 $self->{ca}->{value} .= '&';
4709 $self->{state} = $self->{prev_state};
4710 $self->{s_kwd} = '';
4711 ## Reconsume.
4712 redo A;
4713 }
4714 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4715 if ($self->{nc} == 0x0078) { # x
4716
4717 $self->{state} = HEXREF_X_STATE;
4718 $self->{kwd} .= chr $self->{nc};
4719
4720 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4721 $self->{line_prev} = $self->{line};
4722 $self->{column_prev} = $self->{column};
4723 $self->{column}++;
4724 $self->{nc}
4725 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4726 } else {
4727 $self->{set_nc}->($self);
4728 }
4729
4730 redo A;
4731 } elsif ($self->{nc} == 0x0058) { # X
4732
4733 if ($self->{is_xml}) {
4734 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4735 }
4736 $self->{state} = HEXREF_X_STATE;
4737 $self->{kwd} .= chr $self->{nc};
4738
4739 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4740 $self->{line_prev} = $self->{line};
4741 $self->{column_prev} = $self->{column};
4742 $self->{column}++;
4743 $self->{nc}
4744 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4745 } else {
4746 $self->{set_nc}->($self);
4747 }
4748
4749 redo A;
4750 } elsif (0x0030 <= $self->{nc} and
4751 $self->{nc} <= 0x0039) { # 0..9
4752
4753 $self->{state} = NCR_NUM_STATE;
4754 $self->{kwd} = $self->{nc} - 0x0030;
4755
4756 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4757 $self->{line_prev} = $self->{line};
4758 $self->{column_prev} = $self->{column};
4759 $self->{column}++;
4760 $self->{nc}
4761 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4762 } else {
4763 $self->{set_nc}->($self);
4764 }
4765
4766 redo A;
4767 } else {
4768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4769 line => $self->{line_prev},
4770 column => $self->{column_prev} - 1);
4771
4772 ## NOTE: According to the spec algorithm, nothing is returned,
4773 ## and then "&#" is appended to the parent element or the attribute
4774 ## value in the later processing.
4775
4776 if ($self->{prev_state} == DATA_STATE) {
4777
4778 $self->{state} = $self->{prev_state};
4779 $self->{s_kwd} = '';
4780 ## Reconsume.
4781 return ({type => CHARACTER_TOKEN,
4782 data => '&#',
4783 line => $self->{line_prev},
4784 column => $self->{column_prev} - 1,
4785 });
4786 redo A;
4787 } else {
4788
4789 $self->{ca}->{value} .= '&#';
4790 $self->{state} = $self->{prev_state};
4791 $self->{s_kwd} = '';
4792 ## Reconsume.
4793 redo A;
4794 }
4795 }
4796 } elsif ($self->{state} == NCR_NUM_STATE) {
4797 if (0x0030 <= $self->{nc} and
4798 $self->{nc} <= 0x0039) { # 0..9
4799
4800 $self->{kwd} *= 10;
4801 $self->{kwd} += $self->{nc} - 0x0030;
4802
4803 ## Stay in the state.
4804
4805 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4806 $self->{line_prev} = $self->{line};
4807 $self->{column_prev} = $self->{column};
4808 $self->{column}++;
4809 $self->{nc}
4810 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4811 } else {
4812 $self->{set_nc}->($self);
4813 }
4814
4815 redo A;
4816 } elsif ($self->{nc} == 0x003B) { # ;
4817
4818
4819 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4820 $self->{line_prev} = $self->{line};
4821 $self->{column_prev} = $self->{column};
4822 $self->{column}++;
4823 $self->{nc}
4824 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4825 } else {
4826 $self->{set_nc}->($self);
4827 }
4828
4829 #
4830 } else {
4831
4832 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4833 ## Reconsume.
4834 #
4835 }
4836
4837 my $code = $self->{kwd};
4838 my $l = $self->{line_prev};
4839 my $c = $self->{column_prev};
4840 if ($charref_map->{$code}) {
4841
4842 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4843 text => (sprintf 'U+%04X', $code),
4844 line => $l, column => $c);
4845 $code = $charref_map->{$code};
4846 } elsif ($code > 0x10FFFF) {
4847
4848 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4849 text => (sprintf 'U-%08X', $code),
4850 line => $l, column => $c);
4851 $code = 0xFFFD;
4852 }
4853
4854 if ($self->{prev_state} == DATA_STATE) {
4855
4856 $self->{state} = $self->{prev_state};
4857 $self->{s_kwd} = '';
4858 ## Reconsume.
4859 return ({type => CHARACTER_TOKEN, data => chr $code,
4860 has_reference => 1,
4861 line => $l, column => $c,
4862 });
4863 redo A;
4864 } else {
4865
4866 $self->{ca}->{value} .= chr $code;
4867 $self->{ca}->{has_reference} = 1;
4868 $self->{state} = $self->{prev_state};
4869 $self->{s_kwd} = '';
4870 ## Reconsume.
4871 redo A;
4872 }
4873 } elsif ($self->{state} == HEXREF_X_STATE) {
4874 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4875 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4876 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4877 # 0..9, A..F, a..f
4878
4879 $self->{state} = HEXREF_HEX_STATE;
4880 $self->{kwd} = 0;
4881 ## Reconsume.
4882 redo A;
4883 } else {
4884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4885 line => $self->{line_prev},
4886 column => $self->{column_prev} - 2);
4887
4888 ## NOTE: According to the spec algorithm, nothing is returned,
4889 ## and then "&#" followed by "X" or "x" is appended to the parent
4890 ## element or the attribute value in the later processing.
4891
4892 if ($self->{prev_state} == DATA_STATE) {
4893
4894 $self->{state} = $self->{prev_state};
4895 $self->{s_kwd} = '';
4896 ## Reconsume.
4897 return ({type => CHARACTER_TOKEN,
4898 data => '&' . $self->{kwd},
4899 line => $self->{line_prev},
4900 column => $self->{column_prev} - length $self->{kwd},
4901 });
4902 redo A;
4903 } else {
4904
4905 $self->{ca}->{value} .= '&' . $self->{kwd};
4906 $self->{state} = $self->{prev_state};
4907 $self->{s_kwd} = '';
4908 ## Reconsume.
4909 redo A;
4910 }
4911 }
4912 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4913 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4914 # 0..9
4915
4916 $self->{kwd} *= 0x10;
4917 $self->{kwd} += $self->{nc} - 0x0030;
4918 ## Stay in the state.
4919
4920 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4921 $self->{line_prev} = $self->{line};
4922 $self->{column_prev} = $self->{column};
4923 $self->{column}++;
4924 $self->{nc}
4925 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4926 } else {
4927 $self->{set_nc}->($self);
4928 }
4929
4930 redo A;
4931 } elsif (0x0061 <= $self->{nc} and
4932 $self->{nc} <= 0x0066) { # a..f
4933
4934 $self->{kwd} *= 0x10;
4935 $self->{kwd} += $self->{nc} - 0x0060 + 9;
4936 ## Stay in the state.
4937
4938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4939 $self->{line_prev} = $self->{line};
4940 $self->{column_prev} = $self->{column};
4941 $self->{column}++;
4942 $self->{nc}
4943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4944 } else {
4945 $self->{set_nc}->($self);
4946 }
4947
4948 redo A;
4949 } elsif (0x0041 <= $self->{nc} and
4950 $self->{nc} <= 0x0046) { # A..F
4951
4952 $self->{kwd} *= 0x10;
4953 $self->{kwd} += $self->{nc} - 0x0040 + 9;
4954 ## Stay in the state.
4955
4956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957 $self->{line_prev} = $self->{line};
4958 $self->{column_prev} = $self->{column};
4959 $self->{column}++;
4960 $self->{nc}
4961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962 } else {
4963 $self->{set_nc}->($self);
4964 }
4965
4966 redo A;
4967 } elsif ($self->{nc} == 0x003B) { # ;
4968
4969
4970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4971 $self->{line_prev} = $self->{line};
4972 $self->{column_prev} = $self->{column};
4973 $self->{column}++;
4974 $self->{nc}
4975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4976 } else {
4977 $self->{set_nc}->($self);
4978 }
4979
4980 #
4981 } else {
4982
4983 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4984 line => $self->{line},
4985 column => $self->{column});
4986 ## Reconsume.
4987 #
4988 }
4989
4990 my $code = $self->{kwd};
4991 my $l = $self->{line_prev};
4992 my $c = $self->{column_prev};
4993 if ($charref_map->{$code}) {
4994
4995 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4996 text => (sprintf 'U+%04X', $code),
4997 line => $l, column => $c);
4998 $code = $charref_map->{$code};
4999 } elsif ($code > 0x10FFFF) {
5000
5001 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5002 text => (sprintf 'U-%08X', $code),
5003 line => $l, column => $c);
5004 $code = 0xFFFD;
5005 }
5006
5007 if ($self->{prev_state} == DATA_STATE) {
5008
5009 $self->{state} = $self->{prev_state};
5010 $self->{s_kwd} = '';
5011 ## Reconsume.
5012 return ({type => CHARACTER_TOKEN, data => chr $code,
5013 has_reference => 1,
5014 line => $l, column => $c,
5015 });
5016 redo A;
5017 } else {
5018
5019 $self->{ca}->{value} .= chr $code;
5020 $self->{ca}->{has_reference} = 1;
5021 $self->{state} = $self->{prev_state};
5022 $self->{s_kwd} = '';
5023 ## Reconsume.
5024 redo A;
5025 }
5026 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5027 if ((0x0041 <= $self->{nc} and # a
5028 $self->{nc} <= 0x005A) or # x
5029 (0x0061 <= $self->{nc} and # a
5030 $self->{nc} <= 0x007A) or # z
5031 (0x0030 <= $self->{nc} and # 0
5032 $self->{nc} <= 0x0039) or # 9
5033 $self->{nc} == 0x003B or # ;
5034 ($self->{is_xml} and
5035 not ($is_space->{$self->{nc}} or
5036 {
5037 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5038 $self->{entity_add} => 1,
5039 }->{$self->{nc}}))) {
5040 our $EntityChar;
5041 $self->{kwd} .= chr $self->{nc};
5042 if (defined $EntityChar->{$self->{kwd}} or
5043 $self->{ge}->{$self->{kwd}}) {
5044 if ($self->{nc} == 0x003B) { # ;
5045 if (defined $self->{ge}->{$self->{kwd}}) {
5046 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5047
5048 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5049 } else {
5050 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5051
5052 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5053 value => $self->{kwd});
5054 } else {
5055
5056 }
5057 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5058 }
5059 } else {
5060 if ($self->{is_xml}) {
5061
5062 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5063 value => $self->{kwd},
5064 level => {
5065 'amp;' => $self->{level}->{warn},
5066 'quot;' => $self->{level}->{warn},
5067 'lt;' => $self->{level}->{warn},
5068 'gt;' => $self->{level}->{warn},
5069 'apos;' => $self->{level}->{warn},
5070 }->{$self->{kwd}} ||
5071 $self->{level}->{must});
5072 } else {
5073
5074 }
5075 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5076 }
5077 $self->{entity__match} = 1;
5078
5079 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5080 $self->{line_prev} = $self->{line};
5081 $self->{column_prev} = $self->{column};
5082 $self->{column}++;
5083 $self->{nc}
5084 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5085 } else {
5086 $self->{set_nc}->($self);
5087 }
5088
5089 #
5090 } else {
5091
5092 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5093 $self->{entity__match} = -1;
5094 ## Stay in the state.
5095
5096 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5097 $self->{line_prev} = $self->{line};
5098 $self->{column_prev} = $self->{column};
5099 $self->{column}++;
5100 $self->{nc}
5101 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5102 } else {
5103 $self->{set_nc}->($self);
5104 }
5105
5106 redo A;
5107 }
5108 } else {
5109
5110 $self->{entity__value} .= chr $self->{nc};
5111 $self->{entity__match} *= 2;
5112 ## Stay in the state.
5113
5114 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5115 $self->{line_prev} = $self->{line};
5116 $self->{column_prev} = $self->{column};
5117 $self->{column}++;
5118 $self->{nc}
5119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5120 } else {
5121 $self->{set_nc}->($self);
5122 }
5123
5124 redo A;
5125 }
5126 }
5127
5128 my $data;
5129 my $has_ref;
5130 if ($self->{entity__match} > 0) {
5131
5132 $data = $self->{entity__value};
5133 $has_ref = 1;
5134 #
5135 } elsif ($self->{entity__match} < 0) {
5136 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5137 if ($self->{prev_state} != DATA_STATE and # in attribute
5138 $self->{entity__match} < -1) {
5139
5140 $data = '&' . $self->{kwd};
5141 #
5142 } else {
5143
5144 $data = $self->{entity__value};
5145 $has_ref = 1;
5146 #
5147 }
5148 } else {
5149
5150 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5151 line => $self->{line_prev},
5152 column => $self->{column_prev} - length $self->{kwd});
5153 $data = '&' . $self->{kwd};
5154 #
5155 }
5156
5157 ## NOTE: In these cases, when a character reference is found,
5158 ## it is consumed and a character token is returned, or, otherwise,
5159 ## nothing is consumed and returned, according to the spec algorithm.
5160 ## In this implementation, anything that has been examined by the
5161 ## tokenizer is appended to the parent element or the attribute value
5162 ## as string, either literal string when no character reference or
5163 ## entity-replaced string otherwise, in this stage, since any characters
5164 ## that would not be consumed are appended in the data state or in an
5165 ## appropriate attribute value state anyway.
5166
5167 if ($self->{prev_state} == DATA_STATE) {
5168
5169 $self->{state} = $self->{prev_state};
5170 $self->{s_kwd} = '';
5171 ## Reconsume.
5172 return ({type => CHARACTER_TOKEN,
5173 data => $data,
5174 has_reference => $has_ref,
5175 line => $self->{line_prev},
5176 column => $self->{column_prev} + 1 - length $self->{kwd},
5177 });
5178 redo A;
5179 } else {
5180
5181 $self->{ca}->{value} .= $data;
5182 $self->{ca}->{has_reference} = 1 if $has_ref;
5183 $self->{state} = $self->{prev_state};
5184 $self->{s_kwd} = '';
5185 ## Reconsume.
5186 redo A;
5187 }
5188
5189 ## XML-only states
5190
5191 } elsif ($self->{state} == PI_STATE) {
5192 ## XML5: "Pi state" and "DOCTYPE pi state".
5193
5194 if ($is_space->{$self->{nc}} or
5195 $self->{nc} == 0x003F or # ?
5196 $self->{nc} == -1) {
5197 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5198 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5199 ## "DOCTYPE pi state": Parse error, switch to the "data
5200 ## state".
5201 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5202 line => $self->{line_prev},
5203 column => $self->{column_prev}
5204 - 1 * ($self->{nc} != -1));
5205 $self->{state} = BOGUS_COMMENT_STATE;
5206 ## Reconsume.
5207 $self->{ct} = {type => COMMENT_TOKEN,
5208 data => '?',
5209 line => $self->{line_prev},
5210 column => $self->{column_prev}
5211 - 1 * ($self->{nc} != -1),
5212 };
5213 redo A;
5214 } else {
5215 ## XML5: "DOCTYPE pi state": Stay in the state.
5216 $self->{ct} = {type => PI_TOKEN,
5217 target => chr $self->{nc},
5218 data => '',
5219 line => $self->{line_prev},
5220 column => $self->{column_prev} - 1,
5221 };
5222 $self->{state} = PI_TARGET_STATE;
5223
5224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5225 $self->{line_prev} = $self->{line};
5226 $self->{column_prev} = $self->{column};
5227 $self->{column}++;
5228 $self->{nc}
5229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5230 } else {
5231 $self->{set_nc}->($self);
5232 }
5233
5234 redo A;
5235 }
5236 } elsif ($self->{state} == PI_TARGET_STATE) {
5237 if ($is_space->{$self->{nc}}) {
5238 $self->{state} = PI_TARGET_AFTER_STATE;
5239
5240 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5241 $self->{line_prev} = $self->{line};
5242 $self->{column_prev} = $self->{column};
5243 $self->{column}++;
5244 $self->{nc}
5245 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5246 } else {
5247 $self->{set_nc}->($self);
5248 }
5249
5250 redo A;
5251 } elsif ($self->{nc} == -1) {
5252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5253 if ($self->{in_subset}) {
5254 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5255 } else {
5256 $self->{state} = DATA_STATE;
5257 $self->{s_kwd} = '';
5258 }
5259 ## Reconsume.
5260 return ($self->{ct}); # pi
5261 redo A;
5262 } elsif ($self->{nc} == 0x003F) { # ?
5263 $self->{state} = PI_AFTER_STATE;
5264
5265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5266 $self->{line_prev} = $self->{line};
5267 $self->{column_prev} = $self->{column};
5268 $self->{column}++;
5269 $self->{nc}
5270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5271 } else {
5272 $self->{set_nc}->($self);
5273 }
5274
5275 redo A;
5276 } else {
5277 ## XML5: typo ("tag name" -> "target")
5278 $self->{ct}->{target} .= chr $self->{nc}; # pi
5279
5280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5281 $self->{line_prev} = $self->{line};
5282 $self->{column_prev} = $self->{column};
5283 $self->{column}++;
5284 $self->{nc}
5285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5286 } else {
5287 $self->{set_nc}->($self);
5288 }
5289
5290 redo A;
5291 }
5292 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5293 if ($is_space->{$self->{nc}}) {
5294 ## Stay in the state.
5295
5296 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5297 $self->{line_prev} = $self->{line};
5298 $self->{column_prev} = $self->{column};
5299 $self->{column}++;
5300 $self->{nc}
5301 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5302 } else {
5303 $self->{set_nc}->($self);
5304 }
5305
5306 redo A;
5307 } else {
5308 $self->{state} = PI_DATA_STATE;
5309 ## Reprocess.
5310 redo A;
5311 }
5312 } elsif ($self->{state} == PI_DATA_STATE) {
5313 if ($self->{nc} == 0x003F) { # ?
5314 $self->{state} = PI_DATA_AFTER_STATE;
5315
5316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5317 $self->{line_prev} = $self->{line};
5318 $self->{column_prev} = $self->{column};
5319 $self->{column}++;
5320 $self->{nc}
5321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5322 } else {
5323 $self->{set_nc}->($self);
5324 }
5325
5326 redo A;
5327 } elsif ($self->{nc} == -1) {
5328 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5329 if ($self->{in_subset}) {
5330 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5331 } else {
5332 $self->{state} = DATA_STATE;
5333 $self->{s_kwd} = '';
5334 }
5335 ## Reprocess.
5336 return ($self->{ct}); # pi
5337 redo A;
5338 } else {
5339 $self->{ct}->{data} .= chr $self->{nc}; # pi
5340 $self->{read_until}->($self->{ct}->{data}, q[?],
5341 length $self->{ct}->{data});
5342 ## Stay in the state.
5343
5344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5345 $self->{line_prev} = $self->{line};
5346 $self->{column_prev} = $self->{column};
5347 $self->{column}++;
5348 $self->{nc}
5349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5350 } else {
5351 $self->{set_nc}->($self);
5352 }
5353
5354 ## Reprocess.
5355 redo A;
5356 }
5357 } elsif ($self->{state} == PI_AFTER_STATE) {
5358 ## XML5: Part of "Pi after state".
5359
5360 if ($self->{nc} == 0x003E) { # >
5361 if ($self->{in_subset}) {
5362 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5363 } else {
5364 $self->{state} = DATA_STATE;
5365 $self->{s_kwd} = '';
5366 }
5367
5368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5369 $self->{line_prev} = $self->{line};
5370 $self->{column_prev} = $self->{column};
5371 $self->{column}++;
5372 $self->{nc}
5373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5374 } else {
5375 $self->{set_nc}->($self);
5376 }
5377
5378 return ($self->{ct}); # pi
5379 redo A;
5380 } elsif ($self->{nc} == 0x003F) { # ?
5381 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5382 line => $self->{line_prev},
5383 column => $self->{column_prev}); ## XML5: no error
5384 $self->{ct}->{data} .= '?';
5385 $self->{state} = PI_DATA_AFTER_STATE;
5386
5387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5388 $self->{line_prev} = $self->{line};
5389 $self->{column_prev} = $self->{column};
5390 $self->{column}++;
5391 $self->{nc}
5392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5393 } else {
5394 $self->{set_nc}->($self);
5395 }
5396
5397 redo A;
5398 } else {
5399 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5400 line => $self->{line_prev},
5401 column => $self->{column_prev}
5402 + 1 * ($self->{nc} == -1)); ## XML5: no error
5403 $self->{ct}->{data} .= '?'; ## XML5: not appended
5404 $self->{state} = PI_DATA_STATE;
5405 ## Reprocess.
5406 redo A;
5407 }
5408 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5409 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5410
5411 if ($self->{nc} == 0x003E) { # >
5412 if ($self->{in_subset}) {
5413 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5414 } else {
5415 $self->{state} = DATA_STATE;
5416 $self->{s_kwd} = '';
5417 }
5418
5419 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5420 $self->{line_prev} = $self->{line};
5421 $self->{column_prev} = $self->{column};
5422 $self->{column}++;
5423 $self->{nc}
5424 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5425 } else {
5426 $self->{set_nc}->($self);
5427 }
5428
5429 return ($self->{ct}); # pi
5430 redo A;
5431 } elsif ($self->{nc} == 0x003F) { # ?
5432 $self->{ct}->{data} .= '?';
5433 ## Stay in the state.
5434
5435 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5436 $self->{line_prev} = $self->{line};
5437 $self->{column_prev} = $self->{column};
5438 $self->{column}++;
5439 $self->{nc}
5440 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5441 } else {
5442 $self->{set_nc}->($self);
5443 }
5444
5445 redo A;
5446 } else {
5447 $self->{ct}->{data} .= '?'; ## XML5: not appended
5448 $self->{state} = PI_DATA_STATE;
5449 ## Reprocess.
5450 redo A;
5451 }
5452
5453 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5454 if ($self->{nc} == 0x003C) { # <
5455 $self->{state} = DOCTYPE_TAG_STATE;
5456
5457 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5458 $self->{line_prev} = $self->{line};
5459 $self->{column_prev} = $self->{column};
5460 $self->{column}++;
5461 $self->{nc}
5462 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5463 } else {
5464 $self->{set_nc}->($self);
5465 }
5466
5467 redo A;
5468 } elsif ($self->{nc} == 0x0025) { # %
5469 ## XML5: Not defined yet.
5470
5471 ## TODO:
5472
5473 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5474 $self->{line_prev} = $self->{line};
5475 $self->{column_prev} = $self->{column};
5476 $self->{column}++;
5477 $self->{nc}
5478 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5479 } else {
5480 $self->{set_nc}->($self);
5481 }
5482
5483 redo A;
5484 } elsif ($self->{nc} == 0x005D) { # ]
5485 delete $self->{in_subset};
5486 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5487
5488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5489 $self->{line_prev} = $self->{line};
5490 $self->{column_prev} = $self->{column};
5491 $self->{column}++;
5492 $self->{nc}
5493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5494 } else {
5495 $self->{set_nc}->($self);
5496 }
5497
5498 redo A;
5499 } elsif ($is_space->{$self->{nc}}) {
5500 ## Stay in the state.
5501
5502 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503 $self->{line_prev} = $self->{line};
5504 $self->{column_prev} = $self->{column};
5505 $self->{column}++;
5506 $self->{nc}
5507 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508 } else {
5509 $self->{set_nc}->($self);
5510 }
5511
5512 redo A;
5513 } elsif ($self->{nc} == -1) {
5514 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5515 delete $self->{in_subset};
5516 $self->{state} = DATA_STATE;
5517 $self->{s_kwd} = '';
5518 ## Reconsume.
5519 return ({type => END_OF_DOCTYPE_TOKEN});
5520 redo A;
5521 } else {
5522 unless ($self->{internal_subset_tainted}) {
5523 ## XML5: No parse error.
5524 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5525 $self->{internal_subset_tainted} = 1;
5526 }
5527 ## Stay in the state.
5528
5529 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5530 $self->{line_prev} = $self->{line};
5531 $self->{column_prev} = $self->{column};
5532 $self->{column}++;
5533 $self->{nc}
5534 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5535 } else {
5536 $self->{set_nc}->($self);
5537 }
5538
5539 redo A;
5540 }
5541 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5542 if ($self->{nc} == 0x003E) { # >
5543 $self->{state} = DATA_STATE;
5544 $self->{s_kwd} = '';
5545
5546 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5547 $self->{line_prev} = $self->{line};
5548 $self->{column_prev} = $self->{column};
5549 $self->{column}++;
5550 $self->{nc}
5551 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5552 } else {
5553 $self->{set_nc}->($self);
5554 }
5555
5556 return ({type => END_OF_DOCTYPE_TOKEN});
5557 redo A;
5558 } elsif ($self->{nc} == -1) {
5559 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5560 $self->{state} = DATA_STATE;
5561 $self->{s_kwd} = '';
5562 ## Reconsume.
5563 return ({type => END_OF_DOCTYPE_TOKEN});
5564 redo A;
5565 } else {
5566 ## XML5: No parse error and stay in the state.
5567 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5568
5569 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5570
5571 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5572 $self->{line_prev} = $self->{line};
5573 $self->{column_prev} = $self->{column};
5574 $self->{column}++;
5575 $self->{nc}
5576 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5577 } else {
5578 $self->{set_nc}->($self);
5579 }
5580
5581 redo A;
5582 }
5583 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5584 if ($self->{nc} == 0x003E) { # >
5585 $self->{state} = DATA_STATE;
5586 $self->{s_kwd} = '';
5587
5588 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5589 $self->{line_prev} = $self->{line};
5590 $self->{column_prev} = $self->{column};
5591 $self->{column}++;
5592 $self->{nc}
5593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5594 } else {
5595 $self->{set_nc}->($self);
5596 }
5597
5598 return ({type => END_OF_DOCTYPE_TOKEN});
5599 redo A;
5600 } elsif ($self->{nc} == -1) {
5601 $self->{state} = DATA_STATE;
5602 $self->{s_kwd} = '';
5603 ## Reconsume.
5604 return ({type => END_OF_DOCTYPE_TOKEN});
5605 redo A;
5606 } else {
5607 ## Stay in the state.
5608
5609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5610 $self->{line_prev} = $self->{line};
5611 $self->{column_prev} = $self->{column};
5612 $self->{column}++;
5613 $self->{nc}
5614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5615 } else {
5616 $self->{set_nc}->($self);
5617 }
5618
5619 redo A;
5620 }
5621 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5622 if ($self->{nc} == 0x0021) { # !
5623 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5624
5625 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5626 $self->{line_prev} = $self->{line};
5627 $self->{column_prev} = $self->{column};
5628 $self->{column}++;
5629 $self->{nc}
5630 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5631 } else {
5632 $self->{set_nc}->($self);
5633 }
5634
5635 redo A;
5636 } elsif ($self->{nc} == 0x003F) { # ?
5637 $self->{state} = PI_STATE;
5638
5639 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5640 $self->{line_prev} = $self->{line};
5641 $self->{column_prev} = $self->{column};
5642 $self->{column}++;
5643 $self->{nc}
5644 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5645 } else {
5646 $self->{set_nc}->($self);
5647 }
5648
5649 redo A;
5650 } elsif ($self->{nc} == -1) {
5651 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5652 $self->{state} = DATA_STATE;
5653 $self->{s_kwd} = '';
5654 ## Reconsume.
5655 redo A;
5656 } else {
5657 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5658 line => $self->{line_prev},
5659 column => $self->{column_prev});
5660 $self->{state} = BOGUS_COMMENT_STATE;
5661 $self->{ct} = {type => COMMENT_TOKEN,
5662 data => '',
5663 }; ## NOTE: Will be discarded.
5664
5665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5666 $self->{line_prev} = $self->{line};
5667 $self->{column_prev} = $self->{column};
5668 $self->{column}++;
5669 $self->{nc}
5670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5671 } else {
5672 $self->{set_nc}->($self);
5673 }
5674
5675 redo A;
5676 }
5677 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5678 ## XML5: "DOCTYPE markup declaration state".
5679
5680 if ($self->{nc} == 0x002D) { # -
5681 $self->{state} = MD_HYPHEN_STATE;
5682
5683 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5684 $self->{line_prev} = $self->{line};
5685 $self->{column_prev} = $self->{column};
5686 $self->{column}++;
5687 $self->{nc}
5688 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5689 } else {
5690 $self->{set_nc}->($self);
5691 }
5692
5693 redo A;
5694 } elsif ($self->{nc} == 0x0045 or # E
5695 $self->{nc} == 0x0065) { # e
5696 $self->{state} = MD_E_STATE;
5697 $self->{kwd} = chr $self->{nc};
5698
5699 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5700 $self->{line_prev} = $self->{line};
5701 $self->{column_prev} = $self->{column};
5702 $self->{column}++;
5703 $self->{nc}
5704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5705 } else {
5706 $self->{set_nc}->($self);
5707 }
5708
5709 redo A;
5710 } elsif ($self->{nc} == 0x0041 or # A
5711 $self->{nc} == 0x0061) { # a
5712 $self->{state} = MD_ATTLIST_STATE;
5713 $self->{kwd} = chr $self->{nc};
5714
5715 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5716 $self->{line_prev} = $self->{line};
5717 $self->{column_prev} = $self->{column};
5718 $self->{column}++;
5719 $self->{nc}
5720 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5721 } else {
5722 $self->{set_nc}->($self);
5723 }
5724
5725 redo A;
5726 } elsif ($self->{nc} == 0x004E or # N
5727 $self->{nc} == 0x006E) { # n
5728 $self->{state} = MD_NOTATION_STATE;
5729 $self->{kwd} = chr $self->{nc};
5730
5731 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5732 $self->{line_prev} = $self->{line};
5733 $self->{column_prev} = $self->{column};
5734 $self->{column}++;
5735 $self->{nc}
5736 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5737 } else {
5738 $self->{set_nc}->($self);
5739 }
5740
5741 redo A;
5742 } else {
5743 #
5744 }
5745
5746 ## XML5: No parse error.
5747 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5748 line => $self->{line_prev},
5749 column => $self->{column_prev} - 1);
5750 ## Reconsume.
5751 $self->{state} = BOGUS_COMMENT_STATE;
5752 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5753 redo A;
5754 } elsif ($self->{state} == MD_E_STATE) {
5755 if ($self->{nc} == 0x004E or # N
5756 $self->{nc} == 0x006E) { # n
5757 $self->{state} = MD_ENTITY_STATE;
5758 $self->{kwd} .= chr $self->{nc};
5759
5760 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5761 $self->{line_prev} = $self->{line};
5762 $self->{column_prev} = $self->{column};
5763 $self->{column}++;
5764 $self->{nc}
5765 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5766 } else {
5767 $self->{set_nc}->($self);
5768 }
5769
5770 redo A;
5771 } elsif ($self->{nc} == 0x004C or # L
5772 $self->{nc} == 0x006C) { # l
5773 ## XML5: <!ELEMENT> not supported.
5774 $self->{state} = MD_ELEMENT_STATE;
5775 $self->{kwd} .= chr $self->{nc};
5776
5777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778 $self->{line_prev} = $self->{line};
5779 $self->{column_prev} = $self->{column};
5780 $self->{column}++;
5781 $self->{nc}
5782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783 } else {
5784 $self->{set_nc}->($self);
5785 }
5786
5787 redo A;
5788 } else {
5789 ## XML5: No parse error.
5790 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5791 line => $self->{line_prev},
5792 column => $self->{column_prev} - 2
5793 + 1 * ($self->{nc} == -1));
5794 ## Reconsume.
5795 $self->{state} = BOGUS_COMMENT_STATE;
5796 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5797 redo A;
5798 }
5799 } elsif ($self->{state} == MD_ENTITY_STATE) {
5800 if ($self->{nc} == [
5801 undef,
5802 undef,
5803 0x0054, # T
5804 0x0049, # I
5805 0x0054, # T
5806 ]->[length $self->{kwd}] or
5807 $self->{nc} == [
5808 undef,
5809 undef,
5810 0x0074, # t
5811 0x0069, # i
5812 0x0074, # t
5813 ]->[length $self->{kwd}]) {
5814 ## Stay in the state.
5815 $self->{kwd} .= chr $self->{nc};
5816
5817 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5818 $self->{line_prev} = $self->{line};
5819 $self->{column_prev} = $self->{column};
5820 $self->{column}++;
5821 $self->{nc}
5822 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5823 } else {
5824 $self->{set_nc}->($self);
5825 }
5826
5827 redo A;
5828 } elsif ((length $self->{kwd}) == 5 and
5829 ($self->{nc} == 0x0059 or # Y
5830 $self->{nc} == 0x0079)) { # y
5831 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5832 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5833 text => 'ENTITY',
5834 line => $self->{line_prev},
5835 column => $self->{column_prev} - 4);
5836 }
5837 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5838 line => $self->{line_prev},
5839 column => $self->{column_prev} - 6};
5840 $self->{state} = DOCTYPE_MD_STATE;
5841
5842 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5843 $self->{line_prev} = $self->{line};
5844 $self->{column_prev} = $self->{column};
5845 $self->{column}++;
5846 $self->{nc}
5847 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5848 } else {
5849 $self->{set_nc}->($self);
5850 }
5851
5852 redo A;
5853 } else {
5854 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5855 line => $self->{line_prev},
5856 column => $self->{column_prev} - 1
5857 - (length $self->{kwd})
5858 + 1 * ($self->{nc} == -1));
5859 $self->{state} = BOGUS_COMMENT_STATE;
5860 ## Reconsume.
5861 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5862 redo A;
5863 }
5864 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5865 if ($self->{nc} == [
5866 undef,
5867 undef,
5868 0x0045, # E
5869 0x004D, # M
5870 0x0045, # E
5871 0x004E, # N
5872 ]->[length $self->{kwd}] or
5873 $self->{nc} == [
5874 undef,
5875 undef,
5876 0x0065, # e
5877 0x006D, # m
5878 0x0065, # e
5879 0x006E, # n
5880 ]->[length $self->{kwd}]) {
5881 ## Stay in the state.
5882 $self->{kwd} .= chr $self->{nc};
5883
5884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885 $self->{line_prev} = $self->{line};
5886 $self->{column_prev} = $self->{column};
5887 $self->{column}++;
5888 $self->{nc}
5889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890 } else {
5891 $self->{set_nc}->($self);
5892 }
5893
5894 redo A;
5895 } elsif ((length $self->{kwd}) == 6 and
5896 ($self->{nc} == 0x0054 or # T
5897 $self->{nc} == 0x0074)) { # t
5898 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5899 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5900 text => 'ELEMENT',
5901 line => $self->{line_prev},
5902 column => $self->{column_prev} - 5);
5903 }
5904 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5905 line => $self->{line_prev},
5906 column => $self->{column_prev} - 7};
5907 $self->{state} = DOCTYPE_MD_STATE;
5908
5909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5910 $self->{line_prev} = $self->{line};
5911 $self->{column_prev} = $self->{column};
5912 $self->{column}++;
5913 $self->{nc}
5914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5915 } else {
5916 $self->{set_nc}->($self);
5917 }
5918
5919 redo A;
5920 } else {
5921 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5922 line => $self->{line_prev},
5923 column => $self->{column_prev} - 1
5924 - (length $self->{kwd})
5925 + 1 * ($self->{nc} == -1));
5926 $self->{state} = BOGUS_COMMENT_STATE;
5927 ## Reconsume.
5928 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5929 redo A;
5930 }
5931 } elsif ($self->{state} == MD_ATTLIST_STATE) {
5932 if ($self->{nc} == [
5933 undef,
5934 0x0054, # T
5935 0x0054, # T
5936 0x004C, # L
5937 0x0049, # I
5938 0x0053, # S
5939 ]->[length $self->{kwd}] or
5940 $self->{nc} == [
5941 undef,
5942 0x0074, # t
5943 0x0074, # t
5944 0x006C, # l
5945 0x0069, # i
5946 0x0073, # s
5947 ]->[length $self->{kwd}]) {
5948 ## Stay in the state.
5949 $self->{kwd} .= chr $self->{nc};
5950
5951 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5952 $self->{line_prev} = $self->{line};
5953 $self->{column_prev} = $self->{column};
5954 $self->{column}++;
5955 $self->{nc}
5956 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5957 } else {
5958 $self->{set_nc}->($self);
5959 }
5960
5961 redo A;
5962 } elsif ((length $self->{kwd}) == 6 and
5963 ($self->{nc} == 0x0054 or # T
5964 $self->{nc} == 0x0074)) { # t
5965 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5966 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5967 text => 'ATTLIST',
5968 line => $self->{line_prev},
5969 column => $self->{column_prev} - 5);
5970 }
5971 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5972 attrdefs => [],
5973 line => $self->{line_prev},
5974 column => $self->{column_prev} - 7};
5975 $self->{state} = DOCTYPE_MD_STATE;
5976
5977 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5978 $self->{line_prev} = $self->{line};
5979 $self->{column_prev} = $self->{column};
5980 $self->{column}++;
5981 $self->{nc}
5982 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5983 } else {
5984 $self->{set_nc}->($self);
5985 }
5986
5987 redo A;
5988 } else {
5989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5990 line => $self->{line_prev},
5991 column => $self->{column_prev} - 1
5992 - (length $self->{kwd})
5993 + 1 * ($self->{nc} == -1));
5994 $self->{state} = BOGUS_COMMENT_STATE;
5995 ## Reconsume.
5996 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5997 redo A;
5998 }
5999 } elsif ($self->{state} == MD_NOTATION_STATE) {
6000 if ($self->{nc} == [
6001 undef,
6002 0x004F, # O
6003 0x0054, # T
6004 0x0041, # A
6005 0x0054, # T
6006 0x0049, # I
6007 0x004F, # O
6008 ]->[length $self->{kwd}] or
6009 $self->{nc} == [
6010 undef,
6011 0x006F, # o
6012 0x0074, # t
6013 0x0061, # a
6014 0x0074, # t
6015 0x0069, # i
6016 0x006F, # o
6017 ]->[length $self->{kwd}]) {
6018 ## Stay in the state.
6019 $self->{kwd} .= chr $self->{nc};
6020
6021 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6022 $self->{line_prev} = $self->{line};
6023 $self->{column_prev} = $self->{column};
6024 $self->{column}++;
6025 $self->{nc}
6026 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6027 } else {
6028 $self->{set_nc}->($self);
6029 }
6030
6031 redo A;
6032 } elsif ((length $self->{kwd}) == 7 and
6033 ($self->{nc} == 0x004E or # N
6034 $self->{nc} == 0x006E)) { # n
6035 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6036 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6037 text => 'NOTATION',
6038 line => $self->{line_prev},
6039 column => $self->{column_prev} - 6);
6040 }
6041 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6042 line => $self->{line_prev},
6043 column => $self->{column_prev} - 8};
6044 $self->{state} = DOCTYPE_MD_STATE;
6045
6046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047 $self->{line_prev} = $self->{line};
6048 $self->{column_prev} = $self->{column};
6049 $self->{column}++;
6050 $self->{nc}
6051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052 } else {
6053 $self->{set_nc}->($self);
6054 }
6055
6056 redo A;
6057 } else {
6058 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6059 line => $self->{line_prev},
6060 column => $self->{column_prev} - 1
6061 - (length $self->{kwd})
6062 + 1 * ($self->{nc} == -1));
6063 $self->{state} = BOGUS_COMMENT_STATE;
6064 ## Reconsume.
6065 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6066 redo A;
6067 }
6068 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6069 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6070 ## "DOCTYPE NOTATION state".
6071
6072 if ($is_space->{$self->{nc}}) {
6073 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6074 $self->{state} = BEFORE_MD_NAME_STATE;
6075
6076 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6077 $self->{line_prev} = $self->{line};
6078 $self->{column_prev} = $self->{column};
6079 $self->{column}++;
6080 $self->{nc}
6081 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6082 } else {
6083 $self->{set_nc}->($self);
6084 }
6085
6086 redo A;
6087 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6088 $self->{nc} == 0x0025) { # %
6089 ## XML5: Switch to the "DOCTYPE bogus comment state".
6090 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6091 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6092
6093 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6094 $self->{line_prev} = $self->{line};
6095 $self->{column_prev} = $self->{column};
6096 $self->{column}++;
6097 $self->{nc}
6098 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6099 } else {
6100 $self->{set_nc}->($self);
6101 }
6102
6103 redo A;
6104 } elsif ($self->{nc} == -1) {
6105 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6106 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6107 ## Reconsume.
6108 redo A;
6109 } elsif ($self->{nc} == 0x003E) { # >
6110 ## XML5: Switch to the "DOCTYPE bogus comment state".
6111 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6112 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6113
6114 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6115 $self->{line_prev} = $self->{line};
6116 $self->{column_prev} = $self->{column};
6117 $self->{column}++;
6118 $self->{nc}
6119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6120 } else {
6121 $self->{set_nc}->($self);
6122 }
6123
6124 redo A;
6125 } else {
6126 ## XML5: Switch to the "DOCTYPE bogus comment state".
6127 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6128 $self->{state} = BEFORE_MD_NAME_STATE;
6129 redo A;
6130 }
6131 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6132 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6133 ## before state", "DOCTYPE ATTLIST name before state".
6134
6135 if ($is_space->{$self->{nc}}) {
6136 ## Stay in the state.
6137
6138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6139 $self->{line_prev} = $self->{line};
6140 $self->{column_prev} = $self->{column};
6141 $self->{column}++;
6142 $self->{nc}
6143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6144 } else {
6145 $self->{set_nc}->($self);
6146 }
6147
6148 redo A;
6149 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6150 $self->{nc} == 0x0025) { # %
6151 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6152
6153 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6154 $self->{line_prev} = $self->{line};
6155 $self->{column_prev} = $self->{column};
6156 $self->{column}++;
6157 $self->{nc}
6158 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6159 } else {
6160 $self->{set_nc}->($self);
6161 }
6162
6163 redo A;
6164 } elsif ($self->{nc} == 0x003E) { # >
6165 ## XML5: Same as "Anything else".
6166 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6167 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6168
6169 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6170 $self->{line_prev} = $self->{line};
6171 $self->{column_prev} = $self->{column};
6172 $self->{column}++;
6173 $self->{nc}
6174 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6175 } else {
6176 $self->{set_nc}->($self);
6177 }
6178
6179 redo A;
6180 } elsif ($self->{nc} == -1) {
6181 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6182 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6183 ## Reconsume.
6184 redo A;
6185 } else {
6186 ## XML5: [ATTLIST] Not defined yet.
6187 $self->{ct}->{name} .= chr $self->{nc};
6188 $self->{state} = MD_NAME_STATE;
6189
6190 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6191 $self->{line_prev} = $self->{line};
6192 $self->{column_prev} = $self->{column};
6193 $self->{column}++;
6194 $self->{nc}
6195 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6196 } else {
6197 $self->{set_nc}->($self);
6198 }
6199
6200 redo A;
6201 }
6202 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6203 if ($is_space->{$self->{nc}}) {
6204 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6205 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6206 $self->{state} = BEFORE_MD_NAME_STATE;
6207
6208 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6209 $self->{line_prev} = $self->{line};
6210 $self->{column_prev} = $self->{column};
6211 $self->{column}++;
6212 $self->{nc}
6213 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6214 } else {
6215 $self->{set_nc}->($self);
6216 }
6217
6218 redo A;
6219 } elsif ($self->{nc} == 0x003E) { # >
6220 ## XML5: Same as "Anything else".
6221 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6222 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6223
6224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6225 $self->{line_prev} = $self->{line};
6226 $self->{column_prev} = $self->{column};
6227 $self->{column}++;
6228 $self->{nc}
6229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6230 } else {
6231 $self->{set_nc}->($self);
6232 }
6233
6234 redo A;
6235 } elsif ($self->{nc} == -1) {
6236 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6237 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6238 ## Reconsume.
6239 redo A;
6240 } else {
6241 ## XML5: No parse error.
6242 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6243 $self->{state} = BOGUS_COMMENT_STATE;
6244 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6245 ## Reconsume.
6246 redo A;
6247 }
6248 } elsif ($self->{state} == MD_NAME_STATE) {
6249 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6250
6251 if ($is_space->{$self->{nc}}) {
6252 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6253 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6254 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6255 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6256 } else { # ENTITY/NOTATION
6257 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6258 }
6259
6260 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6261 $self->{line_prev} = $self->{line};
6262 $self->{column_prev} = $self->{column};
6263 $self->{column}++;
6264 $self->{nc}
6265 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6266 } else {
6267 $self->{set_nc}->($self);
6268 }
6269
6270 redo A;
6271 } elsif ($self->{nc} == 0x003E) { # >
6272 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6273 #
6274 } else {
6275 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6276 }
6277 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6278
6279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6280 $self->{line_prev} = $self->{line};
6281 $self->{column_prev} = $self->{column};
6282 $self->{column}++;
6283 $self->{nc}
6284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6285 } else {
6286 $self->{set_nc}->($self);
6287 }
6288
6289 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6290 redo A;
6291 } elsif ($self->{nc} == -1) {
6292 ## XML5: [ATTLIST] No parse error.
6293 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6294 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6295 ## Reconsume.
6296 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6297 redo A;
6298 } else {
6299 ## XML5: [ATTLIST] Not defined yet.
6300 $self->{ct}->{name} .= chr $self->{nc};
6301 ## Stay in the state.
6302
6303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6304 $self->{line_prev} = $self->{line};
6305 $self->{column_prev} = $self->{column};
6306 $self->{column}++;
6307 $self->{nc}
6308 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6309 } else {
6310 $self->{set_nc}->($self);
6311 }
6312
6313 redo A;
6314 }
6315 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6316 if ($is_space->{$self->{nc}}) {
6317 ## Stay in the state.
6318
6319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6320 $self->{line_prev} = $self->{line};
6321 $self->{column_prev} = $self->{column};
6322 $self->{column}++;
6323 $self->{nc}
6324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6325 } else {
6326 $self->{set_nc}->($self);
6327 }
6328
6329 redo A;
6330 } elsif ($self->{nc} == 0x003E) { # >
6331 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6332
6333 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6334 $self->{line_prev} = $self->{line};
6335 $self->{column_prev} = $self->{column};
6336 $self->{column}++;
6337 $self->{nc}
6338 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6339 } else {
6340 $self->{set_nc}->($self);
6341 }
6342
6343 return ($self->{ct}); # ATTLIST
6344 redo A;
6345 } elsif ($self->{nc} == -1) {
6346 ## XML5: No parse error.
6347 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6348 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6349 return ($self->{ct});
6350 redo A;
6351 } else {
6352 ## XML5: Not defined yet.
6353 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6354 tokens => [],
6355 line => $self->{line}, column => $self->{column}};
6356 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6357
6358 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6359 $self->{line_prev} = $self->{line};
6360 $self->{column_prev} = $self->{column};
6361 $self->{column}++;
6362 $self->{nc}
6363 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6364 } else {
6365 $self->{set_nc}->($self);
6366 }
6367
6368 redo A;
6369 }
6370 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6371 if ($is_space->{$self->{nc}}) {
6372 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6373
6374 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6375 $self->{line_prev} = $self->{line};
6376 $self->{column_prev} = $self->{column};
6377 $self->{column}++;
6378 $self->{nc}
6379 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6380 } else {
6381 $self->{set_nc}->($self);
6382 }
6383
6384 redo A;
6385 } elsif ($self->{nc} == 0x003E) { # >
6386 ## XML5: Same as "anything else".
6387 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6388 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6389
6390 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6391 $self->{line_prev} = $self->{line};
6392 $self->{column_prev} = $self->{column};
6393 $self->{column}++;
6394 $self->{nc}
6395 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6396 } else {
6397 $self->{set_nc}->($self);
6398 }
6399
6400 return ($self->{ct}); # ATTLIST
6401 redo A;
6402 } elsif ($self->{nc} == 0x0028) { # (
6403 ## XML5: Same as "anything else".
6404 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6405 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6406
6407 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6408 $self->{line_prev} = $self->{line};
6409 $self->{column_prev} = $self->{column};
6410 $self->{column}++;
6411 $self->{nc}
6412 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6413 } else {
6414 $self->{set_nc}->($self);
6415 }
6416
6417 redo A;
6418 } elsif ($self->{nc} == -1) {
6419 ## XML5: No parse error.
6420 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6421 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6422
6423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6424 $self->{line_prev} = $self->{line};
6425 $self->{column_prev} = $self->{column};
6426 $self->{column}++;
6427 $self->{nc}
6428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6429 } else {
6430 $self->{set_nc}->($self);
6431 }
6432
6433 return ($self->{ct}); # ATTLIST
6434 redo A;
6435 } else {
6436 ## XML5: Not defined yet.
6437 $self->{ca}->{name} .= chr $self->{nc};
6438 ## Stay in the state.
6439
6440 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6441 $self->{line_prev} = $self->{line};
6442 $self->{column_prev} = $self->{column};
6443 $self->{column}++;
6444 $self->{nc}
6445 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6446 } else {
6447 $self->{set_nc}->($self);
6448 }
6449
6450 redo A;
6451 }
6452 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6453 if ($is_space->{$self->{nc}}) {
6454 ## Stay in the state.
6455
6456 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6457 $self->{line_prev} = $self->{line};
6458 $self->{column_prev} = $self->{column};
6459 $self->{column}++;
6460 $self->{nc}
6461 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6462 } else {
6463 $self->{set_nc}->($self);
6464 }
6465
6466 redo A;
6467 } elsif ($self->{nc} == 0x003E) { # >
6468 ## XML5: Same as "anything else".
6469 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6470 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6471
6472 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6473 $self->{line_prev} = $self->{line};
6474 $self->{column_prev} = $self->{column};
6475 $self->{column}++;
6476 $self->{nc}
6477 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6478 } else {
6479 $self->{set_nc}->($self);
6480 }
6481
6482 return ($self->{ct}); # ATTLIST
6483 redo A;
6484 } elsif ($self->{nc} == 0x0028) { # (
6485 ## XML5: Same as "anything else".
6486 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6487
6488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6489 $self->{line_prev} = $self->{line};
6490 $self->{column_prev} = $self->{column};
6491 $self->{column}++;
6492 $self->{nc}
6493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6494 } else {
6495 $self->{set_nc}->($self);
6496 }
6497
6498 redo A;
6499 } elsif ($self->{nc} == -1) {
6500 ## XML5: No parse error.
6501 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6502 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6503
6504 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6505 $self->{line_prev} = $self->{line};
6506 $self->{column_prev} = $self->{column};
6507 $self->{column}++;
6508 $self->{nc}
6509 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6510 } else {
6511 $self->{set_nc}->($self);
6512 }
6513
6514 return ($self->{ct});
6515 redo A;
6516 } else {
6517 ## XML5: Not defined yet.
6518 $self->{ca}->{type} = chr $self->{nc};
6519 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6520
6521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6522 $self->{line_prev} = $self->{line};
6523 $self->{column_prev} = $self->{column};
6524 $self->{column}++;
6525 $self->{nc}
6526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6527 } else {
6528 $self->{set_nc}->($self);
6529 }
6530
6531 redo A;
6532 }
6533 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6534 if ($is_space->{$self->{nc}}) {
6535 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6536
6537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6538 $self->{line_prev} = $self->{line};
6539 $self->{column_prev} = $self->{column};
6540 $self->{column}++;
6541 $self->{nc}
6542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6543 } else {
6544 $self->{set_nc}->($self);
6545 }
6546
6547 redo A;
6548 } elsif ($self->{nc} == 0x0023) { # #
6549 ## XML5: Same as "anything else".
6550 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6551 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6552
6553 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6554 $self->{line_prev} = $self->{line};
6555 $self->{column_prev} = $self->{column};
6556 $self->{column}++;
6557 $self->{nc}
6558 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6559 } else {
6560 $self->{set_nc}->($self);
6561 }
6562
6563 redo A;
6564 } elsif ($self->{nc} == 0x0022) { # "
6565 ## XML5: Same as "anything else".
6566 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6567 $self->{ca}->{value} = '';
6568 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6569
6570 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6571 $self->{line_prev} = $self->{line};
6572 $self->{column_prev} = $self->{column};
6573 $self->{column}++;
6574 $self->{nc}
6575 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6576 } else {
6577 $self->{set_nc}->($self);
6578 }
6579
6580 redo A;
6581 } elsif ($self->{nc} == 0x0027) { # '
6582 ## XML5: Same as "anything else".
6583 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6584 $self->{ca}->{value} = '';
6585 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6586
6587 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6588 $self->{line_prev} = $self->{line};
6589 $self->{column_prev} = $self->{column};
6590 $self->{column}++;
6591 $self->{nc}
6592 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6593 } else {
6594 $self->{set_nc}->($self);
6595 }
6596
6597 redo A;
6598 } elsif ($self->{nc} == 0x003E) { # >
6599 ## XML5: Same as "anything else".
6600 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6601 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6602
6603 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6604 $self->{line_prev} = $self->{line};
6605 $self->{column_prev} = $self->{column};
6606 $self->{column}++;
6607 $self->{nc}
6608 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6609 } else {
6610 $self->{set_nc}->($self);
6611 }
6612
6613 return ($self->{ct}); # ATTLIST
6614 redo A;
6615 } elsif ($self->{nc} == 0x0028) { # (
6616 ## XML5: Same as "anything else".
6617 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6618 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6619
6620 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6621 $self->{line_prev} = $self->{line};
6622 $self->{column_prev} = $self->{column};
6623 $self->{column}++;
6624 $self->{nc}
6625 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6626 } else {
6627 $self->{set_nc}->($self);
6628 }
6629
6630 redo A;
6631 } elsif ($self->{nc} == -1) {
6632 ## XML5: No parse error.
6633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6634 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6635
6636 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6637 $self->{line_prev} = $self->{line};
6638 $self->{column_prev} = $self->{column};
6639 $self->{column}++;
6640 $self->{nc}
6641 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6642 } else {
6643 $self->{set_nc}->($self);
6644 }
6645
6646 return ($self->{ct});
6647 redo A;
6648 } else {
6649 ## XML5: Not defined yet.
6650 $self->{ca}->{type} .= chr $self->{nc};
6651 ## Stay in the state.
6652
6653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6654 $self->{line_prev} = $self->{line};
6655 $self->{column_prev} = $self->{column};
6656 $self->{column}++;
6657 $self->{nc}
6658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6659 } else {
6660 $self->{set_nc}->($self);
6661 }
6662
6663 redo A;
6664 }
6665 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6666 if ($is_space->{$self->{nc}}) {
6667 ## Stay in the state.
6668
6669 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6670 $self->{line_prev} = $self->{line};
6671 $self->{column_prev} = $self->{column};
6672 $self->{column}++;
6673 $self->{nc}
6674 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6675 } else {
6676 $self->{set_nc}->($self);
6677 }
6678
6679 redo A;
6680 } elsif ($self->{nc} == 0x0028) { # (
6681 ## XML5: Same as "anything else".
6682 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6683
6684 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6685 $self->{line_prev} = $self->{line};
6686 $self->{column_prev} = $self->{column};
6687 $self->{column}++;
6688 $self->{nc}
6689 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6690 } else {
6691 $self->{set_nc}->($self);
6692 }
6693
6694 redo A;
6695 } elsif ($self->{nc} == 0x0023) { # #
6696 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6697
6698 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6699 $self->{line_prev} = $self->{line};
6700 $self->{column_prev} = $self->{column};
6701 $self->{column}++;
6702 $self->{nc}
6703 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6704 } else {
6705 $self->{set_nc}->($self);
6706 }
6707
6708 redo A;
6709 } elsif ($self->{nc} == 0x0022) { # "
6710 ## XML5: Same as "anything else".
6711 $self->{ca}->{value} = '';
6712 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6713
6714 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6715 $self->{line_prev} = $self->{line};
6716 $self->{column_prev} = $self->{column};
6717 $self->{column}++;
6718 $self->{nc}
6719 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6720 } else {
6721 $self->{set_nc}->($self);
6722 }
6723
6724 redo A;
6725 } elsif ($self->{nc} == 0x0027) { # '
6726 ## XML5: Same as "anything else".
6727 $self->{ca}->{value} = '';
6728 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6729
6730 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6731 $self->{line_prev} = $self->{line};
6732 $self->{column_prev} = $self->{column};
6733 $self->{column}++;
6734 $self->{nc}
6735 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6736 } else {
6737 $self->{set_nc}->($self);
6738 }
6739
6740 redo A;
6741 } elsif ($self->{nc} == 0x003E) { # >
6742 ## XML5: Same as "anything else".
6743 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6744 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6745
6746 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6747 $self->{line_prev} = $self->{line};
6748 $self->{column_prev} = $self->{column};
6749 $self->{column}++;
6750 $self->{nc}
6751 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6752 } else {
6753 $self->{set_nc}->($self);
6754 }
6755
6756 return ($self->{ct}); # ATTLIST
6757 redo A;
6758 } elsif ($self->{nc} == -1) {
6759 ## XML5: No parse error.
6760 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6761 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6762
6763 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6764 $self->{line_prev} = $self->{line};
6765 $self->{column_prev} = $self->{column};
6766 $self->{column}++;
6767 $self->{nc}
6768 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6769 } else {
6770 $self->{set_nc}->($self);
6771 }
6772
6773 return ($self->{ct});
6774 redo A;
6775 } else {
6776 ## XML5: Switch to the "DOCTYPE bogus comment state".
6777 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6778 $self->{ca}->{value} = '';
6779 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6780 ## Reconsume.
6781 redo A;
6782 }
6783 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6784 if ($is_space->{$self->{nc}}) {
6785 ## Stay in the state.
6786
6787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6788 $self->{line_prev} = $self->{line};
6789 $self->{column_prev} = $self->{column};
6790 $self->{column}++;
6791 $self->{nc}
6792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6793 } else {
6794 $self->{set_nc}->($self);
6795 }
6796
6797 redo A;
6798 } elsif ($self->{nc} == 0x007C) { # |
6799 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6800 ## Stay in the state.
6801
6802 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6803 $self->{line_prev} = $self->{line};
6804 $self->{column_prev} = $self->{column};
6805 $self->{column}++;
6806 $self->{nc}
6807 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6808 } else {
6809 $self->{set_nc}->($self);
6810 }
6811
6812 redo A;
6813 } elsif ($self->{nc} == 0x0029) { # )
6814 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6815 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6816
6817 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6818 $self->{line_prev} = $self->{line};
6819 $self->{column_prev} = $self->{column};
6820 $self->{column}++;
6821 $self->{nc}
6822 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6823 } else {
6824 $self->{set_nc}->($self);
6825 }
6826
6827 redo A;
6828 } elsif ($self->{nc} == 0x003E) { # >
6829 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6830 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6831
6832 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6833 $self->{line_prev} = $self->{line};
6834 $self->{column_prev} = $self->{column};
6835 $self->{column}++;
6836 $self->{nc}
6837 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6838 } else {
6839 $self->{set_nc}->($self);
6840 }
6841
6842 return ($self->{ct}); # ATTLIST
6843 redo A;
6844 } elsif ($self->{nc} == -1) {
6845 ## XML5: No parse error.
6846 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6847 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6848
6849 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6850 $self->{line_prev} = $self->{line};
6851 $self->{column_prev} = $self->{column};
6852 $self->{column}++;
6853 $self->{nc}
6854 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6855 } else {
6856 $self->{set_nc}->($self);
6857 }
6858
6859 return ($self->{ct});
6860 redo A;
6861 } else {
6862 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6863 $self->{state} = ALLOWED_TOKEN_STATE;
6864
6865 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6866 $self->{line_prev} = $self->{line};
6867 $self->{column_prev} = $self->{column};
6868 $self->{column}++;
6869 $self->{nc}
6870 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6871 } else {
6872 $self->{set_nc}->($self);
6873 }
6874
6875 redo A;
6876 }
6877 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6878 if ($is_space->{$self->{nc}}) {
6879 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6880
6881 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6882 $self->{line_prev} = $self->{line};
6883 $self->{column_prev} = $self->{column};
6884 $self->{column}++;
6885 $self->{nc}
6886 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6887 } else {
6888 $self->{set_nc}->($self);
6889 }
6890
6891 redo A;
6892 } elsif ($self->{nc} == 0x007C) { # |
6893 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6894
6895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896 $self->{line_prev} = $self->{line};
6897 $self->{column_prev} = $self->{column};
6898 $self->{column}++;
6899 $self->{nc}
6900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901 } else {
6902 $self->{set_nc}->($self);
6903 }
6904
6905 redo A;
6906 } elsif ($self->{nc} == 0x0029) { # )
6907 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6908
6909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6910 $self->{line_prev} = $self->{line};
6911 $self->{column_prev} = $self->{column};
6912 $self->{column}++;
6913 $self->{nc}
6914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6915 } else {
6916 $self->{set_nc}->($self);
6917 }
6918
6919 redo A;
6920 } elsif ($self->{nc} == 0x003E) { # >
6921 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6922 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6923
6924 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6925 $self->{line_prev} = $self->{line};
6926 $self->{column_prev} = $self->{column};
6927 $self->{column}++;
6928 $self->{nc}
6929 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6930 } else {
6931 $self->{set_nc}->($self);
6932 }
6933
6934 return ($self->{ct}); # ATTLIST
6935 redo A;
6936 } elsif ($self->{nc} == -1) {
6937 ## XML5: No parse error.
6938 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6939 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6940
6941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6942 $self->{line_prev} = $self->{line};
6943 $self->{column_prev} = $self->{column};
6944 $self->{column}++;
6945 $self->{nc}
6946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6947 } else {
6948 $self->{set_nc}->($self);
6949 }
6950
6951 return ($self->{ct});
6952 redo A;
6953 } else {
6954 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6955 ## Stay in the state.
6956
6957 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6958 $self->{line_prev} = $self->{line};
6959 $self->{column_prev} = $self->{column};
6960 $self->{column}++;
6961 $self->{nc}
6962 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6963 } else {
6964 $self->{set_nc}->($self);
6965 }
6966
6967 redo A;
6968 }
6969 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6970 if ($is_space->{$self->{nc}}) {
6971 ## Stay in the state.
6972
6973 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6974 $self->{line_prev} = $self->{line};
6975 $self->{column_prev} = $self->{column};
6976 $self->{column}++;
6977 $self->{nc}
6978 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6979 } else {
6980 $self->{set_nc}->($self);
6981 }
6982
6983 redo A;
6984 } elsif ($self->{nc} == 0x007C) { # |
6985 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6986
6987 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6988 $self->{line_prev} = $self->{line};
6989 $self->{column_prev} = $self->{column};
6990 $self->{column}++;
6991 $self->{nc}
6992 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6993 } else {
6994 $self->{set_nc}->($self);
6995 }
6996
6997 redo A;
6998 } elsif ($self->{nc} == 0x0029) { # )
6999 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7000
7001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7002 $self->{line_prev} = $self->{line};
7003 $self->{column_prev} = $self->{column};
7004 $self->{column}++;
7005 $self->{nc}
7006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7007 } else {
7008 $self->{set_nc}->($self);
7009 }
7010
7011 redo A;
7012 } elsif ($self->{nc} == 0x003E) { # >
7013 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7014 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7015
7016 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7017 $self->{line_prev} = $self->{line};
7018 $self->{column_prev} = $self->{column};
7019 $self->{column}++;
7020 $self->{nc}
7021 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7022 } else {
7023 $self->{set_nc}->($self);
7024 }
7025
7026 return ($self->{ct}); # ATTLIST
7027 redo A;
7028 } elsif ($self->{nc} == -1) {
7029 ## XML5: No parse error.
7030 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7031 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7032
7033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7034 $self->{line_prev} = $self->{line};
7035 $self->{column_prev} = $self->{column};
7036 $self->{column}++;
7037 $self->{nc}
7038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7039 } else {
7040 $self->{set_nc}->($self);
7041 }
7042
7043 return ($self->{ct});
7044 redo A;
7045 } else {
7046 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7047 line => $self->{line_prev},
7048 column => $self->{column_prev});
7049 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7050 $self->{state} = ALLOWED_TOKEN_STATE;
7051
7052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7053 $self->{line_prev} = $self->{line};
7054 $self->{column_prev} = $self->{column};
7055 $self->{column}++;
7056 $self->{nc}
7057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7058 } else {
7059 $self->{set_nc}->($self);
7060 }
7061
7062 redo A;
7063 }
7064 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7065 if ($is_space->{$self->{nc}}) {
7066 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7067
7068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069 $self->{line_prev} = $self->{line};
7070 $self->{column_prev} = $self->{column};
7071 $self->{column}++;
7072 $self->{nc}
7073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074 } else {
7075 $self->{set_nc}->($self);
7076 }
7077
7078 redo A;
7079 } elsif ($self->{nc} == 0x0023) { # #
7080 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7081 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7082
7083 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7084 $self->{line_prev} = $self->{line};
7085 $self->{column_prev} = $self->{column};
7086 $self->{column}++;
7087 $self->{nc}
7088 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7089 } else {
7090 $self->{set_nc}->($self);
7091 }
7092
7093 redo A;
7094 } elsif ($self->{nc} == 0x0022) { # "
7095 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7096 $self->{ca}->{value} = '';
7097 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7098
7099 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7100 $self->{line_prev} = $self->{line};
7101 $self->{column_prev} = $self->{column};
7102 $self->{column}++;
7103 $self->{nc}
7104 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7105 } else {
7106 $self->{set_nc}->($self);
7107 }
7108
7109 redo A;
7110 } elsif ($self->{nc} == 0x0027) { # '
7111 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7112 $self->{ca}->{value} = '';
7113 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7114
7115 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7116 $self->{line_prev} = $self->{line};
7117 $self->{column_prev} = $self->{column};
7118 $self->{column}++;
7119 $self->{nc}
7120 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7121 } else {
7122 $self->{set_nc}->($self);
7123 }
7124
7125 redo A;
7126 } elsif ($self->{nc} == 0x003E) { # >
7127 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7128 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7129
7130 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7131 $self->{line_prev} = $self->{line};
7132 $self->{column_prev} = $self->{column};
7133 $self->{column}++;
7134 $self->{nc}
7135 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7136 } else {
7137 $self->{set_nc}->($self);
7138 }
7139
7140 return ($self->{ct}); # ATTLIST
7141 redo A;
7142 } elsif ($self->{nc} == -1) {
7143 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7144 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7145
7146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7147 $self->{line_prev} = $self->{line};
7148 $self->{column_prev} = $self->{column};
7149 $self->{column}++;
7150 $self->{nc}
7151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7152 } else {
7153 $self->{set_nc}->($self);
7154 }
7155
7156 return ($self->{ct});
7157 redo A;
7158 } else {
7159 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7160 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7161 ## Reconsume.
7162 redo A;
7163 }
7164 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7165 if ($is_space->{$self->{nc}}) {
7166 ## Stay in the state.
7167
7168 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7169 $self->{line_prev} = $self->{line};
7170 $self->{column_prev} = $self->{column};
7171 $self->{column}++;
7172 $self->{nc}
7173 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7174 } else {
7175 $self->{set_nc}->($self);
7176 }
7177
7178 redo A;
7179 } elsif ($self->{nc} == 0x0023) { # #
7180 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7181
7182 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7183 $self->{line_prev} = $self->{line};
7184 $self->{column_prev} = $self->{column};
7185 $self->{column}++;
7186 $self->{nc}
7187 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7188 } else {
7189 $self->{set_nc}->($self);
7190 }
7191
7192 redo A;
7193 } elsif ($self->{nc} == 0x0022) { # "
7194 $self->{ca}->{value} = '';
7195 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7196
7197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7198 $self->{line_prev} = $self->{line};
7199 $self->{column_prev} = $self->{column};
7200 $self->{column}++;
7201 $self->{nc}
7202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7203 } else {
7204 $self->{set_nc}->($self);
7205 }
7206
7207 redo A;
7208 } elsif ($self->{nc} == 0x0027) { # '
7209 $self->{ca}->{value} = '';
7210 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7211
7212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7213 $self->{line_prev} = $self->{line};
7214 $self->{column_prev} = $self->{column};
7215 $self->{column}++;
7216 $self->{nc}
7217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7218 } else {
7219 $self->{set_nc}->($self);
7220 }
7221
7222 redo A;
7223 } elsif ($self->{nc} == 0x003E) { # >
7224 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7225 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7226
7227 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7228 $self->{line_prev} = $self->{line};
7229 $self->{column_prev} = $self->{column};
7230 $self->{column}++;
7231 $self->{nc}
7232 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7233 } else {
7234 $self->{set_nc}->($self);
7235 }
7236
7237 return ($self->{ct}); # ATTLIST
7238 redo A;
7239 } elsif ($self->{nc} == -1) {
7240 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7241 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7242
7243 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7244 $self->{line_prev} = $self->{line};
7245 $self->{column_prev} = $self->{column};
7246 $self->{column}++;
7247 $self->{nc}
7248 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7249 } else {
7250 $self->{set_nc}->($self);
7251 }
7252
7253 return ($self->{ct});
7254 redo A;
7255 } else {
7256 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7257 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7258 ## Reconsume.
7259 redo A;
7260 }
7261 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7262 if ($is_space->{$self->{nc}}) {
7263 ## XML5: No parse error.
7264 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7265 $self->{state} = BOGUS_MD_STATE;
7266 ## Reconsume.
7267 redo A;
7268 } elsif ($self->{nc} == 0x0022) { # "
7269 ## XML5: Same as "anything else".
7270 $self->{ca}->{value} = '';
7271 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7272
7273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7274 $self->{line_prev} = $self->{line};
7275 $self->{column_prev} = $self->{column};
7276 $self->{column}++;
7277 $self->{nc}
7278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7279 } else {
7280 $self->{set_nc}->($self);
7281 }
7282
7283 redo A;
7284 } elsif ($self->{nc} == 0x0027) { # '
7285 ## XML5: Same as "anything else".
7286 $self->{ca}->{value} = '';
7287 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7288
7289 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7290 $self->{line_prev} = $self->{line};
7291 $self->{column_prev} = $self->{column};
7292 $self->{column}++;
7293 $self->{nc}
7294 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7295 } else {
7296 $self->{set_nc}->($self);
7297 }
7298
7299 redo A;
7300 } elsif ($self->{nc} == 0x003E) { # >
7301 ## XML5: Same as "anything else".
7302 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7303 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7304
7305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7306 $self->{line_prev} = $self->{line};
7307 $self->{column_prev} = $self->{column};
7308 $self->{column}++;
7309 $self->{nc}
7310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7311 } else {
7312 $self->{set_nc}->($self);
7313 }
7314
7315 return ($self->{ct}); # ATTLIST
7316 redo A;
7317 } elsif ($self->{nc} == -1) {
7318 ## XML5: No parse error.
7319 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7320 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7321
7322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323 $self->{line_prev} = $self->{line};
7324 $self->{column_prev} = $self->{column};
7325 $self->{column}++;
7326 $self->{nc}
7327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328 } else {
7329 $self->{set_nc}->($self);
7330 }
7331
7332 return ($self->{ct});
7333 redo A;
7334 } else {
7335 $self->{ca}->{default} = chr $self->{nc};
7336 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7337
7338 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339 $self->{line_prev} = $self->{line};
7340 $self->{column_prev} = $self->{column};
7341 $self->{column}++;
7342 $self->{nc}
7343 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344 } else {
7345 $self->{set_nc}->($self);
7346 }
7347
7348 redo A;
7349 }
7350 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7351 if ($is_space->{$self->{nc}}) {
7352 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7353
7354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355 $self->{line_prev} = $self->{line};
7356 $self->{column_prev} = $self->{column};
7357 $self->{column}++;
7358 $self->{nc}
7359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360 } else {
7361 $self->{set_nc}->($self);
7362 }
7363
7364 redo A;
7365 } elsif ($self->{nc} == 0x0022) { # "
7366 ## XML5: Same as "anything else".
7367 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7368 $self->{ca}->{value} = '';
7369 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7370
7371 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7372 $self->{line_prev} = $self->{line};
7373 $self->{column_prev} = $self->{column};
7374 $self->{column}++;
7375 $self->{nc}
7376 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7377 } else {
7378 $self->{set_nc}->($self);
7379 }
7380
7381 redo A;
7382 } elsif ($self->{nc} == 0x0027) { # '
7383 ## XML5: Same as "anything else".
7384 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7385 $self->{ca}->{value} = '';
7386 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7387
7388 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7389 $self->{line_prev} = $self->{line};
7390 $self->{column_prev} = $self->{column};
7391 $self->{column}++;
7392 $self->{nc}
7393 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7394 } else {
7395 $self->{set_nc}->($self);
7396 }
7397
7398 redo A;
7399 } elsif ($self->{nc} == 0x003E) { # >
7400 ## XML5: Same as "anything else".
7401 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7402 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7403
7404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7405 $self->{line_prev} = $self->{line};
7406 $self->{column_prev} = $self->{column};
7407 $self->{column}++;
7408 $self->{nc}
7409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7410 } else {
7411 $self->{set_nc}->($self);
7412 }
7413
7414 return ($self->{ct}); # ATTLIST
7415 redo A;
7416 } elsif ($self->{nc} == -1) {
7417 ## XML5: No parse error.
7418 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7419 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7420 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7421
7422 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7423 $self->{line_prev} = $self->{line};
7424 $self->{column_prev} = $self->{column};
7425 $self->{column}++;
7426 $self->{nc}
7427 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7428 } else {
7429 $self->{set_nc}->($self);
7430 }
7431
7432 return ($self->{ct});
7433 redo A;
7434 } else {
7435 $self->{ca}->{default} .= chr $self->{nc};
7436 ## Stay in the state.
7437
7438 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7439 $self->{line_prev} = $self->{line};
7440 $self->{column_prev} = $self->{column};
7441 $self->{column}++;
7442 $self->{nc}
7443 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7444 } else {
7445 $self->{set_nc}->($self);
7446 }
7447
7448 redo A;
7449 }
7450 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7451 if ($is_space->{$self->{nc}}) {
7452 ## Stay in the state.
7453
7454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7455 $self->{line_prev} = $self->{line};
7456 $self->{column_prev} = $self->{column};
7457 $self->{column}++;
7458 $self->{nc}
7459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7460 } else {
7461 $self->{set_nc}->($self);
7462 }
7463
7464 redo A;
7465 } elsif ($self->{nc} == 0x0022) { # "
7466 $self->{ca}->{value} = '';
7467 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7468
7469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7470 $self->{line_prev} = $self->{line};
7471 $self->{column_prev} = $self->{column};
7472 $self->{column}++;
7473 $self->{nc}
7474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7475 } else {
7476 $self->{set_nc}->($self);
7477 }
7478
7479 redo A;
7480 } elsif ($self->{nc} == 0x0027) { # '
7481 $self->{ca}->{value} = '';
7482 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7483
7484 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7485 $self->{line_prev} = $self->{line};
7486 $self->{column_prev} = $self->{column};
7487 $self->{column}++;
7488 $self->{nc}
7489 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7490 } else {
7491 $self->{set_nc}->($self);
7492 }
7493
7494 redo A;
7495 } elsif ($self->{nc} == 0x003E) { # >
7496 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7497 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7498
7499 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7500 $self->{line_prev} = $self->{line};
7501 $self->{column_prev} = $self->{column};
7502 $self->{column}++;
7503 $self->{nc}
7504 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7505 } else {
7506 $self->{set_nc}->($self);
7507 }
7508
7509 return ($self->{ct}); # ATTLIST
7510 redo A;
7511 } elsif ($self->{nc} == -1) {
7512 ## XML5: No parse error.
7513 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7514 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7515 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7516
7517 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7518 $self->{line_prev} = $self->{line};
7519 $self->{column_prev} = $self->{column};
7520 $self->{column}++;
7521 $self->{nc}
7522 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7523 } else {
7524 $self->{set_nc}->($self);
7525 }
7526
7527 return ($self->{ct});
7528 redo A;
7529 } else {
7530 ## XML5: Not defined yet.
7531 if ($self->{ca}->{default} eq 'FIXED') {
7532 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7533 } else {
7534 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7535 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7536 }
7537 ## Reconsume.
7538 redo A;
7539 }
7540 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7541 if ($is_space->{$self->{nc}} or
7542 $self->{nc} == -1 or
7543 $self->{nc} == 0x003E) { # >
7544 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7545 ## Reconsume.
7546 redo A;
7547 } else {
7548 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7549 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7550 ## Reconsume.
7551 redo A;
7552 }
7553 } elsif ($self->{state} == NDATA_STATE) {
7554 ## ASCII case-insensitive
7555 if ($self->{nc} == [
7556 undef,
7557 0x0044, # D
7558 0x0041, # A
7559 0x0054, # T
7560 ]->[length $self->{kwd}] or
7561 $self->{nc} == [
7562 undef,
7563 0x0064, # d
7564 0x0061, # a
7565 0x0074, # t
7566 ]->[length $self->{kwd}]) {
7567
7568 ## Stay in the state.
7569 $self->{kwd} .= chr $self->{nc};
7570
7571 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7572 $self->{line_prev} = $self->{line};
7573 $self->{column_prev} = $self->{column};
7574 $self->{column}++;
7575 $self->{nc}
7576 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7577 } else {
7578 $self->{set_nc}->($self);
7579 }
7580
7581 redo A;
7582 } elsif ((length $self->{kwd}) == 4 and
7583 ($self->{nc} == 0x0041 or # A
7584 $self->{nc} == 0x0061)) { # a
7585 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7586
7587 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7588 text => 'NDATA',
7589 line => $self->{line_prev},
7590 column => $self->{column_prev} - 4);
7591 } else {
7592
7593 }
7594 $self->{state} = AFTER_NDATA_STATE;
7595
7596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7597 $self->{line_prev} = $self->{line};
7598 $self->{column_prev} = $self->{column};
7599 $self->{column}++;
7600 $self->{nc}
7601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7602 } else {
7603 $self->{set_nc}->($self);
7604 }
7605
7606 redo A;
7607 } else {
7608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7609 line => $self->{line_prev},
7610 column => $self->{column_prev} + 1
7611 - length $self->{kwd});
7612
7613 $self->{state} = BOGUS_MD_STATE;
7614 ## Reconsume.
7615 redo A;
7616 }
7617 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7618 if ($is_space->{$self->{nc}}) {
7619 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7620
7621 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7622 $self->{line_prev} = $self->{line};
7623 $self->{column_prev} = $self->{column};
7624 $self->{column}++;
7625 $self->{nc}
7626 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7627 } else {
7628 $self->{set_nc}->($self);
7629 }
7630
7631 redo A;
7632 } elsif ($self->{nc} == 0x003E) { # >
7633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7634 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7635
7636 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7637 $self->{line_prev} = $self->{line};
7638 $self->{column_prev} = $self->{column};
7639 $self->{column}++;
7640 $self->{nc}
7641 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7642 } else {
7643 $self->{set_nc}->($self);
7644 }
7645
7646 return ($self->{ct}); # ENTITY
7647 redo A;
7648 } elsif ($self->{nc} == -1) {
7649 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7650 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7651
7652 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7653 $self->{line_prev} = $self->{line};
7654 $self->{column_prev} = $self->{column};
7655 $self->{column}++;
7656 $self->{nc}
7657 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7658 } else {
7659 $self->{set_nc}->($self);
7660 }
7661
7662 return ($self->{ct}); # ENTITY
7663 redo A;
7664 } else {
7665 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7666 line => $self->{line_prev},
7667 column => $self->{column_prev} + 1
7668 - length $self->{kwd});
7669 $self->{state} = BOGUS_MD_STATE;
7670 ## Reconsume.
7671 redo A;
7672 }
7673 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7674 if ($is_space->{$self->{nc}}) {
7675 ## Stay in the state.
7676
7677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7678 $self->{line_prev} = $self->{line};
7679 $self->{column_prev} = $self->{column};
7680 $self->{column}++;
7681 $self->{nc}
7682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7683 } else {
7684 $self->{set_nc}->($self);
7685 }
7686
7687 redo A;
7688 } elsif ($self->{nc} == 0x003E) { # >
7689 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7690 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7691
7692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7693 $self->{line_prev} = $self->{line};
7694 $self->{column_prev} = $self->{column};
7695 $self->{column}++;
7696 $self->{nc}
7697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7698 } else {
7699 $self->{set_nc}->($self);
7700 }
7701
7702 return ($self->{ct}); # ENTITY
7703 redo A;
7704 } elsif ($self->{nc} == -1) {
7705 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7706 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7707
7708 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7709 $self->{line_prev} = $self->{line};
7710 $self->{column_prev} = $self->{column};
7711 $self->{column}++;
7712 $self->{nc}
7713 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7714 } else {
7715 $self->{set_nc}->($self);
7716 }
7717
7718 return ($self->{ct}); # ENTITY
7719 redo A;
7720 } else {
7721 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7722 $self->{state} = NOTATION_NAME_STATE;
7723
7724 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725 $self->{line_prev} = $self->{line};
7726 $self->{column_prev} = $self->{column};
7727 $self->{column}++;
7728 $self->{nc}
7729 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730 } else {
7731 $self->{set_nc}->($self);
7732 }
7733
7734 redo A;
7735 }
7736 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7737 if ($is_space->{$self->{nc}}) {
7738 $self->{state} = AFTER_MD_DEF_STATE;
7739
7740 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7741 $self->{line_prev} = $self->{line};
7742 $self->{column_prev} = $self->{column};
7743 $self->{column}++;
7744 $self->{nc}
7745 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7746 } else {
7747 $self->{set_nc}->($self);
7748 }
7749
7750 redo A;
7751 } elsif ($self->{nc} == 0x003E) { # >
7752 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753
7754 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755 $self->{line_prev} = $self->{line};
7756 $self->{column_prev} = $self->{column};
7757 $self->{column}++;
7758 $self->{nc}
7759 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760 } else {
7761 $self->{set_nc}->($self);
7762 }
7763
7764 return ($self->{ct}); # ENTITY
7765 redo A;
7766 } elsif ($self->{nc} == -1) {
7767 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7768 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7769
7770 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771 $self->{line_prev} = $self->{line};
7772 $self->{column_prev} = $self->{column};
7773 $self->{column}++;
7774 $self->{nc}
7775 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776 } else {
7777 $self->{set_nc}->($self);
7778 }
7779
7780 return ($self->{ct}); # ENTITY
7781 redo A;
7782 } else {
7783 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7784 ## Stay in the state.
7785
7786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7787 $self->{line_prev} = $self->{line};
7788 $self->{column_prev} = $self->{column};
7789 $self->{column}++;
7790 $self->{nc}
7791 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7792 } else {
7793 $self->{set_nc}->($self);
7794 }
7795
7796 redo A;
7797 }
7798 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7799 if ($self->{nc} == 0x0022) { # "
7800 $self->{state} = AFTER_MD_DEF_STATE;
7801
7802 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7803 $self->{line_prev} = $self->{line};
7804 $self->{column_prev} = $self->{column};
7805 $self->{column}++;
7806 $self->{nc}
7807 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7808 } else {
7809 $self->{set_nc}->($self);
7810 }
7811
7812 redo A;
7813 } elsif ($self->{nc} == 0x0026) { # &
7814 $self->{prev_state} = $self->{state};
7815 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7816 $self->{entity_add} = 0x0022; # "
7817
7818 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7819 $self->{line_prev} = $self->{line};
7820 $self->{column_prev} = $self->{column};
7821 $self->{column}++;
7822 $self->{nc}
7823 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7824 } else {
7825 $self->{set_nc}->($self);
7826 }
7827
7828 redo A;
7829 ## TODO: %
7830 } elsif ($self->{nc} == -1) {
7831 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7832 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7833 ## Reconsume.
7834 return ($self->{ct}); # ENTITY
7835 redo A;
7836 } else {
7837 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7838
7839 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7840 $self->{line_prev} = $self->{line};
7841 $self->{column_prev} = $self->{column};
7842 $self->{column}++;
7843 $self->{nc}
7844 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7845 } else {
7846 $self->{set_nc}->($self);
7847 }
7848
7849 redo A;
7850 }
7851 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7852 if ($self->{nc} == 0x0027) { # '
7853 $self->{state} = AFTER_MD_DEF_STATE;
7854
7855 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7856 $self->{line_prev} = $self->{line};
7857 $self->{column_prev} = $self->{column};
7858 $self->{column}++;
7859 $self->{nc}
7860 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7861 } else {
7862 $self->{set_nc}->($self);
7863 }
7864
7865 redo A;
7866 } elsif ($self->{nc} == 0x0026) { # &
7867 $self->{prev_state} = $self->{state};
7868 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7869 $self->{entity_add} = 0x0027; # '
7870
7871 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7872 $self->{line_prev} = $self->{line};
7873 $self->{column_prev} = $self->{column};
7874 $self->{column}++;
7875 $self->{nc}
7876 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7877 } else {
7878 $self->{set_nc}->($self);
7879 }
7880
7881 redo A;
7882 ## TODO: %
7883 } elsif ($self->{nc} == -1) {
7884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7885 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7886 ## Reconsume.
7887 return ($self->{ct}); # ENTITY
7888 redo A;
7889 } else {
7890 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7891
7892 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7893 $self->{line_prev} = $self->{line};
7894 $self->{column_prev} = $self->{column};
7895 $self->{column}++;
7896 $self->{nc}
7897 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7898 } else {
7899 $self->{set_nc}->($self);
7900 }
7901
7902 redo A;
7903 }
7904 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7905 if ($is_space->{$self->{nc}} or
7906 {
7907 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7908 $self->{entity_add} => 1,
7909 }->{$self->{nc}}) {
7910 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7911 line => $self->{line_prev},
7912 column => $self->{column_prev}
7913 + ($self->{nc} == -1 ? 1 : 0));
7914 ## Don't consume
7915 ## Return nothing.
7916 #
7917 } elsif ($self->{nc} == 0x0023) { # #
7918 $self->{ca} = $self->{ct};
7919 $self->{state} = ENTITY_HASH_STATE;
7920 $self->{kwd} = '#';
7921
7922 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7923 $self->{line_prev} = $self->{line};
7924 $self->{column_prev} = $self->{column};
7925 $self->{column}++;
7926 $self->{nc}
7927 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7928 } else {
7929 $self->{set_nc}->($self);
7930 }
7931
7932 redo A;
7933 } else {
7934 #
7935 }
7936
7937 $self->{ct}->{value} .= '&';
7938 $self->{state} = $self->{prev_state};
7939 ## Reconsume.
7940 redo A;
7941 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7942 if ($is_space->{$self->{nc}}) {
7943 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7944
7945 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7946 $self->{line_prev} = $self->{line};
7947 $self->{column_prev} = $self->{column};
7948 $self->{column}++;
7949 $self->{nc}
7950 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7951 } else {
7952 $self->{set_nc}->($self);
7953 }
7954
7955 redo A;
7956 } elsif ($self->{nc} == 0x0028) { # (
7957 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7958 $self->{ct}->{content} = ['('];
7959 $self->{group_depth} = 1;
7960
7961 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7962 $self->{line_prev} = $self->{line};
7963 $self->{column_prev} = $self->{column};
7964 $self->{column}++;
7965 $self->{nc}
7966 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7967 } else {
7968 $self->{set_nc}->($self);
7969 }
7970
7971 redo A;
7972 } elsif ($self->{nc} == 0x003E) { # >
7973 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7974 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7975
7976 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7977 $self->{line_prev} = $self->{line};
7978 $self->{column_prev} = $self->{column};
7979 $self->{column}++;
7980 $self->{nc}
7981 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7982 } else {
7983 $self->{set_nc}->($self);
7984 }
7985
7986 return ($self->{ct}); # ELEMENT
7987 redo A;
7988 } elsif ($self->{nc} == -1) {
7989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7990 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7991
7992 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7993 $self->{line_prev} = $self->{line};
7994 $self->{column_prev} = $self->{column};
7995 $self->{column}++;
7996 $self->{nc}
7997 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7998 } else {
7999 $self->{set_nc}->($self);
8000 }
8001
8002 return ($self->{ct}); # ELEMENT
8003 redo A;
8004 } else {
8005 $self->{ct}->{content} = [chr $self->{nc}];
8006 $self->{state} = CONTENT_KEYWORD_STATE;
8007
8008 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8009 $self->{line_prev} = $self->{line};
8010 $self->{column_prev} = $self->{column};
8011 $self->{column}++;
8012 $self->{nc}
8013 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8014 } else {
8015 $self->{set_nc}->($self);
8016 }
8017
8018 redo A;
8019 }
8020 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8021 if ($is_space->{$self->{nc}}) {
8022 $self->{state} = AFTER_MD_DEF_STATE;
8023
8024 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8025 $self->{line_prev} = $self->{line};
8026 $self->{column_prev} = $self->{column};
8027 $self->{column}++;
8028 $self->{nc}
8029 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8030 } else {
8031 $self->{set_nc}->($self);
8032 }
8033
8034 redo A;
8035 } elsif ($self->{nc} == 0x003E) { # >
8036 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8037
8038 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8039 $self->{line_prev} = $self->{line};
8040 $self->{column_prev} = $self->{column};
8041 $self->{column}++;
8042 $self->{nc}
8043 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8044 } else {
8045 $self->{set_nc}->($self);
8046 }
8047
8048 return ($self->{ct}); # ELEMENT
8049 redo A;
8050 } elsif ($self->{nc} == -1) {
8051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8052 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8053
8054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8055 $self->{line_prev} = $self->{line};
8056 $self->{column_prev} = $self->{column};
8057 $self->{column}++;
8058 $self->{nc}
8059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8060 } else {
8061 $self->{set_nc}->($self);
8062 }
8063
8064 return ($self->{ct}); # ELEMENT
8065 redo A;
8066 } else {
8067 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8068 ## Stay in the state.
8069
8070 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8071 $self->{line_prev} = $self->{line};
8072 $self->{column_prev} = $self->{column};
8073 $self->{column}++;
8074 $self->{nc}
8075 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8076 } else {
8077 $self->{set_nc}->($self);
8078 }
8079
8080 redo A;
8081 }
8082 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8083 if ($is_space->{$self->{nc}}) {
8084 ## Stay in the state.
8085
8086 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8087 $self->{line_prev} = $self->{line};
8088 $self->{column_prev} = $self->{column};
8089 $self->{column}++;
8090 $self->{nc}
8091 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8092 } else {
8093 $self->{set_nc}->($self);
8094 }
8095
8096 redo A;
8097 } elsif ($self->{nc} == 0x0028) { # (
8098 $self->{group_depth}++;
8099 push @{$self->{ct}->{content}}, chr $self->{nc};
8100 ## Stay in the state.
8101
8102 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8103 $self->{line_prev} = $self->{line};
8104 $self->{column_prev} = $self->{column};
8105 $self->{column}++;
8106 $self->{nc}
8107 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8108 } else {
8109 $self->{set_nc}->($self);
8110 }
8111
8112 redo A;
8113 } elsif ($self->{nc} == 0x007C or # |
8114 $self->{nc} == 0x002C) { # ,
8115 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8116 ## Stay in the state.
8117
8118 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8119 $self->{line_prev} = $self->{line};
8120 $self->{column_prev} = $self->{column};
8121 $self->{column}++;
8122 $self->{nc}
8123 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8124 } else {
8125 $self->{set_nc}->($self);
8126 }
8127
8128 redo A;
8129 } elsif ($self->{nc} == 0x0029) { # )
8130 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8131 push @{$self->{ct}->{content}}, chr $self->{nc};
8132 $self->{group_depth}--;
8133 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8134
8135 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8136 $self->{line_prev} = $self->{line};
8137 $self->{column_prev} = $self->{column};
8138 $self->{column}++;
8139 $self->{nc}
8140 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8141 } else {
8142 $self->{set_nc}->($self);
8143 }
8144
8145 redo A;
8146 } elsif ($self->{nc} == 0x003E) { # >
8147 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8148 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8149 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8150
8151 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8152 $self->{line_prev} = $self->{line};
8153 $self->{column_prev} = $self->{column};
8154 $self->{column}++;
8155 $self->{nc}
8156 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8157 } else {
8158 $self->{set_nc}->($self);
8159 }
8160
8161 return ($self->{ct}); # ELEMENT
8162 redo A;
8163 } elsif ($self->{nc} == -1) {
8164 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8165 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8166 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8167
8168 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8169 $self->{line_prev} = $self->{line};
8170 $self->{column_prev} = $self->{column};
8171 $self->{column}++;
8172 $self->{nc}
8173 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8174 } else {
8175 $self->{set_nc}->($self);
8176 }
8177
8178 return ($self->{ct}); # ELEMENT
8179 redo A;
8180 } else {
8181 push @{$self->{ct}->{content}}, chr $self->{nc};
8182 $self->{state} = CM_ELEMENT_NAME_STATE;
8183
8184 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8185 $self->{line_prev} = $self->{line};
8186 $self->{column_prev} = $self->{column};
8187 $self->{column}++;
8188 $self->{nc}
8189 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8190 } else {
8191 $self->{set_nc}->($self);
8192 }
8193
8194 redo A;
8195 }
8196 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8197 if ($is_space->{$self->{nc}}) {
8198 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8199
8200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8201 $self->{line_prev} = $self->{line};
8202 $self->{column_prev} = $self->{column};
8203 $self->{column}++;
8204 $self->{nc}
8205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8206 } else {
8207 $self->{set_nc}->($self);
8208 }
8209
8210 redo A;
8211 } elsif ($self->{nc} == 0x002A or # *
8212 $self->{nc} == 0x002B or # +
8213 $self->{nc} == 0x003F) { # ?
8214 push @{$self->{ct}->{content}}, chr $self->{nc};
8215 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8216
8217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8218 $self->{line_prev} = $self->{line};
8219 $self->{column_prev} = $self->{column};
8220 $self->{column}++;
8221 $self->{nc}
8222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8223 } else {
8224 $self->{set_nc}->($self);
8225 }
8226
8227 redo A;
8228 } elsif ($self->{nc} == 0x007C or # |
8229 $self->{nc} == 0x002C) { # ,
8230 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8231 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8232
8233 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8234 $self->{line_prev} = $self->{line};
8235 $self->{column_prev} = $self->{column};
8236 $self->{column}++;
8237 $self->{nc}
8238 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8239 } else {
8240 $self->{set_nc}->($self);
8241 }
8242
8243 redo A;
8244 } elsif ($self->{nc} == 0x0029) { # )
8245 $self->{group_depth}--;
8246 push @{$self->{ct}->{content}}, chr $self->{nc};
8247 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8248
8249 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8250 $self->{line_prev} = $self->{line};
8251 $self->{column_prev} = $self->{column};
8252 $self->{column}++;
8253 $self->{nc}
8254 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8255 } else {
8256 $self->{set_nc}->($self);
8257 }
8258
8259 redo A;
8260 } elsif ($self->{nc} == 0x003E) { # >
8261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8262 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8263 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8264
8265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8266 $self->{line_prev} = $self->{line};
8267 $self->{column_prev} = $self->{column};
8268 $self->{column}++;
8269 $self->{nc}
8270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8271 } else {
8272 $self->{set_nc}->($self);
8273 }
8274
8275 return ($self->{ct}); # ELEMENT
8276 redo A;
8277 } elsif ($self->{nc} == -1) {
8278 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8279 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8280 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8281
8282 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8283 $self->{line_prev} = $self->{line};
8284 $self->{column_prev} = $self->{column};
8285 $self->{column}++;
8286 $self->{nc}
8287 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8288 } else {
8289 $self->{set_nc}->($self);
8290 }
8291
8292 return ($self->{ct}); # ELEMENT
8293 redo A;
8294 } else {
8295 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8296 ## Stay in the state.
8297
8298 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8299 $self->{line_prev} = $self->{line};
8300 $self->{column_prev} = $self->{column};
8301 $self->{column}++;
8302 $self->{nc}
8303 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8304 } else {
8305 $self->{set_nc}->($self);
8306 }
8307
8308 redo A;
8309 }
8310 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8311 if ($is_space->{$self->{nc}}) {
8312 ## Stay in the state.
8313
8314 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8315 $self->{line_prev} = $self->{line};
8316 $self->{column_prev} = $self->{column};
8317 $self->{column}++;
8318 $self->{nc}
8319 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8320 } else {
8321 $self->{set_nc}->($self);
8322 }
8323
8324 redo A;
8325 } elsif ($self->{nc} == 0x007C or # |
8326 $self->{nc} == 0x002C) { # ,
8327 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8328 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8329
8330 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8331 $self->{line_prev} = $self->{line};
8332 $self->{column_prev} = $self->{column};
8333 $self->{column}++;
8334 $self->{nc}
8335 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8336 } else {
8337 $self->{set_nc}->($self);
8338 }
8339
8340 redo A;
8341 } elsif ($self->{nc} == 0x0029) { # )
8342 $self->{group_depth}--;
8343 push @{$self->{ct}->{content}}, chr $self->{nc};
8344 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8345
8346 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8347 $self->{line_prev} = $self->{line};
8348 $self->{column_prev} = $self->{column};
8349 $self->{column}++;
8350 $self->{nc}
8351 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8352 } else {
8353 $self->{set_nc}->($self);
8354 }
8355
8356 redo A;
8357 } elsif ($self->{nc} == 0x003E) { # >
8358 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8359 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8360 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8361
8362 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8363 $self->{line_prev} = $self->{line};
8364 $self->{column_prev} = $self->{column};
8365 $self->{column}++;
8366 $self->{nc}
8367 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8368 } else {
8369 $self->{set_nc}->($self);
8370 }
8371
8372 return ($self->{ct}); # ELEMENT
8373 redo A;
8374 } elsif ($self->{nc} == -1) {
8375 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8376 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8377 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8378
8379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8380 $self->{line_prev} = $self->{line};
8381 $self->{column_prev} = $self->{column};
8382 $self->{column}++;
8383 $self->{nc}
8384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8385 } else {
8386 $self->{set_nc}->($self);
8387 }
8388
8389 return ($self->{ct}); # ELEMENT
8390 redo A;
8391 } else {
8392 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8393 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8394 $self->{state} = BOGUS_MD_STATE;
8395
8396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8397 $self->{line_prev} = $self->{line};
8398 $self->{column_prev} = $self->{column};
8399 $self->{column}++;
8400 $self->{nc}
8401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8402 } else {
8403 $self->{set_nc}->($self);
8404 }
8405
8406 redo A;
8407 }
8408 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8409 if ($is_space->{$self->{nc}}) {
8410 if ($self->{group_depth}) {
8411 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8412 } else {
8413 $self->{state} = AFTER_MD_DEF_STATE;
8414 }
8415
8416 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8417 $self->{line_prev} = $self->{line};
8418 $self->{column_prev} = $self->{column};
8419 $self->{column}++;
8420 $self->{nc}
8421 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8422 } else {
8423 $self->{set_nc}->($self);
8424 }
8425
8426 redo A;
8427 } elsif ($self->{nc} == 0x002A or # *
8428 $self->{nc} == 0x002B or # +
8429 $self->{nc} == 0x003F) { # ?
8430 push @{$self->{ct}->{content}}, chr $self->{nc};
8431 if ($self->{group_depth}) {
8432 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8433 } else {
8434 $self->{state} = AFTER_MD_DEF_STATE;
8435 }
8436
8437 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8438 $self->{line_prev} = $self->{line};
8439 $self->{column_prev} = $self->{column};
8440 $self->{column}++;
8441 $self->{nc}
8442 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8443 } else {
8444 $self->{set_nc}->($self);
8445 }
8446
8447 redo A;
8448 } elsif ($self->{nc} == 0x0029) { # )
8449 if ($self->{group_depth}) {
8450 $self->{group_depth}--;
8451 push @{$self->{ct}->{content}}, chr $self->{nc};
8452 ## Stay in the state.
8453
8454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8455 $self->{line_prev} = $self->{line};
8456 $self->{column_prev} = $self->{column};
8457 $self->{column}++;
8458 $self->{nc}
8459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8460 } else {
8461 $self->{set_nc}->($self);
8462 }
8463
8464 redo A;
8465 } else {
8466 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8467 $self->{state} = BOGUS_MD_STATE;
8468 ## Reconsume.
8469 redo A;
8470 }
8471 } elsif ($self->{nc} == 0x003E) { # >
8472 if ($self->{group_depth}) {
8473 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8474 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8475 }
8476 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8477
8478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8479 $self->{line_prev} = $self->{line};
8480 $self->{column_prev} = $self->{column};
8481 $self->{column}++;
8482 $self->{nc}
8483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8484 } else {
8485 $self->{set_nc}->($self);
8486 }
8487
8488 return ($self->{ct}); # ELEMENT
8489 redo A;
8490 } elsif ($self->{nc} == -1) {
8491 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8492 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8493 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8494
8495 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8496 $self->{line_prev} = $self->{line};
8497 $self->{column_prev} = $self->{column};
8498 $self->{column}++;
8499 $self->{nc}
8500 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8501 } else {
8502 $self->{set_nc}->($self);
8503 }
8504
8505 return ($self->{ct}); # ELEMENT
8506 redo A;
8507 } else {
8508 if ($self->{group_depth}) {
8509 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8510 } else {
8511 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8512 $self->{state} = BOGUS_MD_STATE;
8513 }
8514 ## Reconsume.
8515 redo A;
8516 }
8517 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8518 if ($is_space->{$self->{nc}}) {
8519 ## Stay in the state.
8520
8521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8522 $self->{line_prev} = $self->{line};
8523 $self->{column_prev} = $self->{column};
8524 $self->{column}++;
8525 $self->{nc}
8526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8527 } else {
8528 $self->{set_nc}->($self);
8529 }
8530
8531 redo A;
8532 } elsif ($self->{nc} == 0x003E) { # >
8533 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8534
8535 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8536 $self->{line_prev} = $self->{line};
8537 $self->{column_prev} = $self->{column};
8538 $self->{column}++;
8539 $self->{nc}
8540 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8541 } else {
8542 $self->{set_nc}->($self);
8543 }
8544
8545 return ($self->{ct}); # ENTITY/ELEMENT
8546 redo A;
8547 } elsif ($self->{nc} == -1) {
8548 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8549 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8550
8551 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8552 $self->{line_prev} = $self->{line};
8553 $self->{column_prev} = $self->{column};
8554 $self->{column}++;
8555 $self->{nc}
8556 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8557 } else {
8558 $self->{set_nc}->($self);
8559 }
8560
8561 return ($self->{ct}); # ENTITY/ELEMENT
8562 redo A;
8563 } else {
8564 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8565 $self->{state} = BOGUS_MD_STATE;
8566 ## Reconsume.
8567 redo A;
8568 }
8569 } elsif ($self->{state} == BOGUS_MD_STATE) {
8570 if ($self->{nc} == 0x003E) { # >
8571 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8572
8573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8574 $self->{line_prev} = $self->{line};
8575 $self->{column_prev} = $self->{column};
8576 $self->{column}++;
8577 $self->{nc}
8578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8579 } else {
8580 $self->{set_nc}->($self);
8581 }
8582
8583 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8584 redo A;
8585 } elsif ($self->{nc} == -1) {
8586 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8587 ## Reconsume.
8588 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8589 redo A;
8590 } else {
8591 ## Stay in the state.
8592
8593 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8594 $self->{line_prev} = $self->{line};
8595 $self->{column_prev} = $self->{column};
8596 $self->{column}++;
8597 $self->{nc}
8598 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8599 } else {
8600 $self->{set_nc}->($self);
8601 }
8602
8603 redo A;
8604 }
8605 } else {
8606 die "$0: $self->{state}: Unknown state";
8607 }
8608 } # A
8609
8610 die "$0: _get_next_token: unexpected case";
8611 } # _get_next_token
8612
8613 1;
8614 ## $Date: 2008/10/19 10:12:54 $
8615

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24