/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.34 - (show annotations) (download)
Sat Sep 5 11:31:58 2009 UTC (15 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.33: +11 -10 lines
++ whatpm/t/ChangeLog	5 Sep 2009 11:31:07 -0000
	* tokenizer-test-1.test: Changed to keep non-normal character
	references (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 11:31:46 -0000
	* Tokenizer.pm.src: Changed to keep non-normal character
	references as is (HTML5 revision 3374).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.33 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_BANG_STATE () { 102 }
109 sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 sub COMMENT_END_DASH_STATE () { 18 }
111 sub BOGUS_COMMENT_STATE () { 19 }
112 sub DOCTYPE_STATE () { 20 }
113 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114 sub DOCTYPE_NAME_STATE () { 22 }
115 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124 sub BOGUS_DOCTYPE_STATE () { 32 }
125 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126 sub SELF_CLOSING_START_TAG_STATE () { 34 }
127 sub CDATA_SECTION_STATE () { 35 }
128 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136 ## NOTE: "Entity data state", "entity in attribute value state", and
137 ## "consume a character reference" algorithm are jointly implemented
138 ## using the following six states:
139 sub ENTITY_STATE () { 44 }
140 sub ENTITY_HASH_STATE () { 45 }
141 sub NCR_NUM_STATE () { 46 }
142 sub HEXREF_X_STATE () { 47 }
143 sub HEXREF_HEX_STATE () { 48 }
144 sub ENTITY_NAME_STATE () { 49 }
145 sub PCDATA_STATE () { 50 } # "data state" in the spec
146
147 ## XML-only states
148 sub PI_STATE () { 51 }
149 sub PI_TARGET_STATE () { 52 }
150 sub PI_TARGET_AFTER_STATE () { 53 }
151 sub PI_DATA_STATE () { 54 }
152 sub PI_AFTER_STATE () { 55 }
153 sub PI_DATA_AFTER_STATE () { 56 }
154 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157 sub DOCTYPE_TAG_STATE () { 60 }
158 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159 sub MD_ATTLIST_STATE () { 62 }
160 sub MD_E_STATE () { 63 }
161 sub MD_ELEMENT_STATE () { 64 }
162 sub MD_ENTITY_STATE () { 65 }
163 sub MD_NOTATION_STATE () { 66 }
164 sub DOCTYPE_MD_STATE () { 67 }
165 sub BEFORE_MD_NAME_STATE () { 68 }
166 sub MD_NAME_STATE () { 69 }
167 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174 sub ALLOWED_TOKEN_STATE () { 77 }
175 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 sub BEFORE_NDATA_STATE () { 85 }
183 sub NDATA_STATE () { 86 }
184 sub AFTER_NDATA_STATE () { 87 }
185 sub BEFORE_NOTATION_NAME_STATE () { 88 }
186 sub NOTATION_NAME_STATE () { 89 }
187 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190 sub AFTER_ELEMENT_NAME_STATE () { 93 }
191 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192 sub CONTENT_KEYWORD_STATE () { 95 }
193 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194 sub CM_ELEMENT_NAME_STATE () { 97 }
195 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197 sub AFTER_MD_DEF_STATE () { 100 }
198 sub BOGUS_MD_STATE () { 101 }
199
200 ## Tree constructor state constants (see Whatpm::HTML for the full
201 ## list and descriptions)
202
203 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204 sub FOREIGN_EL () { 0b1_00000000000 }
205
206 ## Character reference mappings
207
208 my $charref_map = {
209 0x00 => 0xFFFD, # REPLACEMENT CHARACTER
210 0x0D => 0x000A,
211 0x80 => 0x20AC,
212 0x81 => 0x0081,
213 0x82 => 0x201A,
214 0x83 => 0x0192,
215 0x84 => 0x201E,
216 0x85 => 0x2026,
217 0x86 => 0x2020,
218 0x87 => 0x2021,
219 0x88 => 0x02C6,
220 0x89 => 0x2030,
221 0x8A => 0x0160,
222 0x8B => 0x2039,
223 0x8C => 0x0152,
224 0x8D => 0x008D,
225 0x8E => 0x017D,
226 0x8F => 0x008F,
227 0x90 => 0x0090,
228 0x91 => 0x2018,
229 0x92 => 0x2019,
230 0x93 => 0x201C,
231 0x94 => 0x201D,
232 0x95 => 0x2022,
233 0x96 => 0x2013,
234 0x97 => 0x2014,
235 0x98 => 0x02DC,
236 0x99 => 0x2122,
237 0x9A => 0x0161,
238 0x9B => 0x203A,
239 0x9C => 0x0153,
240 0x9D => 0x009D,
241 0x9E => 0x017E,
242 0x9F => 0x0178,
243 }; # $charref_map
244 $charref_map->{$_} = $_
245 for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
246 0xD800..0xDFFF, 0xFDD0..0xFDEF,
247 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
248 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
249 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
250 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
251 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
252
253 ## Implementations MUST act as if state machine in the spec
254
255 sub _initialize_tokenizer ($) {
256 my $self = shift;
257
258 ## NOTE: Fields set by |new| constructor:
259 #$self->{level}
260 #$self->{set_nc}
261 #$self->{parse_error}
262 #$self->{is_xml} (if XML)
263
264 $self->{state} = DATA_STATE; # MUST
265 $self->{s_kwd} = ''; # Data state keyword
266 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
267 #$self->{entity__value}; # initialized when used
268 #$self->{entity__match}; # initialized when used
269 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
270 undef $self->{ct}; # current token
271 undef $self->{ca}; # current attribute
272 undef $self->{last_stag_name}; # last emitted start tag name
273 #$self->{prev_state}; # initialized when used
274 delete $self->{self_closing};
275 $self->{char_buffer} = '';
276 $self->{char_buffer_pos} = 0;
277 $self->{nc} = -1; # next input character
278 #$self->{next_nc}
279
280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
281 $self->{line_prev} = $self->{line};
282 $self->{column_prev} = $self->{column};
283 $self->{column}++;
284 $self->{nc}
285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
286 } else {
287 $self->{set_nc}->($self);
288 }
289
290 $self->{token} = [];
291 # $self->{escape}
292 } # _initialize_tokenizer
293
294 ## A token has:
295 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
296 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
297 ## ->{name} (DOCTYPE_TOKEN)
298 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
299 ## ->{target} (PI_TOKEN)
300 ## ->{pubid} (DOCTYPE_TOKEN)
301 ## ->{sysid} (DOCTYPE_TOKEN)
302 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
303 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
304 ## ->{name}
305 ## ->{value}
306 ## ->{has_reference} == 1 or 0
307 ## ->{index}: Index of the attribute in a tag.
308 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
309 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
310 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
311 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
312
313 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
314 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
315 ## while the token is pushed back to the stack.
316
317 ## Emitted token MUST immediately be handled by the tree construction state.
318
319 ## Before each step, UA MAY check to see if either one of the scripts in
320 ## "list of scripts that will execute as soon as possible" or the first
321 ## script in the "list of scripts that will execute asynchronously",
322 ## has completed loading. If one has, then it MUST be executed
323 ## and removed from the list.
324
325 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
326 ## (This requirement was dropped from HTML5 spec, unfortunately.)
327
328 my $is_space = {
329 0x0009 => 1, # CHARACTER TABULATION (HT)
330 0x000A => 1, # LINE FEED (LF)
331 #0x000B => 0, # LINE TABULATION (VT)
332 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
333 #0x000D => 1, # CARRIAGE RETURN (CR)
334 0x0020 => 1, # SPACE (SP)
335 };
336
337 sub _get_next_token ($) {
338 my $self = shift;
339
340 if ($self->{self_closing}) {
341 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
342 ## NOTE: The |self_closing| flag is only set by start tag token.
343 ## In addition, when a start tag token is emitted, it is always set to
344 ## |ct|.
345 delete $self->{self_closing};
346 }
347
348 if (@{$self->{token}}) {
349 $self->{self_closing} = $self->{token}->[0]->{self_closing};
350 return shift @{$self->{token}};
351 }
352
353 A: {
354 if ($self->{state} == PCDATA_STATE) {
355 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
356
357 if ($self->{nc} == 0x0026) { # &
358
359 ## NOTE: In the spec, the tokenizer is switched to the
360 ## "entity data state". In this implementation, the tokenizer
361 ## is switched to the |ENTITY_STATE|, which is an implementation
362 ## of the "consume a character reference" algorithm.
363 $self->{entity_add} = -1;
364 $self->{prev_state} = DATA_STATE;
365 $self->{state} = ENTITY_STATE;
366
367 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
368 $self->{line_prev} = $self->{line};
369 $self->{column_prev} = $self->{column};
370 $self->{column}++;
371 $self->{nc}
372 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
373 } else {
374 $self->{set_nc}->($self);
375 }
376
377 redo A;
378 } elsif ($self->{nc} == 0x003C) { # <
379
380 $self->{state} = TAG_OPEN_STATE;
381
382 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
383 $self->{line_prev} = $self->{line};
384 $self->{column_prev} = $self->{column};
385 $self->{column}++;
386 $self->{nc}
387 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
388 } else {
389 $self->{set_nc}->($self);
390 }
391
392 redo A;
393 } elsif ($self->{nc} == -1) {
394
395 return ({type => END_OF_FILE_TOKEN,
396 line => $self->{line}, column => $self->{column}});
397 last A; ## TODO: ok?
398 } else {
399
400 #
401 }
402
403 # Anything else
404 my $token = {type => CHARACTER_TOKEN,
405 data => chr $self->{nc},
406 line => $self->{line}, column => $self->{column},
407 };
408 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
409
410 ## Stay in the state.
411
412 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
413 $self->{line_prev} = $self->{line};
414 $self->{column_prev} = $self->{column};
415 $self->{column}++;
416 $self->{nc}
417 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
418 } else {
419 $self->{set_nc}->($self);
420 }
421
422 return ($token);
423 redo A;
424 } elsif ($self->{state} == DATA_STATE) {
425 $self->{s_kwd} = '' unless defined $self->{s_kwd};
426 if ($self->{nc} == 0x0026) { # &
427 $self->{s_kwd} = '';
428 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
429 not $self->{escape}) {
430
431 ## NOTE: In the spec, the tokenizer is switched to the
432 ## "entity data state". In this implementation, the tokenizer
433 ## is switched to the |ENTITY_STATE|, which is an implementation
434 ## of the "consume a character reference" algorithm.
435 $self->{entity_add} = -1;
436 $self->{prev_state} = DATA_STATE;
437 $self->{state} = ENTITY_STATE;
438
439 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
440 $self->{line_prev} = $self->{line};
441 $self->{column_prev} = $self->{column};
442 $self->{column}++;
443 $self->{nc}
444 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
445 } else {
446 $self->{set_nc}->($self);
447 }
448
449 redo A;
450 } else {
451
452 #
453 }
454 } elsif ($self->{nc} == 0x002D) { # -
455 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
456 if ($self->{s_kwd} eq '<!-') {
457
458 $self->{escape} = 1; # unless $self->{escape};
459 $self->{s_kwd} = '--';
460 #
461 } elsif ($self->{s_kwd} eq '-') {
462
463 $self->{s_kwd} = '--';
464 #
465 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
466
467 $self->{s_kwd} .= '-';
468 #
469 } else {
470
471 $self->{s_kwd} = '-';
472 #
473 }
474 }
475
476 #
477 } elsif ($self->{nc} == 0x0021) { # !
478 if (length $self->{s_kwd}) {
479
480 $self->{s_kwd} .= '!';
481 #
482 } else {
483
484 #$self->{s_kwd} = '';
485 #
486 }
487 #
488 } elsif ($self->{nc} == 0x003C) { # <
489 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
490 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
491 not $self->{escape})) {
492
493 $self->{state} = TAG_OPEN_STATE;
494
495 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
496 $self->{line_prev} = $self->{line};
497 $self->{column_prev} = $self->{column};
498 $self->{column}++;
499 $self->{nc}
500 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
501 } else {
502 $self->{set_nc}->($self);
503 }
504
505 redo A;
506 } else {
507
508 $self->{s_kwd} = '';
509 #
510 }
511 } elsif ($self->{nc} == 0x003E) { # >
512 if ($self->{escape} and
513 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
514 if ($self->{s_kwd} eq '--') {
515
516 delete $self->{escape};
517 #
518 } else {
519
520 #
521 }
522 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
523
524 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
525 line => $self->{line_prev},
526 column => $self->{column_prev} - 1);
527 #
528 } else {
529
530 #
531 }
532
533 $self->{s_kwd} = '';
534 #
535 } elsif ($self->{nc} == 0x005D) { # ]
536 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
537
538 $self->{s_kwd} .= ']';
539 } elsif ($self->{s_kwd} eq ']]') {
540
541 #
542 } else {
543
544 $self->{s_kwd} = '';
545 }
546 #
547 } elsif ($self->{nc} == -1) {
548
549 $self->{s_kwd} = '';
550 return ({type => END_OF_FILE_TOKEN,
551 line => $self->{line}, column => $self->{column}});
552 last A; ## TODO: ok?
553 } else {
554
555 $self->{s_kwd} = '';
556 #
557 }
558
559 # Anything else
560 my $token = {type => CHARACTER_TOKEN,
561 data => chr $self->{nc},
562 line => $self->{line}, column => $self->{column},
563 };
564 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
565 length $token->{data})) {
566 $self->{s_kwd} = '';
567 }
568
569 ## Stay in the data state.
570 if (not $self->{is_xml} and
571 $self->{content_model} == PCDATA_CONTENT_MODEL) {
572
573 $self->{state} = PCDATA_STATE;
574 } else {
575
576 ## Stay in the state.
577 }
578
579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
580 $self->{line_prev} = $self->{line};
581 $self->{column_prev} = $self->{column};
582 $self->{column}++;
583 $self->{nc}
584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
585 } else {
586 $self->{set_nc}->($self);
587 }
588
589 return ($token);
590 redo A;
591 } elsif ($self->{state} == TAG_OPEN_STATE) {
592 ## XML5: "tag state".
593
594 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
595 if ($self->{nc} == 0x002F) { # /
596
597
598 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
599 $self->{line_prev} = $self->{line};
600 $self->{column_prev} = $self->{column};
601 $self->{column}++;
602 $self->{nc}
603 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
604 } else {
605 $self->{set_nc}->($self);
606 }
607
608 $self->{state} = CLOSE_TAG_OPEN_STATE;
609 redo A;
610 } elsif ($self->{nc} == 0x0021) { # !
611
612 $self->{s_kwd} = $self->{escaped} ? '' : '<';
613 #
614 } else {
615
616 $self->{s_kwd} = '';
617 #
618 }
619
620 ## reconsume
621 $self->{state} = DATA_STATE;
622 return ({type => CHARACTER_TOKEN, data => '<',
623 line => $self->{line_prev},
624 column => $self->{column_prev},
625 });
626 redo A;
627 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
628 if ($self->{nc} == 0x0021) { # !
629
630 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
631
632 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
633 $self->{line_prev} = $self->{line};
634 $self->{column_prev} = $self->{column};
635 $self->{column}++;
636 $self->{nc}
637 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
638 } else {
639 $self->{set_nc}->($self);
640 }
641
642 redo A;
643 } elsif ($self->{nc} == 0x002F) { # /
644
645 $self->{state} = CLOSE_TAG_OPEN_STATE;
646
647 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
648 $self->{line_prev} = $self->{line};
649 $self->{column_prev} = $self->{column};
650 $self->{column}++;
651 $self->{nc}
652 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
653 } else {
654 $self->{set_nc}->($self);
655 }
656
657 redo A;
658 } elsif (0x0041 <= $self->{nc} and
659 $self->{nc} <= 0x005A) { # A..Z
660
661 $self->{ct}
662 = {type => START_TAG_TOKEN,
663 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
664 line => $self->{line_prev},
665 column => $self->{column_prev}};
666 $self->{state} = TAG_NAME_STATE;
667
668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
669 $self->{line_prev} = $self->{line};
670 $self->{column_prev} = $self->{column};
671 $self->{column}++;
672 $self->{nc}
673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
674 } else {
675 $self->{set_nc}->($self);
676 }
677
678 redo A;
679 } elsif (0x0061 <= $self->{nc} and
680 $self->{nc} <= 0x007A) { # a..z
681
682 $self->{ct} = {type => START_TAG_TOKEN,
683 tag_name => chr ($self->{nc}),
684 line => $self->{line_prev},
685 column => $self->{column_prev}};
686 $self->{state} = TAG_NAME_STATE;
687
688 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
689 $self->{line_prev} = $self->{line};
690 $self->{column_prev} = $self->{column};
691 $self->{column}++;
692 $self->{nc}
693 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
694 } else {
695 $self->{set_nc}->($self);
696 }
697
698 redo A;
699 } elsif ($self->{nc} == 0x003E) { # >
700
701 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
702 line => $self->{line_prev},
703 column => $self->{column_prev});
704 $self->{state} = DATA_STATE;
705 $self->{s_kwd} = '';
706
707 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
708 $self->{line_prev} = $self->{line};
709 $self->{column_prev} = $self->{column};
710 $self->{column}++;
711 $self->{nc}
712 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
713 } else {
714 $self->{set_nc}->($self);
715 }
716
717
718 return ({type => CHARACTER_TOKEN, data => '<>',
719 line => $self->{line_prev},
720 column => $self->{column_prev},
721 });
722
723 redo A;
724 } elsif ($self->{nc} == 0x003F) { # ?
725 if ($self->{is_xml}) {
726
727 $self->{state} = PI_STATE;
728
729 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
730 $self->{line_prev} = $self->{line};
731 $self->{column_prev} = $self->{column};
732 $self->{column}++;
733 $self->{nc}
734 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
735 } else {
736 $self->{set_nc}->($self);
737 }
738
739 redo A;
740 } else {
741
742 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
743 line => $self->{line_prev},
744 column => $self->{column_prev});
745 $self->{state} = BOGUS_COMMENT_STATE;
746 $self->{ct} = {type => COMMENT_TOKEN, data => '',
747 line => $self->{line_prev},
748 column => $self->{column_prev},
749 };
750 ## $self->{nc} is intentionally left as is
751 redo A;
752 }
753 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
754
755 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
756 line => $self->{line_prev},
757 column => $self->{column_prev});
758 $self->{state} = DATA_STATE;
759 $self->{s_kwd} = '';
760 ## reconsume
761
762 return ({type => CHARACTER_TOKEN, data => '<',
763 line => $self->{line_prev},
764 column => $self->{column_prev},
765 });
766
767 redo A;
768 } else {
769 ## XML5: "<:" is a parse error.
770
771 $self->{ct} = {type => START_TAG_TOKEN,
772 tag_name => chr ($self->{nc}),
773 line => $self->{line_prev},
774 column => $self->{column_prev}};
775 $self->{state} = TAG_NAME_STATE;
776
777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
778 $self->{line_prev} = $self->{line};
779 $self->{column_prev} = $self->{column};
780 $self->{column}++;
781 $self->{nc}
782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
783 } else {
784 $self->{set_nc}->($self);
785 }
786
787 redo A;
788 }
789 } else {
790 die "$0: $self->{content_model} in tag open";
791 }
792 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
793 ## NOTE: The "close tag open state" in the spec is implemented as
794 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
795
796 ## XML5: "end tag state".
797
798 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
799 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
800 if (defined $self->{last_stag_name}) {
801 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
802 $self->{kwd} = '';
803 ## Reconsume.
804 redo A;
805 } else {
806 ## No start tag token has ever been emitted
807 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
808
809 $self->{state} = DATA_STATE;
810 $self->{s_kwd} = '';
811 ## Reconsume.
812 return ({type => CHARACTER_TOKEN, data => '</',
813 line => $l, column => $c,
814 });
815 redo A;
816 }
817 }
818
819 if (0x0041 <= $self->{nc} and
820 $self->{nc} <= 0x005A) { # A..Z
821
822 $self->{ct}
823 = {type => END_TAG_TOKEN,
824 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
825 line => $l, column => $c};
826 $self->{state} = TAG_NAME_STATE;
827
828 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
829 $self->{line_prev} = $self->{line};
830 $self->{column_prev} = $self->{column};
831 $self->{column}++;
832 $self->{nc}
833 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
834 } else {
835 $self->{set_nc}->($self);
836 }
837
838 redo A;
839 } elsif (0x0061 <= $self->{nc} and
840 $self->{nc} <= 0x007A) { # a..z
841
842 $self->{ct} = {type => END_TAG_TOKEN,
843 tag_name => chr ($self->{nc}),
844 line => $l, column => $c};
845 $self->{state} = TAG_NAME_STATE;
846
847 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
848 $self->{line_prev} = $self->{line};
849 $self->{column_prev} = $self->{column};
850 $self->{column}++;
851 $self->{nc}
852 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
853 } else {
854 $self->{set_nc}->($self);
855 }
856
857 redo A;
858 } elsif ($self->{nc} == 0x003E) { # >
859 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
860 line => $self->{line_prev}, ## "<" in "</>"
861 column => $self->{column_prev} - 1);
862 $self->{state} = DATA_STATE;
863 $self->{s_kwd} = '';
864 if ($self->{is_xml}) {
865
866 ## XML5: No parse error.
867
868 ## NOTE: This parser raises a parse error, since it supports
869 ## XML1, not XML5.
870
871 ## NOTE: A short end tag token.
872 my $ct = {type => END_TAG_TOKEN,
873 tag_name => '',
874 line => $self->{line_prev},
875 column => $self->{column_prev} - 1,
876 };
877
878 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
879 $self->{line_prev} = $self->{line};
880 $self->{column_prev} = $self->{column};
881 $self->{column}++;
882 $self->{nc}
883 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
884 } else {
885 $self->{set_nc}->($self);
886 }
887
888 return ($ct);
889 } else {
890
891
892 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
893 $self->{line_prev} = $self->{line};
894 $self->{column_prev} = $self->{column};
895 $self->{column}++;
896 $self->{nc}
897 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
898 } else {
899 $self->{set_nc}->($self);
900 }
901
902 }
903 redo A;
904 } elsif ($self->{nc} == -1) {
905
906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
907 $self->{s_kwd} = '';
908 $self->{state} = DATA_STATE;
909 # reconsume
910
911 return ({type => CHARACTER_TOKEN, data => '</',
912 line => $l, column => $c,
913 });
914
915 redo A;
916 } elsif (not $self->{is_xml} or
917 $is_space->{$self->{nc}}) {
918
919 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
920 line => $self->{line_prev}, # "<" of "</"
921 column => $self->{column_prev} - 1);
922 $self->{state} = BOGUS_COMMENT_STATE;
923 $self->{ct} = {type => COMMENT_TOKEN, data => '',
924 line => $self->{line_prev}, # "<" of "</"
925 column => $self->{column_prev} - 1,
926 };
927 ## NOTE: $self->{nc} is intentionally left as is.
928 ## Although the "anything else" case of the spec not explicitly
929 ## states that the next input character is to be reconsumed,
930 ## it will be included to the |data| of the comment token
931 ## generated from the bogus end tag, as defined in the
932 ## "bogus comment state" entry.
933 redo A;
934 } else {
935 ## XML5: "</:" is a parse error.
936
937 $self->{ct} = {type => END_TAG_TOKEN,
938 tag_name => chr ($self->{nc}),
939 line => $l, column => $c};
940 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
941
942 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
943 $self->{line_prev} = $self->{line};
944 $self->{column_prev} = $self->{column};
945 $self->{column}++;
946 $self->{nc}
947 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
948 } else {
949 $self->{set_nc}->($self);
950 }
951
952 redo A;
953 }
954 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
955 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
956 if (length $ch) {
957 my $CH = $ch;
958 $ch =~ tr/a-z/A-Z/;
959 my $nch = chr $self->{nc};
960 if ($nch eq $ch or $nch eq $CH) {
961
962 ## Stay in the state.
963 $self->{kwd} .= $nch;
964
965 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
966 $self->{line_prev} = $self->{line};
967 $self->{column_prev} = $self->{column};
968 $self->{column}++;
969 $self->{nc}
970 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
971 } else {
972 $self->{set_nc}->($self);
973 }
974
975 redo A;
976 } else {
977
978 $self->{state} = DATA_STATE;
979 $self->{s_kwd} = '';
980 ## Reconsume.
981 return ({type => CHARACTER_TOKEN,
982 data => '</' . $self->{kwd},
983 line => $self->{line_prev},
984 column => $self->{column_prev} - 1 - length $self->{kwd},
985 });
986 redo A;
987 }
988 } else { # after "<{tag-name}"
989 unless ($is_space->{$self->{nc}} or
990 {
991 0x003E => 1, # >
992 0x002F => 1, # /
993 -1 => 1, # EOF
994 }->{$self->{nc}}) {
995
996 ## Reconsume.
997 $self->{state} = DATA_STATE;
998 $self->{s_kwd} = '';
999 return ({type => CHARACTER_TOKEN,
1000 data => '</' . $self->{kwd},
1001 line => $self->{line_prev},
1002 column => $self->{column_prev} - 1 - length $self->{kwd},
1003 });
1004 redo A;
1005 } else {
1006
1007 $self->{ct}
1008 = {type => END_TAG_TOKEN,
1009 tag_name => $self->{last_stag_name},
1010 line => $self->{line_prev},
1011 column => $self->{column_prev} - 1 - length $self->{kwd}};
1012 $self->{state} = TAG_NAME_STATE;
1013 ## Reconsume.
1014 redo A;
1015 }
1016 }
1017 } elsif ($self->{state} == TAG_NAME_STATE) {
1018 if ($is_space->{$self->{nc}}) {
1019
1020 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1021
1022 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1023 $self->{line_prev} = $self->{line};
1024 $self->{column_prev} = $self->{column};
1025 $self->{column}++;
1026 $self->{nc}
1027 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1028 } else {
1029 $self->{set_nc}->($self);
1030 }
1031
1032 redo A;
1033 } elsif ($self->{nc} == 0x003E) { # >
1034 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1035
1036 $self->{last_stag_name} = $self->{ct}->{tag_name};
1037 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1038 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1039 #if ($self->{ct}->{attributes}) {
1040 # ## NOTE: This should never be reached.
1041 # !!! cp (36);
1042 # !!! parse-error (type => 'end tag attribute');
1043 #} else {
1044
1045 #}
1046 } else {
1047 die "$0: $self->{ct}->{type}: Unknown token type";
1048 }
1049 $self->{state} = DATA_STATE;
1050 $self->{s_kwd} = '';
1051
1052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1053 $self->{line_prev} = $self->{line};
1054 $self->{column_prev} = $self->{column};
1055 $self->{column}++;
1056 $self->{nc}
1057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1058 } else {
1059 $self->{set_nc}->($self);
1060 }
1061
1062
1063 return ($self->{ct}); # start tag or end tag
1064
1065 redo A;
1066 } elsif (0x0041 <= $self->{nc} and
1067 $self->{nc} <= 0x005A) { # A..Z
1068
1069 $self->{ct}->{tag_name}
1070 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1071 # start tag or end tag
1072 ## Stay in this state
1073
1074 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1075 $self->{line_prev} = $self->{line};
1076 $self->{column_prev} = $self->{column};
1077 $self->{column}++;
1078 $self->{nc}
1079 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1080 } else {
1081 $self->{set_nc}->($self);
1082 }
1083
1084 redo A;
1085 } elsif ($self->{nc} == -1) {
1086 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1087 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1088
1089 $self->{last_stag_name} = $self->{ct}->{tag_name};
1090 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1091 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1092 #if ($self->{ct}->{attributes}) {
1093 # ## NOTE: This state should never be reached.
1094 # !!! cp (40);
1095 # !!! parse-error (type => 'end tag attribute');
1096 #} else {
1097
1098 #}
1099 } else {
1100 die "$0: $self->{ct}->{type}: Unknown token type";
1101 }
1102 $self->{state} = DATA_STATE;
1103 $self->{s_kwd} = '';
1104 # reconsume
1105
1106 ## Discard the token.
1107 #return ($self->{ct}); # start tag or end tag
1108
1109 redo A;
1110 } elsif ($self->{nc} == 0x002F) { # /
1111
1112 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1113
1114 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1115 $self->{line_prev} = $self->{line};
1116 $self->{column_prev} = $self->{column};
1117 $self->{column}++;
1118 $self->{nc}
1119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1120 } else {
1121 $self->{set_nc}->($self);
1122 }
1123
1124 redo A;
1125 } else {
1126
1127 $self->{ct}->{tag_name} .= chr $self->{nc};
1128 # start tag or end tag
1129 ## Stay in the state
1130
1131 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1132 $self->{line_prev} = $self->{line};
1133 $self->{column_prev} = $self->{column};
1134 $self->{column}++;
1135 $self->{nc}
1136 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1137 } else {
1138 $self->{set_nc}->($self);
1139 }
1140
1141 redo A;
1142 }
1143 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1144 ## XML5: "Tag attribute name before state".
1145
1146 if ($is_space->{$self->{nc}}) {
1147
1148 ## Stay in the state
1149
1150 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1151 $self->{line_prev} = $self->{line};
1152 $self->{column_prev} = $self->{column};
1153 $self->{column}++;
1154 $self->{nc}
1155 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1156 } else {
1157 $self->{set_nc}->($self);
1158 }
1159
1160 redo A;
1161 } elsif ($self->{nc} == 0x003E) { # >
1162 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1163
1164 $self->{last_stag_name} = $self->{ct}->{tag_name};
1165 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1166 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1167 if ($self->{ct}->{attributes}) {
1168
1169 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1170 } else {
1171
1172 }
1173 } else {
1174 die "$0: $self->{ct}->{type}: Unknown token type";
1175 }
1176 $self->{state} = DATA_STATE;
1177 $self->{s_kwd} = '';
1178
1179 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1180 $self->{line_prev} = $self->{line};
1181 $self->{column_prev} = $self->{column};
1182 $self->{column}++;
1183 $self->{nc}
1184 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1185 } else {
1186 $self->{set_nc}->($self);
1187 }
1188
1189
1190 return ($self->{ct}); # start tag or end tag
1191
1192 redo A;
1193 } elsif (0x0041 <= $self->{nc} and
1194 $self->{nc} <= 0x005A) { # A..Z
1195
1196 $self->{ca}
1197 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1198 value => '',
1199 line => $self->{line}, column => $self->{column}};
1200 $self->{state} = ATTRIBUTE_NAME_STATE;
1201
1202 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1203 $self->{line_prev} = $self->{line};
1204 $self->{column_prev} = $self->{column};
1205 $self->{column}++;
1206 $self->{nc}
1207 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1208 } else {
1209 $self->{set_nc}->($self);
1210 }
1211
1212 redo A;
1213 } elsif ($self->{nc} == 0x002F) { # /
1214
1215 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1216
1217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1218 $self->{line_prev} = $self->{line};
1219 $self->{column_prev} = $self->{column};
1220 $self->{column}++;
1221 $self->{nc}
1222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1223 } else {
1224 $self->{set_nc}->($self);
1225 }
1226
1227 redo A;
1228 } elsif ($self->{nc} == -1) {
1229 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1230 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1231
1232 $self->{last_stag_name} = $self->{ct}->{tag_name};
1233 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1234 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1235 if ($self->{ct}->{attributes}) {
1236
1237 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1238 } else {
1239
1240 }
1241 } else {
1242 die "$0: $self->{ct}->{type}: Unknown token type";
1243 }
1244 $self->{state} = DATA_STATE;
1245 $self->{s_kwd} = '';
1246 # reconsume
1247
1248 ## Discard the token.
1249 #return ($self->{ct}); # start tag or end tag
1250
1251 redo A;
1252 } else {
1253 if ({
1254 0x0022 => 1, # "
1255 0x0027 => 1, # '
1256 0x003C => 1, # <
1257 0x003D => 1, # =
1258 }->{$self->{nc}}) {
1259
1260 ## XML5: Not a parse error.
1261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1262 } else {
1263
1264 ## XML5: ":" raises a parse error and is ignored.
1265 }
1266 $self->{ca}
1267 = {name => chr ($self->{nc}),
1268 value => '',
1269 line => $self->{line}, column => $self->{column}};
1270 $self->{state} = ATTRIBUTE_NAME_STATE;
1271
1272 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1273 $self->{line_prev} = $self->{line};
1274 $self->{column_prev} = $self->{column};
1275 $self->{column}++;
1276 $self->{nc}
1277 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1278 } else {
1279 $self->{set_nc}->($self);
1280 }
1281
1282 redo A;
1283 }
1284 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1285 ## XML5: "Tag attribute name state".
1286
1287 my $before_leave = sub {
1288 if (exists $self->{ct}->{attributes} # start tag or end tag
1289 ->{$self->{ca}->{name}}) { # MUST
1290
1291 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1292 ## Discard $self->{ca} # MUST
1293 } else {
1294
1295 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1296 = $self->{ca};
1297 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1298 }
1299 }; # $before_leave
1300
1301 if ($is_space->{$self->{nc}}) {
1302
1303 $before_leave->();
1304 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1305
1306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1307 $self->{line_prev} = $self->{line};
1308 $self->{column_prev} = $self->{column};
1309 $self->{column}++;
1310 $self->{nc}
1311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1312 } else {
1313 $self->{set_nc}->($self);
1314 }
1315
1316 redo A;
1317 } elsif ($self->{nc} == 0x003D) { # =
1318
1319 $before_leave->();
1320 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1321
1322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1323 $self->{line_prev} = $self->{line};
1324 $self->{column_prev} = $self->{column};
1325 $self->{column}++;
1326 $self->{nc}
1327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1328 } else {
1329 $self->{set_nc}->($self);
1330 }
1331
1332 redo A;
1333 } elsif ($self->{nc} == 0x003E) { # >
1334 if ($self->{is_xml}) {
1335
1336 ## XML5: Not a parse error.
1337 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1338 } else {
1339
1340 }
1341
1342 $before_leave->();
1343 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1344
1345 $self->{last_stag_name} = $self->{ct}->{tag_name};
1346 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1347
1348 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1349 if ($self->{ct}->{attributes}) {
1350 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1351 }
1352 } else {
1353 die "$0: $self->{ct}->{type}: Unknown token type";
1354 }
1355 $self->{state} = DATA_STATE;
1356 $self->{s_kwd} = '';
1357
1358 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1359 $self->{line_prev} = $self->{line};
1360 $self->{column_prev} = $self->{column};
1361 $self->{column}++;
1362 $self->{nc}
1363 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1364 } else {
1365 $self->{set_nc}->($self);
1366 }
1367
1368
1369 return ($self->{ct}); # start tag or end tag
1370
1371 redo A;
1372 } elsif (0x0041 <= $self->{nc} and
1373 $self->{nc} <= 0x005A) { # A..Z
1374
1375 $self->{ca}->{name}
1376 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1377 ## Stay in the state
1378
1379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1380 $self->{line_prev} = $self->{line};
1381 $self->{column_prev} = $self->{column};
1382 $self->{column}++;
1383 $self->{nc}
1384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1385 } else {
1386 $self->{set_nc}->($self);
1387 }
1388
1389 redo A;
1390 } elsif ($self->{nc} == 0x002F) { # /
1391 if ($self->{is_xml}) {
1392
1393 ## XML5: Not a parse error.
1394 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1395 } else {
1396
1397 }
1398
1399 $before_leave->();
1400 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1401
1402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1403 $self->{line_prev} = $self->{line};
1404 $self->{column_prev} = $self->{column};
1405 $self->{column}++;
1406 $self->{nc}
1407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1408 } else {
1409 $self->{set_nc}->($self);
1410 }
1411
1412 redo A;
1413 } elsif ($self->{nc} == -1) {
1414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1415 $before_leave->();
1416 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1417
1418 $self->{last_stag_name} = $self->{ct}->{tag_name};
1419 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1420 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1421 if ($self->{ct}->{attributes}) {
1422
1423 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1424 } else {
1425 ## NOTE: This state should never be reached.
1426
1427 }
1428 } else {
1429 die "$0: $self->{ct}->{type}: Unknown token type";
1430 }
1431 $self->{state} = DATA_STATE;
1432 $self->{s_kwd} = '';
1433 # reconsume
1434
1435 ## Discard the token.
1436 #return ($self->{ct}); # start tag or end tag
1437
1438 redo A;
1439 } else {
1440 if ({
1441 0x0022 => 1, # "
1442 0x0027 => 1, # '
1443 0x003C => 1, # <
1444 }->{$self->{nc}}) {
1445
1446 ## XML5: Not a parse error.
1447 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1448 } else {
1449
1450 }
1451 $self->{ca}->{name} .= chr ($self->{nc});
1452 ## Stay in the state
1453
1454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1455 $self->{line_prev} = $self->{line};
1456 $self->{column_prev} = $self->{column};
1457 $self->{column}++;
1458 $self->{nc}
1459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1460 } else {
1461 $self->{set_nc}->($self);
1462 }
1463
1464 redo A;
1465 }
1466 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1467 ## XML5: "Tag attribute name after state".
1468
1469 if ($is_space->{$self->{nc}}) {
1470
1471 ## Stay in the state
1472
1473 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1474 $self->{line_prev} = $self->{line};
1475 $self->{column_prev} = $self->{column};
1476 $self->{column}++;
1477 $self->{nc}
1478 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1479 } else {
1480 $self->{set_nc}->($self);
1481 }
1482
1483 redo A;
1484 } elsif ($self->{nc} == 0x003D) { # =
1485
1486 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1487
1488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1489 $self->{line_prev} = $self->{line};
1490 $self->{column_prev} = $self->{column};
1491 $self->{column}++;
1492 $self->{nc}
1493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1494 } else {
1495 $self->{set_nc}->($self);
1496 }
1497
1498 redo A;
1499 } elsif ($self->{nc} == 0x003E) { # >
1500 if ($self->{is_xml}) {
1501
1502 ## XML5: Not a parse error.
1503 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1504 } else {
1505
1506 }
1507
1508 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1509
1510 $self->{last_stag_name} = $self->{ct}->{tag_name};
1511 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1512 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1513 if ($self->{ct}->{attributes}) {
1514
1515 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1516 } else {
1517 ## NOTE: This state should never be reached.
1518
1519 }
1520 } else {
1521 die "$0: $self->{ct}->{type}: Unknown token type";
1522 }
1523 $self->{state} = DATA_STATE;
1524 $self->{s_kwd} = '';
1525
1526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1527 $self->{line_prev} = $self->{line};
1528 $self->{column_prev} = $self->{column};
1529 $self->{column}++;
1530 $self->{nc}
1531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1532 } else {
1533 $self->{set_nc}->($self);
1534 }
1535
1536
1537 return ($self->{ct}); # start tag or end tag
1538
1539 redo A;
1540 } elsif (0x0041 <= $self->{nc} and
1541 $self->{nc} <= 0x005A) { # A..Z
1542
1543 $self->{ca}
1544 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1545 value => '',
1546 line => $self->{line}, column => $self->{column}};
1547 $self->{state} = ATTRIBUTE_NAME_STATE;
1548
1549 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1550 $self->{line_prev} = $self->{line};
1551 $self->{column_prev} = $self->{column};
1552 $self->{column}++;
1553 $self->{nc}
1554 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1555 } else {
1556 $self->{set_nc}->($self);
1557 }
1558
1559 redo A;
1560 } elsif ($self->{nc} == 0x002F) { # /
1561 if ($self->{is_xml}) {
1562
1563 ## XML5: Not a parse error.
1564 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1565 } else {
1566
1567 }
1568
1569 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1570
1571 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1572 $self->{line_prev} = $self->{line};
1573 $self->{column_prev} = $self->{column};
1574 $self->{column}++;
1575 $self->{nc}
1576 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1577 } else {
1578 $self->{set_nc}->($self);
1579 }
1580
1581 redo A;
1582 } elsif ($self->{nc} == -1) {
1583 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1584 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1585
1586 $self->{last_stag_name} = $self->{ct}->{tag_name};
1587 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1588 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1589 if ($self->{ct}->{attributes}) {
1590
1591 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1592 } else {
1593 ## NOTE: This state should never be reached.
1594
1595 }
1596 } else {
1597 die "$0: $self->{ct}->{type}: Unknown token type";
1598 }
1599 $self->{s_kwd} = '';
1600 $self->{state} = DATA_STATE;
1601 # reconsume
1602
1603 ## Discard the token.
1604 #return ($self->{ct}); # start tag or end tag
1605
1606 redo A;
1607 } else {
1608 if ($self->{is_xml}) {
1609
1610 ## XML5: Not a parse error.
1611 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1612 } else {
1613
1614 }
1615
1616 if ({
1617 0x0022 => 1, # "
1618 0x0027 => 1, # '
1619 0x003C => 1, # <
1620 }->{$self->{nc}}) {
1621
1622 ## XML5: Not a parse error.
1623 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1624 } else {
1625
1626 }
1627 $self->{ca}
1628 = {name => chr ($self->{nc}),
1629 value => '',
1630 line => $self->{line}, column => $self->{column}};
1631 $self->{state} = ATTRIBUTE_NAME_STATE;
1632
1633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1634 $self->{line_prev} = $self->{line};
1635 $self->{column_prev} = $self->{column};
1636 $self->{column}++;
1637 $self->{nc}
1638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1639 } else {
1640 $self->{set_nc}->($self);
1641 }
1642
1643 redo A;
1644 }
1645 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1646 ## XML5: "Tag attribute value before state".
1647
1648 if ($is_space->{$self->{nc}}) {
1649
1650 ## Stay in the state
1651
1652 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1653 $self->{line_prev} = $self->{line};
1654 $self->{column_prev} = $self->{column};
1655 $self->{column}++;
1656 $self->{nc}
1657 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1658 } else {
1659 $self->{set_nc}->($self);
1660 }
1661
1662 redo A;
1663 } elsif ($self->{nc} == 0x0022) { # "
1664
1665 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1666
1667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1668 $self->{line_prev} = $self->{line};
1669 $self->{column_prev} = $self->{column};
1670 $self->{column}++;
1671 $self->{nc}
1672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1673 } else {
1674 $self->{set_nc}->($self);
1675 }
1676
1677 redo A;
1678 } elsif ($self->{nc} == 0x0026) { # &
1679
1680 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1681 ## reconsume
1682 redo A;
1683 } elsif ($self->{nc} == 0x0027) { # '
1684
1685 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1686
1687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1688 $self->{line_prev} = $self->{line};
1689 $self->{column_prev} = $self->{column};
1690 $self->{column}++;
1691 $self->{nc}
1692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1693 } else {
1694 $self->{set_nc}->($self);
1695 }
1696
1697 redo A;
1698 } elsif ($self->{nc} == 0x003E) { # >
1699 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1700 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1701
1702 $self->{last_stag_name} = $self->{ct}->{tag_name};
1703 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1704 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1705 if ($self->{ct}->{attributes}) {
1706
1707 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1708 } else {
1709 ## NOTE: This state should never be reached.
1710
1711 }
1712 } else {
1713 die "$0: $self->{ct}->{type}: Unknown token type";
1714 }
1715 $self->{state} = DATA_STATE;
1716 $self->{s_kwd} = '';
1717
1718 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1719 $self->{line_prev} = $self->{line};
1720 $self->{column_prev} = $self->{column};
1721 $self->{column}++;
1722 $self->{nc}
1723 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1724 } else {
1725 $self->{set_nc}->($self);
1726 }
1727
1728
1729 return ($self->{ct}); # start tag or end tag
1730
1731 redo A;
1732 } elsif ($self->{nc} == -1) {
1733 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1734 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1735
1736 $self->{last_stag_name} = $self->{ct}->{tag_name};
1737 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1738 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1739 if ($self->{ct}->{attributes}) {
1740
1741 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1742 } else {
1743 ## NOTE: This state should never be reached.
1744
1745 }
1746 } else {
1747 die "$0: $self->{ct}->{type}: Unknown token type";
1748 }
1749 $self->{state} = DATA_STATE;
1750 $self->{s_kwd} = '';
1751 ## reconsume
1752
1753 ## Discard the token.
1754 #return ($self->{ct}); # start tag or end tag
1755
1756 redo A;
1757 } else {
1758 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1759
1760 ## XML5: Not a parse error.
1761 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1762 } elsif ($self->{is_xml}) {
1763
1764 ## XML5: No parse error.
1765 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1766 } else {
1767
1768 }
1769 $self->{ca}->{value} .= chr ($self->{nc});
1770 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1771
1772 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1773 $self->{line_prev} = $self->{line};
1774 $self->{column_prev} = $self->{column};
1775 $self->{column}++;
1776 $self->{nc}
1777 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1778 } else {
1779 $self->{set_nc}->($self);
1780 }
1781
1782 redo A;
1783 }
1784 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1785 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1786 ## ATTLIST attribute value double quoted state".
1787
1788 if ($self->{nc} == 0x0022) { # "
1789 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1790
1791 ## XML5: "DOCTYPE ATTLIST name after state".
1792 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1793 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1794 } else {
1795
1796 ## XML5: "Tag attribute name before state".
1797 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1798 }
1799
1800 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1801 $self->{line_prev} = $self->{line};
1802 $self->{column_prev} = $self->{column};
1803 $self->{column}++;
1804 $self->{nc}
1805 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1806 } else {
1807 $self->{set_nc}->($self);
1808 }
1809
1810 redo A;
1811 } elsif ($self->{nc} == 0x0026) { # &
1812
1813 ## XML5: Not defined yet.
1814
1815 ## NOTE: In the spec, the tokenizer is switched to the
1816 ## "entity in attribute value state". In this implementation, the
1817 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1818 ## implementation of the "consume a character reference" algorithm.
1819 $self->{prev_state} = $self->{state};
1820 $self->{entity_add} = 0x0022; # "
1821 $self->{state} = ENTITY_STATE;
1822
1823 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1824 $self->{line_prev} = $self->{line};
1825 $self->{column_prev} = $self->{column};
1826 $self->{column}++;
1827 $self->{nc}
1828 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1829 } else {
1830 $self->{set_nc}->($self);
1831 }
1832
1833 redo A;
1834 } elsif ($self->{is_xml} and
1835 $is_space->{$self->{nc}}) {
1836
1837 $self->{ca}->{value} .= ' ';
1838 ## Stay in the state.
1839
1840 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1841 $self->{line_prev} = $self->{line};
1842 $self->{column_prev} = $self->{column};
1843 $self->{column}++;
1844 $self->{nc}
1845 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1846 } else {
1847 $self->{set_nc}->($self);
1848 }
1849
1850 redo A;
1851 } elsif ($self->{nc} == -1) {
1852 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1853 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1854
1855 $self->{last_stag_name} = $self->{ct}->{tag_name};
1856
1857 $self->{state} = DATA_STATE;
1858 $self->{s_kwd} = '';
1859 ## reconsume
1860 return ($self->{ct}); # start tag
1861 redo A;
1862 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1863 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1864 if ($self->{ct}->{attributes}) {
1865
1866 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1867 } else {
1868 ## NOTE: This state should never be reached.
1869
1870 }
1871
1872 $self->{state} = DATA_STATE;
1873 $self->{s_kwd} = '';
1874 ## reconsume
1875
1876 ## Discard the token.
1877 #return ($self->{ct}); # end tag
1878
1879 redo A;
1880 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1881 ## XML5: No parse error above; not defined yet.
1882 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1883 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1884 ## Reconsume.
1885
1886 ## Discard the token.
1887 #return ($self->{ct}); # ATTLIST
1888
1889 redo A;
1890 } else {
1891 die "$0: $self->{ct}->{type}: Unknown token type";
1892 }
1893 } else {
1894 ## XML5 [ATTLIST]: Not defined yet.
1895 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1896
1897 ## XML5: Not a parse error.
1898 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1899 } else {
1900
1901 }
1902 $self->{ca}->{value} .= chr ($self->{nc});
1903 $self->{read_until}->($self->{ca}->{value},
1904 qq["&<\x09\x0C\x20],
1905 length $self->{ca}->{value});
1906
1907 ## Stay in the state
1908
1909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1910 $self->{line_prev} = $self->{line};
1911 $self->{column_prev} = $self->{column};
1912 $self->{column}++;
1913 $self->{nc}
1914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1915 } else {
1916 $self->{set_nc}->($self);
1917 }
1918
1919 redo A;
1920 }
1921 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1922 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1923 ## ATTLIST attribute value single quoted state".
1924
1925 if ($self->{nc} == 0x0027) { # '
1926 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1927
1928 ## XML5: "DOCTYPE ATTLIST name after state".
1929 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1930 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1931 } else {
1932
1933 ## XML5: "Before attribute name state" (sic).
1934 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1935 }
1936
1937 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938 $self->{line_prev} = $self->{line};
1939 $self->{column_prev} = $self->{column};
1940 $self->{column}++;
1941 $self->{nc}
1942 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943 } else {
1944 $self->{set_nc}->($self);
1945 }
1946
1947 redo A;
1948 } elsif ($self->{nc} == 0x0026) { # &
1949
1950 ## XML5: Not defined yet.
1951
1952 ## NOTE: In the spec, the tokenizer is switched to the
1953 ## "entity in attribute value state". In this implementation, the
1954 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1955 ## implementation of the "consume a character reference" algorithm.
1956 $self->{entity_add} = 0x0027; # '
1957 $self->{prev_state} = $self->{state};
1958 $self->{state} = ENTITY_STATE;
1959
1960 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1961 $self->{line_prev} = $self->{line};
1962 $self->{column_prev} = $self->{column};
1963 $self->{column}++;
1964 $self->{nc}
1965 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1966 } else {
1967 $self->{set_nc}->($self);
1968 }
1969
1970 redo A;
1971 } elsif ($self->{is_xml} and
1972 $is_space->{$self->{nc}}) {
1973
1974 $self->{ca}->{value} .= ' ';
1975 ## Stay in the state.
1976
1977 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1978 $self->{line_prev} = $self->{line};
1979 $self->{column_prev} = $self->{column};
1980 $self->{column}++;
1981 $self->{nc}
1982 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1983 } else {
1984 $self->{set_nc}->($self);
1985 }
1986
1987 redo A;
1988 } elsif ($self->{nc} == -1) {
1989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1990 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1991
1992 $self->{last_stag_name} = $self->{ct}->{tag_name};
1993
1994 $self->{state} = DATA_STATE;
1995 $self->{s_kwd} = '';
1996 ## reconsume
1997
1998 ## Discard the token.
1999 #return ($self->{ct}); # start tag
2000
2001 redo A;
2002 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2003 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2004 if ($self->{ct}->{attributes}) {
2005
2006 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2007 } else {
2008 ## NOTE: This state should never be reached.
2009
2010 }
2011
2012 $self->{state} = DATA_STATE;
2013 $self->{s_kwd} = '';
2014 ## reconsume
2015
2016 ## Discard the token.
2017 #return ($self->{ct}); # end tag
2018
2019 redo A;
2020 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2021 ## XML5: No parse error above; not defined yet.
2022 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2023 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2024 ## Reconsume.
2025
2026 ## Discard the token.
2027 #return ($self->{ct}); # ATTLIST
2028
2029 redo A;
2030 } else {
2031 die "$0: $self->{ct}->{type}: Unknown token type";
2032 }
2033 } else {
2034 ## XML5 [ATTLIST]: Not defined yet.
2035 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2036
2037 ## XML5: Not a parse error.
2038 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2039 } else {
2040
2041 }
2042 $self->{ca}->{value} .= chr ($self->{nc});
2043 $self->{read_until}->($self->{ca}->{value},
2044 qq['&<\x09\x0C\x20],
2045 length $self->{ca}->{value});
2046
2047 ## Stay in the state
2048
2049 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2050 $self->{line_prev} = $self->{line};
2051 $self->{column_prev} = $self->{column};
2052 $self->{column}++;
2053 $self->{nc}
2054 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2055 } else {
2056 $self->{set_nc}->($self);
2057 }
2058
2059 redo A;
2060 }
2061 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2062 ## XML5: "Tag attribute value unquoted state".
2063
2064 if ($is_space->{$self->{nc}}) {
2065 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2066
2067 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2068 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2069 } else {
2070
2071 ## XML5: "Tag attribute name before state".
2072 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2073 }
2074
2075 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2076 $self->{line_prev} = $self->{line};
2077 $self->{column_prev} = $self->{column};
2078 $self->{column}++;
2079 $self->{nc}
2080 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2081 } else {
2082 $self->{set_nc}->($self);
2083 }
2084
2085 redo A;
2086 } elsif ($self->{nc} == 0x0026) { # &
2087
2088
2089 ## XML5: Not defined yet.
2090
2091 ## NOTE: In the spec, the tokenizer is switched to the
2092 ## "entity in attribute value state". In this implementation, the
2093 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2094 ## implementation of the "consume a character reference" algorithm.
2095 $self->{entity_add} = -1;
2096 $self->{prev_state} = $self->{state};
2097 $self->{state} = ENTITY_STATE;
2098
2099 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2100 $self->{line_prev} = $self->{line};
2101 $self->{column_prev} = $self->{column};
2102 $self->{column}++;
2103 $self->{nc}
2104 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2105 } else {
2106 $self->{set_nc}->($self);
2107 }
2108
2109 redo A;
2110 } elsif ($self->{nc} == 0x003E) { # >
2111 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112
2113 $self->{last_stag_name} = $self->{ct}->{tag_name};
2114
2115 $self->{state} = DATA_STATE;
2116 $self->{s_kwd} = '';
2117
2118 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2119 $self->{line_prev} = $self->{line};
2120 $self->{column_prev} = $self->{column};
2121 $self->{column}++;
2122 $self->{nc}
2123 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2124 } else {
2125 $self->{set_nc}->($self);
2126 }
2127
2128 return ($self->{ct}); # start tag
2129 redo A;
2130 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2131 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2132 if ($self->{ct}->{attributes}) {
2133
2134 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2135 } else {
2136 ## NOTE: This state should never be reached.
2137
2138 }
2139
2140 $self->{state} = DATA_STATE;
2141 $self->{s_kwd} = '';
2142
2143 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2144 $self->{line_prev} = $self->{line};
2145 $self->{column_prev} = $self->{column};
2146 $self->{column}++;
2147 $self->{nc}
2148 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2149 } else {
2150 $self->{set_nc}->($self);
2151 }
2152
2153 return ($self->{ct}); # end tag
2154 redo A;
2155 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2156 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2157 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2158
2159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2160 $self->{line_prev} = $self->{line};
2161 $self->{column_prev} = $self->{column};
2162 $self->{column}++;
2163 $self->{nc}
2164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2165 } else {
2166 $self->{set_nc}->($self);
2167 }
2168
2169 return ($self->{ct}); # ATTLIST
2170 redo A;
2171 } else {
2172 die "$0: $self->{ct}->{type}: Unknown token type";
2173 }
2174 } elsif ($self->{nc} == -1) {
2175 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2176
2177 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2178 $self->{last_stag_name} = $self->{ct}->{tag_name};
2179
2180 $self->{state} = DATA_STATE;
2181 $self->{s_kwd} = '';
2182 ## reconsume
2183
2184 ## Discard the token.
2185 #return ($self->{ct}); # start tag
2186
2187 redo A;
2188 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2189 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2190 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2191 if ($self->{ct}->{attributes}) {
2192
2193 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2194 } else {
2195 ## NOTE: This state should never be reached.
2196
2197 }
2198
2199 $self->{state} = DATA_STATE;
2200 $self->{s_kwd} = '';
2201 ## reconsume
2202
2203 ## Discard the token.
2204 #return ($self->{ct}); # end tag
2205
2206 redo A;
2207 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2209 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2210 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2211 ## Reconsume.
2212
2213 ## Discard the token.
2214 #return ($self->{ct}); # ATTLIST
2215
2216 redo A;
2217 } else {
2218 die "$0: $self->{ct}->{type}: Unknown token type";
2219 }
2220 } else {
2221 if ({
2222 0x0022 => 1, # "
2223 0x0027 => 1, # '
2224 0x003D => 1, # =
2225 0x003C => 1, # <
2226 }->{$self->{nc}}) {
2227
2228 ## XML5: Not a parse error.
2229 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2230 } else {
2231
2232 }
2233 $self->{ca}->{value} .= chr ($self->{nc});
2234 $self->{read_until}->($self->{ca}->{value},
2235 qq["'=& \x09\x0C>],
2236 length $self->{ca}->{value});
2237
2238 ## Stay in the state
2239
2240 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2241 $self->{line_prev} = $self->{line};
2242 $self->{column_prev} = $self->{column};
2243 $self->{column}++;
2244 $self->{nc}
2245 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2246 } else {
2247 $self->{set_nc}->($self);
2248 }
2249
2250 redo A;
2251 }
2252 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2253 if ($is_space->{$self->{nc}}) {
2254
2255 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2256
2257 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258 $self->{line_prev} = $self->{line};
2259 $self->{column_prev} = $self->{column};
2260 $self->{column}++;
2261 $self->{nc}
2262 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263 } else {
2264 $self->{set_nc}->($self);
2265 }
2266
2267 redo A;
2268 } elsif ($self->{nc} == 0x003E) { # >
2269 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2270
2271 $self->{last_stag_name} = $self->{ct}->{tag_name};
2272 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2273 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2274 if ($self->{ct}->{attributes}) {
2275
2276 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2277 } else {
2278 ## NOTE: This state should never be reached.
2279
2280 }
2281 } else {
2282 die "$0: $self->{ct}->{type}: Unknown token type";
2283 }
2284 $self->{state} = DATA_STATE;
2285 $self->{s_kwd} = '';
2286
2287 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2288 $self->{line_prev} = $self->{line};
2289 $self->{column_prev} = $self->{column};
2290 $self->{column}++;
2291 $self->{nc}
2292 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2293 } else {
2294 $self->{set_nc}->($self);
2295 }
2296
2297
2298 return ($self->{ct}); # start tag or end tag
2299
2300 redo A;
2301 } elsif ($self->{nc} == 0x002F) { # /
2302
2303 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2304
2305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2306 $self->{line_prev} = $self->{line};
2307 $self->{column_prev} = $self->{column};
2308 $self->{column}++;
2309 $self->{nc}
2310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2311 } else {
2312 $self->{set_nc}->($self);
2313 }
2314
2315 redo A;
2316 } elsif ($self->{nc} == -1) {
2317 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2318 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2319
2320 $self->{last_stag_name} = $self->{ct}->{tag_name};
2321 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2322 if ($self->{ct}->{attributes}) {
2323
2324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2325 } else {
2326 ## NOTE: This state should never be reached.
2327
2328 }
2329 } else {
2330 die "$0: $self->{ct}->{type}: Unknown token type";
2331 }
2332 $self->{state} = DATA_STATE;
2333 $self->{s_kwd} = '';
2334 ## Reconsume.
2335
2336 ## Discard the token.
2337 #return ($self->{ct}); # start tag or end tag
2338
2339 redo A;
2340 } else {
2341
2342 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2343 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2344 ## reconsume
2345 redo A;
2346 }
2347 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2348 ## XML5: "Empty tag state".
2349
2350 if ($self->{nc} == 0x003E) { # >
2351 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2352
2353 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2354 ## TODO: Different type than slash in start tag
2355 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2356 if ($self->{ct}->{attributes}) {
2357
2358 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2359 } else {
2360
2361 }
2362 ## TODO: Test |<title></title/>|
2363 } else {
2364
2365 $self->{self_closing} = 1;
2366 }
2367
2368 $self->{state} = DATA_STATE;
2369 $self->{s_kwd} = '';
2370
2371 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2372 $self->{line_prev} = $self->{line};
2373 $self->{column_prev} = $self->{column};
2374 $self->{column}++;
2375 $self->{nc}
2376 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2377 } else {
2378 $self->{set_nc}->($self);
2379 }
2380
2381
2382 return ($self->{ct}); # start tag or end tag
2383
2384 redo A;
2385 } elsif ($self->{nc} == -1) {
2386 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2387 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2388
2389 $self->{last_stag_name} = $self->{ct}->{tag_name};
2390 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2391 if ($self->{ct}->{attributes}) {
2392
2393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2394 } else {
2395 ## NOTE: This state should never be reached.
2396
2397 }
2398 } else {
2399 die "$0: $self->{ct}->{type}: Unknown token type";
2400 }
2401 ## XML5: "Tag attribute name before state".
2402 $self->{state} = DATA_STATE;
2403 $self->{s_kwd} = '';
2404 ## Reconsume.
2405
2406 ## Discard the token.
2407 #return ($self->{ct}); # start tag or end tag
2408
2409 redo A;
2410 } else {
2411
2412 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2413 ## TODO: This error type is wrong.
2414 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2415 ## Reconsume.
2416 redo A;
2417 }
2418 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2419 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2420
2421 ## NOTE: Unlike spec's "bogus comment state", this implementation
2422 ## consumes characters one-by-one basis.
2423
2424 if ($self->{nc} == 0x003E) { # >
2425 if ($self->{in_subset}) {
2426
2427 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2428 } else {
2429
2430 $self->{state} = DATA_STATE;
2431 $self->{s_kwd} = '';
2432 }
2433
2434 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2435 $self->{line_prev} = $self->{line};
2436 $self->{column_prev} = $self->{column};
2437 $self->{column}++;
2438 $self->{nc}
2439 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2440 } else {
2441 $self->{set_nc}->($self);
2442 }
2443
2444
2445 return ($self->{ct}); # comment
2446 redo A;
2447 } elsif ($self->{nc} == -1) {
2448 if ($self->{in_subset}) {
2449
2450 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2451 } else {
2452
2453 $self->{state} = DATA_STATE;
2454 $self->{s_kwd} = '';
2455 }
2456 ## reconsume
2457
2458 return ($self->{ct}); # comment
2459 redo A;
2460 } else {
2461
2462 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2463 $self->{read_until}->($self->{ct}->{data},
2464 q[>],
2465 length $self->{ct}->{data});
2466
2467 ## Stay in the state.
2468
2469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470 $self->{line_prev} = $self->{line};
2471 $self->{column_prev} = $self->{column};
2472 $self->{column}++;
2473 $self->{nc}
2474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475 } else {
2476 $self->{set_nc}->($self);
2477 }
2478
2479 redo A;
2480 }
2481 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2482 ## XML5: "Markup declaration state".
2483
2484 if ($self->{nc} == 0x002D) { # -
2485
2486 $self->{state} = MD_HYPHEN_STATE;
2487
2488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2489 $self->{line_prev} = $self->{line};
2490 $self->{column_prev} = $self->{column};
2491 $self->{column}++;
2492 $self->{nc}
2493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2494 } else {
2495 $self->{set_nc}->($self);
2496 }
2497
2498 redo A;
2499 } elsif ($self->{nc} == 0x0044 or # D
2500 $self->{nc} == 0x0064) { # d
2501 ## ASCII case-insensitive.
2502
2503 $self->{state} = MD_DOCTYPE_STATE;
2504 $self->{kwd} = chr $self->{nc};
2505
2506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2507 $self->{line_prev} = $self->{line};
2508 $self->{column_prev} = $self->{column};
2509 $self->{column}++;
2510 $self->{nc}
2511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2512 } else {
2513 $self->{set_nc}->($self);
2514 }
2515
2516 redo A;
2517 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2518 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2519 $self->{is_xml}) and
2520 $self->{nc} == 0x005B) { # [
2521
2522 $self->{state} = MD_CDATA_STATE;
2523 $self->{kwd} = '[';
2524
2525 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2526 $self->{line_prev} = $self->{line};
2527 $self->{column_prev} = $self->{column};
2528 $self->{column}++;
2529 $self->{nc}
2530 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2531 } else {
2532 $self->{set_nc}->($self);
2533 }
2534
2535 redo A;
2536 } else {
2537
2538 }
2539
2540 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2541 line => $self->{line_prev},
2542 column => $self->{column_prev} - 1);
2543 ## Reconsume.
2544 $self->{state} = BOGUS_COMMENT_STATE;
2545 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2546 line => $self->{line_prev},
2547 column => $self->{column_prev} - 1,
2548 };
2549 redo A;
2550 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2551 if ($self->{nc} == 0x002D) { # -
2552
2553 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2554 line => $self->{line_prev},
2555 column => $self->{column_prev} - 2,
2556 };
2557 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2558
2559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2560 $self->{line_prev} = $self->{line};
2561 $self->{column_prev} = $self->{column};
2562 $self->{column}++;
2563 $self->{nc}
2564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2565 } else {
2566 $self->{set_nc}->($self);
2567 }
2568
2569 redo A;
2570 } else {
2571
2572 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2573 line => $self->{line_prev},
2574 column => $self->{column_prev} - 2);
2575 $self->{state} = BOGUS_COMMENT_STATE;
2576 ## Reconsume.
2577 $self->{ct} = {type => COMMENT_TOKEN,
2578 data => '-',
2579 line => $self->{line_prev},
2580 column => $self->{column_prev} - 2,
2581 };
2582 redo A;
2583 }
2584 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2585 ## ASCII case-insensitive.
2586 if ($self->{nc} == [
2587 undef,
2588 0x004F, # O
2589 0x0043, # C
2590 0x0054, # T
2591 0x0059, # Y
2592 0x0050, # P
2593 ]->[length $self->{kwd}] or
2594 $self->{nc} == [
2595 undef,
2596 0x006F, # o
2597 0x0063, # c
2598 0x0074, # t
2599 0x0079, # y
2600 0x0070, # p
2601 ]->[length $self->{kwd}]) {
2602
2603 ## Stay in the state.
2604 $self->{kwd} .= chr $self->{nc};
2605
2606 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2607 $self->{line_prev} = $self->{line};
2608 $self->{column_prev} = $self->{column};
2609 $self->{column}++;
2610 $self->{nc}
2611 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2612 } else {
2613 $self->{set_nc}->($self);
2614 }
2615
2616 redo A;
2617 } elsif ((length $self->{kwd}) == 6 and
2618 ($self->{nc} == 0x0045 or # E
2619 $self->{nc} == 0x0065)) { # e
2620 if ($self->{is_xml} and
2621 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2622
2623 ## XML5: case-sensitive.
2624 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2625 text => 'DOCTYPE',
2626 line => $self->{line_prev},
2627 column => $self->{column_prev} - 5);
2628 } else {
2629
2630 }
2631 $self->{state} = DOCTYPE_STATE;
2632 $self->{ct} = {type => DOCTYPE_TOKEN,
2633 quirks => 1,
2634 line => $self->{line_prev},
2635 column => $self->{column_prev} - 7,
2636 };
2637
2638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2639 $self->{line_prev} = $self->{line};
2640 $self->{column_prev} = $self->{column};
2641 $self->{column}++;
2642 $self->{nc}
2643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2644 } else {
2645 $self->{set_nc}->($self);
2646 }
2647
2648 redo A;
2649 } else {
2650
2651 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2652 line => $self->{line_prev},
2653 column => $self->{column_prev} - 1 - length $self->{kwd});
2654 $self->{state} = BOGUS_COMMENT_STATE;
2655 ## Reconsume.
2656 $self->{ct} = {type => COMMENT_TOKEN,
2657 data => $self->{kwd},
2658 line => $self->{line_prev},
2659 column => $self->{column_prev} - 1 - length $self->{kwd},
2660 };
2661 redo A;
2662 }
2663 } elsif ($self->{state} == MD_CDATA_STATE) {
2664 if ($self->{nc} == {
2665 '[' => 0x0043, # C
2666 '[C' => 0x0044, # D
2667 '[CD' => 0x0041, # A
2668 '[CDA' => 0x0054, # T
2669 '[CDAT' => 0x0041, # A
2670 }->{$self->{kwd}}) {
2671
2672 ## Stay in the state.
2673 $self->{kwd} .= chr $self->{nc};
2674
2675 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2676 $self->{line_prev} = $self->{line};
2677 $self->{column_prev} = $self->{column};
2678 $self->{column}++;
2679 $self->{nc}
2680 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2681 } else {
2682 $self->{set_nc}->($self);
2683 }
2684
2685 redo A;
2686 } elsif ($self->{kwd} eq '[CDATA' and
2687 $self->{nc} == 0x005B) { # [
2688 if ($self->{is_xml} and
2689 not $self->{tainted} and
2690 @{$self->{open_elements} or []} == 0) {
2691
2692 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2693 line => $self->{line_prev},
2694 column => $self->{column_prev} - 7);
2695 $self->{tainted} = 1;
2696 } else {
2697
2698 }
2699
2700 $self->{ct} = {type => CHARACTER_TOKEN,
2701 data => '',
2702 line => $self->{line_prev},
2703 column => $self->{column_prev} - 7};
2704 $self->{state} = CDATA_SECTION_STATE;
2705
2706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2707 $self->{line_prev} = $self->{line};
2708 $self->{column_prev} = $self->{column};
2709 $self->{column}++;
2710 $self->{nc}
2711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2712 } else {
2713 $self->{set_nc}->($self);
2714 }
2715
2716 redo A;
2717 } else {
2718
2719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2720 line => $self->{line_prev},
2721 column => $self->{column_prev} - 1 - length $self->{kwd});
2722 $self->{state} = BOGUS_COMMENT_STATE;
2723 ## Reconsume.
2724 $self->{ct} = {type => COMMENT_TOKEN,
2725 data => $self->{kwd},
2726 line => $self->{line_prev},
2727 column => $self->{column_prev} - 1 - length $self->{kwd},
2728 };
2729 redo A;
2730 }
2731 } elsif ($self->{state} == COMMENT_START_STATE) {
2732 if ($self->{nc} == 0x002D) { # -
2733
2734 $self->{state} = COMMENT_START_DASH_STATE;
2735
2736 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2737 $self->{line_prev} = $self->{line};
2738 $self->{column_prev} = $self->{column};
2739 $self->{column}++;
2740 $self->{nc}
2741 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2742 } else {
2743 $self->{set_nc}->($self);
2744 }
2745
2746 redo A;
2747 } elsif ($self->{nc} == 0x003E) { # >
2748 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2749 if ($self->{in_subset}) {
2750
2751 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2752 } else {
2753
2754 $self->{state} = DATA_STATE;
2755 $self->{s_kwd} = '';
2756 }
2757
2758 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2759 $self->{line_prev} = $self->{line};
2760 $self->{column_prev} = $self->{column};
2761 $self->{column}++;
2762 $self->{nc}
2763 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2764 } else {
2765 $self->{set_nc}->($self);
2766 }
2767
2768
2769 return ($self->{ct}); # comment
2770
2771 redo A;
2772 } elsif ($self->{nc} == -1) {
2773 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2774 if ($self->{in_subset}) {
2775
2776 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2777 } else {
2778
2779 $self->{state} = DATA_STATE;
2780 $self->{s_kwd} = '';
2781 }
2782 ## reconsume
2783
2784 return ($self->{ct}); # comment
2785
2786 redo A;
2787 } else {
2788
2789 $self->{ct}->{data} # comment
2790 .= chr ($self->{nc});
2791 $self->{state} = COMMENT_STATE;
2792
2793 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2794 $self->{line_prev} = $self->{line};
2795 $self->{column_prev} = $self->{column};
2796 $self->{column}++;
2797 $self->{nc}
2798 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2799 } else {
2800 $self->{set_nc}->($self);
2801 }
2802
2803 redo A;
2804 }
2805 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2806 if ($self->{nc} == 0x002D) { # -
2807
2808 $self->{state} = COMMENT_END_STATE;
2809
2810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2811 $self->{line_prev} = $self->{line};
2812 $self->{column_prev} = $self->{column};
2813 $self->{column}++;
2814 $self->{nc}
2815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2816 } else {
2817 $self->{set_nc}->($self);
2818 }
2819
2820 redo A;
2821 } elsif ($self->{nc} == 0x003E) { # >
2822 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2823 if ($self->{in_subset}) {
2824
2825 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2826 } else {
2827
2828 $self->{state} = DATA_STATE;
2829 $self->{s_kwd} = '';
2830 }
2831
2832 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2833 $self->{line_prev} = $self->{line};
2834 $self->{column_prev} = $self->{column};
2835 $self->{column}++;
2836 $self->{nc}
2837 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2838 } else {
2839 $self->{set_nc}->($self);
2840 }
2841
2842
2843 return ($self->{ct}); # comment
2844
2845 redo A;
2846 } elsif ($self->{nc} == -1) {
2847 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2848 if ($self->{in_subset}) {
2849
2850 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2851 } else {
2852
2853 $self->{state} = DATA_STATE;
2854 $self->{s_kwd} = '';
2855 }
2856 ## reconsume
2857
2858 return ($self->{ct}); # comment
2859
2860 redo A;
2861 } else {
2862
2863 $self->{ct}->{data} # comment
2864 .= '-' . chr ($self->{nc});
2865 $self->{state} = COMMENT_STATE;
2866
2867 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2868 $self->{line_prev} = $self->{line};
2869 $self->{column_prev} = $self->{column};
2870 $self->{column}++;
2871 $self->{nc}
2872 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2873 } else {
2874 $self->{set_nc}->($self);
2875 }
2876
2877 redo A;
2878 }
2879 } elsif ($self->{state} == COMMENT_STATE) {
2880 ## XML5: "Comment state" and "DOCTYPE comment state".
2881
2882 if ($self->{nc} == 0x002D) { # -
2883
2884 $self->{state} = COMMENT_END_DASH_STATE;
2885
2886 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2887 $self->{line_prev} = $self->{line};
2888 $self->{column_prev} = $self->{column};
2889 $self->{column}++;
2890 $self->{nc}
2891 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2892 } else {
2893 $self->{set_nc}->($self);
2894 }
2895
2896 redo A;
2897 } elsif ($self->{nc} == -1) {
2898 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2899 if ($self->{in_subset}) {
2900
2901 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902 } else {
2903
2904 $self->{state} = DATA_STATE;
2905 $self->{s_kwd} = '';
2906 }
2907 ## reconsume
2908
2909 return ($self->{ct}); # comment
2910
2911 redo A;
2912 } else {
2913
2914 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2915 $self->{read_until}->($self->{ct}->{data},
2916 q[-],
2917 length $self->{ct}->{data});
2918
2919 ## Stay in the state
2920
2921 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2922 $self->{line_prev} = $self->{line};
2923 $self->{column_prev} = $self->{column};
2924 $self->{column}++;
2925 $self->{nc}
2926 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2927 } else {
2928 $self->{set_nc}->($self);
2929 }
2930
2931 redo A;
2932 }
2933 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2934 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2935
2936 if ($self->{nc} == 0x002D) { # -
2937
2938 $self->{state} = COMMENT_END_STATE;
2939
2940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2941 $self->{line_prev} = $self->{line};
2942 $self->{column_prev} = $self->{column};
2943 $self->{column}++;
2944 $self->{nc}
2945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2946 } else {
2947 $self->{set_nc}->($self);
2948 }
2949
2950 redo A;
2951 } elsif ($self->{nc} == -1) {
2952 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2953 if ($self->{in_subset}) {
2954
2955 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2956 } else {
2957
2958 $self->{state} = DATA_STATE;
2959 $self->{s_kwd} = '';
2960 }
2961 ## reconsume
2962
2963 return ($self->{ct}); # comment
2964
2965 redo A;
2966 } else {
2967
2968 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2969 $self->{state} = COMMENT_STATE;
2970
2971 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2972 $self->{line_prev} = $self->{line};
2973 $self->{column_prev} = $self->{column};
2974 $self->{column}++;
2975 $self->{nc}
2976 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2977 } else {
2978 $self->{set_nc}->($self);
2979 }
2980
2981 redo A;
2982 }
2983 } elsif ($self->{state} == COMMENT_END_STATE or
2984 $self->{state} == COMMENT_END_BANG_STATE) {
2985 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2986 ## (No comment end bang state.)
2987
2988 if ($self->{nc} == 0x003E) { # >
2989 if ($self->{in_subset}) {
2990
2991 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2992 } else {
2993
2994 $self->{state} = DATA_STATE;
2995 $self->{s_kwd} = '';
2996 }
2997
2998 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2999 $self->{line_prev} = $self->{line};
3000 $self->{column_prev} = $self->{column};
3001 $self->{column}++;
3002 $self->{nc}
3003 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3004 } else {
3005 $self->{set_nc}->($self);
3006 }
3007
3008
3009 return ($self->{ct}); # comment
3010
3011 redo A;
3012 } elsif ($self->{nc} == 0x002D) { # -
3013 if ($self->{state} == COMMENT_END_BANG_STATE) {
3014
3015 $self->{ct}->{data} .= '--!'; # comment
3016 $self->{state} = COMMENT_END_DASH_STATE;
3017 } else {
3018
3019 ## XML5: Not a parse error.
3020 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3021 line => $self->{line_prev},
3022 column => $self->{column_prev});
3023 $self->{ct}->{data} .= '-'; # comment
3024 ## Stay in the state
3025 }
3026
3027 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3028 $self->{line_prev} = $self->{line};
3029 $self->{column_prev} = $self->{column};
3030 $self->{column}++;
3031 $self->{nc}
3032 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3033 } else {
3034 $self->{set_nc}->($self);
3035 }
3036
3037 redo A;
3038 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3039 $is_space->{$self->{nc}}) {
3040
3041 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3042 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3043 $self->{state} = COMMENT_END_SPACE_STATE;
3044
3045 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3046 $self->{line_prev} = $self->{line};
3047 $self->{column_prev} = $self->{column};
3048 $self->{column}++;
3049 $self->{nc}
3050 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3051 } else {
3052 $self->{set_nc}->($self);
3053 }
3054
3055 redo A;
3056 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3057 $self->{nc} == 0x0021) { # !
3058
3059 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3060 $self->{state} = COMMENT_END_BANG_STATE;
3061
3062 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3063 $self->{line_prev} = $self->{line};
3064 $self->{column_prev} = $self->{column};
3065 $self->{column}++;
3066 $self->{nc}
3067 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3068 } else {
3069 $self->{set_nc}->($self);
3070 }
3071
3072 redo A;
3073 } elsif ($self->{nc} == -1) {
3074 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3075 if ($self->{in_subset}) {
3076
3077 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3078 } else {
3079
3080 $self->{state} = DATA_STATE;
3081 $self->{s_kwd} = '';
3082 }
3083 ## Reconsume.
3084
3085 return ($self->{ct}); # comment
3086
3087 redo A;
3088 } else {
3089
3090 if ($self->{state} == COMMENT_END_BANG_STATE) {
3091 $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3092 } else {
3093 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3094 }
3095 $self->{state} = COMMENT_STATE;
3096
3097 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3098 $self->{line_prev} = $self->{line};
3099 $self->{column_prev} = $self->{column};
3100 $self->{column}++;
3101 $self->{nc}
3102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3103 } else {
3104 $self->{set_nc}->($self);
3105 }
3106
3107 redo A;
3108 }
3109 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3110 ## XML5: Not exist.
3111
3112 if ($self->{nc} == 0x003E) { # >
3113 if ($self->{in_subset}) {
3114
3115 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3116 } else {
3117
3118 $self->{state} = DATA_STATE;
3119 $self->{s_kwd} = '';
3120 }
3121
3122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123 $self->{line_prev} = $self->{line};
3124 $self->{column_prev} = $self->{column};
3125 $self->{column}++;
3126 $self->{nc}
3127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128 } else {
3129 $self->{set_nc}->($self);
3130 }
3131
3132
3133 return ($self->{ct}); # comment
3134
3135 redo A;
3136 } elsif ($is_space->{$self->{nc}}) {
3137
3138 $self->{ct}->{data} .= chr ($self->{nc}); # comment
3139 ## Stay in the state.
3140
3141 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3142 $self->{line_prev} = $self->{line};
3143 $self->{column_prev} = $self->{column};
3144 $self->{column}++;
3145 $self->{nc}
3146 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3147 } else {
3148 $self->{set_nc}->($self);
3149 }
3150
3151 redo A;
3152 } elsif ($self->{nc} == -1) {
3153 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3154 if ($self->{in_subset}) {
3155
3156 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3157 } else {
3158
3159 $self->{state} = DATA_STATE;
3160 $self->{s_kwd} = '';
3161 }
3162 ## Reconsume.
3163
3164 return ($self->{ct}); # comment
3165
3166 redo A;
3167 } else {
3168
3169 $self->{ct}->{data} .= chr ($self->{nc}); # comment
3170 $self->{state} = COMMENT_STATE;
3171
3172 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3173 $self->{line_prev} = $self->{line};
3174 $self->{column_prev} = $self->{column};
3175 $self->{column}++;
3176 $self->{nc}
3177 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3178 } else {
3179 $self->{set_nc}->($self);
3180 }
3181
3182 redo A;
3183 }
3184 } elsif ($self->{state} == DOCTYPE_STATE) {
3185 if ($is_space->{$self->{nc}}) {
3186
3187 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3188
3189 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3190 $self->{line_prev} = $self->{line};
3191 $self->{column_prev} = $self->{column};
3192 $self->{column}++;
3193 $self->{nc}
3194 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3195 } else {
3196 $self->{set_nc}->($self);
3197 }
3198
3199 redo A;
3200 } elsif ($self->{nc} == -1) {
3201
3202 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3203 $self->{ct}->{quirks} = 1;
3204
3205 $self->{state} = DATA_STATE;
3206 ## Reconsume.
3207 return ($self->{ct}); # DOCTYPE (quirks)
3208
3209 redo A;
3210 } else {
3211
3212 ## XML5: Swith to the bogus comment state.
3213 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3214 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3215 ## reconsume
3216 redo A;
3217 }
3218 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3219 ## XML5: "DOCTYPE root name before state".
3220
3221 if ($is_space->{$self->{nc}}) {
3222
3223 ## Stay in the state
3224
3225 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3226 $self->{line_prev} = $self->{line};
3227 $self->{column_prev} = $self->{column};
3228 $self->{column}++;
3229 $self->{nc}
3230 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3231 } else {
3232 $self->{set_nc}->($self);
3233 }
3234
3235 redo A;
3236 } elsif ($self->{nc} == 0x003E) { # >
3237
3238 ## XML5: No parse error.
3239 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3240 $self->{state} = DATA_STATE;
3241 $self->{s_kwd} = '';
3242
3243 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3244 $self->{line_prev} = $self->{line};
3245 $self->{column_prev} = $self->{column};
3246 $self->{column}++;
3247 $self->{nc}
3248 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3249 } else {
3250 $self->{set_nc}->($self);
3251 }
3252
3253
3254 return ($self->{ct}); # DOCTYPE (quirks)
3255
3256 redo A;
3257 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3258
3259 $self->{ct}->{name} # DOCTYPE
3260 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3261 delete $self->{ct}->{quirks};
3262 $self->{state} = DOCTYPE_NAME_STATE;
3263
3264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265 $self->{line_prev} = $self->{line};
3266 $self->{column_prev} = $self->{column};
3267 $self->{column}++;
3268 $self->{nc}
3269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270 } else {
3271 $self->{set_nc}->($self);
3272 }
3273
3274 redo A;
3275 } elsif ($self->{nc} == -1) {
3276
3277 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3278 $self->{state} = DATA_STATE;
3279 $self->{s_kwd} = '';
3280 ## reconsume
3281
3282 return ($self->{ct}); # DOCTYPE (quirks)
3283
3284 redo A;
3285 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3286
3287 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3288 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3289 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3290 $self->{in_subset} = 1;
3291
3292 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3293 $self->{line_prev} = $self->{line};
3294 $self->{column_prev} = $self->{column};
3295 $self->{column}++;
3296 $self->{nc}
3297 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3298 } else {
3299 $self->{set_nc}->($self);
3300 }
3301
3302 return ($self->{ct}); # DOCTYPE
3303 redo A;
3304 } else {
3305
3306 $self->{ct}->{name} = chr $self->{nc};
3307 delete $self->{ct}->{quirks};
3308 $self->{state} = DOCTYPE_NAME_STATE;
3309
3310 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3311 $self->{line_prev} = $self->{line};
3312 $self->{column_prev} = $self->{column};
3313 $self->{column}++;
3314 $self->{nc}
3315 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3316 } else {
3317 $self->{set_nc}->($self);
3318 }
3319
3320 redo A;
3321 }
3322 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3323 ## XML5: "DOCTYPE root name state".
3324
3325 ## ISSUE: Redundant "First," in the spec.
3326
3327 if ($is_space->{$self->{nc}}) {
3328
3329 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3330
3331 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3332 $self->{line_prev} = $self->{line};
3333 $self->{column_prev} = $self->{column};
3334 $self->{column}++;
3335 $self->{nc}
3336 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3337 } else {
3338 $self->{set_nc}->($self);
3339 }
3340
3341 redo A;
3342 } elsif ($self->{nc} == 0x003E) { # >
3343
3344 $self->{state} = DATA_STATE;
3345 $self->{s_kwd} = '';
3346
3347 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3348 $self->{line_prev} = $self->{line};
3349 $self->{column_prev} = $self->{column};
3350 $self->{column}++;
3351 $self->{nc}
3352 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3353 } else {
3354 $self->{set_nc}->($self);
3355 }
3356
3357
3358 return ($self->{ct}); # DOCTYPE
3359
3360 redo A;
3361 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3362
3363 $self->{ct}->{name} # DOCTYPE
3364 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3365 delete $self->{ct}->{quirks};
3366 ## Stay in the state.
3367
3368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3369 $self->{line_prev} = $self->{line};
3370 $self->{column_prev} = $self->{column};
3371 $self->{column}++;
3372 $self->{nc}
3373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3374 } else {
3375 $self->{set_nc}->($self);
3376 }
3377
3378 redo A;
3379 } elsif ($self->{nc} == -1) {
3380
3381 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3382 $self->{state} = DATA_STATE;
3383 $self->{s_kwd} = '';
3384 ## reconsume
3385
3386 $self->{ct}->{quirks} = 1;
3387 return ($self->{ct}); # DOCTYPE
3388
3389 redo A;
3390 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3391
3392 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3393 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3394 $self->{in_subset} = 1;
3395
3396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3397 $self->{line_prev} = $self->{line};
3398 $self->{column_prev} = $self->{column};
3399 $self->{column}++;
3400 $self->{nc}
3401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3402 } else {
3403 $self->{set_nc}->($self);
3404 }
3405
3406 return ($self->{ct}); # DOCTYPE
3407 redo A;
3408 } else {
3409
3410 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3411 ## Stay in the state.
3412
3413 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3414 $self->{line_prev} = $self->{line};
3415 $self->{column_prev} = $self->{column};
3416 $self->{column}++;
3417 $self->{nc}
3418 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3419 } else {
3420 $self->{set_nc}->($self);
3421 }
3422
3423 redo A;
3424 }
3425 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3426 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3427 ## state", but implemented differently.
3428
3429 if ($is_space->{$self->{nc}}) {
3430
3431 ## Stay in the state
3432
3433 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3434 $self->{line_prev} = $self->{line};
3435 $self->{column_prev} = $self->{column};
3436 $self->{column}++;
3437 $self->{nc}
3438 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3439 } else {
3440 $self->{set_nc}->($self);
3441 }
3442
3443 redo A;
3444 } elsif ($self->{nc} == 0x003E) { # >
3445 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3446
3447 $self->{state} = DATA_STATE;
3448 $self->{s_kwd} = '';
3449 } else {
3450
3451 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3452 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3453 }
3454
3455
3456 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3457 $self->{line_prev} = $self->{line};
3458 $self->{column_prev} = $self->{column};
3459 $self->{column}++;
3460 $self->{nc}
3461 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3462 } else {
3463 $self->{set_nc}->($self);
3464 }
3465
3466 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3467 redo A;
3468 } elsif ($self->{nc} == -1) {
3469 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3470
3471 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3472 $self->{state} = DATA_STATE;
3473 $self->{s_kwd} = '';
3474 $self->{ct}->{quirks} = 1;
3475 } else {
3476
3477 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3478 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3479 }
3480
3481 ## Reconsume.
3482 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3483 redo A;
3484 } elsif ($self->{nc} == 0x0050 or # P
3485 $self->{nc} == 0x0070) { # p
3486
3487 $self->{state} = PUBLIC_STATE;
3488 $self->{kwd} = chr $self->{nc};
3489
3490 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3491 $self->{line_prev} = $self->{line};
3492 $self->{column_prev} = $self->{column};
3493 $self->{column}++;
3494 $self->{nc}
3495 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3496 } else {
3497 $self->{set_nc}->($self);
3498 }
3499
3500 redo A;
3501 } elsif ($self->{nc} == 0x0053 or # S
3502 $self->{nc} == 0x0073) { # s
3503
3504 $self->{state} = SYSTEM_STATE;
3505 $self->{kwd} = chr $self->{nc};
3506
3507 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3508 $self->{line_prev} = $self->{line};
3509 $self->{column_prev} = $self->{column};
3510 $self->{column}++;
3511 $self->{nc}
3512 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3513 } else {
3514 $self->{set_nc}->($self);
3515 }
3516
3517 redo A;
3518 } elsif ($self->{nc} == 0x0022 and # "
3519 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3520 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3521
3522 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3523 $self->{ct}->{value} = ''; # ENTITY
3524
3525 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3526 $self->{line_prev} = $self->{line};
3527 $self->{column_prev} = $self->{column};
3528 $self->{column}++;
3529 $self->{nc}
3530 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3531 } else {
3532 $self->{set_nc}->($self);
3533 }
3534
3535 redo A;
3536 } elsif ($self->{nc} == 0x0027 and # '
3537 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3538 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3539
3540 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3541 $self->{ct}->{value} = ''; # ENTITY
3542
3543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3544 $self->{line_prev} = $self->{line};
3545 $self->{column_prev} = $self->{column};
3546 $self->{column}++;
3547 $self->{nc}
3548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3549 } else {
3550 $self->{set_nc}->($self);
3551 }
3552
3553 redo A;
3554 } elsif ($self->{is_xml} and
3555 $self->{ct}->{type} == DOCTYPE_TOKEN and
3556 $self->{nc} == 0x005B) { # [
3557
3558 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3560 $self->{in_subset} = 1;
3561
3562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563 $self->{line_prev} = $self->{line};
3564 $self->{column_prev} = $self->{column};
3565 $self->{column}++;
3566 $self->{nc}
3567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568 } else {
3569 $self->{set_nc}->($self);
3570 }
3571
3572 return ($self->{ct}); # DOCTYPE
3573 redo A;
3574 } else {
3575 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3576
3577 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3578
3579 $self->{ct}->{quirks} = 1;
3580 $self->{state} = BOGUS_DOCTYPE_STATE;
3581 } else {
3582
3583 $self->{state} = BOGUS_MD_STATE;
3584 }
3585
3586
3587 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3588 $self->{line_prev} = $self->{line};
3589 $self->{column_prev} = $self->{column};
3590 $self->{column}++;
3591 $self->{nc}
3592 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3593 } else {
3594 $self->{set_nc}->($self);
3595 }
3596
3597 redo A;
3598 }
3599 } elsif ($self->{state} == PUBLIC_STATE) {
3600 ## ASCII case-insensitive
3601 if ($self->{nc} == [
3602 undef,
3603 0x0055, # U
3604 0x0042, # B
3605 0x004C, # L
3606 0x0049, # I
3607 ]->[length $self->{kwd}] or
3608 $self->{nc} == [
3609 undef,
3610 0x0075, # u
3611 0x0062, # b
3612 0x006C, # l
3613 0x0069, # i
3614 ]->[length $self->{kwd}]) {
3615
3616 ## Stay in the state.
3617 $self->{kwd} .= chr $self->{nc};
3618
3619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3620 $self->{line_prev} = $self->{line};
3621 $self->{column_prev} = $self->{column};
3622 $self->{column}++;
3623 $self->{nc}
3624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3625 } else {
3626 $self->{set_nc}->($self);
3627 }
3628
3629 redo A;
3630 } elsif ((length $self->{kwd}) == 5 and
3631 ($self->{nc} == 0x0043 or # C
3632 $self->{nc} == 0x0063)) { # c
3633 if ($self->{is_xml} and
3634 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3635
3636 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3637 text => 'PUBLIC',
3638 line => $self->{line_prev},
3639 column => $self->{column_prev} - 4);
3640 } else {
3641
3642 }
3643 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3644
3645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3646 $self->{line_prev} = $self->{line};
3647 $self->{column_prev} = $self->{column};
3648 $self->{column}++;
3649 $self->{nc}
3650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3651 } else {
3652 $self->{set_nc}->($self);
3653 }
3654
3655 redo A;
3656 } else {
3657 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3658 line => $self->{line_prev},
3659 column => $self->{column_prev} + 1 - length $self->{kwd});
3660 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3661
3662 $self->{ct}->{quirks} = 1;
3663 $self->{state} = BOGUS_DOCTYPE_STATE;
3664 } else {
3665
3666 $self->{state} = BOGUS_MD_STATE;
3667 }
3668 ## Reconsume.
3669 redo A;
3670 }
3671 } elsif ($self->{state} == SYSTEM_STATE) {
3672 ## ASCII case-insensitive
3673 if ($self->{nc} == [
3674 undef,
3675 0x0059, # Y
3676 0x0053, # S
3677 0x0054, # T
3678 0x0045, # E
3679 ]->[length $self->{kwd}] or
3680 $self->{nc} == [
3681 undef,
3682 0x0079, # y
3683 0x0073, # s
3684 0x0074, # t
3685 0x0065, # e
3686 ]->[length $self->{kwd}]) {
3687
3688 ## Stay in the state.
3689 $self->{kwd} .= chr $self->{nc};
3690
3691 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3692 $self->{line_prev} = $self->{line};
3693 $self->{column_prev} = $self->{column};
3694 $self->{column}++;
3695 $self->{nc}
3696 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3697 } else {
3698 $self->{set_nc}->($self);
3699 }
3700
3701 redo A;
3702 } elsif ((length $self->{kwd}) == 5 and
3703 ($self->{nc} == 0x004D or # M
3704 $self->{nc} == 0x006D)) { # m
3705 if ($self->{is_xml} and
3706 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3707
3708 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3709 text => 'SYSTEM',
3710 line => $self->{line_prev},
3711 column => $self->{column_prev} - 4);
3712 } else {
3713
3714 }
3715 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3716
3717 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3718 $self->{line_prev} = $self->{line};
3719 $self->{column_prev} = $self->{column};
3720 $self->{column}++;
3721 $self->{nc}
3722 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3723 } else {
3724 $self->{set_nc}->($self);
3725 }
3726
3727 redo A;
3728 } else {
3729 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3730 line => $self->{line_prev},
3731 column => $self->{column_prev} + 1 - length $self->{kwd});
3732 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3733
3734 $self->{ct}->{quirks} = 1;
3735 $self->{state} = BOGUS_DOCTYPE_STATE;
3736 } else {
3737
3738 $self->{state} = BOGUS_MD_STATE;
3739 }
3740 ## Reconsume.
3741 redo A;
3742 }
3743 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3744 if ($is_space->{$self->{nc}}) {
3745
3746 ## Stay in the state
3747
3748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3749 $self->{line_prev} = $self->{line};
3750 $self->{column_prev} = $self->{column};
3751 $self->{column}++;
3752 $self->{nc}
3753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3754 } else {
3755 $self->{set_nc}->($self);
3756 }
3757
3758 redo A;
3759 } elsif ($self->{nc} eq 0x0022) { # "
3760
3761 $self->{ct}->{pubid} = ''; # DOCTYPE
3762 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3763
3764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3765 $self->{line_prev} = $self->{line};
3766 $self->{column_prev} = $self->{column};
3767 $self->{column}++;
3768 $self->{nc}
3769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3770 } else {
3771 $self->{set_nc}->($self);
3772 }
3773
3774 redo A;
3775 } elsif ($self->{nc} eq 0x0027) { # '
3776
3777 $self->{ct}->{pubid} = ''; # DOCTYPE
3778 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3779
3780 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781 $self->{line_prev} = $self->{line};
3782 $self->{column_prev} = $self->{column};
3783 $self->{column}++;
3784 $self->{nc}
3785 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786 } else {
3787 $self->{set_nc}->($self);
3788 }
3789
3790 redo A;
3791 } elsif ($self->{nc} eq 0x003E) { # >
3792 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3793
3794 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3795
3796 $self->{state} = DATA_STATE;
3797 $self->{s_kwd} = '';
3798 $self->{ct}->{quirks} = 1;
3799 } else {
3800
3801 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3802 }
3803
3804
3805 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3806 $self->{line_prev} = $self->{line};
3807 $self->{column_prev} = $self->{column};
3808 $self->{column}++;
3809 $self->{nc}
3810 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3811 } else {
3812 $self->{set_nc}->($self);
3813 }
3814
3815 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3816 redo A;
3817 } elsif ($self->{nc} == -1) {
3818 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3819
3820 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3821 $self->{state} = DATA_STATE;
3822 $self->{s_kwd} = '';
3823 $self->{ct}->{quirks} = 1;
3824 } else {
3825
3826 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3827 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3828 }
3829
3830 ## reconsume
3831 return ($self->{ct}); # DOCTYPE
3832 redo A;
3833 } elsif ($self->{is_xml} and
3834 $self->{ct}->{type} == DOCTYPE_TOKEN and
3835 $self->{nc} == 0x005B) { # [
3836
3837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3838 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3839 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3840 $self->{in_subset} = 1;
3841
3842 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3843 $self->{line_prev} = $self->{line};
3844 $self->{column_prev} = $self->{column};
3845 $self->{column}++;
3846 $self->{nc}
3847 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3848 } else {
3849 $self->{set_nc}->($self);
3850 }
3851
3852 return ($self->{ct}); # DOCTYPE
3853 redo A;
3854 } else {
3855 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3856
3857 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3858
3859 $self->{ct}->{quirks} = 1;
3860 $self->{state} = BOGUS_DOCTYPE_STATE;
3861 } else {
3862
3863 $self->{state} = BOGUS_MD_STATE;
3864 }
3865
3866
3867 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3868 $self->{line_prev} = $self->{line};
3869 $self->{column_prev} = $self->{column};
3870 $self->{column}++;
3871 $self->{nc}
3872 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3873 } else {
3874 $self->{set_nc}->($self);
3875 }
3876
3877 redo A;
3878 }
3879 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3880 if ($self->{nc} == 0x0022) { # "
3881
3882 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3883
3884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3885 $self->{line_prev} = $self->{line};
3886 $self->{column_prev} = $self->{column};
3887 $self->{column}++;
3888 $self->{nc}
3889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3890 } else {
3891 $self->{set_nc}->($self);
3892 }
3893
3894 redo A;
3895 } elsif ($self->{nc} == 0x003E) { # >
3896 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3897
3898 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3899
3900 $self->{state} = DATA_STATE;
3901 $self->{s_kwd} = '';
3902 $self->{ct}->{quirks} = 1;
3903 } else {
3904
3905 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3906 }
3907
3908
3909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3910 $self->{line_prev} = $self->{line};
3911 $self->{column_prev} = $self->{column};
3912 $self->{column}++;
3913 $self->{nc}
3914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3915 } else {
3916 $self->{set_nc}->($self);
3917 }
3918
3919 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3920 redo A;
3921 } elsif ($self->{nc} == -1) {
3922 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3923
3924 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3925
3926 $self->{state} = DATA_STATE;
3927 $self->{s_kwd} = '';
3928 $self->{ct}->{quirks} = 1;
3929 } else {
3930
3931 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3932 }
3933
3934 ## Reconsume.
3935 return ($self->{ct}); # DOCTYPE
3936 redo A;
3937 } else {
3938
3939 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3940 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3941 length $self->{ct}->{pubid});
3942
3943 ## Stay in the state
3944
3945 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3946 $self->{line_prev} = $self->{line};
3947 $self->{column_prev} = $self->{column};
3948 $self->{column}++;
3949 $self->{nc}
3950 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3951 } else {
3952 $self->{set_nc}->($self);
3953 }
3954
3955 redo A;
3956 }
3957 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3958 if ($self->{nc} == 0x0027) { # '
3959
3960 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3961
3962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3963 $self->{line_prev} = $self->{line};
3964 $self->{column_prev} = $self->{column};
3965 $self->{column}++;
3966 $self->{nc}
3967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3968 } else {
3969 $self->{set_nc}->($self);
3970 }
3971
3972 redo A;
3973 } elsif ($self->{nc} == 0x003E) { # >
3974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3975
3976 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3977
3978 $self->{state} = DATA_STATE;
3979 $self->{s_kwd} = '';
3980 $self->{ct}->{quirks} = 1;
3981 } else {
3982
3983 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3984 }
3985
3986
3987 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3988 $self->{line_prev} = $self->{line};
3989 $self->{column_prev} = $self->{column};
3990 $self->{column}++;
3991 $self->{nc}
3992 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3993 } else {
3994 $self->{set_nc}->($self);
3995 }
3996
3997 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3998 redo A;
3999 } elsif ($self->{nc} == -1) {
4000 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
4001
4002 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4003
4004 $self->{state} = DATA_STATE;
4005 $self->{s_kwd} = '';
4006 $self->{ct}->{quirks} = 1;
4007 } else {
4008
4009 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4010 }
4011
4012 ## reconsume
4013 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4014 redo A;
4015 } else {
4016
4017 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4018 $self->{read_until}->($self->{ct}->{pubid}, q['>],
4019 length $self->{ct}->{pubid});
4020
4021 ## Stay in the state
4022
4023 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4024 $self->{line_prev} = $self->{line};
4025 $self->{column_prev} = $self->{column};
4026 $self->{column}++;
4027 $self->{nc}
4028 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4029 } else {
4030 $self->{set_nc}->($self);
4031 }
4032
4033 redo A;
4034 }
4035 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
4036 if ($is_space->{$self->{nc}}) {
4037
4038 ## Stay in the state
4039
4040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4041 $self->{line_prev} = $self->{line};
4042 $self->{column_prev} = $self->{column};
4043 $self->{column}++;
4044 $self->{nc}
4045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4046 } else {
4047 $self->{set_nc}->($self);
4048 }
4049
4050 redo A;
4051 } elsif ($self->{nc} == 0x0022) { # "
4052
4053 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4054 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4055
4056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4057 $self->{line_prev} = $self->{line};
4058 $self->{column_prev} = $self->{column};
4059 $self->{column}++;
4060 $self->{nc}
4061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4062 } else {
4063 $self->{set_nc}->($self);
4064 }
4065
4066 redo A;
4067 } elsif ($self->{nc} == 0x0027) { # '
4068
4069 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4070 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4071
4072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4073 $self->{line_prev} = $self->{line};
4074 $self->{column_prev} = $self->{column};
4075 $self->{column}++;
4076 $self->{nc}
4077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4078 } else {
4079 $self->{set_nc}->($self);
4080 }
4081
4082 redo A;
4083 } elsif ($self->{nc} == 0x003E) { # >
4084 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085 if ($self->{is_xml}) {
4086
4087 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4088 } else {
4089
4090 }
4091 $self->{state} = DATA_STATE;
4092 $self->{s_kwd} = '';
4093 } else {
4094 if ($self->{ct}->{type} == NOTATION_TOKEN) {
4095
4096 } else {
4097
4098 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4099 }
4100 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4101 }
4102
4103
4104 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4105 $self->{line_prev} = $self->{line};
4106 $self->{column_prev} = $self->{column};
4107 $self->{column}++;
4108 $self->{nc}
4109 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4110 } else {
4111 $self->{set_nc}->($self);
4112 }
4113
4114 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4115 redo A;
4116 } elsif ($self->{nc} == -1) {
4117 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118
4119 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4120
4121 $self->{state} = DATA_STATE;
4122 $self->{s_kwd} = '';
4123 $self->{ct}->{quirks} = 1;
4124 } else {
4125 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4126 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4127 }
4128
4129 ## reconsume
4130 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4131 redo A;
4132 } elsif ($self->{is_xml} and
4133 $self->{ct}->{type} == DOCTYPE_TOKEN and
4134 $self->{nc} == 0x005B) { # [
4135
4136 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4137 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4138 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4139 $self->{in_subset} = 1;
4140
4141 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4142 $self->{line_prev} = $self->{line};
4143 $self->{column_prev} = $self->{column};
4144 $self->{column}++;
4145 $self->{nc}
4146 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4147 } else {
4148 $self->{set_nc}->($self);
4149 }
4150
4151 return ($self->{ct}); # DOCTYPE
4152 redo A;
4153 } else {
4154 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4155
4156 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4157
4158 $self->{ct}->{quirks} = 1;
4159 $self->{state} = BOGUS_DOCTYPE_STATE;
4160 } else {
4161
4162 $self->{state} = BOGUS_MD_STATE;
4163 }
4164
4165
4166 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4167 $self->{line_prev} = $self->{line};
4168 $self->{column_prev} = $self->{column};
4169 $self->{column}++;
4170 $self->{nc}
4171 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4172 } else {
4173 $self->{set_nc}->($self);
4174 }
4175
4176 redo A;
4177 }
4178 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4179 if ($is_space->{$self->{nc}}) {
4180
4181 ## Stay in the state
4182
4183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4184 $self->{line_prev} = $self->{line};
4185 $self->{column_prev} = $self->{column};
4186 $self->{column}++;
4187 $self->{nc}
4188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4189 } else {
4190 $self->{set_nc}->($self);
4191 }
4192
4193 redo A;
4194 } elsif ($self->{nc} == 0x0022) { # "
4195
4196 $self->{ct}->{sysid} = ''; # DOCTYPE
4197 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4198
4199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4200 $self->{line_prev} = $self->{line};
4201 $self->{column_prev} = $self->{column};
4202 $self->{column}++;
4203 $self->{nc}
4204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4205 } else {
4206 $self->{set_nc}->($self);
4207 }
4208
4209 redo A;
4210 } elsif ($self->{nc} == 0x0027) { # '
4211
4212 $self->{ct}->{sysid} = ''; # DOCTYPE
4213 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4214
4215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4216 $self->{line_prev} = $self->{line};
4217 $self->{column_prev} = $self->{column};
4218 $self->{column}++;
4219 $self->{nc}
4220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4221 } else {
4222 $self->{set_nc}->($self);
4223 }
4224
4225 redo A;
4226 } elsif ($self->{nc} == 0x003E) { # >
4227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4228
4229 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4230 $self->{line_prev} = $self->{line};
4231 $self->{column_prev} = $self->{column};
4232 $self->{column}++;
4233 $self->{nc}
4234 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4235 } else {
4236 $self->{set_nc}->($self);
4237 }
4238
4239
4240 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4241
4242 $self->{state} = DATA_STATE;
4243 $self->{s_kwd} = '';
4244 $self->{ct}->{quirks} = 1;
4245 } else {
4246
4247 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4248 }
4249
4250 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4251 redo A;
4252 } elsif ($self->{nc} == -1) {
4253 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4254
4255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4256 $self->{state} = DATA_STATE;
4257 $self->{s_kwd} = '';
4258 $self->{ct}->{quirks} = 1;
4259 } else {
4260
4261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4262 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4263 }
4264
4265 ## reconsume
4266 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 redo A;
4268 } elsif ($self->{is_xml} and
4269 $self->{ct}->{type} == DOCTYPE_TOKEN and
4270 $self->{nc} == 0x005B) { # [
4271
4272 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4273
4274 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4276 $self->{in_subset} = 1;
4277
4278 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4279 $self->{line_prev} = $self->{line};
4280 $self->{column_prev} = $self->{column};
4281 $self->{column}++;
4282 $self->{nc}
4283 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4284 } else {
4285 $self->{set_nc}->($self);
4286 }
4287
4288 return ($self->{ct}); # DOCTYPE
4289 redo A;
4290 } else {
4291 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4292
4293 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4294
4295 $self->{ct}->{quirks} = 1;
4296 $self->{state} = BOGUS_DOCTYPE_STATE;
4297 } else {
4298
4299 $self->{state} = BOGUS_MD_STATE;
4300 }
4301
4302
4303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4304 $self->{line_prev} = $self->{line};
4305 $self->{column_prev} = $self->{column};
4306 $self->{column}++;
4307 $self->{nc}
4308 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4309 } else {
4310 $self->{set_nc}->($self);
4311 }
4312
4313 redo A;
4314 }
4315 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4316 if ($self->{nc} == 0x0022) { # "
4317
4318 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4319
4320 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4321 $self->{line_prev} = $self->{line};
4322 $self->{column_prev} = $self->{column};
4323 $self->{column}++;
4324 $self->{nc}
4325 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4326 } else {
4327 $self->{set_nc}->($self);
4328 }
4329
4330 redo A;
4331 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4333
4334 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4335
4336 $self->{state} = DATA_STATE;
4337 $self->{s_kwd} = '';
4338 $self->{ct}->{quirks} = 1;
4339 } else {
4340
4341 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4342 }
4343
4344
4345 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346 $self->{line_prev} = $self->{line};
4347 $self->{column_prev} = $self->{column};
4348 $self->{column}++;
4349 $self->{nc}
4350 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4351 } else {
4352 $self->{set_nc}->($self);
4353 }
4354
4355 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4356 redo A;
4357 } elsif ($self->{nc} == -1) {
4358 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4359
4360 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4361
4362 $self->{state} = DATA_STATE;
4363 $self->{s_kwd} = '';
4364 $self->{ct}->{quirks} = 1;
4365 } else {
4366
4367 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4368 }
4369
4370 ## reconsume
4371 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4372 redo A;
4373 } else {
4374
4375 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4376 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4377 length $self->{ct}->{sysid});
4378
4379 ## Stay in the state
4380
4381 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4382 $self->{line_prev} = $self->{line};
4383 $self->{column_prev} = $self->{column};
4384 $self->{column}++;
4385 $self->{nc}
4386 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4387 } else {
4388 $self->{set_nc}->($self);
4389 }
4390
4391 redo A;
4392 }
4393 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4394 if ($self->{nc} == 0x0027) { # '
4395
4396 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4397
4398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4399 $self->{line_prev} = $self->{line};
4400 $self->{column_prev} = $self->{column};
4401 $self->{column}++;
4402 $self->{nc}
4403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4404 } else {
4405 $self->{set_nc}->($self);
4406 }
4407
4408 redo A;
4409 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4410
4411 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4412
4413 $self->{state} = DATA_STATE;
4414 $self->{s_kwd} = '';
4415
4416 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4417 $self->{line_prev} = $self->{line};
4418 $self->{column_prev} = $self->{column};
4419 $self->{column}++;
4420 $self->{nc}
4421 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4422 } else {
4423 $self->{set_nc}->($self);
4424 }
4425
4426
4427 $self->{ct}->{quirks} = 1;
4428 return ($self->{ct}); # DOCTYPE
4429
4430 redo A;
4431 } elsif ($self->{nc} == -1) {
4432 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4433
4434 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4435
4436 $self->{state} = DATA_STATE;
4437 $self->{s_kwd} = '';
4438 $self->{ct}->{quirks} = 1;
4439 } else {
4440
4441 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4442 }
4443
4444 ## reconsume
4445 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4446 redo A;
4447 } else {
4448
4449 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4450 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4451 length $self->{ct}->{sysid});
4452
4453 ## Stay in the state
4454
4455 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4456 $self->{line_prev} = $self->{line};
4457 $self->{column_prev} = $self->{column};
4458 $self->{column}++;
4459 $self->{nc}
4460 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4461 } else {
4462 $self->{set_nc}->($self);
4463 }
4464
4465 redo A;
4466 }
4467 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4468 if ($is_space->{$self->{nc}}) {
4469 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4470
4471 $self->{state} = BEFORE_NDATA_STATE;
4472 } else {
4473
4474 ## Stay in the state
4475 }
4476
4477 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4478 $self->{line_prev} = $self->{line};
4479 $self->{column_prev} = $self->{column};
4480 $self->{column}++;
4481 $self->{nc}
4482 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4483 } else {
4484 $self->{set_nc}->($self);
4485 }
4486
4487 redo A;
4488 } elsif ($self->{nc} == 0x003E) { # >
4489 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4490
4491 $self->{state} = DATA_STATE;
4492 $self->{s_kwd} = '';
4493 } else {
4494
4495 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4496 }
4497
4498
4499 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4500 $self->{line_prev} = $self->{line};
4501 $self->{column_prev} = $self->{column};
4502 $self->{column}++;
4503 $self->{nc}
4504 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4505 } else {
4506 $self->{set_nc}->($self);
4507 }
4508
4509 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4510 redo A;
4511 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4512 ($self->{nc} == 0x004E or # N
4513 $self->{nc} == 0x006E)) { # n
4514
4515 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4516 $self->{state} = NDATA_STATE;
4517 $self->{kwd} = chr $self->{nc};
4518
4519 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4520 $self->{line_prev} = $self->{line};
4521 $self->{column_prev} = $self->{column};
4522 $self->{column}++;
4523 $self->{nc}
4524 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4525 } else {
4526 $self->{set_nc}->($self);
4527 }
4528
4529 redo A;
4530 } elsif ($self->{nc} == -1) {
4531 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4532
4533 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4534 $self->{state} = DATA_STATE;
4535 $self->{s_kwd} = '';
4536 $self->{ct}->{quirks} = 1;
4537 } else {
4538
4539 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4540 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4541 }
4542
4543 ## reconsume
4544 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4545 redo A;
4546 } elsif ($self->{is_xml} and
4547 $self->{ct}->{type} == DOCTYPE_TOKEN and
4548 $self->{nc} == 0x005B) { # [
4549
4550 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4551 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4552 $self->{in_subset} = 1;
4553
4554 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4555 $self->{line_prev} = $self->{line};
4556 $self->{column_prev} = $self->{column};
4557 $self->{column}++;
4558 $self->{nc}
4559 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4560 } else {
4561 $self->{set_nc}->($self);
4562 }
4563
4564 return ($self->{ct}); # DOCTYPE
4565 redo A;
4566 } else {
4567 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4568
4569 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4570
4571 #$self->{ct}->{quirks} = 1;
4572 $self->{state} = BOGUS_DOCTYPE_STATE;
4573 } else {
4574
4575 $self->{state} = BOGUS_MD_STATE;
4576 }
4577
4578
4579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4580 $self->{line_prev} = $self->{line};
4581 $self->{column_prev} = $self->{column};
4582 $self->{column}++;
4583 $self->{nc}
4584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4585 } else {
4586 $self->{set_nc}->($self);
4587 }
4588
4589 redo A;
4590 }
4591 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4592 if ($is_space->{$self->{nc}}) {
4593
4594 ## Stay in the state.
4595
4596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597 $self->{line_prev} = $self->{line};
4598 $self->{column_prev} = $self->{column};
4599 $self->{column}++;
4600 $self->{nc}
4601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4602 } else {
4603 $self->{set_nc}->($self);
4604 }
4605
4606 redo A;
4607 } elsif ($self->{nc} == 0x003E) { # >
4608
4609 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4610
4611 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4612 $self->{line_prev} = $self->{line};
4613 $self->{column_prev} = $self->{column};
4614 $self->{column}++;
4615 $self->{nc}
4616 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4617 } else {
4618 $self->{set_nc}->($self);
4619 }
4620
4621 return ($self->{ct}); # ENTITY
4622 redo A;
4623 } elsif ($self->{nc} == 0x004E or # N
4624 $self->{nc} == 0x006E) { # n
4625
4626 $self->{state} = NDATA_STATE;
4627 $self->{kwd} = chr $self->{nc};
4628
4629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4630 $self->{line_prev} = $self->{line};
4631 $self->{column_prev} = $self->{column};
4632 $self->{column}++;
4633 $self->{nc}
4634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4635 } else {
4636 $self->{set_nc}->($self);
4637 }
4638
4639 redo A;
4640 } elsif ($self->{nc} == -1) {
4641
4642 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4643 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4644 ## reconsume
4645 return ($self->{ct}); # ENTITY
4646 redo A;
4647 } else {
4648
4649 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4650 $self->{state} = BOGUS_MD_STATE;
4651
4652 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4653 $self->{line_prev} = $self->{line};
4654 $self->{column_prev} = $self->{column};
4655 $self->{column}++;
4656 $self->{nc}
4657 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4658 } else {
4659 $self->{set_nc}->($self);
4660 }
4661
4662 redo A;
4663 }
4664 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4665 if ($self->{nc} == 0x003E) { # >
4666
4667 $self->{state} = DATA_STATE;
4668 $self->{s_kwd} = '';
4669
4670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4671 $self->{line_prev} = $self->{line};
4672 $self->{column_prev} = $self->{column};
4673 $self->{column}++;
4674 $self->{nc}
4675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4676 } else {
4677 $self->{set_nc}->($self);
4678 }
4679
4680
4681 return ($self->{ct}); # DOCTYPE
4682
4683 redo A;
4684 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4685
4686 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4687 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4688 $self->{in_subset} = 1;
4689
4690 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4691 $self->{line_prev} = $self->{line};
4692 $self->{column_prev} = $self->{column};
4693 $self->{column}++;
4694 $self->{nc}
4695 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4696 } else {
4697 $self->{set_nc}->($self);
4698 }
4699
4700 return ($self->{ct}); # DOCTYPE
4701 redo A;
4702 } elsif ($self->{nc} == -1) {
4703
4704 $self->{state} = DATA_STATE;
4705 $self->{s_kwd} = '';
4706 ## reconsume
4707
4708 return ($self->{ct}); # DOCTYPE
4709
4710 redo A;
4711 } else {
4712
4713 my $s = '';
4714 $self->{read_until}->($s, q{>[}, 0);
4715
4716 ## Stay in the state
4717
4718 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4719 $self->{line_prev} = $self->{line};
4720 $self->{column_prev} = $self->{column};
4721 $self->{column}++;
4722 $self->{nc}
4723 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4724 } else {
4725 $self->{set_nc}->($self);
4726 }
4727
4728 redo A;
4729 }
4730 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4731 ## NOTE: "CDATA section state" in the state is jointly implemented
4732 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4733 ## and |CDATA_SECTION_MSE2_STATE|.
4734
4735 ## XML5: "CDATA state".
4736
4737 if ($self->{nc} == 0x005D) { # ]
4738
4739 $self->{state} = CDATA_SECTION_MSE1_STATE;
4740
4741 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4742 $self->{line_prev} = $self->{line};
4743 $self->{column_prev} = $self->{column};
4744 $self->{column}++;
4745 $self->{nc}
4746 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4747 } else {
4748 $self->{set_nc}->($self);
4749 }
4750
4751 redo A;
4752 } elsif ($self->{nc} == -1) {
4753 if ($self->{is_xml}) {
4754
4755 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4756 } else {
4757
4758 }
4759
4760 $self->{state} = DATA_STATE;
4761 $self->{s_kwd} = '';
4762 ## Reconsume.
4763 if (length $self->{ct}->{data}) { # character
4764
4765 return ($self->{ct}); # character
4766 } else {
4767
4768 ## No token to emit. $self->{ct} is discarded.
4769 }
4770 redo A;
4771 } else {
4772
4773 $self->{ct}->{data} .= chr $self->{nc};
4774 $self->{read_until}->($self->{ct}->{data},
4775 q<]>,
4776 length $self->{ct}->{data});
4777
4778 ## Stay in the state.
4779
4780 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4781 $self->{line_prev} = $self->{line};
4782 $self->{column_prev} = $self->{column};
4783 $self->{column}++;
4784 $self->{nc}
4785 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4786 } else {
4787 $self->{set_nc}->($self);
4788 }
4789
4790 redo A;
4791 }
4792
4793 ## ISSUE: "text tokens" in spec.
4794 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4795 ## XML5: "CDATA bracket state".
4796
4797 if ($self->{nc} == 0x005D) { # ]
4798
4799 $self->{state} = CDATA_SECTION_MSE2_STATE;
4800
4801 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4802 $self->{line_prev} = $self->{line};
4803 $self->{column_prev} = $self->{column};
4804 $self->{column}++;
4805 $self->{nc}
4806 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4807 } else {
4808 $self->{set_nc}->($self);
4809 }
4810
4811 redo A;
4812 } else {
4813
4814 ## XML5: If EOF, "]" is not appended and changed to the data state.
4815 $self->{ct}->{data} .= ']';
4816 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4817 ## Reconsume.
4818 redo A;
4819 }
4820 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4821 ## XML5: "CDATA end state".
4822
4823 if ($self->{nc} == 0x003E) { # >
4824 $self->{state} = DATA_STATE;
4825 $self->{s_kwd} = '';
4826
4827 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4828 $self->{line_prev} = $self->{line};
4829 $self->{column_prev} = $self->{column};
4830 $self->{column}++;
4831 $self->{nc}
4832 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4833 } else {
4834 $self->{set_nc}->($self);
4835 }
4836
4837 if (length $self->{ct}->{data}) { # character
4838
4839 return ($self->{ct}); # character
4840 } else {
4841
4842 ## No token to emit. $self->{ct} is discarded.
4843 }
4844 redo A;
4845 } elsif ($self->{nc} == 0x005D) { # ]
4846 # character
4847 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4848 ## Stay in the state.
4849
4850 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4851 $self->{line_prev} = $self->{line};
4852 $self->{column_prev} = $self->{column};
4853 $self->{column}++;
4854 $self->{nc}
4855 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4856 } else {
4857 $self->{set_nc}->($self);
4858 }
4859
4860 redo A;
4861 } else {
4862
4863 $self->{ct}->{data} .= ']]'; # character
4864 $self->{state} = CDATA_SECTION_STATE;
4865 ## Reconsume. ## XML5: Emit.
4866 redo A;
4867 }
4868 } elsif ($self->{state} == ENTITY_STATE) {
4869 if ($is_space->{$self->{nc}} or
4870 {
4871 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4872 $self->{entity_add} => 1,
4873 }->{$self->{nc}}) {
4874 if ($self->{is_xml}) {
4875
4876 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4877 line => $self->{line_prev},
4878 column => $self->{column_prev}
4879 + ($self->{nc} == -1 ? 1 : 0));
4880 } else {
4881
4882 ## No error
4883 }
4884 ## Don't consume
4885 ## Return nothing.
4886 #
4887 } elsif ($self->{nc} == 0x0023) { # #
4888
4889 $self->{state} = ENTITY_HASH_STATE;
4890 $self->{kwd} = '#';
4891
4892 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4893 $self->{line_prev} = $self->{line};
4894 $self->{column_prev} = $self->{column};
4895 $self->{column}++;
4896 $self->{nc}
4897 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4898 } else {
4899 $self->{set_nc}->($self);
4900 }
4901
4902 redo A;
4903 } elsif ($self->{is_xml} or
4904 (0x0041 <= $self->{nc} and
4905 $self->{nc} <= 0x005A) or # A..Z
4906 (0x0061 <= $self->{nc} and
4907 $self->{nc} <= 0x007A)) { # a..z
4908
4909 require Whatpm::_NamedEntityList;
4910 $self->{state} = ENTITY_NAME_STATE;
4911 $self->{kwd} = chr $self->{nc};
4912 $self->{entity__value} = $self->{kwd};
4913 $self->{entity__match} = 0;
4914
4915 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4916 $self->{line_prev} = $self->{line};
4917 $self->{column_prev} = $self->{column};
4918 $self->{column}++;
4919 $self->{nc}
4920 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4921 } else {
4922 $self->{set_nc}->($self);
4923 }
4924
4925 redo A;
4926 } else {
4927
4928 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4929 ## Return nothing.
4930 #
4931 }
4932
4933 ## NOTE: No character is consumed by the "consume a character
4934 ## reference" algorithm. In other word, there is an "&" character
4935 ## that does not introduce a character reference, which would be
4936 ## appended to the parent element or the attribute value in later
4937 ## process of the tokenizer.
4938
4939 if ($self->{prev_state} == DATA_STATE) {
4940
4941 $self->{state} = $self->{prev_state};
4942 $self->{s_kwd} = '';
4943 ## Reconsume.
4944 return ({type => CHARACTER_TOKEN, data => '&',
4945 line => $self->{line_prev},
4946 column => $self->{column_prev},
4947 });
4948 redo A;
4949 } else {
4950
4951 $self->{ca}->{value} .= '&';
4952 $self->{state} = $self->{prev_state};
4953 $self->{s_kwd} = '';
4954 ## Reconsume.
4955 redo A;
4956 }
4957 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4958 if ($self->{nc} == 0x0078) { # x
4959
4960 $self->{state} = HEXREF_X_STATE;
4961 $self->{kwd} .= chr $self->{nc};
4962
4963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964 $self->{line_prev} = $self->{line};
4965 $self->{column_prev} = $self->{column};
4966 $self->{column}++;
4967 $self->{nc}
4968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969 } else {
4970 $self->{set_nc}->($self);
4971 }
4972
4973 redo A;
4974 } elsif ($self->{nc} == 0x0058) { # X
4975
4976 if ($self->{is_xml}) {
4977 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4978 }
4979 $self->{state} = HEXREF_X_STATE;
4980 $self->{kwd} .= chr $self->{nc};
4981
4982 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4983 $self->{line_prev} = $self->{line};
4984 $self->{column_prev} = $self->{column};
4985 $self->{column}++;
4986 $self->{nc}
4987 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4988 } else {
4989 $self->{set_nc}->($self);
4990 }
4991
4992 redo A;
4993 } elsif (0x0030 <= $self->{nc} and
4994 $self->{nc} <= 0x0039) { # 0..9
4995
4996 $self->{state} = NCR_NUM_STATE;
4997 $self->{kwd} = $self->{nc} - 0x0030;
4998
4999 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5000 $self->{line_prev} = $self->{line};
5001 $self->{column_prev} = $self->{column};
5002 $self->{column}++;
5003 $self->{nc}
5004 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5005 } else {
5006 $self->{set_nc}->($self);
5007 }
5008
5009 redo A;
5010 } else {
5011 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
5012 line => $self->{line_prev},
5013 column => $self->{column_prev} - 1);
5014
5015 ## NOTE: According to the spec algorithm, nothing is returned,
5016 ## and then "&#" is appended to the parent element or the attribute
5017 ## value in the later processing.
5018
5019 if ($self->{prev_state} == DATA_STATE) {
5020
5021 $self->{state} = $self->{prev_state};
5022 $self->{s_kwd} = '';
5023 ## Reconsume.
5024 return ({type => CHARACTER_TOKEN,
5025 data => '&#',
5026 line => $self->{line_prev},
5027 column => $self->{column_prev} - 1,
5028 });
5029 redo A;
5030 } else {
5031
5032 $self->{ca}->{value} .= '&#';
5033 $self->{state} = $self->{prev_state};
5034 $self->{s_kwd} = '';
5035 ## Reconsume.
5036 redo A;
5037 }
5038 }
5039 } elsif ($self->{state} == NCR_NUM_STATE) {
5040 if (0x0030 <= $self->{nc} and
5041 $self->{nc} <= 0x0039) { # 0..9
5042
5043 $self->{kwd} *= 10;
5044 $self->{kwd} += $self->{nc} - 0x0030;
5045
5046 ## Stay in the state.
5047
5048 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5049 $self->{line_prev} = $self->{line};
5050 $self->{column_prev} = $self->{column};
5051 $self->{column}++;
5052 $self->{nc}
5053 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5054 } else {
5055 $self->{set_nc}->($self);
5056 }
5057
5058 redo A;
5059 } elsif ($self->{nc} == 0x003B) { # ;
5060
5061
5062 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5063 $self->{line_prev} = $self->{line};
5064 $self->{column_prev} = $self->{column};
5065 $self->{column}++;
5066 $self->{nc}
5067 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5068 } else {
5069 $self->{set_nc}->($self);
5070 }
5071
5072 #
5073 } else {
5074
5075 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5076 ## Reconsume.
5077 #
5078 }
5079
5080 my $code = $self->{kwd};
5081 my $l = $self->{line_prev};
5082 my $c = $self->{column_prev};
5083 if ((not $self->{is_xml} and $charref_map->{$code}) or
5084 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5085 ($self->{is_xml} and $code == 0x0000)) {
5086
5087 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5088 text => (sprintf 'U+%04X', $code),
5089 line => $l, column => $c);
5090 $code = $charref_map->{$code};
5091 } elsif ($code > 0x10FFFF) {
5092
5093 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5094 text => (sprintf 'U-%08X', $code),
5095 line => $l, column => $c);
5096 $code = 0xFFFD;
5097 }
5098
5099 if ($self->{prev_state} == DATA_STATE) {
5100
5101 $self->{state} = $self->{prev_state};
5102 $self->{s_kwd} = '';
5103 ## Reconsume.
5104 return ({type => CHARACTER_TOKEN, data => chr $code,
5105 has_reference => 1,
5106 line => $l, column => $c,
5107 });
5108 redo A;
5109 } else {
5110
5111 $self->{ca}->{value} .= chr $code;
5112 $self->{ca}->{has_reference} = 1;
5113 $self->{state} = $self->{prev_state};
5114 $self->{s_kwd} = '';
5115 ## Reconsume.
5116 redo A;
5117 }
5118 } elsif ($self->{state} == HEXREF_X_STATE) {
5119 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
5120 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
5121 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
5122 # 0..9, A..F, a..f
5123
5124 $self->{state} = HEXREF_HEX_STATE;
5125 $self->{kwd} = 0;
5126 ## Reconsume.
5127 redo A;
5128 } else {
5129 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
5130 line => $self->{line_prev},
5131 column => $self->{column_prev} - 2);
5132
5133 ## NOTE: According to the spec algorithm, nothing is returned,
5134 ## and then "&#" followed by "X" or "x" is appended to the parent
5135 ## element or the attribute value in the later processing.
5136
5137 if ($self->{prev_state} == DATA_STATE) {
5138
5139 $self->{state} = $self->{prev_state};
5140 $self->{s_kwd} = '';
5141 ## Reconsume.
5142 return ({type => CHARACTER_TOKEN,
5143 data => '&' . $self->{kwd},
5144 line => $self->{line_prev},
5145 column => $self->{column_prev} - length $self->{kwd},
5146 });
5147 redo A;
5148 } else {
5149
5150 $self->{ca}->{value} .= '&' . $self->{kwd};
5151 $self->{state} = $self->{prev_state};
5152 $self->{s_kwd} = '';
5153 ## Reconsume.
5154 redo A;
5155 }
5156 }
5157 } elsif ($self->{state} == HEXREF_HEX_STATE) {
5158 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5159 # 0..9
5160
5161 $self->{kwd} *= 0x10;
5162 $self->{kwd} += $self->{nc} - 0x0030;
5163 ## Stay in the state.
5164
5165 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5166 $self->{line_prev} = $self->{line};
5167 $self->{column_prev} = $self->{column};
5168 $self->{column}++;
5169 $self->{nc}
5170 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5171 } else {
5172 $self->{set_nc}->($self);
5173 }
5174
5175 redo A;
5176 } elsif (0x0061 <= $self->{nc} and
5177 $self->{nc} <= 0x0066) { # a..f
5178
5179 $self->{kwd} *= 0x10;
5180 $self->{kwd} += $self->{nc} - 0x0060 + 9;
5181 ## Stay in the state.
5182
5183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5184 $self->{line_prev} = $self->{line};
5185 $self->{column_prev} = $self->{column};
5186 $self->{column}++;
5187 $self->{nc}
5188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5189 } else {
5190 $self->{set_nc}->($self);
5191 }
5192
5193 redo A;
5194 } elsif (0x0041 <= $self->{nc} and
5195 $self->{nc} <= 0x0046) { # A..F
5196
5197 $self->{kwd} *= 0x10;
5198 $self->{kwd} += $self->{nc} - 0x0040 + 9;
5199 ## Stay in the state.
5200
5201 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5202 $self->{line_prev} = $self->{line};
5203 $self->{column_prev} = $self->{column};
5204 $self->{column}++;
5205 $self->{nc}
5206 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5207 } else {
5208 $self->{set_nc}->($self);
5209 }
5210
5211 redo A;
5212 } elsif ($self->{nc} == 0x003B) { # ;
5213
5214
5215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5216 $self->{line_prev} = $self->{line};
5217 $self->{column_prev} = $self->{column};
5218 $self->{column}++;
5219 $self->{nc}
5220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5221 } else {
5222 $self->{set_nc}->($self);
5223 }
5224
5225 #
5226 } else {
5227
5228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5229 line => $self->{line},
5230 column => $self->{column});
5231 ## Reconsume.
5232 #
5233 }
5234
5235 my $code = $self->{kwd};
5236 my $l = $self->{line_prev};
5237 my $c = $self->{column_prev};
5238 if ((not $self->{is_xml} and $charref_map->{$code}) or
5239 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5240 ($self->{is_xml} and $code == 0x0000)) {
5241
5242 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5243 text => (sprintf 'U+%04X', $code),
5244 line => $l, column => $c);
5245 $code = $charref_map->{$code};
5246 } elsif ($code > 0x10FFFF) {
5247
5248 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5249 text => (sprintf 'U-%08X', $code),
5250 line => $l, column => $c);
5251 $code = 0xFFFD;
5252 }
5253
5254 if ($self->{prev_state} == DATA_STATE) {
5255
5256 $self->{state} = $self->{prev_state};
5257 $self->{s_kwd} = '';
5258 ## Reconsume.
5259 return ({type => CHARACTER_TOKEN, data => chr $code,
5260 has_reference => 1,
5261 line => $l, column => $c,
5262 });
5263 redo A;
5264 } else {
5265
5266 $self->{ca}->{value} .= chr $code;
5267 $self->{ca}->{has_reference} = 1;
5268 $self->{state} = $self->{prev_state};
5269 $self->{s_kwd} = '';
5270 ## Reconsume.
5271 redo A;
5272 }
5273 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5274 if ((0x0041 <= $self->{nc} and # a
5275 $self->{nc} <= 0x005A) or # x
5276 (0x0061 <= $self->{nc} and # a
5277 $self->{nc} <= 0x007A) or # z
5278 (0x0030 <= $self->{nc} and # 0
5279 $self->{nc} <= 0x0039) or # 9
5280 $self->{nc} == 0x003B or # ;
5281 ($self->{is_xml} and
5282 not ($is_space->{$self->{nc}} or
5283 {
5284 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5285 $self->{entity_add} => 1,
5286 }->{$self->{nc}}))) {
5287 our $EntityChar;
5288 $self->{kwd} .= chr $self->{nc};
5289 if (defined $EntityChar->{$self->{kwd}} or
5290 $self->{ge}->{$self->{kwd}}) {
5291 if ($self->{nc} == 0x003B) { # ;
5292 if (defined $self->{ge}->{$self->{kwd}}) {
5293 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5294
5295 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5296 } else {
5297 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5298
5299 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5300 value => $self->{kwd});
5301 } else {
5302
5303 }
5304 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5305 }
5306 } else {
5307 if ($self->{is_xml}) {
5308
5309 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5310 value => $self->{kwd},
5311 level => {
5312 'amp;' => $self->{level}->{warn},
5313 'quot;' => $self->{level}->{warn},
5314 'lt;' => $self->{level}->{warn},
5315 'gt;' => $self->{level}->{warn},
5316 'apos;' => $self->{level}->{warn},
5317 }->{$self->{kwd}} ||
5318 $self->{level}->{must});
5319 } else {
5320
5321 }
5322 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5323 }
5324 $self->{entity__match} = 1;
5325
5326 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5327 $self->{line_prev} = $self->{line};
5328 $self->{column_prev} = $self->{column};
5329 $self->{column}++;
5330 $self->{nc}
5331 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5332 } else {
5333 $self->{set_nc}->($self);
5334 }
5335
5336 #
5337 } else {
5338
5339 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5340 $self->{entity__match} = -1;
5341 ## Stay in the state.
5342
5343 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5344 $self->{line_prev} = $self->{line};
5345 $self->{column_prev} = $self->{column};
5346 $self->{column}++;
5347 $self->{nc}
5348 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5349 } else {
5350 $self->{set_nc}->($self);
5351 }
5352
5353 redo A;
5354 }
5355 } else {
5356
5357 $self->{entity__value} .= chr $self->{nc};
5358 $self->{entity__match} *= 2;
5359 ## Stay in the state.
5360
5361 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5362 $self->{line_prev} = $self->{line};
5363 $self->{column_prev} = $self->{column};
5364 $self->{column}++;
5365 $self->{nc}
5366 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5367 } else {
5368 $self->{set_nc}->($self);
5369 }
5370
5371 redo A;
5372 }
5373 }
5374
5375 my $data;
5376 my $has_ref;
5377 if ($self->{entity__match} > 0) {
5378
5379 $data = $self->{entity__value};
5380 $has_ref = 1;
5381 #
5382 } elsif ($self->{entity__match} < 0) {
5383 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5384 if ($self->{prev_state} != DATA_STATE and # in attribute
5385 $self->{entity__match} < -1) {
5386
5387 $data = '&' . $self->{kwd};
5388 #
5389 } else {
5390
5391 $data = $self->{entity__value};
5392 $has_ref = 1;
5393 #
5394 }
5395 } else {
5396
5397 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5398 line => $self->{line_prev},
5399 column => $self->{column_prev} - length $self->{kwd});
5400 $data = '&' . $self->{kwd};
5401 #
5402 }
5403
5404 ## NOTE: In these cases, when a character reference is found,
5405 ## it is consumed and a character token is returned, or, otherwise,
5406 ## nothing is consumed and returned, according to the spec algorithm.
5407 ## In this implementation, anything that has been examined by the
5408 ## tokenizer is appended to the parent element or the attribute value
5409 ## as string, either literal string when no character reference or
5410 ## entity-replaced string otherwise, in this stage, since any characters
5411 ## that would not be consumed are appended in the data state or in an
5412 ## appropriate attribute value state anyway.
5413
5414 if ($self->{prev_state} == DATA_STATE) {
5415
5416 $self->{state} = $self->{prev_state};
5417 $self->{s_kwd} = '';
5418 ## Reconsume.
5419 return ({type => CHARACTER_TOKEN,
5420 data => $data,
5421 has_reference => $has_ref,
5422 line => $self->{line_prev},
5423 column => $self->{column_prev} + 1 - length $self->{kwd},
5424 });
5425 redo A;
5426 } else {
5427
5428 $self->{ca}->{value} .= $data;
5429 $self->{ca}->{has_reference} = 1 if $has_ref;
5430 $self->{state} = $self->{prev_state};
5431 $self->{s_kwd} = '';
5432 ## Reconsume.
5433 redo A;
5434 }
5435
5436 ## XML-only states
5437
5438 } elsif ($self->{state} == PI_STATE) {
5439 ## XML5: "Pi state" and "DOCTYPE pi state".
5440
5441 if ($is_space->{$self->{nc}} or
5442 $self->{nc} == 0x003F or # ?
5443 $self->{nc} == -1) {
5444 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5445 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5446 ## "DOCTYPE pi state": Parse error, switch to the "data
5447 ## state".
5448 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5449 line => $self->{line_prev},
5450 column => $self->{column_prev}
5451 - 1 * ($self->{nc} != -1));
5452 $self->{state} = BOGUS_COMMENT_STATE;
5453 ## Reconsume.
5454 $self->{ct} = {type => COMMENT_TOKEN,
5455 data => '?',
5456 line => $self->{line_prev},
5457 column => $self->{column_prev}
5458 - 1 * ($self->{nc} != -1),
5459 };
5460 redo A;
5461 } else {
5462 ## XML5: "DOCTYPE pi state": Stay in the state.
5463 $self->{ct} = {type => PI_TOKEN,
5464 target => chr $self->{nc},
5465 data => '',
5466 line => $self->{line_prev},
5467 column => $self->{column_prev} - 1,
5468 };
5469 $self->{state} = PI_TARGET_STATE;
5470
5471 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5472 $self->{line_prev} = $self->{line};
5473 $self->{column_prev} = $self->{column};
5474 $self->{column}++;
5475 $self->{nc}
5476 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5477 } else {
5478 $self->{set_nc}->($self);
5479 }
5480
5481 redo A;
5482 }
5483 } elsif ($self->{state} == PI_TARGET_STATE) {
5484 if ($is_space->{$self->{nc}}) {
5485 $self->{state} = PI_TARGET_AFTER_STATE;
5486
5487 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5488 $self->{line_prev} = $self->{line};
5489 $self->{column_prev} = $self->{column};
5490 $self->{column}++;
5491 $self->{nc}
5492 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5493 } else {
5494 $self->{set_nc}->($self);
5495 }
5496
5497 redo A;
5498 } elsif ($self->{nc} == -1) {
5499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5500 if ($self->{in_subset}) {
5501 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5502 } else {
5503 $self->{state} = DATA_STATE;
5504 $self->{s_kwd} = '';
5505 }
5506 ## Reconsume.
5507 return ($self->{ct}); # pi
5508 redo A;
5509 } elsif ($self->{nc} == 0x003F) { # ?
5510 $self->{state} = PI_AFTER_STATE;
5511
5512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5513 $self->{line_prev} = $self->{line};
5514 $self->{column_prev} = $self->{column};
5515 $self->{column}++;
5516 $self->{nc}
5517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5518 } else {
5519 $self->{set_nc}->($self);
5520 }
5521
5522 redo A;
5523 } else {
5524 ## XML5: typo ("tag name" -> "target")
5525 $self->{ct}->{target} .= chr $self->{nc}; # pi
5526
5527 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5528 $self->{line_prev} = $self->{line};
5529 $self->{column_prev} = $self->{column};
5530 $self->{column}++;
5531 $self->{nc}
5532 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5533 } else {
5534 $self->{set_nc}->($self);
5535 }
5536
5537 redo A;
5538 }
5539 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5540 if ($is_space->{$self->{nc}}) {
5541 ## Stay in the state.
5542
5543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5544 $self->{line_prev} = $self->{line};
5545 $self->{column_prev} = $self->{column};
5546 $self->{column}++;
5547 $self->{nc}
5548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5549 } else {
5550 $self->{set_nc}->($self);
5551 }
5552
5553 redo A;
5554 } else {
5555 $self->{state} = PI_DATA_STATE;
5556 ## Reprocess.
5557 redo A;
5558 }
5559 } elsif ($self->{state} == PI_DATA_STATE) {
5560 if ($self->{nc} == 0x003F) { # ?
5561 $self->{state} = PI_DATA_AFTER_STATE;
5562
5563 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5564 $self->{line_prev} = $self->{line};
5565 $self->{column_prev} = $self->{column};
5566 $self->{column}++;
5567 $self->{nc}
5568 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5569 } else {
5570 $self->{set_nc}->($self);
5571 }
5572
5573 redo A;
5574 } elsif ($self->{nc} == -1) {
5575 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5576 if ($self->{in_subset}) {
5577 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5578 } else {
5579 $self->{state} = DATA_STATE;
5580 $self->{s_kwd} = '';
5581 }
5582 ## Reprocess.
5583 return ($self->{ct}); # pi
5584 redo A;
5585 } else {
5586 $self->{ct}->{data} .= chr $self->{nc}; # pi
5587 $self->{read_until}->($self->{ct}->{data}, q[?],
5588 length $self->{ct}->{data});
5589 ## Stay in the state.
5590
5591 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5592 $self->{line_prev} = $self->{line};
5593 $self->{column_prev} = $self->{column};
5594 $self->{column}++;
5595 $self->{nc}
5596 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5597 } else {
5598 $self->{set_nc}->($self);
5599 }
5600
5601 ## Reprocess.
5602 redo A;
5603 }
5604 } elsif ($self->{state} == PI_AFTER_STATE) {
5605 ## XML5: Part of "Pi after state".
5606
5607 if ($self->{nc} == 0x003E) { # >
5608 if ($self->{in_subset}) {
5609 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5610 } else {
5611 $self->{state} = DATA_STATE;
5612 $self->{s_kwd} = '';
5613 }
5614
5615 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5616 $self->{line_prev} = $self->{line};
5617 $self->{column_prev} = $self->{column};
5618 $self->{column}++;
5619 $self->{nc}
5620 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5621 } else {
5622 $self->{set_nc}->($self);
5623 }
5624
5625 return ($self->{ct}); # pi
5626 redo A;
5627 } elsif ($self->{nc} == 0x003F) { # ?
5628 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5629 line => $self->{line_prev},
5630 column => $self->{column_prev}); ## XML5: no error
5631 $self->{ct}->{data} .= '?';
5632 $self->{state} = PI_DATA_AFTER_STATE;
5633
5634 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5635 $self->{line_prev} = $self->{line};
5636 $self->{column_prev} = $self->{column};
5637 $self->{column}++;
5638 $self->{nc}
5639 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5640 } else {
5641 $self->{set_nc}->($self);
5642 }
5643
5644 redo A;
5645 } else {
5646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5647 line => $self->{line_prev},
5648 column => $self->{column_prev}
5649 + 1 * ($self->{nc} == -1)); ## XML5: no error
5650 $self->{ct}->{data} .= '?'; ## XML5: not appended
5651 $self->{state} = PI_DATA_STATE;
5652 ## Reprocess.
5653 redo A;
5654 }
5655 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5656 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5657
5658 if ($self->{nc} == 0x003E) { # >
5659 if ($self->{in_subset}) {
5660 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5661 } else {
5662 $self->{state} = DATA_STATE;
5663 $self->{s_kwd} = '';
5664 }
5665
5666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667 $self->{line_prev} = $self->{line};
5668 $self->{column_prev} = $self->{column};
5669 $self->{column}++;
5670 $self->{nc}
5671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5672 } else {
5673 $self->{set_nc}->($self);
5674 }
5675
5676 return ($self->{ct}); # pi
5677 redo A;
5678 } elsif ($self->{nc} == 0x003F) { # ?
5679 $self->{ct}->{data} .= '?';
5680 ## Stay in the state.
5681
5682 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5683 $self->{line_prev} = $self->{line};
5684 $self->{column_prev} = $self->{column};
5685 $self->{column}++;
5686 $self->{nc}
5687 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5688 } else {
5689 $self->{set_nc}->($self);
5690 }
5691
5692 redo A;
5693 } else {
5694 $self->{ct}->{data} .= '?'; ## XML5: not appended
5695 $self->{state} = PI_DATA_STATE;
5696 ## Reprocess.
5697 redo A;
5698 }
5699
5700 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5701 if ($self->{nc} == 0x003C) { # <
5702 $self->{state} = DOCTYPE_TAG_STATE;
5703
5704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5705 $self->{line_prev} = $self->{line};
5706 $self->{column_prev} = $self->{column};
5707 $self->{column}++;
5708 $self->{nc}
5709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5710 } else {
5711 $self->{set_nc}->($self);
5712 }
5713
5714 redo A;
5715 } elsif ($self->{nc} == 0x0025) { # %
5716 ## XML5: Not defined yet.
5717
5718 ## TODO:
5719
5720 if (not $self->{stop_processing} and
5721 not $self->{document}->xml_standalone) {
5722 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5723 level => $self->{level}->{info});
5724 $self->{stop_processing} = 1;
5725 }
5726
5727
5728 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5729 $self->{line_prev} = $self->{line};
5730 $self->{column_prev} = $self->{column};
5731 $self->{column}++;
5732 $self->{nc}
5733 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5734 } else {
5735 $self->{set_nc}->($self);
5736 }
5737
5738 redo A;
5739 } elsif ($self->{nc} == 0x005D) { # ]
5740 delete $self->{in_subset};
5741 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5742
5743 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5744 $self->{line_prev} = $self->{line};
5745 $self->{column_prev} = $self->{column};
5746 $self->{column}++;
5747 $self->{nc}
5748 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5749 } else {
5750 $self->{set_nc}->($self);
5751 }
5752
5753 redo A;
5754 } elsif ($is_space->{$self->{nc}}) {
5755 ## Stay in the state.
5756
5757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5758 $self->{line_prev} = $self->{line};
5759 $self->{column_prev} = $self->{column};
5760 $self->{column}++;
5761 $self->{nc}
5762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5763 } else {
5764 $self->{set_nc}->($self);
5765 }
5766
5767 redo A;
5768 } elsif ($self->{nc} == -1) {
5769 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5770 delete $self->{in_subset};
5771 $self->{state} = DATA_STATE;
5772 $self->{s_kwd} = '';
5773 ## Reconsume.
5774 return ({type => END_OF_DOCTYPE_TOKEN});
5775 redo A;
5776 } else {
5777 unless ($self->{internal_subset_tainted}) {
5778 ## XML5: No parse error.
5779 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5780 $self->{internal_subset_tainted} = 1;
5781 }
5782 ## Stay in the state.
5783
5784 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5785 $self->{line_prev} = $self->{line};
5786 $self->{column_prev} = $self->{column};
5787 $self->{column}++;
5788 $self->{nc}
5789 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5790 } else {
5791 $self->{set_nc}->($self);
5792 }
5793
5794 redo A;
5795 }
5796 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5797 if ($self->{nc} == 0x003E) { # >
5798 $self->{state} = DATA_STATE;
5799 $self->{s_kwd} = '';
5800
5801 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5802 $self->{line_prev} = $self->{line};
5803 $self->{column_prev} = $self->{column};
5804 $self->{column}++;
5805 $self->{nc}
5806 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5807 } else {
5808 $self->{set_nc}->($self);
5809 }
5810
5811 return ({type => END_OF_DOCTYPE_TOKEN});
5812 redo A;
5813 } elsif ($self->{nc} == -1) {
5814 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5815 $self->{state} = DATA_STATE;
5816 $self->{s_kwd} = '';
5817 ## Reconsume.
5818 return ({type => END_OF_DOCTYPE_TOKEN});
5819 redo A;
5820 } else {
5821 ## XML5: No parse error and stay in the state.
5822 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5823
5824 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5825
5826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5827 $self->{line_prev} = $self->{line};
5828 $self->{column_prev} = $self->{column};
5829 $self->{column}++;
5830 $self->{nc}
5831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5832 } else {
5833 $self->{set_nc}->($self);
5834 }
5835
5836 redo A;
5837 }
5838 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5839 if ($self->{nc} == 0x003E) { # >
5840 $self->{state} = DATA_STATE;
5841 $self->{s_kwd} = '';
5842
5843 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5844 $self->{line_prev} = $self->{line};
5845 $self->{column_prev} = $self->{column};
5846 $self->{column}++;
5847 $self->{nc}
5848 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5849 } else {
5850 $self->{set_nc}->($self);
5851 }
5852
5853 return ({type => END_OF_DOCTYPE_TOKEN});
5854 redo A;
5855 } elsif ($self->{nc} == -1) {
5856 $self->{state} = DATA_STATE;
5857 $self->{s_kwd} = '';
5858 ## Reconsume.
5859 return ({type => END_OF_DOCTYPE_TOKEN});
5860 redo A;
5861 } else {
5862 ## Stay in the state.
5863
5864 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5865 $self->{line_prev} = $self->{line};
5866 $self->{column_prev} = $self->{column};
5867 $self->{column}++;
5868 $self->{nc}
5869 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5870 } else {
5871 $self->{set_nc}->($self);
5872 }
5873
5874 redo A;
5875 }
5876 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5877 if ($self->{nc} == 0x0021) { # !
5878 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5879
5880 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5881 $self->{line_prev} = $self->{line};
5882 $self->{column_prev} = $self->{column};
5883 $self->{column}++;
5884 $self->{nc}
5885 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5886 } else {
5887 $self->{set_nc}->($self);
5888 }
5889
5890 redo A;
5891 } elsif ($self->{nc} == 0x003F) { # ?
5892 $self->{state} = PI_STATE;
5893
5894 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5895 $self->{line_prev} = $self->{line};
5896 $self->{column_prev} = $self->{column};
5897 $self->{column}++;
5898 $self->{nc}
5899 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5900 } else {
5901 $self->{set_nc}->($self);
5902 }
5903
5904 redo A;
5905 } elsif ($self->{nc} == -1) {
5906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5907 $self->{state} = DATA_STATE;
5908 $self->{s_kwd} = '';
5909 ## Reconsume.
5910 redo A;
5911 } else {
5912 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5913 line => $self->{line_prev},
5914 column => $self->{column_prev});
5915 $self->{state} = BOGUS_COMMENT_STATE;
5916 $self->{ct} = {type => COMMENT_TOKEN,
5917 data => '',
5918 }; ## NOTE: Will be discarded.
5919
5920 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5921 $self->{line_prev} = $self->{line};
5922 $self->{column_prev} = $self->{column};
5923 $self->{column}++;
5924 $self->{nc}
5925 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5926 } else {
5927 $self->{set_nc}->($self);
5928 }
5929
5930 redo A;
5931 }
5932 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5933 ## XML5: "DOCTYPE markup declaration state".
5934
5935 if ($self->{nc} == 0x002D) { # -
5936 $self->{state} = MD_HYPHEN_STATE;
5937
5938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5939 $self->{line_prev} = $self->{line};
5940 $self->{column_prev} = $self->{column};
5941 $self->{column}++;
5942 $self->{nc}
5943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5944 } else {
5945 $self->{set_nc}->($self);
5946 }
5947
5948 redo A;
5949 } elsif ($self->{nc} == 0x0045 or # E
5950 $self->{nc} == 0x0065) { # e
5951 $self->{state} = MD_E_STATE;
5952 $self->{kwd} = chr $self->{nc};
5953
5954 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5955 $self->{line_prev} = $self->{line};
5956 $self->{column_prev} = $self->{column};
5957 $self->{column}++;
5958 $self->{nc}
5959 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5960 } else {
5961 $self->{set_nc}->($self);
5962 }
5963
5964 redo A;
5965 } elsif ($self->{nc} == 0x0041 or # A
5966 $self->{nc} == 0x0061) { # a
5967 $self->{state} = MD_ATTLIST_STATE;
5968 $self->{kwd} = chr $self->{nc};
5969
5970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5971 $self->{line_prev} = $self->{line};
5972 $self->{column_prev} = $self->{column};
5973 $self->{column}++;
5974 $self->{nc}
5975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5976 } else {
5977 $self->{set_nc}->($self);
5978 }
5979
5980 redo A;
5981 } elsif ($self->{nc} == 0x004E or # N
5982 $self->{nc} == 0x006E) { # n
5983 $self->{state} = MD_NOTATION_STATE;
5984 $self->{kwd} = chr $self->{nc};
5985
5986 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5987 $self->{line_prev} = $self->{line};
5988 $self->{column_prev} = $self->{column};
5989 $self->{column}++;
5990 $self->{nc}
5991 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5992 } else {
5993 $self->{set_nc}->($self);
5994 }
5995
5996 redo A;
5997 } else {
5998 #
5999 }
6000
6001 ## XML5: No parse error.
6002 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6003 line => $self->{line_prev},
6004 column => $self->{column_prev} - 1);
6005 ## Reconsume.
6006 $self->{state} = BOGUS_COMMENT_STATE;
6007 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
6008 redo A;
6009 } elsif ($self->{state} == MD_E_STATE) {
6010 if ($self->{nc} == 0x004E or # N
6011 $self->{nc} == 0x006E) { # n
6012 $self->{state} = MD_ENTITY_STATE;
6013 $self->{kwd} .= chr $self->{nc};
6014
6015 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6016 $self->{line_prev} = $self->{line};
6017 $self->{column_prev} = $self->{column};
6018 $self->{column}++;
6019 $self->{nc}
6020 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6021 } else {
6022 $self->{set_nc}->($self);
6023 }
6024
6025 redo A;
6026 } elsif ($self->{nc} == 0x004C or # L
6027 $self->{nc} == 0x006C) { # l
6028 ## XML5: <!ELEMENT> not supported.
6029 $self->{state} = MD_ELEMENT_STATE;
6030 $self->{kwd} .= chr $self->{nc};
6031
6032 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6033 $self->{line_prev} = $self->{line};
6034 $self->{column_prev} = $self->{column};
6035 $self->{column}++;
6036 $self->{nc}
6037 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6038 } else {
6039 $self->{set_nc}->($self);
6040 }
6041
6042 redo A;
6043 } else {
6044 ## XML5: No parse error.
6045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6046 line => $self->{line_prev},
6047 column => $self->{column_prev} - 2
6048 + 1 * ($self->{nc} == -1));
6049 ## Reconsume.
6050 $self->{state} = BOGUS_COMMENT_STATE;
6051 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6052 redo A;
6053 }
6054 } elsif ($self->{state} == MD_ENTITY_STATE) {
6055 if ($self->{nc} == [
6056 undef,
6057 undef,
6058 0x0054, # T
6059 0x0049, # I
6060 0x0054, # T
6061 ]->[length $self->{kwd}] or
6062 $self->{nc} == [
6063 undef,
6064 undef,
6065 0x0074, # t
6066 0x0069, # i
6067 0x0074, # t
6068 ]->[length $self->{kwd}]) {
6069 ## Stay in the state.
6070 $self->{kwd} .= chr $self->{nc};
6071
6072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6073 $self->{line_prev} = $self->{line};
6074 $self->{column_prev} = $self->{column};
6075 $self->{column}++;
6076 $self->{nc}
6077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6078 } else {
6079 $self->{set_nc}->($self);
6080 }
6081
6082 redo A;
6083 } elsif ((length $self->{kwd}) == 5 and
6084 ($self->{nc} == 0x0059 or # Y
6085 $self->{nc} == 0x0079)) { # y
6086 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6087 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6088 text => 'ENTITY',
6089 line => $self->{line_prev},
6090 column => $self->{column_prev} - 4);
6091 }
6092 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6093 line => $self->{line_prev},
6094 column => $self->{column_prev} - 6};
6095 $self->{state} = DOCTYPE_MD_STATE;
6096
6097 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6098 $self->{line_prev} = $self->{line};
6099 $self->{column_prev} = $self->{column};
6100 $self->{column}++;
6101 $self->{nc}
6102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6103 } else {
6104 $self->{set_nc}->($self);
6105 }
6106
6107 redo A;
6108 } else {
6109 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6110 line => $self->{line_prev},
6111 column => $self->{column_prev} - 1
6112 - (length $self->{kwd})
6113 + 1 * ($self->{nc} == -1));
6114 $self->{state} = BOGUS_COMMENT_STATE;
6115 ## Reconsume.
6116 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6117 redo A;
6118 }
6119 } elsif ($self->{state} == MD_ELEMENT_STATE) {
6120 if ($self->{nc} == [
6121 undef,
6122 undef,
6123 0x0045, # E
6124 0x004D, # M
6125 0x0045, # E
6126 0x004E, # N
6127 ]->[length $self->{kwd}] or
6128 $self->{nc} == [
6129 undef,
6130 undef,
6131 0x0065, # e
6132 0x006D, # m
6133 0x0065, # e
6134 0x006E, # n
6135 ]->[length $self->{kwd}]) {
6136 ## Stay in the state.
6137 $self->{kwd} .= chr $self->{nc};
6138
6139 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140 $self->{line_prev} = $self->{line};
6141 $self->{column_prev} = $self->{column};
6142 $self->{column}++;
6143 $self->{nc}
6144 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145 } else {
6146 $self->{set_nc}->($self);
6147 }
6148
6149 redo A;
6150 } elsif ((length $self->{kwd}) == 6 and
6151 ($self->{nc} == 0x0054 or # T
6152 $self->{nc} == 0x0074)) { # t
6153 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6154 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6155 text => 'ELEMENT',
6156 line => $self->{line_prev},
6157 column => $self->{column_prev} - 5);
6158 }
6159 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6160 line => $self->{line_prev},
6161 column => $self->{column_prev} - 7};
6162 $self->{state} = DOCTYPE_MD_STATE;
6163
6164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6165 $self->{line_prev} = $self->{line};
6166 $self->{column_prev} = $self->{column};
6167 $self->{column}++;
6168 $self->{nc}
6169 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6170 } else {
6171 $self->{set_nc}->($self);
6172 }
6173
6174 redo A;
6175 } else {
6176 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6177 line => $self->{line_prev},
6178 column => $self->{column_prev} - 1
6179 - (length $self->{kwd})
6180 + 1 * ($self->{nc} == -1));
6181 $self->{state} = BOGUS_COMMENT_STATE;
6182 ## Reconsume.
6183 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6184 redo A;
6185 }
6186 } elsif ($self->{state} == MD_ATTLIST_STATE) {
6187 if ($self->{nc} == [
6188 undef,
6189 0x0054, # T
6190 0x0054, # T
6191 0x004C, # L
6192 0x0049, # I
6193 0x0053, # S
6194 ]->[length $self->{kwd}] or
6195 $self->{nc} == [
6196 undef,
6197 0x0074, # t
6198 0x0074, # t
6199 0x006C, # l
6200 0x0069, # i
6201 0x0073, # s
6202 ]->[length $self->{kwd}]) {
6203 ## Stay in the state.
6204 $self->{kwd} .= chr $self->{nc};
6205
6206 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6207 $self->{line_prev} = $self->{line};
6208 $self->{column_prev} = $self->{column};
6209 $self->{column}++;
6210 $self->{nc}
6211 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6212 } else {
6213 $self->{set_nc}->($self);
6214 }
6215
6216 redo A;
6217 } elsif ((length $self->{kwd}) == 6 and
6218 ($self->{nc} == 0x0054 or # T
6219 $self->{nc} == 0x0074)) { # t
6220 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6221 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6222 text => 'ATTLIST',
6223 line => $self->{line_prev},
6224 column => $self->{column_prev} - 5);
6225 }
6226 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6227 attrdefs => [],
6228 line => $self->{line_prev},
6229 column => $self->{column_prev} - 7};
6230 $self->{state} = DOCTYPE_MD_STATE;
6231
6232 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6233 $self->{line_prev} = $self->{line};
6234 $self->{column_prev} = $self->{column};
6235 $self->{column}++;
6236 $self->{nc}
6237 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6238 } else {
6239 $self->{set_nc}->($self);
6240 }
6241
6242 redo A;
6243 } else {
6244 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6245 line => $self->{line_prev},
6246 column => $self->{column_prev} - 1
6247 - (length $self->{kwd})
6248 + 1 * ($self->{nc} == -1));
6249 $self->{state} = BOGUS_COMMENT_STATE;
6250 ## Reconsume.
6251 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6252 redo A;
6253 }
6254 } elsif ($self->{state} == MD_NOTATION_STATE) {
6255 if ($self->{nc} == [
6256 undef,
6257 0x004F, # O
6258 0x0054, # T
6259 0x0041, # A
6260 0x0054, # T
6261 0x0049, # I
6262 0x004F, # O
6263 ]->[length $self->{kwd}] or
6264 $self->{nc} == [
6265 undef,
6266 0x006F, # o
6267 0x0074, # t
6268 0x0061, # a
6269 0x0074, # t
6270 0x0069, # i
6271 0x006F, # o
6272 ]->[length $self->{kwd}]) {
6273 ## Stay in the state.
6274 $self->{kwd} .= chr $self->{nc};
6275
6276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6277 $self->{line_prev} = $self->{line};
6278 $self->{column_prev} = $self->{column};
6279 $self->{column}++;
6280 $self->{nc}
6281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6282 } else {
6283 $self->{set_nc}->($self);
6284 }
6285
6286 redo A;
6287 } elsif ((length $self->{kwd}) == 7 and
6288 ($self->{nc} == 0x004E or # N
6289 $self->{nc} == 0x006E)) { # n
6290 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6291 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6292 text => 'NOTATION',
6293 line => $self->{line_prev},
6294 column => $self->{column_prev} - 6);
6295 }
6296 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6297 line => $self->{line_prev},
6298 column => $self->{column_prev} - 8};
6299 $self->{state} = DOCTYPE_MD_STATE;
6300
6301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6302 $self->{line_prev} = $self->{line};
6303 $self->{column_prev} = $self->{column};
6304 $self->{column}++;
6305 $self->{nc}
6306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6307 } else {
6308 $self->{set_nc}->($self);
6309 }
6310
6311 redo A;
6312 } else {
6313 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6314 line => $self->{line_prev},
6315 column => $self->{column_prev} - 1
6316 - (length $self->{kwd})
6317 + 1 * ($self->{nc} == -1));
6318 $self->{state} = BOGUS_COMMENT_STATE;
6319 ## Reconsume.
6320 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6321 redo A;
6322 }
6323 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6324 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6325 ## "DOCTYPE NOTATION state".
6326
6327 if ($is_space->{$self->{nc}}) {
6328 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6329 $self->{state} = BEFORE_MD_NAME_STATE;
6330
6331 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6332 $self->{line_prev} = $self->{line};
6333 $self->{column_prev} = $self->{column};
6334 $self->{column}++;
6335 $self->{nc}
6336 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6337 } else {
6338 $self->{set_nc}->($self);
6339 }
6340
6341 redo A;
6342 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6343 $self->{nc} == 0x0025) { # %
6344 ## XML5: Switch to the "DOCTYPE bogus comment state".
6345 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6346 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6347
6348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6349 $self->{line_prev} = $self->{line};
6350 $self->{column_prev} = $self->{column};
6351 $self->{column}++;
6352 $self->{nc}
6353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6354 } else {
6355 $self->{set_nc}->($self);
6356 }
6357
6358 redo A;
6359 } elsif ($self->{nc} == -1) {
6360 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6361 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6362 ## Reconsume.
6363 redo A;
6364 } elsif ($self->{nc} == 0x003E) { # >
6365 ## XML5: Switch to the "DOCTYPE bogus comment state".
6366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6367 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6368
6369 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6370 $self->{line_prev} = $self->{line};
6371 $self->{column_prev} = $self->{column};
6372 $self->{column}++;
6373 $self->{nc}
6374 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6375 } else {
6376 $self->{set_nc}->($self);
6377 }
6378
6379 redo A;
6380 } else {
6381 ## XML5: Switch to the "DOCTYPE bogus comment state".
6382 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6383 $self->{state} = BEFORE_MD_NAME_STATE;
6384 redo A;
6385 }
6386 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6387 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6388 ## before state", "DOCTYPE ATTLIST name before state".
6389
6390 if ($is_space->{$self->{nc}}) {
6391 ## Stay in the state.
6392
6393 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6394 $self->{line_prev} = $self->{line};
6395 $self->{column_prev} = $self->{column};
6396 $self->{column}++;
6397 $self->{nc}
6398 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6399 } else {
6400 $self->{set_nc}->($self);
6401 }
6402
6403 redo A;
6404 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6405 $self->{nc} == 0x0025) { # %
6406 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6407
6408 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6409 $self->{line_prev} = $self->{line};
6410 $self->{column_prev} = $self->{column};
6411 $self->{column}++;
6412 $self->{nc}
6413 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6414 } else {
6415 $self->{set_nc}->($self);
6416 }
6417
6418 redo A;
6419 } elsif ($self->{nc} == 0x003E) { # >
6420 ## XML5: Same as "Anything else".
6421 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6422 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6423
6424 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6425 $self->{line_prev} = $self->{line};
6426 $self->{column_prev} = $self->{column};
6427 $self->{column}++;
6428 $self->{nc}
6429 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6430 } else {
6431 $self->{set_nc}->($self);
6432 }
6433
6434 redo A;
6435 } elsif ($self->{nc} == -1) {
6436 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6437 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6438 ## Reconsume.
6439 redo A;
6440 } else {
6441 ## XML5: [ATTLIST] Not defined yet.
6442 $self->{ct}->{name} .= chr $self->{nc};
6443 $self->{state} = MD_NAME_STATE;
6444
6445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6446 $self->{line_prev} = $self->{line};
6447 $self->{column_prev} = $self->{column};
6448 $self->{column}++;
6449 $self->{nc}
6450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6451 } else {
6452 $self->{set_nc}->($self);
6453 }
6454
6455 redo A;
6456 }
6457 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6458 if ($is_space->{$self->{nc}}) {
6459 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6460 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6461 $self->{state} = BEFORE_MD_NAME_STATE;
6462
6463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6464 $self->{line_prev} = $self->{line};
6465 $self->{column_prev} = $self->{column};
6466 $self->{column}++;
6467 $self->{nc}
6468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6469 } else {
6470 $self->{set_nc}->($self);
6471 }
6472
6473 redo A;
6474 } elsif ($self->{nc} == 0x003E) { # >
6475 ## XML5: Same as "Anything else".
6476 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6477 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6478
6479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6480 $self->{line_prev} = $self->{line};
6481 $self->{column_prev} = $self->{column};
6482 $self->{column}++;
6483 $self->{nc}
6484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6485 } else {
6486 $self->{set_nc}->($self);
6487 }
6488
6489 redo A;
6490 } elsif ($self->{nc} == -1) {
6491 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6492 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6493 ## Reconsume.
6494 redo A;
6495 } else {
6496 ## XML5: No parse error.
6497 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6498 $self->{state} = BOGUS_COMMENT_STATE;
6499 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6500 ## Reconsume.
6501 redo A;
6502 }
6503 } elsif ($self->{state} == MD_NAME_STATE) {
6504 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6505
6506 if ($is_space->{$self->{nc}}) {
6507 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6508 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6509 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6510 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6511 } else { # ENTITY/NOTATION
6512 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6513 }
6514
6515 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6516 $self->{line_prev} = $self->{line};
6517 $self->{column_prev} = $self->{column};
6518 $self->{column}++;
6519 $self->{nc}
6520 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6521 } else {
6522 $self->{set_nc}->($self);
6523 }
6524
6525 redo A;
6526 } elsif ($self->{nc} == 0x003E) { # >
6527 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6528 #
6529 } else {
6530 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6531 }
6532 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6533
6534 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6535 $self->{line_prev} = $self->{line};
6536 $self->{column_prev} = $self->{column};
6537 $self->{column}++;
6538 $self->{nc}
6539 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6540 } else {
6541 $self->{set_nc}->($self);
6542 }
6543
6544 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6545 redo A;
6546 } elsif ($self->{nc} == -1) {
6547 ## XML5: [ATTLIST] No parse error.
6548 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6549 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6550 ## Reconsume.
6551 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6552 redo A;
6553 } else {
6554 ## XML5: [ATTLIST] Not defined yet.
6555 $self->{ct}->{name} .= chr $self->{nc};
6556 ## Stay in the state.
6557
6558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6559 $self->{line_prev} = $self->{line};
6560 $self->{column_prev} = $self->{column};
6561 $self->{column}++;
6562 $self->{nc}
6563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6564 } else {
6565 $self->{set_nc}->($self);
6566 }
6567
6568 redo A;
6569 }
6570 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6571 if ($is_space->{$self->{nc}}) {
6572 ## Stay in the state.
6573
6574 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6575 $self->{line_prev} = $self->{line};
6576 $self->{column_prev} = $self->{column};
6577 $self->{column}++;
6578 $self->{nc}
6579 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6580 } else {
6581 $self->{set_nc}->($self);
6582 }
6583
6584 redo A;
6585 } elsif ($self->{nc} == 0x003E) { # >
6586 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6587
6588 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6589 $self->{line_prev} = $self->{line};
6590 $self->{column_prev} = $self->{column};
6591 $self->{column}++;
6592 $self->{nc}
6593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6594 } else {
6595 $self->{set_nc}->($self);
6596 }
6597
6598 return ($self->{ct}); # ATTLIST
6599 redo A;
6600 } elsif ($self->{nc} == -1) {
6601 ## XML5: No parse error.
6602 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6603 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6604 return ($self->{ct});
6605 redo A;
6606 } else {
6607 ## XML5: Not defined yet.
6608 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6609 tokens => [],
6610 line => $self->{line}, column => $self->{column}};
6611 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6612
6613 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6614 $self->{line_prev} = $self->{line};
6615 $self->{column_prev} = $self->{column};
6616 $self->{column}++;
6617 $self->{nc}
6618 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6619 } else {
6620 $self->{set_nc}->($self);
6621 }
6622
6623 redo A;
6624 }
6625 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6626 if ($is_space->{$self->{nc}}) {
6627 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6628
6629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6630 $self->{line_prev} = $self->{line};
6631 $self->{column_prev} = $self->{column};
6632 $self->{column}++;
6633 $self->{nc}
6634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6635 } else {
6636 $self->{set_nc}->($self);
6637 }
6638
6639 redo A;
6640 } elsif ($self->{nc} == 0x003E) { # >
6641 ## XML5: Same as "anything else".
6642 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6643 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6644
6645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6646 $self->{line_prev} = $self->{line};
6647 $self->{column_prev} = $self->{column};
6648 $self->{column}++;
6649 $self->{nc}
6650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6651 } else {
6652 $self->{set_nc}->($self);
6653 }
6654
6655 return ($self->{ct}); # ATTLIST
6656 redo A;
6657 } elsif ($self->{nc} == 0x0028) { # (
6658 ## XML5: Same as "anything else".
6659 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6660 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6661
6662 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6663 $self->{line_prev} = $self->{line};
6664 $self->{column_prev} = $self->{column};
6665 $self->{column}++;
6666 $self->{nc}
6667 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6668 } else {
6669 $self->{set_nc}->($self);
6670 }
6671
6672 redo A;
6673 } elsif ($self->{nc} == -1) {
6674 ## XML5: No parse error.
6675 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6676 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6677
6678 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6679 $self->{line_prev} = $self->{line};
6680 $self->{column_prev} = $self->{column};
6681 $self->{column}++;
6682 $self->{nc}
6683 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6684 } else {
6685 $self->{set_nc}->($self);
6686 }
6687
6688 return ($self->{ct}); # ATTLIST
6689 redo A;
6690 } else {
6691 ## XML5: Not defined yet.
6692 $self->{ca}->{name} .= chr $self->{nc};
6693 ## Stay in the state.
6694
6695 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6696 $self->{line_prev} = $self->{line};
6697 $self->{column_prev} = $self->{column};
6698 $self->{column}++;
6699 $self->{nc}
6700 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6701 } else {
6702 $self->{set_nc}->($self);
6703 }
6704
6705 redo A;
6706 }
6707 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6708 if ($is_space->{$self->{nc}}) {
6709 ## Stay in the state.
6710
6711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712 $self->{line_prev} = $self->{line};
6713 $self->{column_prev} = $self->{column};
6714 $self->{column}++;
6715 $self->{nc}
6716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717 } else {
6718 $self->{set_nc}->($self);
6719 }
6720
6721 redo A;
6722 } elsif ($self->{nc} == 0x003E) { # >
6723 ## XML5: Same as "anything else".
6724 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6725 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6726
6727 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6728 $self->{line_prev} = $self->{line};
6729 $self->{column_prev} = $self->{column};
6730 $self->{column}++;
6731 $self->{nc}
6732 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6733 } else {
6734 $self->{set_nc}->($self);
6735 }
6736
6737 return ($self->{ct}); # ATTLIST
6738 redo A;
6739 } elsif ($self->{nc} == 0x0028) { # (
6740 ## XML5: Same as "anything else".
6741 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6742
6743 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6744 $self->{line_prev} = $self->{line};
6745 $self->{column_prev} = $self->{column};
6746 $self->{column}++;
6747 $self->{nc}
6748 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6749 } else {
6750 $self->{set_nc}->($self);
6751 }
6752
6753 redo A;
6754 } elsif ($self->{nc} == -1) {
6755 ## XML5: No parse error.
6756 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6757 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6758
6759 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6760 $self->{line_prev} = $self->{line};
6761 $self->{column_prev} = $self->{column};
6762 $self->{column}++;
6763 $self->{nc}
6764 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6765 } else {
6766 $self->{set_nc}->($self);
6767 }
6768
6769 return ($self->{ct});
6770 redo A;
6771 } else {
6772 ## XML5: Not defined yet.
6773 $self->{ca}->{type} = chr $self->{nc};
6774 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6775
6776 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6777 $self->{line_prev} = $self->{line};
6778 $self->{column_prev} = $self->{column};
6779 $self->{column}++;
6780 $self->{nc}
6781 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6782 } else {
6783 $self->{set_nc}->($self);
6784 }
6785
6786 redo A;
6787 }
6788 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6789 if ($is_space->{$self->{nc}}) {
6790 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6791
6792 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6793 $self->{line_prev} = $self->{line};
6794 $self->{column_prev} = $self->{column};
6795 $self->{column}++;
6796 $self->{nc}
6797 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6798 } else {
6799 $self->{set_nc}->($self);
6800 }
6801
6802 redo A;
6803 } elsif ($self->{nc} == 0x0023) { # #
6804 ## XML5: Same as "anything else".
6805 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6806 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6807
6808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6809 $self->{line_prev} = $self->{line};
6810 $self->{column_prev} = $self->{column};
6811 $self->{column}++;
6812 $self->{nc}
6813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6814 } else {
6815 $self->{set_nc}->($self);
6816 }
6817
6818 redo A;
6819 } elsif ($self->{nc} == 0x0022) { # "
6820 ## XML5: Same as "anything else".
6821 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6822 $self->{ca}->{value} = '';
6823 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6824
6825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826 $self->{line_prev} = $self->{line};
6827 $self->{column_prev} = $self->{column};
6828 $self->{column}++;
6829 $self->{nc}
6830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831 } else {
6832 $self->{set_nc}->($self);
6833 }
6834
6835 redo A;
6836 } elsif ($self->{nc} == 0x0027) { # '
6837 ## XML5: Same as "anything else".
6838 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6839 $self->{ca}->{value} = '';
6840 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6841
6842 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6843 $self->{line_prev} = $self->{line};
6844 $self->{column_prev} = $self->{column};
6845 $self->{column}++;
6846 $self->{nc}
6847 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6848 } else {
6849 $self->{set_nc}->($self);
6850 }
6851
6852 redo A;
6853 } elsif ($self->{nc} == 0x003E) { # >
6854 ## XML5: Same as "anything else".
6855 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6856 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6857
6858 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6859 $self->{line_prev} = $self->{line};
6860 $self->{column_prev} = $self->{column};
6861 $self->{column}++;
6862 $self->{nc}
6863 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6864 } else {
6865 $self->{set_nc}->($self);
6866 }
6867
6868 return ($self->{ct}); # ATTLIST
6869 redo A;
6870 } elsif ($self->{nc} == 0x0028) { # (
6871 ## XML5: Same as "anything else".
6872 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6873 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6874
6875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876 $self->{line_prev} = $self->{line};
6877 $self->{column_prev} = $self->{column};
6878 $self->{column}++;
6879 $self->{nc}
6880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881 } else {
6882 $self->{set_nc}->($self);
6883 }
6884
6885 redo A;
6886 } elsif ($self->{nc} == -1) {
6887 ## XML5: No parse error.
6888 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6889 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6890
6891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6892 $self->{line_prev} = $self->{line};
6893 $self->{column_prev} = $self->{column};
6894 $self->{column}++;
6895 $self->{nc}
6896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6897 } else {
6898 $self->{set_nc}->($self);
6899 }
6900
6901 return ($self->{ct});
6902 redo A;
6903 } else {
6904 ## XML5: Not defined yet.
6905 $self->{ca}->{type} .= chr $self->{nc};
6906 ## Stay in the state.
6907
6908 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6909 $self->{line_prev} = $self->{line};
6910 $self->{column_prev} = $self->{column};
6911 $self->{column}++;
6912 $self->{nc}
6913 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6914 } else {
6915 $self->{set_nc}->($self);
6916 }
6917
6918 redo A;
6919 }
6920 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6921 if ($is_space->{$self->{nc}}) {
6922 ## Stay in the state.
6923
6924 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6925 $self->{line_prev} = $self->{line};
6926 $self->{column_prev} = $self->{column};
6927 $self->{column}++;
6928 $self->{nc}
6929 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6930 } else {
6931 $self->{set_nc}->($self);
6932 }
6933
6934 redo A;
6935 } elsif ($self->{nc} == 0x0028) { # (
6936 ## XML5: Same as "anything else".
6937 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6938
6939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6940 $self->{line_prev} = $self->{line};
6941 $self->{column_prev} = $self->{column};
6942 $self->{column}++;
6943 $self->{nc}
6944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6945 } else {
6946 $self->{set_nc}->($self);
6947 }
6948
6949 redo A;
6950 } elsif ($self->{nc} == 0x0023) { # #
6951 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6952
6953 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6954 $self->{line_prev} = $self->{line};
6955 $self->{column_prev} = $self->{column};
6956 $self->{column}++;
6957 $self->{nc}
6958 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6959 } else {
6960 $self->{set_nc}->($self);
6961 }
6962
6963 redo A;
6964 } elsif ($self->{nc} == 0x0022) { # "
6965 ## XML5: Same as "anything else".
6966 $self->{ca}->{value} = '';
6967 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6968
6969 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6970 $self->{line_prev} = $self->{line};
6971 $self->{column_prev} = $self->{column};
6972 $self->{column}++;
6973 $self->{nc}
6974 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6975 } else {
6976 $self->{set_nc}->($self);
6977 }
6978
6979 redo A;
6980 } elsif ($self->{nc} == 0x0027) { # '
6981 ## XML5: Same as "anything else".
6982 $self->{ca}->{value} = '';
6983 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6984
6985 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6986 $self->{line_prev} = $self->{line};
6987 $self->{column_prev} = $self->{column};
6988 $self->{column}++;
6989 $self->{nc}
6990 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6991 } else {
6992 $self->{set_nc}->($self);
6993 }
6994
6995 redo A;
6996 } elsif ($self->{nc} == 0x003E) { # >
6997 ## XML5: Same as "anything else".
6998 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6999 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7000
7001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7002 $self->{line_prev} = $self->{line};
7003 $self->{column_prev} = $self->{column};
7004 $self->{column}++;
7005 $self->{nc}
7006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7007 } else {
7008 $self->{set_nc}->($self);
7009 }
7010
7011 return ($self->{ct}); # ATTLIST
7012 redo A;
7013 } elsif ($self->{nc} == -1) {
7014 ## XML5: No parse error.
7015 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7016 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7017
7018 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7019 $self->{line_prev} = $self->{line};
7020 $self->{column_prev} = $self->{column};
7021 $self->{column}++;
7022 $self->{nc}
7023 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7024 } else {
7025 $self->{set_nc}->($self);
7026 }
7027
7028 return ($self->{ct});
7029 redo A;
7030 } else {
7031 ## XML5: Switch to the "DOCTYPE bogus comment state".
7032 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7033 $self->{ca}->{value} = '';
7034 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7035 ## Reconsume.
7036 redo A;
7037 }
7038 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7039 if ($is_space->{$self->{nc}}) {
7040 ## Stay in the state.
7041
7042 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7043 $self->{line_prev} = $self->{line};
7044 $self->{column_prev} = $self->{column};
7045 $self->{column}++;
7046 $self->{nc}
7047 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7048 } else {
7049 $self->{set_nc}->($self);
7050 }
7051
7052 redo A;
7053 } elsif ($self->{nc} == 0x007C) { # |
7054 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7055 ## Stay in the state.
7056
7057 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7058 $self->{line_prev} = $self->{line};
7059 $self->{column_prev} = $self->{column};
7060 $self->{column}++;
7061 $self->{nc}
7062 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7063 } else {
7064 $self->{set_nc}->($self);
7065 }
7066
7067 redo A;
7068 } elsif ($self->{nc} == 0x0029) { # )
7069 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7070 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7071
7072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7073 $self->{line_prev} = $self->{line};
7074 $self->{column_prev} = $self->{column};
7075 $self->{column}++;
7076 $self->{nc}
7077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7078 } else {
7079 $self->{set_nc}->($self);
7080 }
7081
7082 redo A;
7083 } elsif ($self->{nc} == 0x003E) { # >
7084 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7085 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7086
7087 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7088 $self->{line_prev} = $self->{line};
7089 $self->{column_prev} = $self->{column};
7090 $self->{column}++;
7091 $self->{nc}
7092 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7093 } else {
7094 $self->{set_nc}->($self);
7095 }
7096
7097 return ($self->{ct}); # ATTLIST
7098 redo A;
7099 } elsif ($self->{nc} == -1) {
7100 ## XML5: No parse error.
7101 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7102 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7103
7104 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7105 $self->{line_prev} = $self->{line};
7106 $self->{column_prev} = $self->{column};
7107 $self->{column}++;
7108 $self->{nc}
7109 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7110 } else {
7111 $self->{set_nc}->($self);
7112 }
7113
7114 return ($self->{ct});
7115 redo A;
7116 } else {
7117 push @{$self->{ca}->{tokens}}, chr $self->{nc};
7118 $self->{state} = ALLOWED_TOKEN_STATE;
7119
7120 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7121 $self->{line_prev} = $self->{line};
7122 $self->{column_prev} = $self->{column};
7123 $self->{column}++;
7124 $self->{nc}
7125 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7126 } else {
7127 $self->{set_nc}->($self);
7128 }
7129
7130 redo A;
7131 }
7132 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7133 if ($is_space->{$self->{nc}}) {
7134 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7135
7136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7137 $self->{line_prev} = $self->{line};
7138 $self->{column_prev} = $self->{column};
7139 $self->{column}++;
7140 $self->{nc}
7141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7142 } else {
7143 $self->{set_nc}->($self);
7144 }
7145
7146 redo A;
7147 } elsif ($self->{nc} == 0x007C) { # |
7148 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7149
7150 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7151 $self->{line_prev} = $self->{line};
7152 $self->{column_prev} = $self->{column};
7153 $self->{column}++;
7154 $self->{nc}
7155 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7156 } else {
7157 $self->{set_nc}->($self);
7158 }
7159
7160 redo A;
7161 } elsif ($self->{nc} == 0x0029) { # )
7162 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7163
7164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7165 $self->{line_prev} = $self->{line};
7166 $self->{column_prev} = $self->{column};
7167 $self->{column}++;
7168 $self->{nc}
7169 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7170 } else {
7171 $self->{set_nc}->($self);
7172 }
7173
7174 redo A;
7175 } elsif ($self->{nc} == 0x003E) { # >
7176 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7177 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7178
7179 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7180 $self->{line_prev} = $self->{line};
7181 $self->{column_prev} = $self->{column};
7182 $self->{column}++;
7183 $self->{nc}
7184 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7185 } else {
7186 $self->{set_nc}->($self);
7187 }
7188
7189 return ($self->{ct}); # ATTLIST
7190 redo A;
7191 } elsif ($self->{nc} == -1) {
7192 ## XML5: No parse error.
7193 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7194 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7195
7196 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7197 $self->{line_prev} = $self->{line};
7198 $self->{column_prev} = $self->{column};
7199 $self->{column}++;
7200 $self->{nc}
7201 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7202 } else {
7203 $self->{set_nc}->($self);
7204 }
7205
7206 return ($self->{ct});
7207 redo A;
7208 } else {
7209 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7210 ## Stay in the state.
7211
7212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7213 $self->{line_prev} = $self->{line};
7214 $self->{column_prev} = $self->{column};
7215 $self->{column}++;
7216 $self->{nc}
7217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7218 } else {
7219 $self->{set_nc}->($self);
7220 }
7221
7222 redo A;
7223 }
7224 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7225 if ($is_space->{$self->{nc}}) {
7226 ## Stay in the state.
7227
7228 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229 $self->{line_prev} = $self->{line};
7230 $self->{column_prev} = $self->{column};
7231 $self->{column}++;
7232 $self->{nc}
7233 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234 } else {
7235 $self->{set_nc}->($self);
7236 }
7237
7238 redo A;
7239 } elsif ($self->{nc} == 0x007C) { # |
7240 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7241
7242 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7243 $self->{line_prev} = $self->{line};
7244 $self->{column_prev} = $self->{column};
7245 $self->{column}++;
7246 $self->{nc}
7247 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7248 } else {
7249 $self->{set_nc}->($self);
7250 }
7251
7252 redo A;
7253 } elsif ($self->{nc} == 0x0029) { # )
7254 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7255
7256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7257 $self->{line_prev} = $self->{line};
7258 $self->{column_prev} = $self->{column};
7259 $self->{column}++;
7260 $self->{nc}
7261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7262 } else {
7263 $self->{set_nc}->($self);
7264 }
7265
7266 redo A;
7267 } elsif ($self->{nc} == 0x003E) { # >
7268 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7269 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7270
7271 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7272 $self->{line_prev} = $self->{line};
7273 $self->{column_prev} = $self->{column};
7274 $self->{column}++;
7275 $self->{nc}
7276 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7277 } else {
7278 $self->{set_nc}->($self);
7279 }
7280
7281 return ($self->{ct}); # ATTLIST
7282 redo A;
7283 } elsif ($self->{nc} == -1) {
7284 ## XML5: No parse error.
7285 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7286 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7287
7288 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7289 $self->{line_prev} = $self->{line};
7290 $self->{column_prev} = $self->{column};
7291 $self->{column}++;
7292 $self->{nc}
7293 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7294 } else {
7295 $self->{set_nc}->($self);
7296 }
7297
7298 return ($self->{ct});
7299 redo A;
7300 } else {
7301 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7302 line => $self->{line_prev},
7303 column => $self->{column_prev});
7304 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7305 $self->{state} = ALLOWED_TOKEN_STATE;
7306
7307 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308 $self->{line_prev} = $self->{line};
7309 $self->{column_prev} = $self->{column};
7310 $self->{column}++;
7311 $self->{nc}
7312 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313 } else {
7314 $self->{set_nc}->($self);
7315 }
7316
7317 redo A;
7318 }
7319 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7320 if ($is_space->{$self->{nc}}) {
7321 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7322
7323 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7324 $self->{line_prev} = $self->{line};
7325 $self->{column_prev} = $self->{column};
7326 $self->{column}++;
7327 $self->{nc}
7328 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7329 } else {
7330 $self->{set_nc}->($self);
7331 }
7332
7333 redo A;
7334 } elsif ($self->{nc} == 0x0023) { # #
7335 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7336 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7337
7338 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339 $self->{line_prev} = $self->{line};
7340 $self->{column_prev} = $self->{column};
7341 $self->{column}++;
7342 $self->{nc}
7343 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344 } else {
7345 $self->{set_nc}->($self);
7346 }
7347
7348 redo A;
7349 } elsif ($self->{nc} == 0x0022) { # "
7350 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7351 $self->{ca}->{value} = '';
7352 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7353
7354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355 $self->{line_prev} = $self->{line};
7356 $self->{column_prev} = $self->{column};
7357 $self->{column}++;
7358 $self->{nc}
7359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360 } else {
7361 $self->{set_nc}->($self);
7362 }
7363
7364 redo A;
7365 } elsif ($self->{nc} == 0x0027) { # '
7366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7367 $self->{ca}->{value} = '';
7368 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7369
7370 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7371 $self->{line_prev} = $self->{line};
7372 $self->{column_prev} = $self->{column};
7373 $self->{column}++;
7374 $self->{nc}
7375 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7376 } else {
7377 $self->{set_nc}->($self);
7378 }
7379
7380 redo A;
7381 } elsif ($self->{nc} == 0x003E) { # >
7382 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7383 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7384
7385 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7386 $self->{line_prev} = $self->{line};
7387 $self->{column_prev} = $self->{column};
7388 $self->{column}++;
7389 $self->{nc}
7390 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7391 } else {
7392 $self->{set_nc}->($self);
7393 }
7394
7395 return ($self->{ct}); # ATTLIST
7396 redo A;
7397 } elsif ($self->{nc} == -1) {
7398 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7399 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7400
7401 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402 $self->{line_prev} = $self->{line};
7403 $self->{column_prev} = $self->{column};
7404 $self->{column}++;
7405 $self->{nc}
7406 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407 } else {
7408 $self->{set_nc}->($self);
7409 }
7410
7411 return ($self->{ct});
7412 redo A;
7413 } else {
7414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7415 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7416 ## Reconsume.
7417 redo A;
7418 }
7419 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7420 if ($is_space->{$self->{nc}}) {
7421 ## Stay in the state.
7422
7423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7424 $self->{line_prev} = $self->{line};
7425 $self->{column_prev} = $self->{column};
7426 $self->{column}++;
7427 $self->{nc}
7428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7429 } else {
7430 $self->{set_nc}->($self);
7431 }
7432
7433 redo A;
7434 } elsif ($self->{nc} == 0x0023) { # #
7435 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7436
7437 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7438 $self->{line_prev} = $self->{line};
7439 $self->{column_prev} = $self->{column};
7440 $self->{column}++;
7441 $self->{nc}
7442 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7443 } else {
7444 $self->{set_nc}->($self);
7445 }
7446
7447 redo A;
7448 } elsif ($self->{nc} == 0x0022) { # "
7449 $self->{ca}->{value} = '';
7450 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7451
7452 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7453 $self->{line_prev} = $self->{line};
7454 $self->{column_prev} = $self->{column};
7455 $self->{column}++;
7456 $self->{nc}
7457 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7458 } else {
7459 $self->{set_nc}->($self);
7460 }
7461
7462 redo A;
7463 } elsif ($self->{nc} == 0x0027) { # '
7464 $self->{ca}->{value} = '';
7465 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7466
7467 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7468 $self->{line_prev} = $self->{line};
7469 $self->{column_prev} = $self->{column};
7470 $self->{column}++;
7471 $self->{nc}
7472 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7473 } else {
7474 $self->{set_nc}->($self);
7475 }
7476
7477 redo A;
7478 } elsif ($self->{nc} == 0x003E) { # >
7479 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7480 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7481
7482 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7483 $self->{line_prev} = $self->{line};
7484 $self->{column_prev} = $self->{column};
7485 $self->{column}++;
7486 $self->{nc}
7487 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7488 } else {
7489 $self->{set_nc}->($self);
7490 }
7491
7492 return ($self->{ct}); # ATTLIST
7493 redo A;
7494 } elsif ($self->{nc} == -1) {
7495 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7496 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7497
7498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7499 $self->{line_prev} = $self->{line};
7500 $self->{column_prev} = $self->{column};
7501 $self->{column}++;
7502 $self->{nc}
7503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7504 } else {
7505 $self->{set_nc}->($self);
7506 }
7507
7508 return ($self->{ct});
7509 redo A;
7510 } else {
7511 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7512 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7513 ## Reconsume.
7514 redo A;
7515 }
7516 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7517 if ($is_space->{$self->{nc}}) {
7518 ## XML5: No parse error.
7519 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7520 $self->{state} = BOGUS_MD_STATE;
7521 ## Reconsume.
7522 redo A;
7523 } elsif ($self->{nc} == 0x0022) { # "
7524 ## XML5: Same as "anything else".
7525 $self->{ca}->{value} = '';
7526 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7527
7528 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7529 $self->{line_prev} = $self->{line};
7530 $self->{column_prev} = $self->{column};
7531 $self->{column}++;
7532 $self->{nc}
7533 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7534 } else {
7535 $self->{set_nc}->($self);
7536 }
7537
7538 redo A;
7539 } elsif ($self->{nc} == 0x0027) { # '
7540 ## XML5: Same as "anything else".
7541 $self->{ca}->{value} = '';
7542 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7543
7544 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7545 $self->{line_prev} = $self->{line};
7546 $self->{column_prev} = $self->{column};
7547 $self->{column}++;
7548 $self->{nc}
7549 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7550 } else {
7551 $self->{set_nc}->($self);
7552 }
7553
7554 redo A;
7555 } elsif ($self->{nc} == 0x003E) { # >
7556 ## XML5: Same as "anything else".
7557 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7558 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7559
7560 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7561 $self->{line_prev} = $self->{line};
7562 $self->{column_prev} = $self->{column};
7563 $self->{column}++;
7564 $self->{nc}
7565 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7566 } else {
7567 $self->{set_nc}->($self);
7568 }
7569
7570 return ($self->{ct}); # ATTLIST
7571 redo A;
7572 } elsif ($self->{nc} == -1) {
7573 ## XML5: No parse error.
7574 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7575 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7576
7577 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7578 $self->{line_prev} = $self->{line};
7579 $self->{column_prev} = $self->{column};
7580 $self->{column}++;
7581 $self->{nc}
7582 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7583 } else {
7584 $self->{set_nc}->($self);
7585 }
7586
7587 return ($self->{ct});
7588 redo A;
7589 } else {
7590 $self->{ca}->{default} = chr $self->{nc};
7591 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7592
7593 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7594 $self->{line_prev} = $self->{line};
7595 $self->{column_prev} = $self->{column};
7596 $self->{column}++;
7597 $self->{nc}
7598 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7599 } else {
7600 $self->{set_nc}->($self);
7601 }
7602
7603 redo A;
7604 }
7605 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7606 if ($is_space->{$self->{nc}}) {
7607 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7608
7609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7610 $self->{line_prev} = $self->{line};
7611 $self->{column_prev} = $self->{column};
7612 $self->{column}++;
7613 $self->{nc}
7614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7615 } else {
7616 $self->{set_nc}->($self);
7617 }
7618
7619 redo A;
7620 } elsif ($self->{nc} == 0x0022) { # "
7621 ## XML5: Same as "anything else".
7622 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7623 $self->{ca}->{value} = '';
7624 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7625
7626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7627 $self->{line_prev} = $self->{line};
7628 $self->{column_prev} = $self->{column};
7629 $self->{column}++;
7630 $self->{nc}
7631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7632 } else {
7633 $self->{set_nc}->($self);
7634 }
7635
7636 redo A;
7637 } elsif ($self->{nc} == 0x0027) { # '
7638 ## XML5: Same as "anything else".
7639 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7640 $self->{ca}->{value} = '';
7641 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7642
7643 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7644 $self->{line_prev} = $self->{line};
7645 $self->{column_prev} = $self->{column};
7646 $self->{column}++;
7647 $self->{nc}
7648 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7649 } else {
7650 $self->{set_nc}->($self);
7651 }
7652
7653 redo A;
7654 } elsif ($self->{nc} == 0x003E) { # >
7655 ## XML5: Same as "anything else".
7656 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7657 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7658
7659 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7660 $self->{line_prev} = $self->{line};
7661 $self->{column_prev} = $self->{column};
7662 $self->{column}++;
7663 $self->{nc}
7664 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7665 } else {
7666 $self->{set_nc}->($self);
7667 }
7668
7669 return ($self->{ct}); # ATTLIST
7670 redo A;
7671 } elsif ($self->{nc} == -1) {
7672 ## XML5: No parse error.
7673 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7674 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7675 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7676
7677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7678 $self->{line_prev} = $self->{line};
7679 $self->{column_prev} = $self->{column};
7680 $self->{column}++;
7681 $self->{nc}
7682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7683 } else {
7684 $self->{set_nc}->($self);
7685 }
7686
7687 return ($self->{ct});
7688 redo A;
7689 } else {
7690 $self->{ca}->{default} .= chr $self->{nc};
7691 ## Stay in the state.
7692
7693 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7694 $self->{line_prev} = $self->{line};
7695 $self->{column_prev} = $self->{column};
7696 $self->{column}++;
7697 $self->{nc}
7698 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7699 } else {
7700 $self->{set_nc}->($self);
7701 }
7702
7703 redo A;
7704 }
7705 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7706 if ($is_space->{$self->{nc}}) {
7707 ## Stay in the state.
7708
7709 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7710 $self->{line_prev} = $self->{line};
7711 $self->{column_prev} = $self->{column};
7712 $self->{column}++;
7713 $self->{nc}
7714 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7715 } else {
7716 $self->{set_nc}->($self);
7717 }
7718
7719 redo A;
7720 } elsif ($self->{nc} == 0x0022) { # "
7721 $self->{ca}->{value} = '';
7722 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7723
7724 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725 $self->{line_prev} = $self->{line};
7726 $self->{column_prev} = $self->{column};
7727 $self->{column}++;
7728 $self->{nc}
7729 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730 } else {
7731 $self->{set_nc}->($self);
7732 }
7733
7734 redo A;
7735 } elsif ($self->{nc} == 0x0027) { # '
7736 $self->{ca}->{value} = '';
7737 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7738
7739 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7740 $self->{line_prev} = $self->{line};
7741 $self->{column_prev} = $self->{column};
7742 $self->{column}++;
7743 $self->{nc}
7744 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7745 } else {
7746 $self->{set_nc}->($self);
7747 }
7748
7749 redo A;
7750 } elsif ($self->{nc} == 0x003E) { # >
7751 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7752 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753
7754 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755 $self->{line_prev} = $self->{line};
7756 $self->{column_prev} = $self->{column};
7757 $self->{column}++;
7758 $self->{nc}
7759 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760 } else {
7761 $self->{set_nc}->($self);
7762 }
7763
7764 return ($self->{ct}); # ATTLIST
7765 redo A;
7766 } elsif ($self->{nc} == -1) {
7767 ## XML5: No parse error.
7768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7769 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7770 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7771
7772 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7773 $self->{line_prev} = $self->{line};
7774 $self->{column_prev} = $self->{column};
7775 $self->{column}++;
7776 $self->{nc}
7777 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7778 } else {
7779 $self->{set_nc}->($self);
7780 }
7781
7782 return ($self->{ct});
7783 redo A;
7784 } else {
7785 ## XML5: Not defined yet.
7786 if ($self->{ca}->{default} eq 'FIXED') {
7787 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7788 } else {
7789 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7790 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7791 }
7792 ## Reconsume.
7793 redo A;
7794 }
7795 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7796 if ($is_space->{$self->{nc}} or
7797 $self->{nc} == -1 or
7798 $self->{nc} == 0x003E) { # >
7799 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7800 ## Reconsume.
7801 redo A;
7802 } else {
7803 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7804 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7805 ## Reconsume.
7806 redo A;
7807 }
7808 } elsif ($self->{state} == NDATA_STATE) {
7809 ## ASCII case-insensitive
7810 if ($self->{nc} == [
7811 undef,
7812 0x0044, # D
7813 0x0041, # A
7814 0x0054, # T
7815 ]->[length $self->{kwd}] or
7816 $self->{nc} == [
7817 undef,
7818 0x0064, # d
7819 0x0061, # a
7820 0x0074, # t
7821 ]->[length $self->{kwd}]) {
7822
7823 ## Stay in the state.
7824 $self->{kwd} .= chr $self->{nc};
7825
7826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7827 $self->{line_prev} = $self->{line};
7828 $self->{column_prev} = $self->{column};
7829 $self->{column}++;
7830 $self->{nc}
7831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7832 } else {
7833 $self->{set_nc}->($self);
7834 }
7835
7836 redo A;
7837 } elsif ((length $self->{kwd}) == 4 and
7838 ($self->{nc} == 0x0041 or # A
7839 $self->{nc} == 0x0061)) { # a
7840 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7841
7842 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7843 text => 'NDATA',
7844 line => $self->{line_prev},
7845 column => $self->{column_prev} - 4);
7846 } else {
7847
7848 }
7849 $self->{state} = AFTER_NDATA_STATE;
7850
7851 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7852 $self->{line_prev} = $self->{line};
7853 $self->{column_prev} = $self->{column};
7854 $self->{column}++;
7855 $self->{nc}
7856 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7857 } else {
7858 $self->{set_nc}->($self);
7859 }
7860
7861 redo A;
7862 } else {
7863 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7864 line => $self->{line_prev},
7865 column => $self->{column_prev} + 1
7866 - length $self->{kwd});
7867
7868 $self->{state} = BOGUS_MD_STATE;
7869 ## Reconsume.
7870 redo A;
7871 }
7872 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7873 if ($is_space->{$self->{nc}}) {
7874 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7875
7876 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7877 $self->{line_prev} = $self->{line};
7878 $self->{column_prev} = $self->{column};
7879 $self->{column}++;
7880 $self->{nc}
7881 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7882 } else {
7883 $self->{set_nc}->($self);
7884 }
7885
7886 redo A;
7887 } elsif ($self->{nc} == 0x003E) { # >
7888 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7889 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7890
7891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7892 $self->{line_prev} = $self->{line};
7893 $self->{column_prev} = $self->{column};
7894 $self->{column}++;
7895 $self->{nc}
7896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7897 } else {
7898 $self->{set_nc}->($self);
7899 }
7900
7901 return ($self->{ct}); # ENTITY
7902 redo A;
7903 } elsif ($self->{nc} == -1) {
7904 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7905 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7906
7907 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7908 $self->{line_prev} = $self->{line};
7909 $self->{column_prev} = $self->{column};
7910 $self->{column}++;
7911 $self->{nc}
7912 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7913 } else {
7914 $self->{set_nc}->($self);
7915 }
7916
7917 return ($self->{ct}); # ENTITY
7918 redo A;
7919 } else {
7920 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7921 line => $self->{line_prev},
7922 column => $self->{column_prev} + 1
7923 - length $self->{kwd});
7924 $self->{state} = BOGUS_MD_STATE;
7925 ## Reconsume.
7926 redo A;
7927 }
7928 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7929 if ($is_space->{$self->{nc}}) {
7930 ## Stay in the state.
7931
7932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7933 $self->{line_prev} = $self->{line};
7934 $self->{column_prev} = $self->{column};
7935 $self->{column}++;
7936 $self->{nc}
7937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7938 } else {
7939 $self->{set_nc}->($self);
7940 }
7941
7942 redo A;
7943 } elsif ($self->{nc} == 0x003E) { # >
7944 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7945 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7946
7947 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7948 $self->{line_prev} = $self->{line};
7949 $self->{column_prev} = $self->{column};
7950 $self->{column}++;
7951 $self->{nc}
7952 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7953 } else {
7954 $self->{set_nc}->($self);
7955 }
7956
7957 return ($self->{ct}); # ENTITY
7958 redo A;
7959 } elsif ($self->{nc} == -1) {
7960 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7961 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7962
7963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7964 $self->{line_prev} = $self->{line};
7965 $self->{column_prev} = $self->{column};
7966 $self->{column}++;
7967 $self->{nc}
7968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7969 } else {
7970 $self->{set_nc}->($self);
7971 }
7972
7973 return ($self->{ct}); # ENTITY
7974 redo A;
7975 } else {
7976 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7977 $self->{state} = NOTATION_NAME_STATE;
7978
7979 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7980 $self->{line_prev} = $self->{line};
7981 $self->{column_prev} = $self->{column};
7982 $self->{column}++;
7983 $self->{nc}
7984 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7985 } else {
7986 $self->{set_nc}->($self);
7987 }
7988
7989 redo A;
7990 }
7991 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7992 if ($is_space->{$self->{nc}}) {
7993 $self->{state} = AFTER_MD_DEF_STATE;
7994
7995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7996 $self->{line_prev} = $self->{line};
7997 $self->{column_prev} = $self->{column};
7998 $self->{column}++;
7999 $self->{nc}
8000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8001 } else {
8002 $self->{set_nc}->($self);
8003 }
8004
8005 redo A;
8006 } elsif ($self->{nc} == 0x003E) { # >
8007 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8008
8009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8010 $self->{line_prev} = $self->{line};
8011 $self->{column_prev} = $self->{column};
8012 $self->{column}++;
8013 $self->{nc}
8014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8015 } else {
8016 $self->{set_nc}->($self);
8017 }
8018
8019 return ($self->{ct}); # ENTITY
8020 redo A;
8021 } elsif ($self->{nc} == -1) {
8022 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8023 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8024
8025 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8026 $self->{line_prev} = $self->{line};
8027 $self->{column_prev} = $self->{column};
8028 $self->{column}++;
8029 $self->{nc}
8030 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8031 } else {
8032 $self->{set_nc}->($self);
8033 }
8034
8035 return ($self->{ct}); # ENTITY
8036 redo A;
8037 } else {
8038 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8039 ## Stay in the state.
8040
8041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8042 $self->{line_prev} = $self->{line};
8043 $self->{column_prev} = $self->{column};
8044 $self->{column}++;
8045 $self->{nc}
8046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8047 } else {
8048 $self->{set_nc}->($self);
8049 }
8050
8051 redo A;
8052 }
8053 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8054 if ($self->{nc} == 0x0022) { # "
8055 $self->{state} = AFTER_MD_DEF_STATE;
8056
8057 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8058 $self->{line_prev} = $self->{line};
8059 $self->{column_prev} = $self->{column};
8060 $self->{column}++;
8061 $self->{nc}
8062 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8063 } else {
8064 $self->{set_nc}->($self);
8065 }
8066
8067 redo A;
8068 } elsif ($self->{nc} == 0x0026) { # &
8069 $self->{prev_state} = $self->{state};
8070 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8071 $self->{entity_add} = 0x0022; # "
8072
8073 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8074 $self->{line_prev} = $self->{line};
8075 $self->{column_prev} = $self->{column};
8076 $self->{column}++;
8077 $self->{nc}
8078 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8079 } else {
8080 $self->{set_nc}->($self);
8081 }
8082
8083 redo A;
8084 ## TODO: %
8085 } elsif ($self->{nc} == -1) {
8086 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8087 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8088 ## Reconsume.
8089 return ($self->{ct}); # ENTITY
8090 redo A;
8091 } else {
8092 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8093
8094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095 $self->{line_prev} = $self->{line};
8096 $self->{column_prev} = $self->{column};
8097 $self->{column}++;
8098 $self->{nc}
8099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100 } else {
8101 $self->{set_nc}->($self);
8102 }
8103
8104 redo A;
8105 }
8106 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8107 if ($self->{nc} == 0x0027) { # '
8108 $self->{state} = AFTER_MD_DEF_STATE;
8109
8110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8111 $self->{line_prev} = $self->{line};
8112 $self->{column_prev} = $self->{column};
8113 $self->{column}++;
8114 $self->{nc}
8115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8116 } else {
8117 $self->{set_nc}->($self);
8118 }
8119
8120 redo A;
8121 } elsif ($self->{nc} == 0x0026) { # &
8122 $self->{prev_state} = $self->{state};
8123 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8124 $self->{entity_add} = 0x0027; # '
8125
8126 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127 $self->{line_prev} = $self->{line};
8128 $self->{column_prev} = $self->{column};
8129 $self->{column}++;
8130 $self->{nc}
8131 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132 } else {
8133 $self->{set_nc}->($self);
8134 }
8135
8136 redo A;
8137 ## TODO: %
8138 } elsif ($self->{nc} == -1) {
8139 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8140 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141 ## Reconsume.
8142 return ($self->{ct}); # ENTITY
8143 redo A;
8144 } else {
8145 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8146
8147 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8148 $self->{line_prev} = $self->{line};
8149 $self->{column_prev} = $self->{column};
8150 $self->{column}++;
8151 $self->{nc}
8152 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8153 } else {
8154 $self->{set_nc}->($self);
8155 }
8156
8157 redo A;
8158 }
8159 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8160 if ($is_space->{$self->{nc}} or
8161 {
8162 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8163 $self->{entity_add} => 1,
8164 }->{$self->{nc}}) {
8165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8166 line => $self->{line_prev},
8167 column => $self->{column_prev}
8168 + ($self->{nc} == -1 ? 1 : 0));
8169 ## Don't consume
8170 ## Return nothing.
8171 #
8172 } elsif ($self->{nc} == 0x0023) { # #
8173 $self->{ca} = $self->{ct};
8174 $self->{state} = ENTITY_HASH_STATE;
8175 $self->{kwd} = '#';
8176
8177 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8178 $self->{line_prev} = $self->{line};
8179 $self->{column_prev} = $self->{column};
8180 $self->{column}++;
8181 $self->{nc}
8182 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8183 } else {
8184 $self->{set_nc}->($self);
8185 }
8186
8187 redo A;
8188 } else {
8189 #
8190 }
8191
8192 $self->{ct}->{value} .= '&';
8193 $self->{state} = $self->{prev_state};
8194 ## Reconsume.
8195 redo A;
8196 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8197 if ($is_space->{$self->{nc}}) {
8198 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8199
8200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8201 $self->{line_prev} = $self->{line};
8202 $self->{column_prev} = $self->{column};
8203 $self->{column}++;
8204 $self->{nc}
8205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8206 } else {
8207 $self->{set_nc}->($self);
8208 }
8209
8210 redo A;
8211 } elsif ($self->{nc} == 0x0028) { # (
8212 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8213 $self->{ct}->{content} = ['('];
8214 $self->{group_depth} = 1;
8215
8216 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8217 $self->{line_prev} = $self->{line};
8218 $self->{column_prev} = $self->{column};
8219 $self->{column}++;
8220 $self->{nc}
8221 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8222 } else {
8223 $self->{set_nc}->($self);
8224 }
8225
8226 redo A;
8227 } elsif ($self->{nc} == 0x003E) { # >
8228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8229 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8230
8231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8232 $self->{line_prev} = $self->{line};
8233 $self->{column_prev} = $self->{column};
8234 $self->{column}++;
8235 $self->{nc}
8236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8237 } else {
8238 $self->{set_nc}->($self);
8239 }
8240
8241 return ($self->{ct}); # ELEMENT
8242 redo A;
8243 } elsif ($self->{nc} == -1) {
8244 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8245 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8246
8247 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8248 $self->{line_prev} = $self->{line};
8249 $self->{column_prev} = $self->{column};
8250 $self->{column}++;
8251 $self->{nc}
8252 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8253 } else {
8254 $self->{set_nc}->($self);
8255 }
8256
8257 return ($self->{ct}); # ELEMENT
8258 redo A;
8259 } else {
8260 $self->{ct}->{content} = [chr $self->{nc}];
8261 $self->{state} = CONTENT_KEYWORD_STATE;
8262
8263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264 $self->{line_prev} = $self->{line};
8265 $self->{column_prev} = $self->{column};
8266 $self->{column}++;
8267 $self->{nc}
8268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269 } else {
8270 $self->{set_nc}->($self);
8271 }
8272
8273 redo A;
8274 }
8275 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8276 if ($is_space->{$self->{nc}}) {
8277 $self->{state} = AFTER_MD_DEF_STATE;
8278
8279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280 $self->{line_prev} = $self->{line};
8281 $self->{column_prev} = $self->{column};
8282 $self->{column}++;
8283 $self->{nc}
8284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285 } else {
8286 $self->{set_nc}->($self);
8287 }
8288
8289 redo A;
8290 } elsif ($self->{nc} == 0x003E) { # >
8291 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8292
8293 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8294 $self->{line_prev} = $self->{line};
8295 $self->{column_prev} = $self->{column};
8296 $self->{column}++;
8297 $self->{nc}
8298 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8299 } else {
8300 $self->{set_nc}->($self);
8301 }
8302
8303 return ($self->{ct}); # ELEMENT
8304 redo A;
8305 } elsif ($self->{nc} == -1) {
8306 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8307 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8308
8309 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8310 $self->{line_prev} = $self->{line};
8311 $self->{column_prev} = $self->{column};
8312 $self->{column}++;
8313 $self->{nc}
8314 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8315 } else {
8316 $self->{set_nc}->($self);
8317 }
8318
8319 return ($self->{ct}); # ELEMENT
8320 redo A;
8321 } else {
8322 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8323 ## Stay in the state.
8324
8325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8326 $self->{line_prev} = $self->{line};
8327 $self->{column_prev} = $self->{column};
8328 $self->{column}++;
8329 $self->{nc}
8330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8331 } else {
8332 $self->{set_nc}->($self);
8333 }
8334
8335 redo A;
8336 }
8337 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8338 if ($is_space->{$self->{nc}}) {
8339 ## Stay in the state.
8340
8341 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8342 $self->{line_prev} = $self->{line};
8343 $self->{column_prev} = $self->{column};
8344 $self->{column}++;
8345 $self->{nc}
8346 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8347 } else {
8348 $self->{set_nc}->($self);
8349 }
8350
8351 redo A;
8352 } elsif ($self->{nc} == 0x0028) { # (
8353 $self->{group_depth}++;
8354 push @{$self->{ct}->{content}}, chr $self->{nc};
8355 ## Stay in the state.
8356
8357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8358 $self->{line_prev} = $self->{line};
8359 $self->{column_prev} = $self->{column};
8360 $self->{column}++;
8361 $self->{nc}
8362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8363 } else {
8364 $self->{set_nc}->($self);
8365 }
8366
8367 redo A;
8368 } elsif ($self->{nc} == 0x007C or # |
8369 $self->{nc} == 0x002C) { # ,
8370 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8371 ## Stay in the state.
8372
8373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8374 $self->{line_prev} = $self->{line};
8375 $self->{column_prev} = $self->{column};
8376 $self->{column}++;
8377 $self->{nc}
8378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8379 } else {
8380 $self->{set_nc}->($self);
8381 }
8382
8383 redo A;
8384 } elsif ($self->{nc} == 0x0029) { # )
8385 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8386 push @{$self->{ct}->{content}}, chr $self->{nc};
8387 $self->{group_depth}--;
8388 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8389
8390 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8391 $self->{line_prev} = $self->{line};
8392 $self->{column_prev} = $self->{column};
8393 $self->{column}++;
8394 $self->{nc}
8395 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8396 } else {
8397 $self->{set_nc}->($self);
8398 }
8399
8400 redo A;
8401 } elsif ($self->{nc} == 0x003E) { # >
8402 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8403 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8404 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8405
8406 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8407 $self->{line_prev} = $self->{line};
8408 $self->{column_prev} = $self->{column};
8409 $self->{column}++;
8410 $self->{nc}
8411 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8412 } else {
8413 $self->{set_nc}->($self);
8414 }
8415
8416 return ($self->{ct}); # ELEMENT
8417 redo A;
8418 } elsif ($self->{nc} == -1) {
8419 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8420 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8421 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8422
8423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8424 $self->{line_prev} = $self->{line};
8425 $self->{column_prev} = $self->{column};
8426 $self->{column}++;
8427 $self->{nc}
8428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8429 } else {
8430 $self->{set_nc}->($self);
8431 }
8432
8433 return ($self->{ct}); # ELEMENT
8434 redo A;
8435 } else {
8436 push @{$self->{ct}->{content}}, chr $self->{nc};
8437 $self->{state} = CM_ELEMENT_NAME_STATE;
8438
8439 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8440 $self->{line_prev} = $self->{line};
8441 $self->{column_prev} = $self->{column};
8442 $self->{column}++;
8443 $self->{nc}
8444 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8445 } else {
8446 $self->{set_nc}->($self);
8447 }
8448
8449 redo A;
8450 }
8451 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8452 if ($is_space->{$self->{nc}}) {
8453 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8454
8455 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8456 $self->{line_prev} = $self->{line};
8457 $self->{column_prev} = $self->{column};
8458 $self->{column}++;
8459 $self->{nc}
8460 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8461 } else {
8462 $self->{set_nc}->($self);
8463 }
8464
8465 redo A;
8466 } elsif ($self->{nc} == 0x002A or # *
8467 $self->{nc} == 0x002B or # +
8468 $self->{nc} == 0x003F) { # ?
8469 push @{$self->{ct}->{content}}, chr $self->{nc};
8470 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8471
8472 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8473 $self->{line_prev} = $self->{line};
8474 $self->{column_prev} = $self->{column};
8475 $self->{column}++;
8476 $self->{nc}
8477 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8478 } else {
8479 $self->{set_nc}->($self);
8480 }
8481
8482 redo A;
8483 } elsif ($self->{nc} == 0x007C or # |
8484 $self->{nc} == 0x002C) { # ,
8485 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8486 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8487
8488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8489 $self->{line_prev} = $self->{line};
8490 $self->{column_prev} = $self->{column};
8491 $self->{column}++;
8492 $self->{nc}
8493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8494 } else {
8495 $self->{set_nc}->($self);
8496 }
8497
8498 redo A;
8499 } elsif ($self->{nc} == 0x0029) { # )
8500 $self->{group_depth}--;
8501 push @{$self->{ct}->{content}}, chr $self->{nc};
8502 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8503
8504 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8505 $self->{line_prev} = $self->{line};
8506 $self->{column_prev} = $self->{column};
8507 $self->{column}++;
8508 $self->{nc}
8509 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8510 } else {
8511 $self->{set_nc}->($self);
8512 }
8513
8514 redo A;
8515 } elsif ($self->{nc} == 0x003E) { # >
8516 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8517 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8518 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8519
8520 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8521 $self->{line_prev} = $self->{line};
8522 $self->{column_prev} = $self->{column};
8523 $self->{column}++;
8524 $self->{nc}
8525 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8526 } else {
8527 $self->{set_nc}->($self);
8528 }
8529
8530 return ($self->{ct}); # ELEMENT
8531 redo A;
8532 } elsif ($self->{nc} == -1) {
8533 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8534 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8535 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8536
8537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8538 $self->{line_prev} = $self->{line};
8539 $self->{column_prev} = $self->{column};
8540 $self->{column}++;
8541 $self->{nc}
8542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8543 } else {
8544 $self->{set_nc}->($self);
8545 }
8546
8547 return ($self->{ct}); # ELEMENT
8548 redo A;
8549 } else {
8550 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8551 ## Stay in the state.
8552
8553 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8554 $self->{line_prev} = $self->{line};
8555 $self->{column_prev} = $self->{column};
8556 $self->{column}++;
8557 $self->{nc}
8558 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8559 } else {
8560 $self->{set_nc}->($self);
8561 }
8562
8563 redo A;
8564 }
8565 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8566 if ($is_space->{$self->{nc}}) {
8567 ## Stay in the state.
8568
8569 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8570 $self->{line_prev} = $self->{line};
8571 $self->{column_prev} = $self->{column};
8572 $self->{column}++;
8573 $self->{nc}
8574 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8575 } else {
8576 $self->{set_nc}->($self);
8577 }
8578
8579 redo A;
8580 } elsif ($self->{nc} == 0x007C or # |
8581 $self->{nc} == 0x002C) { # ,
8582 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8583 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8584
8585 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8586 $self->{line_prev} = $self->{line};
8587 $self->{column_prev} = $self->{column};
8588 $self->{column}++;
8589 $self->{nc}
8590 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8591 } else {
8592 $self->{set_nc}->($self);
8593 }
8594
8595 redo A;
8596 } elsif ($self->{nc} == 0x0029) { # )
8597 $self->{group_depth}--;
8598 push @{$self->{ct}->{content}}, chr $self->{nc};
8599 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8600
8601 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8602 $self->{line_prev} = $self->{line};
8603 $self->{column_prev} = $self->{column};
8604 $self->{column}++;
8605 $self->{nc}
8606 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8607 } else {
8608 $self->{set_nc}->($self);
8609 }
8610
8611 redo A;
8612 } elsif ($self->{nc} == 0x003E) { # >
8613 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8614 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8615 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8616
8617 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8618 $self->{line_prev} = $self->{line};
8619 $self->{column_prev} = $self->{column};
8620 $self->{column}++;
8621 $self->{nc}
8622 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8623 } else {
8624 $self->{set_nc}->($self);
8625 }
8626
8627 return ($self->{ct}); # ELEMENT
8628 redo A;
8629 } elsif ($self->{nc} == -1) {
8630 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8631 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8632 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8633
8634 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8635 $self->{line_prev} = $self->{line};
8636 $self->{column_prev} = $self->{column};
8637 $self->{column}++;
8638 $self->{nc}
8639 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8640 } else {
8641 $self->{set_nc}->($self);
8642 }
8643
8644 return ($self->{ct}); # ELEMENT
8645 redo A;
8646 } else {
8647 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8648 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8649 $self->{state} = BOGUS_MD_STATE;
8650
8651 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8652 $self->{line_prev} = $self->{line};
8653 $self->{column_prev} = $self->{column};
8654 $self->{column}++;
8655 $self->{nc}
8656 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8657 } else {
8658 $self->{set_nc}->($self);
8659 }
8660
8661 redo A;
8662 }
8663 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8664 if ($is_space->{$self->{nc}}) {
8665 if ($self->{group_depth}) {
8666 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8667 } else {
8668 $self->{state} = AFTER_MD_DEF_STATE;
8669 }
8670
8671 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8672 $self->{line_prev} = $self->{line};
8673 $self->{column_prev} = $self->{column};
8674 $self->{column}++;
8675 $self->{nc}
8676 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8677 } else {
8678 $self->{set_nc}->($self);
8679 }
8680
8681 redo A;
8682 } elsif ($self->{nc} == 0x002A or # *
8683 $self->{nc} == 0x002B or # +
8684 $self->{nc} == 0x003F) { # ?
8685 push @{$self->{ct}->{content}}, chr $self->{nc};
8686 if ($self->{group_depth}) {
8687 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8688 } else {
8689 $self->{state} = AFTER_MD_DEF_STATE;
8690 }
8691
8692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8693 $self->{line_prev} = $self->{line};
8694 $self->{column_prev} = $self->{column};
8695 $self->{column}++;
8696 $self->{nc}
8697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8698 } else {
8699 $self->{set_nc}->($self);
8700 }
8701
8702 redo A;
8703 } elsif ($self->{nc} == 0x0029) { # )
8704 if ($self->{group_depth}) {
8705 $self->{group_depth}--;
8706 push @{$self->{ct}->{content}}, chr $self->{nc};
8707 ## Stay in the state.
8708
8709 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8710 $self->{line_prev} = $self->{line};
8711 $self->{column_prev} = $self->{column};
8712 $self->{column}++;
8713 $self->{nc}
8714 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8715 } else {
8716 $self->{set_nc}->($self);
8717 }
8718
8719 redo A;
8720 } else {
8721 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8722 $self->{state} = BOGUS_MD_STATE;
8723 ## Reconsume.
8724 redo A;
8725 }
8726 } elsif ($self->{nc} == 0x003E) { # >
8727 if ($self->{group_depth}) {
8728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8729 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8730 }
8731 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8732
8733 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8734 $self->{line_prev} = $self->{line};
8735 $self->{column_prev} = $self->{column};
8736 $self->{column}++;
8737 $self->{nc}
8738 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8739 } else {
8740 $self->{set_nc}->($self);
8741 }
8742
8743 return ($self->{ct}); # ELEMENT
8744 redo A;
8745 } elsif ($self->{nc} == -1) {
8746 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8747 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8748 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8749
8750 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8751 $self->{line_prev} = $self->{line};
8752 $self->{column_prev} = $self->{column};
8753 $self->{column}++;
8754 $self->{nc}
8755 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8756 } else {
8757 $self->{set_nc}->($self);
8758 }
8759
8760 return ($self->{ct}); # ELEMENT
8761 redo A;
8762 } else {
8763 if ($self->{group_depth}) {
8764 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8765 } else {
8766 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8767 $self->{state} = BOGUS_MD_STATE;
8768 }
8769 ## Reconsume.
8770 redo A;
8771 }
8772 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8773 if ($is_space->{$self->{nc}}) {
8774 ## Stay in the state.
8775
8776 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8777 $self->{line_prev} = $self->{line};
8778 $self->{column_prev} = $self->{column};
8779 $self->{column}++;
8780 $self->{nc}
8781 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8782 } else {
8783 $self->{set_nc}->($self);
8784 }
8785
8786 redo A;
8787 } elsif ($self->{nc} == 0x003E) { # >
8788 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8789
8790 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8791 $self->{line_prev} = $self->{line};
8792 $self->{column_prev} = $self->{column};
8793 $self->{column}++;
8794 $self->{nc}
8795 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8796 } else {
8797 $self->{set_nc}->($self);
8798 }
8799
8800 return ($self->{ct}); # ENTITY/ELEMENT
8801 redo A;
8802 } elsif ($self->{nc} == -1) {
8803 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8804 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8805
8806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8807 $self->{line_prev} = $self->{line};
8808 $self->{column_prev} = $self->{column};
8809 $self->{column}++;
8810 $self->{nc}
8811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8812 } else {
8813 $self->{set_nc}->($self);
8814 }
8815
8816 return ($self->{ct}); # ENTITY/ELEMENT
8817 redo A;
8818 } else {
8819 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8820 $self->{state} = BOGUS_MD_STATE;
8821 ## Reconsume.
8822 redo A;
8823 }
8824 } elsif ($self->{state} == BOGUS_MD_STATE) {
8825 if ($self->{nc} == 0x003E) { # >
8826 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8827
8828 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8829 $self->{line_prev} = $self->{line};
8830 $self->{column_prev} = $self->{column};
8831 $self->{column}++;
8832 $self->{nc}
8833 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8834 } else {
8835 $self->{set_nc}->($self);
8836 }
8837
8838 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8839 redo A;
8840 } elsif ($self->{nc} == -1) {
8841 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8842 ## Reconsume.
8843 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8844 redo A;
8845 } else {
8846 ## Stay in the state.
8847
8848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8849 $self->{line_prev} = $self->{line};
8850 $self->{column_prev} = $self->{column};
8851 $self->{column}++;
8852 $self->{nc}
8853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8854 } else {
8855 $self->{set_nc}->($self);
8856 }
8857
8858 redo A;
8859 }
8860 } else {
8861 die "$0: $self->{state}: Unknown state";
8862 }
8863 } # A
8864
8865 die "$0: _get_next_token: unexpected case";
8866 } # _get_next_token
8867
8868 1;
8869 ## $Date: 2009/09/05 10:41:07 $
8870

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24