/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (show annotations) (download)
Sun Oct 19 06:14:57 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.17: +406 -6 lines
++ whatpm/t/ChangeLog	19 Oct 2008 06:14:42 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/entities-1.dat" and "xml/entities-2.dat"
	added.  Support for the "#entities" directive.

++ whatpm/t/xml/ChangeLog	19 Oct 2008 06:11:59 -0000
	* entities-1.dat, entities-2.dat: New test data files.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	19 Oct 2008 06:12:27 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (notation_name): New attribute.

	* NanoDOM.pm (public_id, system_id): New attributes.a
++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 06:13:03 -0000
	* Dumper.pm: Dump text content of Entity nodes.

	* Tokenizer.pm.src: Support for <!ENTITY ... NDATA>.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 06:14:05 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): General and parameter entities
	implemented.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub AFTER_NOTATION_NAME_STATE () { 90 }
186 sub BOGUS_MD_STATE () { 91 }
187
188 ## Tree constructor state constants (see Whatpm::HTML for the full
189 ## list and descriptions)
190
191 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
192 sub FOREIGN_EL () { 0b1_00000000000 }
193
194 ## Character reference mappings
195
196 my $charref_map = {
197 0x0D => 0x000A,
198 0x80 => 0x20AC,
199 0x81 => 0xFFFD,
200 0x82 => 0x201A,
201 0x83 => 0x0192,
202 0x84 => 0x201E,
203 0x85 => 0x2026,
204 0x86 => 0x2020,
205 0x87 => 0x2021,
206 0x88 => 0x02C6,
207 0x89 => 0x2030,
208 0x8A => 0x0160,
209 0x8B => 0x2039,
210 0x8C => 0x0152,
211 0x8D => 0xFFFD,
212 0x8E => 0x017D,
213 0x8F => 0xFFFD,
214 0x90 => 0xFFFD,
215 0x91 => 0x2018,
216 0x92 => 0x2019,
217 0x93 => 0x201C,
218 0x94 => 0x201D,
219 0x95 => 0x2022,
220 0x96 => 0x2013,
221 0x97 => 0x2014,
222 0x98 => 0x02DC,
223 0x99 => 0x2122,
224 0x9A => 0x0161,
225 0x9B => 0x203A,
226 0x9C => 0x0153,
227 0x9D => 0xFFFD,
228 0x9E => 0x017E,
229 0x9F => 0x0178,
230 }; # $charref_map
231 $charref_map->{$_} = 0xFFFD
232 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
233 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
234 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
235 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
236 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
237 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
238 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
239
240 ## Implementations MUST act as if state machine in the spec
241
242 sub _initialize_tokenizer ($) {
243 my $self = shift;
244
245 ## NOTE: Fields set by |new| constructor:
246 #$self->{level}
247 #$self->{set_nc}
248 #$self->{parse_error}
249 #$self->{is_xml} (if XML)
250
251 $self->{state} = DATA_STATE; # MUST
252 $self->{s_kwd} = ''; # Data state keyword
253 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
254 #$self->{entity__value}; # initialized when used
255 #$self->{entity__match}; # initialized when used
256 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
257 undef $self->{ct}; # current token
258 undef $self->{ca}; # current attribute
259 undef $self->{last_stag_name}; # last emitted start tag name
260 #$self->{prev_state}; # initialized when used
261 delete $self->{self_closing};
262 $self->{char_buffer} = '';
263 $self->{char_buffer_pos} = 0;
264 $self->{nc} = -1; # next input character
265 #$self->{next_nc}
266
267 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
268 $self->{line_prev} = $self->{line};
269 $self->{column_prev} = $self->{column};
270 $self->{column}++;
271 $self->{nc}
272 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
273 } else {
274 $self->{set_nc}->($self);
275 }
276
277 $self->{token} = [];
278 # $self->{escape}
279 } # _initialize_tokenizer
280
281 ## A token has:
282 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 ## ->{name} (DOCTYPE_TOKEN)
285 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 ## ->{target} (PI_TOKEN)
287 ## ->{pubid} (DOCTYPE_TOKEN)
288 ## ->{sysid} (DOCTYPE_TOKEN)
289 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291 ## ->{name}
292 ## ->{value}
293 ## ->{has_reference} == 1 or 0
294 ## ->{index}: Index of the attribute in a tag.
295 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299
300 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302 ## while the token is pushed back to the stack.
303
304 ## Emitted token MUST immediately be handled by the tree construction state.
305
306 ## Before each step, UA MAY check to see if either one of the scripts in
307 ## "list of scripts that will execute as soon as possible" or the first
308 ## script in the "list of scripts that will execute asynchronously",
309 ## has completed loading. If one has, then it MUST be executed
310 ## and removed from the list.
311
312 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313 ## (This requirement was dropped from HTML5 spec, unfortunately.)
314
315 my $is_space = {
316 0x0009 => 1, # CHARACTER TABULATION (HT)
317 0x000A => 1, # LINE FEED (LF)
318 #0x000B => 0, # LINE TABULATION (VT)
319 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 #0x000D => 1, # CARRIAGE RETURN (CR)
321 0x0020 => 1, # SPACE (SP)
322 };
323
324 sub _get_next_token ($) {
325 my $self = shift;
326
327 if ($self->{self_closing}) {
328 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
329 ## NOTE: The |self_closing| flag is only set by start tag token.
330 ## In addition, when a start tag token is emitted, it is always set to
331 ## |ct|.
332 delete $self->{self_closing};
333 }
334
335 if (@{$self->{token}}) {
336 $self->{self_closing} = $self->{token}->[0]->{self_closing};
337 return shift @{$self->{token}};
338 }
339
340 A: {
341 if ($self->{state} == PCDATA_STATE) {
342 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343
344 if ($self->{nc} == 0x0026) { # &
345
346 ## NOTE: In the spec, the tokenizer is switched to the
347 ## "entity data state". In this implementation, the tokenizer
348 ## is switched to the |ENTITY_STATE|, which is an implementation
349 ## of the "consume a character reference" algorithm.
350 $self->{entity_add} = -1;
351 $self->{prev_state} = DATA_STATE;
352 $self->{state} = ENTITY_STATE;
353
354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
355 $self->{line_prev} = $self->{line};
356 $self->{column_prev} = $self->{column};
357 $self->{column}++;
358 $self->{nc}
359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
360 } else {
361 $self->{set_nc}->($self);
362 }
363
364 redo A;
365 } elsif ($self->{nc} == 0x003C) { # <
366
367 $self->{state} = TAG_OPEN_STATE;
368
369 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
370 $self->{line_prev} = $self->{line};
371 $self->{column_prev} = $self->{column};
372 $self->{column}++;
373 $self->{nc}
374 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
375 } else {
376 $self->{set_nc}->($self);
377 }
378
379 redo A;
380 } elsif ($self->{nc} == -1) {
381
382 return ({type => END_OF_FILE_TOKEN,
383 line => $self->{line}, column => $self->{column}});
384 last A; ## TODO: ok?
385 } else {
386
387 #
388 }
389
390 # Anything else
391 my $token = {type => CHARACTER_TOKEN,
392 data => chr $self->{nc},
393 line => $self->{line}, column => $self->{column},
394 };
395 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
396
397 ## Stay in the state.
398
399 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
400 $self->{line_prev} = $self->{line};
401 $self->{column_prev} = $self->{column};
402 $self->{column}++;
403 $self->{nc}
404 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
405 } else {
406 $self->{set_nc}->($self);
407 }
408
409 return ($token);
410 redo A;
411 } elsif ($self->{state} == DATA_STATE) {
412 $self->{s_kwd} = '' unless defined $self->{s_kwd};
413 if ($self->{nc} == 0x0026) { # &
414 $self->{s_kwd} = '';
415 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
416 not $self->{escape}) {
417
418 ## NOTE: In the spec, the tokenizer is switched to the
419 ## "entity data state". In this implementation, the tokenizer
420 ## is switched to the |ENTITY_STATE|, which is an implementation
421 ## of the "consume a character reference" algorithm.
422 $self->{entity_add} = -1;
423 $self->{prev_state} = DATA_STATE;
424 $self->{state} = ENTITY_STATE;
425
426 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
427 $self->{line_prev} = $self->{line};
428 $self->{column_prev} = $self->{column};
429 $self->{column}++;
430 $self->{nc}
431 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
432 } else {
433 $self->{set_nc}->($self);
434 }
435
436 redo A;
437 } else {
438
439 #
440 }
441 } elsif ($self->{nc} == 0x002D) { # -
442 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
443 if ($self->{s_kwd} eq '<!-') {
444
445 $self->{escape} = 1; # unless $self->{escape};
446 $self->{s_kwd} = '--';
447 #
448 } elsif ($self->{s_kwd} eq '-') {
449
450 $self->{s_kwd} = '--';
451 #
452 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
453
454 $self->{s_kwd} .= '-';
455 #
456 } else {
457
458 $self->{s_kwd} = '-';
459 #
460 }
461 }
462
463 #
464 } elsif ($self->{nc} == 0x0021) { # !
465 if (length $self->{s_kwd}) {
466
467 $self->{s_kwd} .= '!';
468 #
469 } else {
470
471 #$self->{s_kwd} = '';
472 #
473 }
474 #
475 } elsif ($self->{nc} == 0x003C) { # <
476 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
477 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
478 not $self->{escape})) {
479
480 $self->{state} = TAG_OPEN_STATE;
481
482 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
483 $self->{line_prev} = $self->{line};
484 $self->{column_prev} = $self->{column};
485 $self->{column}++;
486 $self->{nc}
487 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
488 } else {
489 $self->{set_nc}->($self);
490 }
491
492 redo A;
493 } else {
494
495 $self->{s_kwd} = '';
496 #
497 }
498 } elsif ($self->{nc} == 0x003E) { # >
499 if ($self->{escape} and
500 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
501 if ($self->{s_kwd} eq '--') {
502
503 delete $self->{escape};
504 #
505 } else {
506
507 #
508 }
509 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
510
511 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
512 line => $self->{line_prev},
513 column => $self->{column_prev} - 1);
514 #
515 } else {
516
517 #
518 }
519
520 $self->{s_kwd} = '';
521 #
522 } elsif ($self->{nc} == 0x005D) { # ]
523 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
524
525 $self->{s_kwd} .= ']';
526 } elsif ($self->{s_kwd} eq ']]') {
527
528 #
529 } else {
530
531 $self->{s_kwd} = '';
532 }
533 #
534 } elsif ($self->{nc} == -1) {
535
536 $self->{s_kwd} = '';
537 return ({type => END_OF_FILE_TOKEN,
538 line => $self->{line}, column => $self->{column}});
539 last A; ## TODO: ok?
540 } else {
541
542 $self->{s_kwd} = '';
543 #
544 }
545
546 # Anything else
547 my $token = {type => CHARACTER_TOKEN,
548 data => chr $self->{nc},
549 line => $self->{line}, column => $self->{column},
550 };
551 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
552 length $token->{data})) {
553 $self->{s_kwd} = '';
554 }
555
556 ## Stay in the data state.
557 if (not $self->{is_xml} and
558 $self->{content_model} == PCDATA_CONTENT_MODEL) {
559
560 $self->{state} = PCDATA_STATE;
561 } else {
562
563 ## Stay in the state.
564 }
565
566 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
567 $self->{line_prev} = $self->{line};
568 $self->{column_prev} = $self->{column};
569 $self->{column}++;
570 $self->{nc}
571 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
572 } else {
573 $self->{set_nc}->($self);
574 }
575
576 return ($token);
577 redo A;
578 } elsif ($self->{state} == TAG_OPEN_STATE) {
579 ## XML5: "tag state".
580
581 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
582 if ($self->{nc} == 0x002F) { # /
583
584
585 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
586 $self->{line_prev} = $self->{line};
587 $self->{column_prev} = $self->{column};
588 $self->{column}++;
589 $self->{nc}
590 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
591 } else {
592 $self->{set_nc}->($self);
593 }
594
595 $self->{state} = CLOSE_TAG_OPEN_STATE;
596 redo A;
597 } elsif ($self->{nc} == 0x0021) { # !
598
599 $self->{s_kwd} = $self->{escaped} ? '' : '<';
600 #
601 } else {
602
603 $self->{s_kwd} = '';
604 #
605 }
606
607 ## reconsume
608 $self->{state} = DATA_STATE;
609 return ({type => CHARACTER_TOKEN, data => '<',
610 line => $self->{line_prev},
611 column => $self->{column_prev},
612 });
613 redo A;
614 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
615 if ($self->{nc} == 0x0021) { # !
616
617 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
618
619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
620 $self->{line_prev} = $self->{line};
621 $self->{column_prev} = $self->{column};
622 $self->{column}++;
623 $self->{nc}
624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
625 } else {
626 $self->{set_nc}->($self);
627 }
628
629 redo A;
630 } elsif ($self->{nc} == 0x002F) { # /
631
632 $self->{state} = CLOSE_TAG_OPEN_STATE;
633
634 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
635 $self->{line_prev} = $self->{line};
636 $self->{column_prev} = $self->{column};
637 $self->{column}++;
638 $self->{nc}
639 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
640 } else {
641 $self->{set_nc}->($self);
642 }
643
644 redo A;
645 } elsif (0x0041 <= $self->{nc} and
646 $self->{nc} <= 0x005A) { # A..Z
647
648 $self->{ct}
649 = {type => START_TAG_TOKEN,
650 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
651 line => $self->{line_prev},
652 column => $self->{column_prev}};
653 $self->{state} = TAG_NAME_STATE;
654
655 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
656 $self->{line_prev} = $self->{line};
657 $self->{column_prev} = $self->{column};
658 $self->{column}++;
659 $self->{nc}
660 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
661 } else {
662 $self->{set_nc}->($self);
663 }
664
665 redo A;
666 } elsif (0x0061 <= $self->{nc} and
667 $self->{nc} <= 0x007A) { # a..z
668
669 $self->{ct} = {type => START_TAG_TOKEN,
670 tag_name => chr ($self->{nc}),
671 line => $self->{line_prev},
672 column => $self->{column_prev}};
673 $self->{state} = TAG_NAME_STATE;
674
675 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
676 $self->{line_prev} = $self->{line};
677 $self->{column_prev} = $self->{column};
678 $self->{column}++;
679 $self->{nc}
680 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
681 } else {
682 $self->{set_nc}->($self);
683 }
684
685 redo A;
686 } elsif ($self->{nc} == 0x003E) { # >
687
688 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
689 line => $self->{line_prev},
690 column => $self->{column_prev});
691 $self->{state} = DATA_STATE;
692 $self->{s_kwd} = '';
693
694 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
695 $self->{line_prev} = $self->{line};
696 $self->{column_prev} = $self->{column};
697 $self->{column}++;
698 $self->{nc}
699 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
700 } else {
701 $self->{set_nc}->($self);
702 }
703
704
705 return ({type => CHARACTER_TOKEN, data => '<>',
706 line => $self->{line_prev},
707 column => $self->{column_prev},
708 });
709
710 redo A;
711 } elsif ($self->{nc} == 0x003F) { # ?
712 if ($self->{is_xml}) {
713
714 $self->{state} = PI_STATE;
715
716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
717 $self->{line_prev} = $self->{line};
718 $self->{column_prev} = $self->{column};
719 $self->{column}++;
720 $self->{nc}
721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
722 } else {
723 $self->{set_nc}->($self);
724 }
725
726 redo A;
727 } else {
728
729 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
730 line => $self->{line_prev},
731 column => $self->{column_prev});
732 $self->{state} = BOGUS_COMMENT_STATE;
733 $self->{ct} = {type => COMMENT_TOKEN, data => '',
734 line => $self->{line_prev},
735 column => $self->{column_prev},
736 };
737 ## $self->{nc} is intentionally left as is
738 redo A;
739 }
740 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
741
742 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
743 line => $self->{line_prev},
744 column => $self->{column_prev});
745 $self->{state} = DATA_STATE;
746 $self->{s_kwd} = '';
747 ## reconsume
748
749 return ({type => CHARACTER_TOKEN, data => '<',
750 line => $self->{line_prev},
751 column => $self->{column_prev},
752 });
753
754 redo A;
755 } else {
756 ## XML5: "<:" is a parse error.
757
758 $self->{ct} = {type => START_TAG_TOKEN,
759 tag_name => chr ($self->{nc}),
760 line => $self->{line_prev},
761 column => $self->{column_prev}};
762 $self->{state} = TAG_NAME_STATE;
763
764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
765 $self->{line_prev} = $self->{line};
766 $self->{column_prev} = $self->{column};
767 $self->{column}++;
768 $self->{nc}
769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
770 } else {
771 $self->{set_nc}->($self);
772 }
773
774 redo A;
775 }
776 } else {
777 die "$0: $self->{content_model} in tag open";
778 }
779 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
780 ## NOTE: The "close tag open state" in the spec is implemented as
781 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
782
783 ## XML5: "end tag state".
784
785 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
786 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
787 if (defined $self->{last_stag_name}) {
788 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
789 $self->{kwd} = '';
790 ## Reconsume.
791 redo A;
792 } else {
793 ## No start tag token has ever been emitted
794 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
795
796 $self->{state} = DATA_STATE;
797 $self->{s_kwd} = '';
798 ## Reconsume.
799 return ({type => CHARACTER_TOKEN, data => '</',
800 line => $l, column => $c,
801 });
802 redo A;
803 }
804 }
805
806 if (0x0041 <= $self->{nc} and
807 $self->{nc} <= 0x005A) { # A..Z
808
809 $self->{ct}
810 = {type => END_TAG_TOKEN,
811 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
812 line => $l, column => $c};
813 $self->{state} = TAG_NAME_STATE;
814
815 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
816 $self->{line_prev} = $self->{line};
817 $self->{column_prev} = $self->{column};
818 $self->{column}++;
819 $self->{nc}
820 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
821 } else {
822 $self->{set_nc}->($self);
823 }
824
825 redo A;
826 } elsif (0x0061 <= $self->{nc} and
827 $self->{nc} <= 0x007A) { # a..z
828
829 $self->{ct} = {type => END_TAG_TOKEN,
830 tag_name => chr ($self->{nc}),
831 line => $l, column => $c};
832 $self->{state} = TAG_NAME_STATE;
833
834 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
835 $self->{line_prev} = $self->{line};
836 $self->{column_prev} = $self->{column};
837 $self->{column}++;
838 $self->{nc}
839 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
840 } else {
841 $self->{set_nc}->($self);
842 }
843
844 redo A;
845 } elsif ($self->{nc} == 0x003E) { # >
846 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
847 line => $self->{line_prev}, ## "<" in "</>"
848 column => $self->{column_prev} - 1);
849 $self->{state} = DATA_STATE;
850 $self->{s_kwd} = '';
851 if ($self->{is_xml}) {
852
853 ## XML5: No parse error.
854
855 ## NOTE: This parser raises a parse error, since it supports
856 ## XML1, not XML5.
857
858 ## NOTE: A short end tag token.
859 my $ct = {type => END_TAG_TOKEN,
860 tag_name => '',
861 line => $self->{line_prev},
862 column => $self->{column_prev} - 1,
863 };
864
865 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
866 $self->{line_prev} = $self->{line};
867 $self->{column_prev} = $self->{column};
868 $self->{column}++;
869 $self->{nc}
870 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
871 } else {
872 $self->{set_nc}->($self);
873 }
874
875 return ($ct);
876 } else {
877
878
879 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
880 $self->{line_prev} = $self->{line};
881 $self->{column_prev} = $self->{column};
882 $self->{column}++;
883 $self->{nc}
884 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
885 } else {
886 $self->{set_nc}->($self);
887 }
888
889 }
890 redo A;
891 } elsif ($self->{nc} == -1) {
892
893 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
894 $self->{s_kwd} = '';
895 $self->{state} = DATA_STATE;
896 # reconsume
897
898 return ({type => CHARACTER_TOKEN, data => '</',
899 line => $l, column => $c,
900 });
901
902 redo A;
903 } elsif (not $self->{is_xml} or
904 $is_space->{$self->{nc}}) {
905
906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
907 line => $self->{line_prev}, # "<" of "</"
908 column => $self->{column_prev} - 1);
909 $self->{state} = BOGUS_COMMENT_STATE;
910 $self->{ct} = {type => COMMENT_TOKEN, data => '',
911 line => $self->{line_prev}, # "<" of "</"
912 column => $self->{column_prev} - 1,
913 };
914 ## NOTE: $self->{nc} is intentionally left as is.
915 ## Although the "anything else" case of the spec not explicitly
916 ## states that the next input character is to be reconsumed,
917 ## it will be included to the |data| of the comment token
918 ## generated from the bogus end tag, as defined in the
919 ## "bogus comment state" entry.
920 redo A;
921 } else {
922 ## XML5: "</:" is a parse error.
923
924 $self->{ct} = {type => END_TAG_TOKEN,
925 tag_name => chr ($self->{nc}),
926 line => $l, column => $c};
927 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
928
929 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
930 $self->{line_prev} = $self->{line};
931 $self->{column_prev} = $self->{column};
932 $self->{column}++;
933 $self->{nc}
934 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
935 } else {
936 $self->{set_nc}->($self);
937 }
938
939 redo A;
940 }
941 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
942 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
943 if (length $ch) {
944 my $CH = $ch;
945 $ch =~ tr/a-z/A-Z/;
946 my $nch = chr $self->{nc};
947 if ($nch eq $ch or $nch eq $CH) {
948
949 ## Stay in the state.
950 $self->{kwd} .= $nch;
951
952 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
953 $self->{line_prev} = $self->{line};
954 $self->{column_prev} = $self->{column};
955 $self->{column}++;
956 $self->{nc}
957 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
958 } else {
959 $self->{set_nc}->($self);
960 }
961
962 redo A;
963 } else {
964
965 $self->{state} = DATA_STATE;
966 $self->{s_kwd} = '';
967 ## Reconsume.
968 return ({type => CHARACTER_TOKEN,
969 data => '</' . $self->{kwd},
970 line => $self->{line_prev},
971 column => $self->{column_prev} - 1 - length $self->{kwd},
972 });
973 redo A;
974 }
975 } else { # after "<{tag-name}"
976 unless ($is_space->{$self->{nc}} or
977 {
978 0x003E => 1, # >
979 0x002F => 1, # /
980 -1 => 1, # EOF
981 }->{$self->{nc}}) {
982
983 ## Reconsume.
984 $self->{state} = DATA_STATE;
985 $self->{s_kwd} = '';
986 return ({type => CHARACTER_TOKEN,
987 data => '</' . $self->{kwd},
988 line => $self->{line_prev},
989 column => $self->{column_prev} - 1 - length $self->{kwd},
990 });
991 redo A;
992 } else {
993
994 $self->{ct}
995 = {type => END_TAG_TOKEN,
996 tag_name => $self->{last_stag_name},
997 line => $self->{line_prev},
998 column => $self->{column_prev} - 1 - length $self->{kwd}};
999 $self->{state} = TAG_NAME_STATE;
1000 ## Reconsume.
1001 redo A;
1002 }
1003 }
1004 } elsif ($self->{state} == TAG_NAME_STATE) {
1005 if ($is_space->{$self->{nc}}) {
1006
1007 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1008
1009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1010 $self->{line_prev} = $self->{line};
1011 $self->{column_prev} = $self->{column};
1012 $self->{column}++;
1013 $self->{nc}
1014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1015 } else {
1016 $self->{set_nc}->($self);
1017 }
1018
1019 redo A;
1020 } elsif ($self->{nc} == 0x003E) { # >
1021 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1022
1023 $self->{last_stag_name} = $self->{ct}->{tag_name};
1024 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1025 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1026 #if ($self->{ct}->{attributes}) {
1027 # ## NOTE: This should never be reached.
1028 # !!! cp (36);
1029 # !!! parse-error (type => 'end tag attribute');
1030 #} else {
1031
1032 #}
1033 } else {
1034 die "$0: $self->{ct}->{type}: Unknown token type";
1035 }
1036 $self->{state} = DATA_STATE;
1037 $self->{s_kwd} = '';
1038
1039 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1040 $self->{line_prev} = $self->{line};
1041 $self->{column_prev} = $self->{column};
1042 $self->{column}++;
1043 $self->{nc}
1044 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1045 } else {
1046 $self->{set_nc}->($self);
1047 }
1048
1049
1050 return ($self->{ct}); # start tag or end tag
1051
1052 redo A;
1053 } elsif (0x0041 <= $self->{nc} and
1054 $self->{nc} <= 0x005A) { # A..Z
1055
1056 $self->{ct}->{tag_name}
1057 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1058 # start tag or end tag
1059 ## Stay in this state
1060
1061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1062 $self->{line_prev} = $self->{line};
1063 $self->{column_prev} = $self->{column};
1064 $self->{column}++;
1065 $self->{nc}
1066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1067 } else {
1068 $self->{set_nc}->($self);
1069 }
1070
1071 redo A;
1072 } elsif ($self->{nc} == -1) {
1073 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1074 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1075
1076 $self->{last_stag_name} = $self->{ct}->{tag_name};
1077 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1078 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1079 #if ($self->{ct}->{attributes}) {
1080 # ## NOTE: This state should never be reached.
1081 # !!! cp (40);
1082 # !!! parse-error (type => 'end tag attribute');
1083 #} else {
1084
1085 #}
1086 } else {
1087 die "$0: $self->{ct}->{type}: Unknown token type";
1088 }
1089 $self->{state} = DATA_STATE;
1090 $self->{s_kwd} = '';
1091 # reconsume
1092
1093 return ($self->{ct}); # start tag or end tag
1094
1095 redo A;
1096 } elsif ($self->{nc} == 0x002F) { # /
1097
1098 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1099
1100 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1101 $self->{line_prev} = $self->{line};
1102 $self->{column_prev} = $self->{column};
1103 $self->{column}++;
1104 $self->{nc}
1105 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1106 } else {
1107 $self->{set_nc}->($self);
1108 }
1109
1110 redo A;
1111 } else {
1112
1113 $self->{ct}->{tag_name} .= chr $self->{nc};
1114 # start tag or end tag
1115 ## Stay in the state
1116
1117 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1118 $self->{line_prev} = $self->{line};
1119 $self->{column_prev} = $self->{column};
1120 $self->{column}++;
1121 $self->{nc}
1122 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1123 } else {
1124 $self->{set_nc}->($self);
1125 }
1126
1127 redo A;
1128 }
1129 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1130 ## XML5: "Tag attribute name before state".
1131
1132 if ($is_space->{$self->{nc}}) {
1133
1134 ## Stay in the state
1135
1136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1137 $self->{line_prev} = $self->{line};
1138 $self->{column_prev} = $self->{column};
1139 $self->{column}++;
1140 $self->{nc}
1141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1142 } else {
1143 $self->{set_nc}->($self);
1144 }
1145
1146 redo A;
1147 } elsif ($self->{nc} == 0x003E) { # >
1148 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1149
1150 $self->{last_stag_name} = $self->{ct}->{tag_name};
1151 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1152 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1153 if ($self->{ct}->{attributes}) {
1154
1155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1156 } else {
1157
1158 }
1159 } else {
1160 die "$0: $self->{ct}->{type}: Unknown token type";
1161 }
1162 $self->{state} = DATA_STATE;
1163 $self->{s_kwd} = '';
1164
1165 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1166 $self->{line_prev} = $self->{line};
1167 $self->{column_prev} = $self->{column};
1168 $self->{column}++;
1169 $self->{nc}
1170 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1171 } else {
1172 $self->{set_nc}->($self);
1173 }
1174
1175
1176 return ($self->{ct}); # start tag or end tag
1177
1178 redo A;
1179 } elsif (0x0041 <= $self->{nc} and
1180 $self->{nc} <= 0x005A) { # A..Z
1181
1182 $self->{ca}
1183 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1184 value => '',
1185 line => $self->{line}, column => $self->{column}};
1186 $self->{state} = ATTRIBUTE_NAME_STATE;
1187
1188 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1189 $self->{line_prev} = $self->{line};
1190 $self->{column_prev} = $self->{column};
1191 $self->{column}++;
1192 $self->{nc}
1193 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1194 } else {
1195 $self->{set_nc}->($self);
1196 }
1197
1198 redo A;
1199 } elsif ($self->{nc} == 0x002F) { # /
1200
1201 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1202
1203 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1204 $self->{line_prev} = $self->{line};
1205 $self->{column_prev} = $self->{column};
1206 $self->{column}++;
1207 $self->{nc}
1208 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1209 } else {
1210 $self->{set_nc}->($self);
1211 }
1212
1213 redo A;
1214 } elsif ($self->{nc} == -1) {
1215 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1216 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1217
1218 $self->{last_stag_name} = $self->{ct}->{tag_name};
1219 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1220 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1221 if ($self->{ct}->{attributes}) {
1222
1223 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1224 } else {
1225
1226 }
1227 } else {
1228 die "$0: $self->{ct}->{type}: Unknown token type";
1229 }
1230 $self->{state} = DATA_STATE;
1231 $self->{s_kwd} = '';
1232 # reconsume
1233
1234 return ($self->{ct}); # start tag or end tag
1235
1236 redo A;
1237 } else {
1238 if ({
1239 0x0022 => 1, # "
1240 0x0027 => 1, # '
1241 0x003D => 1, # =
1242 }->{$self->{nc}}) {
1243
1244 ## XML5: Not a parse error.
1245 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1246 } else {
1247
1248 ## XML5: ":" raises a parse error and is ignored.
1249 }
1250 $self->{ca}
1251 = {name => chr ($self->{nc}),
1252 value => '',
1253 line => $self->{line}, column => $self->{column}};
1254 $self->{state} = ATTRIBUTE_NAME_STATE;
1255
1256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1257 $self->{line_prev} = $self->{line};
1258 $self->{column_prev} = $self->{column};
1259 $self->{column}++;
1260 $self->{nc}
1261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1262 } else {
1263 $self->{set_nc}->($self);
1264 }
1265
1266 redo A;
1267 }
1268 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1269 ## XML5: "Tag attribute name state".
1270
1271 my $before_leave = sub {
1272 if (exists $self->{ct}->{attributes} # start tag or end tag
1273 ->{$self->{ca}->{name}}) { # MUST
1274
1275 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1276 ## Discard $self->{ca} # MUST
1277 } else {
1278
1279 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1280 = $self->{ca};
1281 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1282 }
1283 }; # $before_leave
1284
1285 if ($is_space->{$self->{nc}}) {
1286
1287 $before_leave->();
1288 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1289
1290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1291 $self->{line_prev} = $self->{line};
1292 $self->{column_prev} = $self->{column};
1293 $self->{column}++;
1294 $self->{nc}
1295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1296 } else {
1297 $self->{set_nc}->($self);
1298 }
1299
1300 redo A;
1301 } elsif ($self->{nc} == 0x003D) { # =
1302
1303 $before_leave->();
1304 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1305
1306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1307 $self->{line_prev} = $self->{line};
1308 $self->{column_prev} = $self->{column};
1309 $self->{column}++;
1310 $self->{nc}
1311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1312 } else {
1313 $self->{set_nc}->($self);
1314 }
1315
1316 redo A;
1317 } elsif ($self->{nc} == 0x003E) { # >
1318 if ($self->{is_xml}) {
1319
1320 ## XML5: Not a parse error.
1321 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1322 } else {
1323
1324 }
1325
1326 $before_leave->();
1327 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1328
1329 $self->{last_stag_name} = $self->{ct}->{tag_name};
1330 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1331
1332 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1333 if ($self->{ct}->{attributes}) {
1334 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1335 }
1336 } else {
1337 die "$0: $self->{ct}->{type}: Unknown token type";
1338 }
1339 $self->{state} = DATA_STATE;
1340 $self->{s_kwd} = '';
1341
1342 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1343 $self->{line_prev} = $self->{line};
1344 $self->{column_prev} = $self->{column};
1345 $self->{column}++;
1346 $self->{nc}
1347 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1348 } else {
1349 $self->{set_nc}->($self);
1350 }
1351
1352
1353 return ($self->{ct}); # start tag or end tag
1354
1355 redo A;
1356 } elsif (0x0041 <= $self->{nc} and
1357 $self->{nc} <= 0x005A) { # A..Z
1358
1359 $self->{ca}->{name}
1360 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1361 ## Stay in the state
1362
1363 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1364 $self->{line_prev} = $self->{line};
1365 $self->{column_prev} = $self->{column};
1366 $self->{column}++;
1367 $self->{nc}
1368 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1369 } else {
1370 $self->{set_nc}->($self);
1371 }
1372
1373 redo A;
1374 } elsif ($self->{nc} == 0x002F) { # /
1375 if ($self->{is_xml}) {
1376
1377 ## XML5: Not a parse error.
1378 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1379 } else {
1380
1381 }
1382
1383 $before_leave->();
1384 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1385
1386 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1387 $self->{line_prev} = $self->{line};
1388 $self->{column_prev} = $self->{column};
1389 $self->{column}++;
1390 $self->{nc}
1391 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1392 } else {
1393 $self->{set_nc}->($self);
1394 }
1395
1396 redo A;
1397 } elsif ($self->{nc} == -1) {
1398 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1399 $before_leave->();
1400 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1401
1402 $self->{last_stag_name} = $self->{ct}->{tag_name};
1403 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1404 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1405 if ($self->{ct}->{attributes}) {
1406
1407 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1408 } else {
1409 ## NOTE: This state should never be reached.
1410
1411 }
1412 } else {
1413 die "$0: $self->{ct}->{type}: Unknown token type";
1414 }
1415 $self->{state} = DATA_STATE;
1416 $self->{s_kwd} = '';
1417 # reconsume
1418
1419 return ($self->{ct}); # start tag or end tag
1420
1421 redo A;
1422 } else {
1423 if ($self->{nc} == 0x0022 or # "
1424 $self->{nc} == 0x0027) { # '
1425
1426 ## XML5: Not a parse error.
1427 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1428 } else {
1429
1430 }
1431 $self->{ca}->{name} .= chr ($self->{nc});
1432 ## Stay in the state
1433
1434 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1435 $self->{line_prev} = $self->{line};
1436 $self->{column_prev} = $self->{column};
1437 $self->{column}++;
1438 $self->{nc}
1439 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1440 } else {
1441 $self->{set_nc}->($self);
1442 }
1443
1444 redo A;
1445 }
1446 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1447 ## XML5: "Tag attribute name after state".
1448
1449 if ($is_space->{$self->{nc}}) {
1450
1451 ## Stay in the state
1452
1453 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1454 $self->{line_prev} = $self->{line};
1455 $self->{column_prev} = $self->{column};
1456 $self->{column}++;
1457 $self->{nc}
1458 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1459 } else {
1460 $self->{set_nc}->($self);
1461 }
1462
1463 redo A;
1464 } elsif ($self->{nc} == 0x003D) { # =
1465
1466 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1467
1468 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1469 $self->{line_prev} = $self->{line};
1470 $self->{column_prev} = $self->{column};
1471 $self->{column}++;
1472 $self->{nc}
1473 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1474 } else {
1475 $self->{set_nc}->($self);
1476 }
1477
1478 redo A;
1479 } elsif ($self->{nc} == 0x003E) { # >
1480 if ($self->{is_xml}) {
1481
1482 ## XML5: Not a parse error.
1483 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1484 } else {
1485
1486 }
1487
1488 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1489
1490 $self->{last_stag_name} = $self->{ct}->{tag_name};
1491 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1492 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1493 if ($self->{ct}->{attributes}) {
1494
1495 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1496 } else {
1497 ## NOTE: This state should never be reached.
1498
1499 }
1500 } else {
1501 die "$0: $self->{ct}->{type}: Unknown token type";
1502 }
1503 $self->{state} = DATA_STATE;
1504 $self->{s_kwd} = '';
1505
1506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1507 $self->{line_prev} = $self->{line};
1508 $self->{column_prev} = $self->{column};
1509 $self->{column}++;
1510 $self->{nc}
1511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1512 } else {
1513 $self->{set_nc}->($self);
1514 }
1515
1516
1517 return ($self->{ct}); # start tag or end tag
1518
1519 redo A;
1520 } elsif (0x0041 <= $self->{nc} and
1521 $self->{nc} <= 0x005A) { # A..Z
1522
1523 $self->{ca}
1524 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1525 value => '',
1526 line => $self->{line}, column => $self->{column}};
1527 $self->{state} = ATTRIBUTE_NAME_STATE;
1528
1529 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1530 $self->{line_prev} = $self->{line};
1531 $self->{column_prev} = $self->{column};
1532 $self->{column}++;
1533 $self->{nc}
1534 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1535 } else {
1536 $self->{set_nc}->($self);
1537 }
1538
1539 redo A;
1540 } elsif ($self->{nc} == 0x002F) { # /
1541 if ($self->{is_xml}) {
1542
1543 ## XML5: Not a parse error.
1544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1545 } else {
1546
1547 }
1548
1549 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1550
1551 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1552 $self->{line_prev} = $self->{line};
1553 $self->{column_prev} = $self->{column};
1554 $self->{column}++;
1555 $self->{nc}
1556 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1557 } else {
1558 $self->{set_nc}->($self);
1559 }
1560
1561 redo A;
1562 } elsif ($self->{nc} == -1) {
1563 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1564 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1565
1566 $self->{last_stag_name} = $self->{ct}->{tag_name};
1567 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1568 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1569 if ($self->{ct}->{attributes}) {
1570
1571 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1572 } else {
1573 ## NOTE: This state should never be reached.
1574
1575 }
1576 } else {
1577 die "$0: $self->{ct}->{type}: Unknown token type";
1578 }
1579 $self->{s_kwd} = '';
1580 $self->{state} = DATA_STATE;
1581 # reconsume
1582
1583 return ($self->{ct}); # start tag or end tag
1584
1585 redo A;
1586 } else {
1587 if ($self->{is_xml}) {
1588
1589 ## XML5: Not a parse error.
1590 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1591 } else {
1592
1593 }
1594
1595 if ($self->{nc} == 0x0022 or # "
1596 $self->{nc} == 0x0027) { # '
1597
1598 ## XML5: Not a parse error.
1599 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1600 } else {
1601
1602 }
1603 $self->{ca}
1604 = {name => chr ($self->{nc}),
1605 value => '',
1606 line => $self->{line}, column => $self->{column}};
1607 $self->{state} = ATTRIBUTE_NAME_STATE;
1608
1609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1610 $self->{line_prev} = $self->{line};
1611 $self->{column_prev} = $self->{column};
1612 $self->{column}++;
1613 $self->{nc}
1614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1615 } else {
1616 $self->{set_nc}->($self);
1617 }
1618
1619 redo A;
1620 }
1621 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1622 ## XML5: "Tag attribute value before state".
1623
1624 if ($is_space->{$self->{nc}}) {
1625
1626 ## Stay in the state
1627
1628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1629 $self->{line_prev} = $self->{line};
1630 $self->{column_prev} = $self->{column};
1631 $self->{column}++;
1632 $self->{nc}
1633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1634 } else {
1635 $self->{set_nc}->($self);
1636 }
1637
1638 redo A;
1639 } elsif ($self->{nc} == 0x0022) { # "
1640
1641 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1642
1643 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644 $self->{line_prev} = $self->{line};
1645 $self->{column_prev} = $self->{column};
1646 $self->{column}++;
1647 $self->{nc}
1648 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649 } else {
1650 $self->{set_nc}->($self);
1651 }
1652
1653 redo A;
1654 } elsif ($self->{nc} == 0x0026) { # &
1655
1656 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1657 ## reconsume
1658 redo A;
1659 } elsif ($self->{nc} == 0x0027) { # '
1660
1661 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1662
1663 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1664 $self->{line_prev} = $self->{line};
1665 $self->{column_prev} = $self->{column};
1666 $self->{column}++;
1667 $self->{nc}
1668 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1669 } else {
1670 $self->{set_nc}->($self);
1671 }
1672
1673 redo A;
1674 } elsif ($self->{nc} == 0x003E) { # >
1675 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1676 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1677
1678 $self->{last_stag_name} = $self->{ct}->{tag_name};
1679 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1680 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1681 if ($self->{ct}->{attributes}) {
1682
1683 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1684 } else {
1685 ## NOTE: This state should never be reached.
1686
1687 }
1688 } else {
1689 die "$0: $self->{ct}->{type}: Unknown token type";
1690 }
1691 $self->{state} = DATA_STATE;
1692 $self->{s_kwd} = '';
1693
1694 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1695 $self->{line_prev} = $self->{line};
1696 $self->{column_prev} = $self->{column};
1697 $self->{column}++;
1698 $self->{nc}
1699 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1700 } else {
1701 $self->{set_nc}->($self);
1702 }
1703
1704
1705 return ($self->{ct}); # start tag or end tag
1706
1707 redo A;
1708 } elsif ($self->{nc} == -1) {
1709 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1710 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1711
1712 $self->{last_stag_name} = $self->{ct}->{tag_name};
1713 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1714 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1715 if ($self->{ct}->{attributes}) {
1716
1717 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1718 } else {
1719 ## NOTE: This state should never be reached.
1720
1721 }
1722 } else {
1723 die "$0: $self->{ct}->{type}: Unknown token type";
1724 }
1725 $self->{state} = DATA_STATE;
1726 $self->{s_kwd} = '';
1727 ## reconsume
1728
1729 return ($self->{ct}); # start tag or end tag
1730
1731 redo A;
1732 } else {
1733 if ($self->{nc} == 0x003D) { # =
1734
1735 ## XML5: Not a parse error.
1736 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1737 } elsif ($self->{is_xml}) {
1738
1739 ## XML5: No parse error.
1740 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1741 } else {
1742
1743 }
1744 $self->{ca}->{value} .= chr ($self->{nc});
1745 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1746
1747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1748 $self->{line_prev} = $self->{line};
1749 $self->{column_prev} = $self->{column};
1750 $self->{column}++;
1751 $self->{nc}
1752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1753 } else {
1754 $self->{set_nc}->($self);
1755 }
1756
1757 redo A;
1758 }
1759 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1760 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1761 ## ATTLIST attribute value double quoted state".
1762
1763 if ($self->{nc} == 0x0022) { # "
1764 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1765
1766 ## XML5: "DOCTYPE ATTLIST name after state".
1767 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1768 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1769 } else {
1770
1771 ## XML5: "Tag attribute name before state".
1772 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1773 }
1774
1775 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1776 $self->{line_prev} = $self->{line};
1777 $self->{column_prev} = $self->{column};
1778 $self->{column}++;
1779 $self->{nc}
1780 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1781 } else {
1782 $self->{set_nc}->($self);
1783 }
1784
1785 redo A;
1786 } elsif ($self->{nc} == 0x0026) { # &
1787
1788 ## XML5: Not defined yet.
1789
1790 ## NOTE: In the spec, the tokenizer is switched to the
1791 ## "entity in attribute value state". In this implementation, the
1792 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1793 ## implementation of the "consume a character reference" algorithm.
1794 $self->{prev_state} = $self->{state};
1795 $self->{entity_add} = 0x0022; # "
1796 $self->{state} = ENTITY_STATE;
1797
1798 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1799 $self->{line_prev} = $self->{line};
1800 $self->{column_prev} = $self->{column};
1801 $self->{column}++;
1802 $self->{nc}
1803 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1804 } else {
1805 $self->{set_nc}->($self);
1806 }
1807
1808 redo A;
1809 } elsif ($self->{nc} == -1) {
1810 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1811 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1812
1813 $self->{last_stag_name} = $self->{ct}->{tag_name};
1814
1815 $self->{state} = DATA_STATE;
1816 $self->{s_kwd} = '';
1817 ## reconsume
1818 return ($self->{ct}); # start tag
1819 redo A;
1820 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1821 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1822 if ($self->{ct}->{attributes}) {
1823
1824 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1825 } else {
1826 ## NOTE: This state should never be reached.
1827
1828 }
1829
1830 $self->{state} = DATA_STATE;
1831 $self->{s_kwd} = '';
1832 ## reconsume
1833 return ($self->{ct}); # end tag
1834 redo A;
1835 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1836 ## XML5: No parse error above; not defined yet.
1837 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1838 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1839 ## Reconsume.
1840 return ($self->{ct}); # ATTLIST
1841 redo A;
1842 } else {
1843 die "$0: $self->{ct}->{type}: Unknown token type";
1844 }
1845 } else {
1846 ## XML5 [ATTLIST]: Not defined yet.
1847 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1848
1849 ## XML5: Not a parse error.
1850 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1851 } else {
1852
1853 }
1854 $self->{ca}->{value} .= chr ($self->{nc});
1855 $self->{read_until}->($self->{ca}->{value},
1856 q["&<],
1857 length $self->{ca}->{value});
1858
1859 ## Stay in the state
1860
1861 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1862 $self->{line_prev} = $self->{line};
1863 $self->{column_prev} = $self->{column};
1864 $self->{column}++;
1865 $self->{nc}
1866 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1867 } else {
1868 $self->{set_nc}->($self);
1869 }
1870
1871 redo A;
1872 }
1873 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1874 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1875 ## ATTLIST attribute value single quoted state".
1876
1877 if ($self->{nc} == 0x0027) { # '
1878 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1879
1880 ## XML5: "DOCTYPE ATTLIST name after state".
1881 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1883 } else {
1884
1885 ## XML5: "Before attribute name state" (sic).
1886 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1887 }
1888
1889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1890 $self->{line_prev} = $self->{line};
1891 $self->{column_prev} = $self->{column};
1892 $self->{column}++;
1893 $self->{nc}
1894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1895 } else {
1896 $self->{set_nc}->($self);
1897 }
1898
1899 redo A;
1900 } elsif ($self->{nc} == 0x0026) { # &
1901
1902 ## XML5: Not defined yet.
1903
1904 ## NOTE: In the spec, the tokenizer is switched to the
1905 ## "entity in attribute value state". In this implementation, the
1906 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1907 ## implementation of the "consume a character reference" algorithm.
1908 $self->{entity_add} = 0x0027; # '
1909 $self->{prev_state} = $self->{state};
1910 $self->{state} = ENTITY_STATE;
1911
1912 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1913 $self->{line_prev} = $self->{line};
1914 $self->{column_prev} = $self->{column};
1915 $self->{column}++;
1916 $self->{nc}
1917 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1918 } else {
1919 $self->{set_nc}->($self);
1920 }
1921
1922 redo A;
1923 } elsif ($self->{nc} == -1) {
1924 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1925 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1926
1927 $self->{last_stag_name} = $self->{ct}->{tag_name};
1928
1929 $self->{state} = DATA_STATE;
1930 $self->{s_kwd} = '';
1931 ## reconsume
1932 return ($self->{ct}); # start tag
1933 redo A;
1934 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1935 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1936 if ($self->{ct}->{attributes}) {
1937
1938 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1939 } else {
1940 ## NOTE: This state should never be reached.
1941
1942 }
1943
1944 $self->{state} = DATA_STATE;
1945 $self->{s_kwd} = '';
1946 ## reconsume
1947 return ($self->{ct}); # end tag
1948 redo A;
1949 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1950 ## XML5: No parse error above; not defined yet.
1951 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1952 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1953 ## Reconsume.
1954 return ($self->{ct}); # ATTLIST
1955 redo A;
1956 } else {
1957 die "$0: $self->{ct}->{type}: Unknown token type";
1958 }
1959 } else {
1960 ## XML5 [ATTLIST]: Not defined yet.
1961 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1962
1963 ## XML5: Not a parse error.
1964 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1965 } else {
1966
1967 }
1968 $self->{ca}->{value} .= chr ($self->{nc});
1969 $self->{read_until}->($self->{ca}->{value},
1970 q['&<],
1971 length $self->{ca}->{value});
1972
1973 ## Stay in the state
1974
1975 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1976 $self->{line_prev} = $self->{line};
1977 $self->{column_prev} = $self->{column};
1978 $self->{column}++;
1979 $self->{nc}
1980 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1981 } else {
1982 $self->{set_nc}->($self);
1983 }
1984
1985 redo A;
1986 }
1987 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1988 ## XML5: "Tag attribute value unquoted state".
1989
1990 if ($is_space->{$self->{nc}}) {
1991 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1992
1993 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1994 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1995 } else {
1996
1997 ## XML5: "Tag attribute name before state".
1998 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1999 }
2000
2001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2002 $self->{line_prev} = $self->{line};
2003 $self->{column_prev} = $self->{column};
2004 $self->{column}++;
2005 $self->{nc}
2006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2007 } else {
2008 $self->{set_nc}->($self);
2009 }
2010
2011 redo A;
2012 } elsif ($self->{nc} == 0x0026) { # &
2013
2014
2015 ## XML5: Not defined yet.
2016
2017 ## NOTE: In the spec, the tokenizer is switched to the
2018 ## "entity in attribute value state". In this implementation, the
2019 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2020 ## implementation of the "consume a character reference" algorithm.
2021 $self->{entity_add} = -1;
2022 $self->{prev_state} = $self->{state};
2023 $self->{state} = ENTITY_STATE;
2024
2025 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2026 $self->{line_prev} = $self->{line};
2027 $self->{column_prev} = $self->{column};
2028 $self->{column}++;
2029 $self->{nc}
2030 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2031 } else {
2032 $self->{set_nc}->($self);
2033 }
2034
2035 redo A;
2036 } elsif ($self->{nc} == 0x003E) { # >
2037 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2038
2039 $self->{last_stag_name} = $self->{ct}->{tag_name};
2040
2041 $self->{state} = DATA_STATE;
2042 $self->{s_kwd} = '';
2043
2044 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2045 $self->{line_prev} = $self->{line};
2046 $self->{column_prev} = $self->{column};
2047 $self->{column}++;
2048 $self->{nc}
2049 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2050 } else {
2051 $self->{set_nc}->($self);
2052 }
2053
2054 return ($self->{ct}); # start tag
2055 redo A;
2056 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2057 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2058 if ($self->{ct}->{attributes}) {
2059
2060 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2061 } else {
2062 ## NOTE: This state should never be reached.
2063
2064 }
2065
2066 $self->{state} = DATA_STATE;
2067 $self->{s_kwd} = '';
2068
2069 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070 $self->{line_prev} = $self->{line};
2071 $self->{column_prev} = $self->{column};
2072 $self->{column}++;
2073 $self->{nc}
2074 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075 } else {
2076 $self->{set_nc}->($self);
2077 }
2078
2079 return ($self->{ct}); # end tag
2080 redo A;
2081 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2082 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2083 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2084
2085 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2086 $self->{line_prev} = $self->{line};
2087 $self->{column_prev} = $self->{column};
2088 $self->{column}++;
2089 $self->{nc}
2090 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2091 } else {
2092 $self->{set_nc}->($self);
2093 }
2094
2095 return ($self->{ct}); # ATTLIST
2096 redo A;
2097 } else {
2098 die "$0: $self->{ct}->{type}: Unknown token type";
2099 }
2100 } elsif ($self->{nc} == -1) {
2101 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2102
2103 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2104 $self->{last_stag_name} = $self->{ct}->{tag_name};
2105
2106 $self->{state} = DATA_STATE;
2107 $self->{s_kwd} = '';
2108 ## reconsume
2109 return ($self->{ct}); # start tag
2110 redo A;
2111 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2113 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2114 if ($self->{ct}->{attributes}) {
2115
2116 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2117 } else {
2118 ## NOTE: This state should never be reached.
2119
2120 }
2121
2122 $self->{state} = DATA_STATE;
2123 $self->{s_kwd} = '';
2124 ## reconsume
2125 return ($self->{ct}); # end tag
2126 redo A;
2127 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2128 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2129 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2130 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2131 ## Reconsume.
2132 return ($self->{ct}); # ATTLIST
2133 redo A;
2134 } else {
2135 die "$0: $self->{ct}->{type}: Unknown token type";
2136 }
2137 } else {
2138 if ({
2139 0x0022 => 1, # "
2140 0x0027 => 1, # '
2141 0x003D => 1, # =
2142 }->{$self->{nc}}) {
2143
2144 ## XML5: Not a parse error.
2145 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2146 } else {
2147
2148 }
2149 $self->{ca}->{value} .= chr ($self->{nc});
2150 $self->{read_until}->($self->{ca}->{value},
2151 q["'=& >],
2152 length $self->{ca}->{value});
2153
2154 ## Stay in the state
2155
2156 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2157 $self->{line_prev} = $self->{line};
2158 $self->{column_prev} = $self->{column};
2159 $self->{column}++;
2160 $self->{nc}
2161 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2162 } else {
2163 $self->{set_nc}->($self);
2164 }
2165
2166 redo A;
2167 }
2168 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2169 if ($is_space->{$self->{nc}}) {
2170
2171 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2172
2173 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2174 $self->{line_prev} = $self->{line};
2175 $self->{column_prev} = $self->{column};
2176 $self->{column}++;
2177 $self->{nc}
2178 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2179 } else {
2180 $self->{set_nc}->($self);
2181 }
2182
2183 redo A;
2184 } elsif ($self->{nc} == 0x003E) { # >
2185 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2186
2187 $self->{last_stag_name} = $self->{ct}->{tag_name};
2188 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2189 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2190 if ($self->{ct}->{attributes}) {
2191
2192 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2193 } else {
2194 ## NOTE: This state should never be reached.
2195
2196 }
2197 } else {
2198 die "$0: $self->{ct}->{type}: Unknown token type";
2199 }
2200 $self->{state} = DATA_STATE;
2201 $self->{s_kwd} = '';
2202
2203 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2204 $self->{line_prev} = $self->{line};
2205 $self->{column_prev} = $self->{column};
2206 $self->{column}++;
2207 $self->{nc}
2208 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2209 } else {
2210 $self->{set_nc}->($self);
2211 }
2212
2213
2214 return ($self->{ct}); # start tag or end tag
2215
2216 redo A;
2217 } elsif ($self->{nc} == 0x002F) { # /
2218
2219 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2220
2221 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2222 $self->{line_prev} = $self->{line};
2223 $self->{column_prev} = $self->{column};
2224 $self->{column}++;
2225 $self->{nc}
2226 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2227 } else {
2228 $self->{set_nc}->($self);
2229 }
2230
2231 redo A;
2232 } elsif ($self->{nc} == -1) {
2233 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2234 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2235
2236 $self->{last_stag_name} = $self->{ct}->{tag_name};
2237 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2238 if ($self->{ct}->{attributes}) {
2239
2240 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2241 } else {
2242 ## NOTE: This state should never be reached.
2243
2244 }
2245 } else {
2246 die "$0: $self->{ct}->{type}: Unknown token type";
2247 }
2248 $self->{state} = DATA_STATE;
2249 $self->{s_kwd} = '';
2250 ## Reconsume.
2251 return ($self->{ct}); # start tag or end tag
2252 redo A;
2253 } else {
2254
2255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2256 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2257 ## reconsume
2258 redo A;
2259 }
2260 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2261 ## XML5: "Empty tag state".
2262
2263 if ($self->{nc} == 0x003E) { # >
2264 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2265
2266 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2267 ## TODO: Different type than slash in start tag
2268 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2269 if ($self->{ct}->{attributes}) {
2270
2271 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2272 } else {
2273
2274 }
2275 ## TODO: Test |<title></title/>|
2276 } else {
2277
2278 $self->{self_closing} = 1;
2279 }
2280
2281 $self->{state} = DATA_STATE;
2282 $self->{s_kwd} = '';
2283
2284 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2285 $self->{line_prev} = $self->{line};
2286 $self->{column_prev} = $self->{column};
2287 $self->{column}++;
2288 $self->{nc}
2289 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2290 } else {
2291 $self->{set_nc}->($self);
2292 }
2293
2294
2295 return ($self->{ct}); # start tag or end tag
2296
2297 redo A;
2298 } elsif ($self->{nc} == -1) {
2299 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2300 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2301
2302 $self->{last_stag_name} = $self->{ct}->{tag_name};
2303 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2304 if ($self->{ct}->{attributes}) {
2305
2306 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2307 } else {
2308 ## NOTE: This state should never be reached.
2309
2310 }
2311 } else {
2312 die "$0: $self->{ct}->{type}: Unknown token type";
2313 }
2314 ## XML5: "Tag attribute name before state".
2315 $self->{state} = DATA_STATE;
2316 $self->{s_kwd} = '';
2317 ## Reconsume.
2318 return ($self->{ct}); # start tag or end tag
2319 redo A;
2320 } else {
2321
2322 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2323 ## TODO: This error type is wrong.
2324 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2325 ## Reconsume.
2326 redo A;
2327 }
2328 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2329 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2330
2331 ## NOTE: Unlike spec's "bogus comment state", this implementation
2332 ## consumes characters one-by-one basis.
2333
2334 if ($self->{nc} == 0x003E) { # >
2335 if ($self->{in_subset}) {
2336
2337 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2338 } else {
2339
2340 $self->{state} = DATA_STATE;
2341 $self->{s_kwd} = '';
2342 }
2343
2344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2345 $self->{line_prev} = $self->{line};
2346 $self->{column_prev} = $self->{column};
2347 $self->{column}++;
2348 $self->{nc}
2349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2350 } else {
2351 $self->{set_nc}->($self);
2352 }
2353
2354
2355 return ($self->{ct}); # comment
2356 redo A;
2357 } elsif ($self->{nc} == -1) {
2358 if ($self->{in_subset}) {
2359
2360 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2361 } else {
2362
2363 $self->{state} = DATA_STATE;
2364 $self->{s_kwd} = '';
2365 }
2366 ## reconsume
2367
2368 return ($self->{ct}); # comment
2369 redo A;
2370 } else {
2371
2372 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2373 $self->{read_until}->($self->{ct}->{data},
2374 q[>],
2375 length $self->{ct}->{data});
2376
2377 ## Stay in the state.
2378
2379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2380 $self->{line_prev} = $self->{line};
2381 $self->{column_prev} = $self->{column};
2382 $self->{column}++;
2383 $self->{nc}
2384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2385 } else {
2386 $self->{set_nc}->($self);
2387 }
2388
2389 redo A;
2390 }
2391 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2392 ## XML5: "Markup declaration state".
2393
2394 if ($self->{nc} == 0x002D) { # -
2395
2396 $self->{state} = MD_HYPHEN_STATE;
2397
2398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2399 $self->{line_prev} = $self->{line};
2400 $self->{column_prev} = $self->{column};
2401 $self->{column}++;
2402 $self->{nc}
2403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2404 } else {
2405 $self->{set_nc}->($self);
2406 }
2407
2408 redo A;
2409 } elsif ($self->{nc} == 0x0044 or # D
2410 $self->{nc} == 0x0064) { # d
2411 ## ASCII case-insensitive.
2412
2413 $self->{state} = MD_DOCTYPE_STATE;
2414 $self->{kwd} = chr $self->{nc};
2415
2416 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2417 $self->{line_prev} = $self->{line};
2418 $self->{column_prev} = $self->{column};
2419 $self->{column}++;
2420 $self->{nc}
2421 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2422 } else {
2423 $self->{set_nc}->($self);
2424 }
2425
2426 redo A;
2427 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2428 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2429 $self->{is_xml}) and
2430 $self->{nc} == 0x005B) { # [
2431
2432 $self->{state} = MD_CDATA_STATE;
2433 $self->{kwd} = '[';
2434
2435 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2436 $self->{line_prev} = $self->{line};
2437 $self->{column_prev} = $self->{column};
2438 $self->{column}++;
2439 $self->{nc}
2440 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2441 } else {
2442 $self->{set_nc}->($self);
2443 }
2444
2445 redo A;
2446 } else {
2447
2448 }
2449
2450 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2451 line => $self->{line_prev},
2452 column => $self->{column_prev} - 1);
2453 ## Reconsume.
2454 $self->{state} = BOGUS_COMMENT_STATE;
2455 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2456 line => $self->{line_prev},
2457 column => $self->{column_prev} - 1,
2458 };
2459 redo A;
2460 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2461 if ($self->{nc} == 0x002D) { # -
2462
2463 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2464 line => $self->{line_prev},
2465 column => $self->{column_prev} - 2,
2466 };
2467 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2468
2469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470 $self->{line_prev} = $self->{line};
2471 $self->{column_prev} = $self->{column};
2472 $self->{column}++;
2473 $self->{nc}
2474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475 } else {
2476 $self->{set_nc}->($self);
2477 }
2478
2479 redo A;
2480 } else {
2481
2482 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2483 line => $self->{line_prev},
2484 column => $self->{column_prev} - 2);
2485 $self->{state} = BOGUS_COMMENT_STATE;
2486 ## Reconsume.
2487 $self->{ct} = {type => COMMENT_TOKEN,
2488 data => '-',
2489 line => $self->{line_prev},
2490 column => $self->{column_prev} - 2,
2491 };
2492 redo A;
2493 }
2494 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2495 ## ASCII case-insensitive.
2496 if ($self->{nc} == [
2497 undef,
2498 0x004F, # O
2499 0x0043, # C
2500 0x0054, # T
2501 0x0059, # Y
2502 0x0050, # P
2503 ]->[length $self->{kwd}] or
2504 $self->{nc} == [
2505 undef,
2506 0x006F, # o
2507 0x0063, # c
2508 0x0074, # t
2509 0x0079, # y
2510 0x0070, # p
2511 ]->[length $self->{kwd}]) {
2512
2513 ## Stay in the state.
2514 $self->{kwd} .= chr $self->{nc};
2515
2516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2517 $self->{line_prev} = $self->{line};
2518 $self->{column_prev} = $self->{column};
2519 $self->{column}++;
2520 $self->{nc}
2521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2522 } else {
2523 $self->{set_nc}->($self);
2524 }
2525
2526 redo A;
2527 } elsif ((length $self->{kwd}) == 6 and
2528 ($self->{nc} == 0x0045 or # E
2529 $self->{nc} == 0x0065)) { # e
2530 if ($self->{is_xml} and
2531 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2532
2533 ## XML5: case-sensitive.
2534 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2535 text => 'DOCTYPE',
2536 line => $self->{line_prev},
2537 column => $self->{column_prev} - 5);
2538 } else {
2539
2540 }
2541 $self->{state} = DOCTYPE_STATE;
2542 $self->{ct} = {type => DOCTYPE_TOKEN,
2543 quirks => 1,
2544 line => $self->{line_prev},
2545 column => $self->{column_prev} - 7,
2546 };
2547
2548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2549 $self->{line_prev} = $self->{line};
2550 $self->{column_prev} = $self->{column};
2551 $self->{column}++;
2552 $self->{nc}
2553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2554 } else {
2555 $self->{set_nc}->($self);
2556 }
2557
2558 redo A;
2559 } else {
2560
2561 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2562 line => $self->{line_prev},
2563 column => $self->{column_prev} - 1 - length $self->{kwd});
2564 $self->{state} = BOGUS_COMMENT_STATE;
2565 ## Reconsume.
2566 $self->{ct} = {type => COMMENT_TOKEN,
2567 data => $self->{kwd},
2568 line => $self->{line_prev},
2569 column => $self->{column_prev} - 1 - length $self->{kwd},
2570 };
2571 redo A;
2572 }
2573 } elsif ($self->{state} == MD_CDATA_STATE) {
2574 if ($self->{nc} == {
2575 '[' => 0x0043, # C
2576 '[C' => 0x0044, # D
2577 '[CD' => 0x0041, # A
2578 '[CDA' => 0x0054, # T
2579 '[CDAT' => 0x0041, # A
2580 }->{$self->{kwd}}) {
2581
2582 ## Stay in the state.
2583 $self->{kwd} .= chr $self->{nc};
2584
2585 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2586 $self->{line_prev} = $self->{line};
2587 $self->{column_prev} = $self->{column};
2588 $self->{column}++;
2589 $self->{nc}
2590 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2591 } else {
2592 $self->{set_nc}->($self);
2593 }
2594
2595 redo A;
2596 } elsif ($self->{kwd} eq '[CDATA' and
2597 $self->{nc} == 0x005B) { # [
2598 if ($self->{is_xml} and
2599 not $self->{tainted} and
2600 @{$self->{open_elements} or []} == 0) {
2601
2602 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2603 line => $self->{line_prev},
2604 column => $self->{column_prev} - 7);
2605 $self->{tainted} = 1;
2606 } else {
2607
2608 }
2609
2610 $self->{ct} = {type => CHARACTER_TOKEN,
2611 data => '',
2612 line => $self->{line_prev},
2613 column => $self->{column_prev} - 7};
2614 $self->{state} = CDATA_SECTION_STATE;
2615
2616 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2617 $self->{line_prev} = $self->{line};
2618 $self->{column_prev} = $self->{column};
2619 $self->{column}++;
2620 $self->{nc}
2621 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2622 } else {
2623 $self->{set_nc}->($self);
2624 }
2625
2626 redo A;
2627 } else {
2628
2629 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2630 line => $self->{line_prev},
2631 column => $self->{column_prev} - 1 - length $self->{kwd});
2632 $self->{state} = BOGUS_COMMENT_STATE;
2633 ## Reconsume.
2634 $self->{ct} = {type => COMMENT_TOKEN,
2635 data => $self->{kwd},
2636 line => $self->{line_prev},
2637 column => $self->{column_prev} - 1 - length $self->{kwd},
2638 };
2639 redo A;
2640 }
2641 } elsif ($self->{state} == COMMENT_START_STATE) {
2642 if ($self->{nc} == 0x002D) { # -
2643
2644 $self->{state} = COMMENT_START_DASH_STATE;
2645
2646 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2647 $self->{line_prev} = $self->{line};
2648 $self->{column_prev} = $self->{column};
2649 $self->{column}++;
2650 $self->{nc}
2651 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2652 } else {
2653 $self->{set_nc}->($self);
2654 }
2655
2656 redo A;
2657 } elsif ($self->{nc} == 0x003E) { # >
2658 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2659 if ($self->{in_subset}) {
2660
2661 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2662 } else {
2663
2664 $self->{state} = DATA_STATE;
2665 $self->{s_kwd} = '';
2666 }
2667
2668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2669 $self->{line_prev} = $self->{line};
2670 $self->{column_prev} = $self->{column};
2671 $self->{column}++;
2672 $self->{nc}
2673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2674 } else {
2675 $self->{set_nc}->($self);
2676 }
2677
2678
2679 return ($self->{ct}); # comment
2680
2681 redo A;
2682 } elsif ($self->{nc} == -1) {
2683 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2684 if ($self->{in_subset}) {
2685
2686 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2687 } else {
2688
2689 $self->{state} = DATA_STATE;
2690 $self->{s_kwd} = '';
2691 }
2692 ## reconsume
2693
2694 return ($self->{ct}); # comment
2695
2696 redo A;
2697 } else {
2698
2699 $self->{ct}->{data} # comment
2700 .= chr ($self->{nc});
2701 $self->{state} = COMMENT_STATE;
2702
2703 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2704 $self->{line_prev} = $self->{line};
2705 $self->{column_prev} = $self->{column};
2706 $self->{column}++;
2707 $self->{nc}
2708 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2709 } else {
2710 $self->{set_nc}->($self);
2711 }
2712
2713 redo A;
2714 }
2715 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2716 if ($self->{nc} == 0x002D) { # -
2717
2718 $self->{state} = COMMENT_END_STATE;
2719
2720 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2721 $self->{line_prev} = $self->{line};
2722 $self->{column_prev} = $self->{column};
2723 $self->{column}++;
2724 $self->{nc}
2725 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2726 } else {
2727 $self->{set_nc}->($self);
2728 }
2729
2730 redo A;
2731 } elsif ($self->{nc} == 0x003E) { # >
2732 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2733 if ($self->{in_subset}) {
2734
2735 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2736 } else {
2737
2738 $self->{state} = DATA_STATE;
2739 $self->{s_kwd} = '';
2740 }
2741
2742 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2743 $self->{line_prev} = $self->{line};
2744 $self->{column_prev} = $self->{column};
2745 $self->{column}++;
2746 $self->{nc}
2747 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2748 } else {
2749 $self->{set_nc}->($self);
2750 }
2751
2752
2753 return ($self->{ct}); # comment
2754
2755 redo A;
2756 } elsif ($self->{nc} == -1) {
2757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2758 if ($self->{in_subset}) {
2759
2760 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2761 } else {
2762
2763 $self->{state} = DATA_STATE;
2764 $self->{s_kwd} = '';
2765 }
2766 ## reconsume
2767
2768 return ($self->{ct}); # comment
2769
2770 redo A;
2771 } else {
2772
2773 $self->{ct}->{data} # comment
2774 .= '-' . chr ($self->{nc});
2775 $self->{state} = COMMENT_STATE;
2776
2777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2778 $self->{line_prev} = $self->{line};
2779 $self->{column_prev} = $self->{column};
2780 $self->{column}++;
2781 $self->{nc}
2782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2783 } else {
2784 $self->{set_nc}->($self);
2785 }
2786
2787 redo A;
2788 }
2789 } elsif ($self->{state} == COMMENT_STATE) {
2790 ## XML5: "Comment state" and "DOCTYPE comment state".
2791
2792 if ($self->{nc} == 0x002D) { # -
2793
2794 $self->{state} = COMMENT_END_DASH_STATE;
2795
2796 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2797 $self->{line_prev} = $self->{line};
2798 $self->{column_prev} = $self->{column};
2799 $self->{column}++;
2800 $self->{nc}
2801 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2802 } else {
2803 $self->{set_nc}->($self);
2804 }
2805
2806 redo A;
2807 } elsif ($self->{nc} == -1) {
2808 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2809 if ($self->{in_subset}) {
2810
2811 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2812 } else {
2813
2814 $self->{state} = DATA_STATE;
2815 $self->{s_kwd} = '';
2816 }
2817 ## reconsume
2818
2819 return ($self->{ct}); # comment
2820
2821 redo A;
2822 } else {
2823
2824 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2825 $self->{read_until}->($self->{ct}->{data},
2826 q[-],
2827 length $self->{ct}->{data});
2828
2829 ## Stay in the state
2830
2831 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2832 $self->{line_prev} = $self->{line};
2833 $self->{column_prev} = $self->{column};
2834 $self->{column}++;
2835 $self->{nc}
2836 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2837 } else {
2838 $self->{set_nc}->($self);
2839 }
2840
2841 redo A;
2842 }
2843 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2844 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2845
2846 if ($self->{nc} == 0x002D) { # -
2847
2848 $self->{state} = COMMENT_END_STATE;
2849
2850 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2851 $self->{line_prev} = $self->{line};
2852 $self->{column_prev} = $self->{column};
2853 $self->{column}++;
2854 $self->{nc}
2855 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2856 } else {
2857 $self->{set_nc}->($self);
2858 }
2859
2860 redo A;
2861 } elsif ($self->{nc} == -1) {
2862 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2863 if ($self->{in_subset}) {
2864
2865 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2866 } else {
2867
2868 $self->{state} = DATA_STATE;
2869 $self->{s_kwd} = '';
2870 }
2871 ## reconsume
2872
2873 return ($self->{ct}); # comment
2874
2875 redo A;
2876 } else {
2877
2878 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2879 $self->{state} = COMMENT_STATE;
2880
2881 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2882 $self->{line_prev} = $self->{line};
2883 $self->{column_prev} = $self->{column};
2884 $self->{column}++;
2885 $self->{nc}
2886 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2887 } else {
2888 $self->{set_nc}->($self);
2889 }
2890
2891 redo A;
2892 }
2893 } elsif ($self->{state} == COMMENT_END_STATE) {
2894 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2895
2896 if ($self->{nc} == 0x003E) { # >
2897 if ($self->{in_subset}) {
2898
2899 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2900 } else {
2901
2902 $self->{state} = DATA_STATE;
2903 $self->{s_kwd} = '';
2904 }
2905
2906 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2907 $self->{line_prev} = $self->{line};
2908 $self->{column_prev} = $self->{column};
2909 $self->{column}++;
2910 $self->{nc}
2911 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2912 } else {
2913 $self->{set_nc}->($self);
2914 }
2915
2916
2917 return ($self->{ct}); # comment
2918
2919 redo A;
2920 } elsif ($self->{nc} == 0x002D) { # -
2921
2922 ## XML5: Not a parse error.
2923 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2924 line => $self->{line_prev},
2925 column => $self->{column_prev});
2926 $self->{ct}->{data} .= '-'; # comment
2927 ## Stay in the state
2928
2929 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2930 $self->{line_prev} = $self->{line};
2931 $self->{column_prev} = $self->{column};
2932 $self->{column}++;
2933 $self->{nc}
2934 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2935 } else {
2936 $self->{set_nc}->($self);
2937 }
2938
2939 redo A;
2940 } elsif ($self->{nc} == -1) {
2941 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2942 if ($self->{in_subset}) {
2943
2944 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945 } else {
2946
2947 $self->{state} = DATA_STATE;
2948 $self->{s_kwd} = '';
2949 }
2950 ## reconsume
2951
2952 return ($self->{ct}); # comment
2953
2954 redo A;
2955 } else {
2956
2957 ## XML5: Not a parse error.
2958 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2959 line => $self->{line_prev},
2960 column => $self->{column_prev});
2961 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2962 $self->{state} = COMMENT_STATE;
2963
2964 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2965 $self->{line_prev} = $self->{line};
2966 $self->{column_prev} = $self->{column};
2967 $self->{column}++;
2968 $self->{nc}
2969 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2970 } else {
2971 $self->{set_nc}->($self);
2972 }
2973
2974 redo A;
2975 }
2976 } elsif ($self->{state} == DOCTYPE_STATE) {
2977 if ($is_space->{$self->{nc}}) {
2978
2979 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2980
2981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2982 $self->{line_prev} = $self->{line};
2983 $self->{column_prev} = $self->{column};
2984 $self->{column}++;
2985 $self->{nc}
2986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2987 } else {
2988 $self->{set_nc}->($self);
2989 }
2990
2991 redo A;
2992 } else {
2993
2994 ## XML5: Unless EOF, swith to the bogus comment state.
2995 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2996 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2997 ## reconsume
2998 redo A;
2999 }
3000 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3001 ## XML5: "DOCTYPE root name before state".
3002
3003 if ($is_space->{$self->{nc}}) {
3004
3005 ## Stay in the state
3006
3007 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3008 $self->{line_prev} = $self->{line};
3009 $self->{column_prev} = $self->{column};
3010 $self->{column}++;
3011 $self->{nc}
3012 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3013 } else {
3014 $self->{set_nc}->($self);
3015 }
3016
3017 redo A;
3018 } elsif ($self->{nc} == 0x003E) { # >
3019
3020 ## XML5: No parse error.
3021 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3022 $self->{state} = DATA_STATE;
3023 $self->{s_kwd} = '';
3024
3025 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3026 $self->{line_prev} = $self->{line};
3027 $self->{column_prev} = $self->{column};
3028 $self->{column}++;
3029 $self->{nc}
3030 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3031 } else {
3032 $self->{set_nc}->($self);
3033 }
3034
3035
3036 return ($self->{ct}); # DOCTYPE (quirks)
3037
3038 redo A;
3039 } elsif ($self->{nc} == -1) {
3040
3041 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3042 $self->{state} = DATA_STATE;
3043 $self->{s_kwd} = '';
3044 ## reconsume
3045
3046 return ($self->{ct}); # DOCTYPE (quirks)
3047
3048 redo A;
3049 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3050
3051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3053 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3054 $self->{in_subset} = 1;
3055
3056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3057 $self->{line_prev} = $self->{line};
3058 $self->{column_prev} = $self->{column};
3059 $self->{column}++;
3060 $self->{nc}
3061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3062 } else {
3063 $self->{set_nc}->($self);
3064 }
3065
3066 return ($self->{ct}); # DOCTYPE
3067 redo A;
3068 } else {
3069
3070 $self->{ct}->{name} = chr $self->{nc};
3071 delete $self->{ct}->{quirks};
3072 $self->{state} = DOCTYPE_NAME_STATE;
3073
3074 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3075 $self->{line_prev} = $self->{line};
3076 $self->{column_prev} = $self->{column};
3077 $self->{column}++;
3078 $self->{nc}
3079 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3080 } else {
3081 $self->{set_nc}->($self);
3082 }
3083
3084 redo A;
3085 }
3086 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3087 ## XML5: "DOCTYPE root name state".
3088
3089 ## ISSUE: Redundant "First," in the spec.
3090
3091 if ($is_space->{$self->{nc}}) {
3092
3093 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3094
3095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3096 $self->{line_prev} = $self->{line};
3097 $self->{column_prev} = $self->{column};
3098 $self->{column}++;
3099 $self->{nc}
3100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3101 } else {
3102 $self->{set_nc}->($self);
3103 }
3104
3105 redo A;
3106 } elsif ($self->{nc} == 0x003E) { # >
3107
3108 $self->{state} = DATA_STATE;
3109 $self->{s_kwd} = '';
3110
3111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3112 $self->{line_prev} = $self->{line};
3113 $self->{column_prev} = $self->{column};
3114 $self->{column}++;
3115 $self->{nc}
3116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3117 } else {
3118 $self->{set_nc}->($self);
3119 }
3120
3121
3122 return ($self->{ct}); # DOCTYPE
3123
3124 redo A;
3125 } elsif ($self->{nc} == -1) {
3126
3127 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3128 $self->{state} = DATA_STATE;
3129 $self->{s_kwd} = '';
3130 ## reconsume
3131
3132 $self->{ct}->{quirks} = 1;
3133 return ($self->{ct}); # DOCTYPE
3134
3135 redo A;
3136 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3137
3138 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3139 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3140 $self->{in_subset} = 1;
3141
3142 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3143 $self->{line_prev} = $self->{line};
3144 $self->{column_prev} = $self->{column};
3145 $self->{column}++;
3146 $self->{nc}
3147 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3148 } else {
3149 $self->{set_nc}->($self);
3150 }
3151
3152 return ($self->{ct}); # DOCTYPE
3153 redo A;
3154 } else {
3155
3156 $self->{ct}->{name}
3157 .= chr ($self->{nc}); # DOCTYPE
3158 ## Stay in the state
3159
3160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3161 $self->{line_prev} = $self->{line};
3162 $self->{column_prev} = $self->{column};
3163 $self->{column}++;
3164 $self->{nc}
3165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3166 } else {
3167 $self->{set_nc}->($self);
3168 }
3169
3170 redo A;
3171 }
3172 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3173 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3174 ## state", but implemented differently.
3175
3176 if ($is_space->{$self->{nc}}) {
3177
3178 ## Stay in the state
3179
3180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3181 $self->{line_prev} = $self->{line};
3182 $self->{column_prev} = $self->{column};
3183 $self->{column}++;
3184 $self->{nc}
3185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3186 } else {
3187 $self->{set_nc}->($self);
3188 }
3189
3190 redo A;
3191 } elsif ($self->{nc} == 0x003E) { # >
3192 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3193
3194 $self->{state} = DATA_STATE;
3195 $self->{s_kwd} = '';
3196 } else {
3197
3198 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3199 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3200 }
3201
3202
3203 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3204 $self->{line_prev} = $self->{line};
3205 $self->{column_prev} = $self->{column};
3206 $self->{column}++;
3207 $self->{nc}
3208 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3209 } else {
3210 $self->{set_nc}->($self);
3211 }
3212
3213 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3214 redo A;
3215 } elsif ($self->{nc} == -1) {
3216 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3217
3218 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3219 $self->{state} = DATA_STATE;
3220 $self->{s_kwd} = '';
3221 $self->{ct}->{quirks} = 1;
3222 } else {
3223
3224 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3225 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3226 }
3227
3228 ## Reconsume.
3229 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3230 redo A;
3231 } elsif ($self->{nc} == 0x0050 or # P
3232 $self->{nc} == 0x0070) { # p
3233
3234 $self->{state} = PUBLIC_STATE;
3235 $self->{kwd} = chr $self->{nc};
3236
3237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238 $self->{line_prev} = $self->{line};
3239 $self->{column_prev} = $self->{column};
3240 $self->{column}++;
3241 $self->{nc}
3242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243 } else {
3244 $self->{set_nc}->($self);
3245 }
3246
3247 redo A;
3248 } elsif ($self->{nc} == 0x0053 or # S
3249 $self->{nc} == 0x0073) { # s
3250
3251 $self->{state} = SYSTEM_STATE;
3252 $self->{kwd} = chr $self->{nc};
3253
3254 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3255 $self->{line_prev} = $self->{line};
3256 $self->{column_prev} = $self->{column};
3257 $self->{column}++;
3258 $self->{nc}
3259 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3260 } else {
3261 $self->{set_nc}->($self);
3262 }
3263
3264 redo A;
3265 ## TODO: " and ' for ENTITY
3266 } elsif ($self->{is_xml} and
3267 $self->{ct}->{type} == DOCTYPE_TOKEN and
3268 $self->{nc} == 0x005B) { # [
3269
3270 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3271 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3272 $self->{in_subset} = 1;
3273
3274 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3275 $self->{line_prev} = $self->{line};
3276 $self->{column_prev} = $self->{column};
3277 $self->{column}++;
3278 $self->{nc}
3279 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3280 } else {
3281 $self->{set_nc}->($self);
3282 }
3283
3284 return ($self->{ct}); # DOCTYPE
3285 redo A;
3286 } else {
3287 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3288
3289 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3290
3291 $self->{ct}->{quirks} = 1;
3292 $self->{state} = BOGUS_DOCTYPE_STATE;
3293 } else {
3294
3295 $self->{state} = BOGUS_MD_STATE;
3296 }
3297
3298
3299 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3300 $self->{line_prev} = $self->{line};
3301 $self->{column_prev} = $self->{column};
3302 $self->{column}++;
3303 $self->{nc}
3304 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3305 } else {
3306 $self->{set_nc}->($self);
3307 }
3308
3309 redo A;
3310 }
3311 } elsif ($self->{state} == PUBLIC_STATE) {
3312 ## ASCII case-insensitive
3313 if ($self->{nc} == [
3314 undef,
3315 0x0055, # U
3316 0x0042, # B
3317 0x004C, # L
3318 0x0049, # I
3319 ]->[length $self->{kwd}] or
3320 $self->{nc} == [
3321 undef,
3322 0x0075, # u
3323 0x0062, # b
3324 0x006C, # l
3325 0x0069, # i
3326 ]->[length $self->{kwd}]) {
3327
3328 ## Stay in the state.
3329 $self->{kwd} .= chr $self->{nc};
3330
3331 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3332 $self->{line_prev} = $self->{line};
3333 $self->{column_prev} = $self->{column};
3334 $self->{column}++;
3335 $self->{nc}
3336 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3337 } else {
3338 $self->{set_nc}->($self);
3339 }
3340
3341 redo A;
3342 } elsif ((length $self->{kwd}) == 5 and
3343 ($self->{nc} == 0x0043 or # C
3344 $self->{nc} == 0x0063)) { # c
3345 if ($self->{is_xml} and
3346 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3347
3348 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3349 text => 'PUBLIC',
3350 line => $self->{line_prev},
3351 column => $self->{column_prev} - 4);
3352 } else {
3353
3354 }
3355 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3356
3357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3358 $self->{line_prev} = $self->{line};
3359 $self->{column_prev} = $self->{column};
3360 $self->{column}++;
3361 $self->{nc}
3362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3363 } else {
3364 $self->{set_nc}->($self);
3365 }
3366
3367 redo A;
3368 } else {
3369 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3370 line => $self->{line_prev},
3371 column => $self->{column_prev} + 1 - length $self->{kwd});
3372 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3373
3374 $self->{ct}->{quirks} = 1;
3375 $self->{state} = BOGUS_DOCTYPE_STATE;
3376 } else {
3377
3378 $self->{state} = BOGUS_MD_STATE;
3379 }
3380 ## Reconsume.
3381 redo A;
3382 }
3383 } elsif ($self->{state} == SYSTEM_STATE) {
3384 ## ASCII case-insensitive
3385 if ($self->{nc} == [
3386 undef,
3387 0x0059, # Y
3388 0x0053, # S
3389 0x0054, # T
3390 0x0045, # E
3391 ]->[length $self->{kwd}] or
3392 $self->{nc} == [
3393 undef,
3394 0x0079, # y
3395 0x0073, # s
3396 0x0074, # t
3397 0x0065, # e
3398 ]->[length $self->{kwd}]) {
3399
3400 ## Stay in the state.
3401 $self->{kwd} .= chr $self->{nc};
3402
3403 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3404 $self->{line_prev} = $self->{line};
3405 $self->{column_prev} = $self->{column};
3406 $self->{column}++;
3407 $self->{nc}
3408 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3409 } else {
3410 $self->{set_nc}->($self);
3411 }
3412
3413 redo A;
3414 } elsif ((length $self->{kwd}) == 5 and
3415 ($self->{nc} == 0x004D or # M
3416 $self->{nc} == 0x006D)) { # m
3417 if ($self->{is_xml} and
3418 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3419
3420 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3421 text => 'SYSTEM',
3422 line => $self->{line_prev},
3423 column => $self->{column_prev} - 4);
3424 } else {
3425
3426 }
3427 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3428
3429 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3430 $self->{line_prev} = $self->{line};
3431 $self->{column_prev} = $self->{column};
3432 $self->{column}++;
3433 $self->{nc}
3434 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3435 } else {
3436 $self->{set_nc}->($self);
3437 }
3438
3439 redo A;
3440 } else {
3441 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3442 line => $self->{line_prev},
3443 column => $self->{column_prev} + 1 - length $self->{kwd});
3444 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3445
3446 $self->{ct}->{quirks} = 1;
3447 $self->{state} = BOGUS_DOCTYPE_STATE;
3448 } else {
3449
3450 $self->{state} = BOGUS_MD_STATE;
3451 }
3452 ## Reconsume.
3453 redo A;
3454 }
3455 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3456 if ($is_space->{$self->{nc}}) {
3457
3458 ## Stay in the state
3459
3460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3461 $self->{line_prev} = $self->{line};
3462 $self->{column_prev} = $self->{column};
3463 $self->{column}++;
3464 $self->{nc}
3465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3466 } else {
3467 $self->{set_nc}->($self);
3468 }
3469
3470 redo A;
3471 } elsif ($self->{nc} eq 0x0022) { # "
3472
3473 $self->{ct}->{pubid} = ''; # DOCTYPE
3474 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3475
3476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3477 $self->{line_prev} = $self->{line};
3478 $self->{column_prev} = $self->{column};
3479 $self->{column}++;
3480 $self->{nc}
3481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3482 } else {
3483 $self->{set_nc}->($self);
3484 }
3485
3486 redo A;
3487 } elsif ($self->{nc} eq 0x0027) { # '
3488
3489 $self->{ct}->{pubid} = ''; # DOCTYPE
3490 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3491
3492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3493 $self->{line_prev} = $self->{line};
3494 $self->{column_prev} = $self->{column};
3495 $self->{column}++;
3496 $self->{nc}
3497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3498 } else {
3499 $self->{set_nc}->($self);
3500 }
3501
3502 redo A;
3503 } elsif ($self->{nc} eq 0x003E) { # >
3504 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3505
3506 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3507
3508 $self->{state} = DATA_STATE;
3509 $self->{s_kwd} = '';
3510 $self->{ct}->{quirks} = 1;
3511 } else {
3512
3513 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3514 }
3515
3516
3517 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3518 $self->{line_prev} = $self->{line};
3519 $self->{column_prev} = $self->{column};
3520 $self->{column}++;
3521 $self->{nc}
3522 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3523 } else {
3524 $self->{set_nc}->($self);
3525 }
3526
3527 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3528 redo A;
3529 } elsif ($self->{nc} == -1) {
3530 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3531
3532 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3533 $self->{state} = DATA_STATE;
3534 $self->{s_kwd} = '';
3535 $self->{ct}->{quirks} = 1;
3536 } else {
3537
3538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3539 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3540 }
3541
3542 ## reconsume
3543 return ($self->{ct}); # DOCTYPE
3544 redo A;
3545 } elsif ($self->{is_xml} and
3546 $self->{ct}->{type} == DOCTYPE_TOKEN and
3547 $self->{nc} == 0x005B) { # [
3548
3549 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3551 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3552 $self->{in_subset} = 1;
3553
3554 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3555 $self->{line_prev} = $self->{line};
3556 $self->{column_prev} = $self->{column};
3557 $self->{column}++;
3558 $self->{nc}
3559 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3560 } else {
3561 $self->{set_nc}->($self);
3562 }
3563
3564 return ($self->{ct}); # DOCTYPE
3565 redo A;
3566 } else {
3567 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3568
3569 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3570
3571 $self->{ct}->{quirks} = 1;
3572 $self->{state} = BOGUS_DOCTYPE_STATE;
3573 } else {
3574
3575 $self->{state} = BOGUS_MD_STATE;
3576 }
3577
3578
3579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3580 $self->{line_prev} = $self->{line};
3581 $self->{column_prev} = $self->{column};
3582 $self->{column}++;
3583 $self->{nc}
3584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3585 } else {
3586 $self->{set_nc}->($self);
3587 }
3588
3589 redo A;
3590 }
3591 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3592 if ($self->{nc} == 0x0022) { # "
3593
3594 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3595
3596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3597 $self->{line_prev} = $self->{line};
3598 $self->{column_prev} = $self->{column};
3599 $self->{column}++;
3600 $self->{nc}
3601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3602 } else {
3603 $self->{set_nc}->($self);
3604 }
3605
3606 redo A;
3607 } elsif ($self->{nc} == 0x003E) { # >
3608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3609
3610 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611
3612 $self->{state} = DATA_STATE;
3613 $self->{s_kwd} = '';
3614 $self->{ct}->{quirks} = 1;
3615 } else {
3616
3617 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3618 }
3619
3620
3621 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3622 $self->{line_prev} = $self->{line};
3623 $self->{column_prev} = $self->{column};
3624 $self->{column}++;
3625 $self->{nc}
3626 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3627 } else {
3628 $self->{set_nc}->($self);
3629 }
3630
3631 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3632 redo A;
3633 } elsif ($self->{nc} == -1) {
3634 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3635
3636 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3637
3638 $self->{state} = DATA_STATE;
3639 $self->{s_kwd} = '';
3640 $self->{ct}->{quirks} = 1;
3641 } else {
3642
3643 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3644 }
3645
3646 ## Reconsume.
3647 return ($self->{ct}); # DOCTYPE
3648 redo A;
3649 } else {
3650
3651 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3652 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3653 length $self->{ct}->{pubid});
3654
3655 ## Stay in the state
3656
3657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3658 $self->{line_prev} = $self->{line};
3659 $self->{column_prev} = $self->{column};
3660 $self->{column}++;
3661 $self->{nc}
3662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3663 } else {
3664 $self->{set_nc}->($self);
3665 }
3666
3667 redo A;
3668 }
3669 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3670 if ($self->{nc} == 0x0027) { # '
3671
3672 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3673
3674 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3675 $self->{line_prev} = $self->{line};
3676 $self->{column_prev} = $self->{column};
3677 $self->{column}++;
3678 $self->{nc}
3679 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3680 } else {
3681 $self->{set_nc}->($self);
3682 }
3683
3684 redo A;
3685 } elsif ($self->{nc} == 0x003E) { # >
3686 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3687
3688 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3689
3690 $self->{state} = DATA_STATE;
3691 $self->{s_kwd} = '';
3692 $self->{ct}->{quirks} = 1;
3693 } else {
3694
3695 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3696 }
3697
3698
3699 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3700 $self->{line_prev} = $self->{line};
3701 $self->{column_prev} = $self->{column};
3702 $self->{column}++;
3703 $self->{nc}
3704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3705 } else {
3706 $self->{set_nc}->($self);
3707 }
3708
3709 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3710 redo A;
3711 } elsif ($self->{nc} == -1) {
3712 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3713
3714 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3715
3716 $self->{state} = DATA_STATE;
3717 $self->{s_kwd} = '';
3718 $self->{ct}->{quirks} = 1;
3719 } else {
3720
3721 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3722 }
3723
3724 ## reconsume
3725 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3726 redo A;
3727 } else {
3728
3729 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3730 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3731 length $self->{ct}->{pubid});
3732
3733 ## Stay in the state
3734
3735 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3736 $self->{line_prev} = $self->{line};
3737 $self->{column_prev} = $self->{column};
3738 $self->{column}++;
3739 $self->{nc}
3740 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3741 } else {
3742 $self->{set_nc}->($self);
3743 }
3744
3745 redo A;
3746 }
3747 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3748 if ($is_space->{$self->{nc}}) {
3749
3750 ## Stay in the state
3751
3752 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3753 $self->{line_prev} = $self->{line};
3754 $self->{column_prev} = $self->{column};
3755 $self->{column}++;
3756 $self->{nc}
3757 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3758 } else {
3759 $self->{set_nc}->($self);
3760 }
3761
3762 redo A;
3763 } elsif ($self->{nc} == 0x0022) { # "
3764
3765 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3766 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3767
3768 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3769 $self->{line_prev} = $self->{line};
3770 $self->{column_prev} = $self->{column};
3771 $self->{column}++;
3772 $self->{nc}
3773 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3774 } else {
3775 $self->{set_nc}->($self);
3776 }
3777
3778 redo A;
3779 } elsif ($self->{nc} == 0x0027) { # '
3780
3781 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3782 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3783
3784 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3785 $self->{line_prev} = $self->{line};
3786 $self->{column_prev} = $self->{column};
3787 $self->{column}++;
3788 $self->{nc}
3789 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3790 } else {
3791 $self->{set_nc}->($self);
3792 }
3793
3794 redo A;
3795 } elsif ($self->{nc} == 0x003E) { # >
3796 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3797 if ($self->{is_xml}) {
3798
3799 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3800 } else {
3801
3802 }
3803 $self->{state} = DATA_STATE;
3804 $self->{s_kwd} = '';
3805 } else {
3806 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3807
3808 } else {
3809
3810 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3811 }
3812 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3813 }
3814
3815
3816 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3817 $self->{line_prev} = $self->{line};
3818 $self->{column_prev} = $self->{column};
3819 $self->{column}++;
3820 $self->{nc}
3821 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3822 } else {
3823 $self->{set_nc}->($self);
3824 }
3825
3826 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3827 redo A;
3828 } elsif ($self->{nc} == -1) {
3829 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3830
3831 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3832
3833 $self->{state} = DATA_STATE;
3834 $self->{s_kwd} = '';
3835 $self->{ct}->{quirks} = 1;
3836 } else {
3837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3838 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3839 }
3840
3841 ## reconsume
3842 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3843 redo A;
3844 } elsif ($self->{is_xml} and
3845 $self->{ct}->{type} == DOCTYPE_TOKEN and
3846 $self->{nc} == 0x005B) { # [
3847
3848 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3849 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3850 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3851 $self->{in_subset} = 1;
3852
3853 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3854 $self->{line_prev} = $self->{line};
3855 $self->{column_prev} = $self->{column};
3856 $self->{column}++;
3857 $self->{nc}
3858 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3859 } else {
3860 $self->{set_nc}->($self);
3861 }
3862
3863 return ($self->{ct}); # DOCTYPE
3864 redo A;
3865 } else {
3866 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3867
3868 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3869
3870 $self->{ct}->{quirks} = 1;
3871 $self->{state} = BOGUS_DOCTYPE_STATE;
3872 } else {
3873
3874 $self->{state} = BOGUS_MD_STATE;
3875 }
3876
3877
3878 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3879 $self->{line_prev} = $self->{line};
3880 $self->{column_prev} = $self->{column};
3881 $self->{column}++;
3882 $self->{nc}
3883 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3884 } else {
3885 $self->{set_nc}->($self);
3886 }
3887
3888 redo A;
3889 }
3890 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3891 if ($is_space->{$self->{nc}}) {
3892
3893 ## Stay in the state
3894
3895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3896 $self->{line_prev} = $self->{line};
3897 $self->{column_prev} = $self->{column};
3898 $self->{column}++;
3899 $self->{nc}
3900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3901 } else {
3902 $self->{set_nc}->($self);
3903 }
3904
3905 redo A;
3906 } elsif ($self->{nc} == 0x0022) { # "
3907
3908 $self->{ct}->{sysid} = ''; # DOCTYPE
3909 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3910
3911 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3912 $self->{line_prev} = $self->{line};
3913 $self->{column_prev} = $self->{column};
3914 $self->{column}++;
3915 $self->{nc}
3916 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3917 } else {
3918 $self->{set_nc}->($self);
3919 }
3920
3921 redo A;
3922 } elsif ($self->{nc} == 0x0027) { # '
3923
3924 $self->{ct}->{sysid} = ''; # DOCTYPE
3925 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3926
3927 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3928 $self->{line_prev} = $self->{line};
3929 $self->{column_prev} = $self->{column};
3930 $self->{column}++;
3931 $self->{nc}
3932 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3933 } else {
3934 $self->{set_nc}->($self);
3935 }
3936
3937 redo A;
3938 } elsif ($self->{nc} == 0x003E) { # >
3939 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3940
3941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3942 $self->{line_prev} = $self->{line};
3943 $self->{column_prev} = $self->{column};
3944 $self->{column}++;
3945 $self->{nc}
3946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3947 } else {
3948 $self->{set_nc}->($self);
3949 }
3950
3951
3952 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3953
3954 $self->{state} = DATA_STATE;
3955 $self->{s_kwd} = '';
3956 $self->{ct}->{quirks} = 1;
3957 } else {
3958
3959 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3960 }
3961
3962 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3963 redo A;
3964 } elsif ($self->{nc} == -1) {
3965 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3966
3967 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3968 $self->{state} = DATA_STATE;
3969 $self->{s_kwd} = '';
3970 $self->{ct}->{quirks} = 1;
3971 } else {
3972
3973 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3974 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3975 }
3976
3977 ## reconsume
3978 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3979 redo A;
3980 } elsif ($self->{is_xml} and
3981 $self->{ct}->{type} == DOCTYPE_TOKEN and
3982 $self->{nc} == 0x005B) { # [
3983
3984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985
3986 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3987 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3988 $self->{in_subset} = 1;
3989
3990 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3991 $self->{line_prev} = $self->{line};
3992 $self->{column_prev} = $self->{column};
3993 $self->{column}++;
3994 $self->{nc}
3995 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3996 } else {
3997 $self->{set_nc}->($self);
3998 }
3999
4000 return ($self->{ct}); # DOCTYPE
4001 redo A;
4002 } else {
4003 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4004
4005 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4006
4007 $self->{ct}->{quirks} = 1;
4008 $self->{state} = BOGUS_DOCTYPE_STATE;
4009 } else {
4010
4011 $self->{state} = BOGUS_MD_STATE;
4012 }
4013
4014
4015 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4016 $self->{line_prev} = $self->{line};
4017 $self->{column_prev} = $self->{column};
4018 $self->{column}++;
4019 $self->{nc}
4020 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4021 } else {
4022 $self->{set_nc}->($self);
4023 }
4024
4025 redo A;
4026 }
4027 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4028 if ($self->{nc} == 0x0022) { # "
4029
4030 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4031
4032 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4033 $self->{line_prev} = $self->{line};
4034 $self->{column_prev} = $self->{column};
4035 $self->{column}++;
4036 $self->{nc}
4037 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4038 } else {
4039 $self->{set_nc}->($self);
4040 }
4041
4042 redo A;
4043 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4044 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4045
4046 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4047
4048 $self->{state} = DATA_STATE;
4049 $self->{s_kwd} = '';
4050 $self->{ct}->{quirks} = 1;
4051 } else {
4052
4053 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4054 }
4055
4056
4057 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4058 $self->{line_prev} = $self->{line};
4059 $self->{column_prev} = $self->{column};
4060 $self->{column}++;
4061 $self->{nc}
4062 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4063 } else {
4064 $self->{set_nc}->($self);
4065 }
4066
4067 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4068 redo A;
4069 } elsif ($self->{nc} == -1) {
4070 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4071
4072 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4073
4074 $self->{state} = DATA_STATE;
4075 $self->{s_kwd} = '';
4076 $self->{ct}->{quirks} = 1;
4077 } else {
4078
4079 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4080 }
4081
4082 ## reconsume
4083 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4084 redo A;
4085 } else {
4086
4087 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4088 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4089 length $self->{ct}->{sysid});
4090
4091 ## Stay in the state
4092
4093 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4094 $self->{line_prev} = $self->{line};
4095 $self->{column_prev} = $self->{column};
4096 $self->{column}++;
4097 $self->{nc}
4098 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4099 } else {
4100 $self->{set_nc}->($self);
4101 }
4102
4103 redo A;
4104 }
4105 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4106 if ($self->{nc} == 0x0027) { # '
4107
4108 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4109
4110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4111 $self->{line_prev} = $self->{line};
4112 $self->{column_prev} = $self->{column};
4113 $self->{column}++;
4114 $self->{nc}
4115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4116 } else {
4117 $self->{set_nc}->($self);
4118 }
4119
4120 redo A;
4121 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4122
4123 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4124
4125 $self->{state} = DATA_STATE;
4126 $self->{s_kwd} = '';
4127
4128 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4129 $self->{line_prev} = $self->{line};
4130 $self->{column_prev} = $self->{column};
4131 $self->{column}++;
4132 $self->{nc}
4133 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4134 } else {
4135 $self->{set_nc}->($self);
4136 }
4137
4138
4139 $self->{ct}->{quirks} = 1;
4140 return ($self->{ct}); # DOCTYPE
4141
4142 redo A;
4143 } elsif ($self->{nc} == -1) {
4144 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4145
4146 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4147
4148 $self->{state} = DATA_STATE;
4149 $self->{s_kwd} = '';
4150 $self->{ct}->{quirks} = 1;
4151 } else {
4152
4153 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4154 }
4155
4156 ## reconsume
4157 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4158 redo A;
4159 } else {
4160
4161 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4162 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4163 length $self->{ct}->{sysid});
4164
4165 ## Stay in the state
4166
4167 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4168 $self->{line_prev} = $self->{line};
4169 $self->{column_prev} = $self->{column};
4170 $self->{column}++;
4171 $self->{nc}
4172 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4173 } else {
4174 $self->{set_nc}->($self);
4175 }
4176
4177 redo A;
4178 }
4179 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4180 if ($is_space->{$self->{nc}}) {
4181 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4182
4183 $self->{state} = BEFORE_NDATA_STATE;
4184 } else {
4185
4186 ## Stay in the state
4187 }
4188
4189 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190 $self->{line_prev} = $self->{line};
4191 $self->{column_prev} = $self->{column};
4192 $self->{column}++;
4193 $self->{nc}
4194 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195 } else {
4196 $self->{set_nc}->($self);
4197 }
4198
4199 redo A;
4200 } elsif ($self->{nc} == 0x003E) { # >
4201 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4202
4203 $self->{state} = DATA_STATE;
4204 $self->{s_kwd} = '';
4205 } else {
4206
4207 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4208 }
4209
4210
4211 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4212 $self->{line_prev} = $self->{line};
4213 $self->{column_prev} = $self->{column};
4214 $self->{column}++;
4215 $self->{nc}
4216 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4217 } else {
4218 $self->{set_nc}->($self);
4219 }
4220
4221 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4222 redo A;
4223 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4224 ($self->{nc} == 0x004E or # N
4225 $self->{nc} == 0x006E)) { # n
4226
4227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4228 $self->{state} = NDATA_STATE;
4229 $self->{kwd} = chr $self->{nc};
4230
4231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4232 $self->{line_prev} = $self->{line};
4233 $self->{column_prev} = $self->{column};
4234 $self->{column}++;
4235 $self->{nc}
4236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4237 } else {
4238 $self->{set_nc}->($self);
4239 }
4240
4241 redo A;
4242 } elsif ($self->{nc} == -1) {
4243 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4244
4245 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4246 $self->{state} = DATA_STATE;
4247 $self->{s_kwd} = '';
4248 $self->{ct}->{quirks} = 1;
4249 } else {
4250
4251 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4252 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253 }
4254
4255 ## reconsume
4256 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4257 redo A;
4258 } elsif ($self->{is_xml} and
4259 $self->{ct}->{type} == DOCTYPE_TOKEN and
4260 $self->{nc} == 0x005B) { # [
4261
4262 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4263 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4264 $self->{in_subset} = 1;
4265
4266 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4267 $self->{line_prev} = $self->{line};
4268 $self->{column_prev} = $self->{column};
4269 $self->{column}++;
4270 $self->{nc}
4271 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4272 } else {
4273 $self->{set_nc}->($self);
4274 }
4275
4276 return ($self->{ct}); # DOCTYPE
4277 redo A;
4278 } else {
4279 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4280
4281 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282
4283 #$self->{ct}->{quirks} = 1;
4284 $self->{state} = BOGUS_DOCTYPE_STATE;
4285 } else {
4286
4287 $self->{state} = BOGUS_MD_STATE;
4288 }
4289
4290
4291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4292 $self->{line_prev} = $self->{line};
4293 $self->{column_prev} = $self->{column};
4294 $self->{column}++;
4295 $self->{nc}
4296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4297 } else {
4298 $self->{set_nc}->($self);
4299 }
4300
4301 redo A;
4302 }
4303 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4304 if ($is_space->{$self->{nc}}) {
4305
4306 ## Stay in the state.
4307
4308 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4309 $self->{line_prev} = $self->{line};
4310 $self->{column_prev} = $self->{column};
4311 $self->{column}++;
4312 $self->{nc}
4313 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4314 } else {
4315 $self->{set_nc}->($self);
4316 }
4317
4318 redo A;
4319 } elsif ($self->{nc} == 0x003E) { # >
4320
4321 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4322
4323 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4324 $self->{line_prev} = $self->{line};
4325 $self->{column_prev} = $self->{column};
4326 $self->{column}++;
4327 $self->{nc}
4328 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4329 } else {
4330 $self->{set_nc}->($self);
4331 }
4332
4333 return ($self->{ct}); # ENTITY
4334 redo A;
4335 } elsif ($self->{nc} == 0x004E or # N
4336 $self->{nc} == 0x006E) { # n
4337
4338 $self->{state} = NDATA_STATE;
4339 $self->{kwd} = chr $self->{nc};
4340
4341 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4342 $self->{line_prev} = $self->{line};
4343 $self->{column_prev} = $self->{column};
4344 $self->{column}++;
4345 $self->{nc}
4346 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4347 } else {
4348 $self->{set_nc}->($self);
4349 }
4350
4351 redo A;
4352 } elsif ($self->{nc} == -1) {
4353
4354 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4355 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4356 ## reconsume
4357 return ($self->{ct}); # ENTITY
4358 redo A;
4359 } else {
4360
4361 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4362 $self->{state} = BOGUS_MD_STATE;
4363
4364 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4365 $self->{line_prev} = $self->{line};
4366 $self->{column_prev} = $self->{column};
4367 $self->{column}++;
4368 $self->{nc}
4369 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4370 } else {
4371 $self->{set_nc}->($self);
4372 }
4373
4374 redo A;
4375 }
4376 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4377 if ($self->{nc} == 0x003E) { # >
4378
4379 $self->{state} = DATA_STATE;
4380 $self->{s_kwd} = '';
4381
4382 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4383 $self->{line_prev} = $self->{line};
4384 $self->{column_prev} = $self->{column};
4385 $self->{column}++;
4386 $self->{nc}
4387 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4388 } else {
4389 $self->{set_nc}->($self);
4390 }
4391
4392
4393 return ($self->{ct}); # DOCTYPE
4394
4395 redo A;
4396 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4397
4398 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4399 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4400 $self->{in_subset} = 1;
4401
4402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403 $self->{line_prev} = $self->{line};
4404 $self->{column_prev} = $self->{column};
4405 $self->{column}++;
4406 $self->{nc}
4407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408 } else {
4409 $self->{set_nc}->($self);
4410 }
4411
4412 return ($self->{ct}); # DOCTYPE
4413 redo A;
4414 } elsif ($self->{nc} == -1) {
4415
4416 $self->{state} = DATA_STATE;
4417 $self->{s_kwd} = '';
4418 ## reconsume
4419
4420 return ($self->{ct}); # DOCTYPE
4421
4422 redo A;
4423 } else {
4424
4425 my $s = '';
4426 $self->{read_until}->($s, q{>[}, 0);
4427
4428 ## Stay in the state
4429
4430 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4431 $self->{line_prev} = $self->{line};
4432 $self->{column_prev} = $self->{column};
4433 $self->{column}++;
4434 $self->{nc}
4435 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4436 } else {
4437 $self->{set_nc}->($self);
4438 }
4439
4440 redo A;
4441 }
4442 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4443 ## NOTE: "CDATA section state" in the state is jointly implemented
4444 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4445 ## and |CDATA_SECTION_MSE2_STATE|.
4446
4447 ## XML5: "CDATA state".
4448
4449 if ($self->{nc} == 0x005D) { # ]
4450
4451 $self->{state} = CDATA_SECTION_MSE1_STATE;
4452
4453 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4454 $self->{line_prev} = $self->{line};
4455 $self->{column_prev} = $self->{column};
4456 $self->{column}++;
4457 $self->{nc}
4458 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4459 } else {
4460 $self->{set_nc}->($self);
4461 }
4462
4463 redo A;
4464 } elsif ($self->{nc} == -1) {
4465 if ($self->{is_xml}) {
4466
4467 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4468 } else {
4469
4470 }
4471
4472 $self->{state} = DATA_STATE;
4473 $self->{s_kwd} = '';
4474 ## Reconsume.
4475 if (length $self->{ct}->{data}) { # character
4476
4477 return ($self->{ct}); # character
4478 } else {
4479
4480 ## No token to emit. $self->{ct} is discarded.
4481 }
4482 redo A;
4483 } else {
4484
4485 $self->{ct}->{data} .= chr $self->{nc};
4486 $self->{read_until}->($self->{ct}->{data},
4487 q<]>,
4488 length $self->{ct}->{data});
4489
4490 ## Stay in the state.
4491
4492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493 $self->{line_prev} = $self->{line};
4494 $self->{column_prev} = $self->{column};
4495 $self->{column}++;
4496 $self->{nc}
4497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498 } else {
4499 $self->{set_nc}->($self);
4500 }
4501
4502 redo A;
4503 }
4504
4505 ## ISSUE: "text tokens" in spec.
4506 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4507 ## XML5: "CDATA bracket state".
4508
4509 if ($self->{nc} == 0x005D) { # ]
4510
4511 $self->{state} = CDATA_SECTION_MSE2_STATE;
4512
4513 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4514 $self->{line_prev} = $self->{line};
4515 $self->{column_prev} = $self->{column};
4516 $self->{column}++;
4517 $self->{nc}
4518 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4519 } else {
4520 $self->{set_nc}->($self);
4521 }
4522
4523 redo A;
4524 } else {
4525
4526 ## XML5: If EOF, "]" is not appended and changed to the data state.
4527 $self->{ct}->{data} .= ']';
4528 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4529 ## Reconsume.
4530 redo A;
4531 }
4532 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4533 ## XML5: "CDATA end state".
4534
4535 if ($self->{nc} == 0x003E) { # >
4536 $self->{state} = DATA_STATE;
4537 $self->{s_kwd} = '';
4538
4539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540 $self->{line_prev} = $self->{line};
4541 $self->{column_prev} = $self->{column};
4542 $self->{column}++;
4543 $self->{nc}
4544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4545 } else {
4546 $self->{set_nc}->($self);
4547 }
4548
4549 if (length $self->{ct}->{data}) { # character
4550
4551 return ($self->{ct}); # character
4552 } else {
4553
4554 ## No token to emit. $self->{ct} is discarded.
4555 }
4556 redo A;
4557 } elsif ($self->{nc} == 0x005D) { # ]
4558 # character
4559 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4560 ## Stay in the state.
4561
4562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4563 $self->{line_prev} = $self->{line};
4564 $self->{column_prev} = $self->{column};
4565 $self->{column}++;
4566 $self->{nc}
4567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4568 } else {
4569 $self->{set_nc}->($self);
4570 }
4571
4572 redo A;
4573 } else {
4574
4575 $self->{ct}->{data} .= ']]'; # character
4576 $self->{state} = CDATA_SECTION_STATE;
4577 ## Reconsume. ## XML5: Emit.
4578 redo A;
4579 }
4580 } elsif ($self->{state} == ENTITY_STATE) {
4581 if ($is_space->{$self->{nc}} or
4582 {
4583 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4584 $self->{entity_add} => 1,
4585 }->{$self->{nc}}) {
4586
4587 ## Don't consume
4588 ## No error
4589 ## Return nothing.
4590 #
4591 } elsif ($self->{nc} == 0x0023) { # #
4592
4593 $self->{state} = ENTITY_HASH_STATE;
4594 $self->{kwd} = '#';
4595
4596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597 $self->{line_prev} = $self->{line};
4598 $self->{column_prev} = $self->{column};
4599 $self->{column}++;
4600 $self->{nc}
4601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4602 } else {
4603 $self->{set_nc}->($self);
4604 }
4605
4606 redo A;
4607 } elsif ((0x0041 <= $self->{nc} and
4608 $self->{nc} <= 0x005A) or # A..Z
4609 (0x0061 <= $self->{nc} and
4610 $self->{nc} <= 0x007A)) { # a..z
4611
4612 require Whatpm::_NamedEntityList;
4613 $self->{state} = ENTITY_NAME_STATE;
4614 $self->{kwd} = chr $self->{nc};
4615 $self->{entity__value} = $self->{kwd};
4616 $self->{entity__match} = 0;
4617
4618 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4619 $self->{line_prev} = $self->{line};
4620 $self->{column_prev} = $self->{column};
4621 $self->{column}++;
4622 $self->{nc}
4623 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4624 } else {
4625 $self->{set_nc}->($self);
4626 }
4627
4628 redo A;
4629 } else {
4630
4631 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4632 ## Return nothing.
4633 #
4634 }
4635
4636 ## NOTE: No character is consumed by the "consume a character
4637 ## reference" algorithm. In other word, there is an "&" character
4638 ## that does not introduce a character reference, which would be
4639 ## appended to the parent element or the attribute value in later
4640 ## process of the tokenizer.
4641
4642 if ($self->{prev_state} == DATA_STATE) {
4643
4644 $self->{state} = $self->{prev_state};
4645 $self->{s_kwd} = '';
4646 ## Reconsume.
4647 return ({type => CHARACTER_TOKEN, data => '&',
4648 line => $self->{line_prev},
4649 column => $self->{column_prev},
4650 });
4651 redo A;
4652 } else {
4653
4654 $self->{ca}->{value} .= '&';
4655 $self->{state} = $self->{prev_state};
4656 $self->{s_kwd} = '';
4657 ## Reconsume.
4658 redo A;
4659 }
4660 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4661 if ($self->{nc} == 0x0078 or # x
4662 $self->{nc} == 0x0058) { # X
4663
4664 $self->{state} = HEXREF_X_STATE;
4665 $self->{kwd} .= chr $self->{nc};
4666
4667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4668 $self->{line_prev} = $self->{line};
4669 $self->{column_prev} = $self->{column};
4670 $self->{column}++;
4671 $self->{nc}
4672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4673 } else {
4674 $self->{set_nc}->($self);
4675 }
4676
4677 redo A;
4678 } elsif (0x0030 <= $self->{nc} and
4679 $self->{nc} <= 0x0039) { # 0..9
4680
4681 $self->{state} = NCR_NUM_STATE;
4682 $self->{kwd} = $self->{nc} - 0x0030;
4683
4684 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4685 $self->{line_prev} = $self->{line};
4686 $self->{column_prev} = $self->{column};
4687 $self->{column}++;
4688 $self->{nc}
4689 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4690 } else {
4691 $self->{set_nc}->($self);
4692 }
4693
4694 redo A;
4695 } else {
4696 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4697 line => $self->{line_prev},
4698 column => $self->{column_prev} - 1);
4699
4700 ## NOTE: According to the spec algorithm, nothing is returned,
4701 ## and then "&#" is appended to the parent element or the attribute
4702 ## value in the later processing.
4703
4704 if ($self->{prev_state} == DATA_STATE) {
4705
4706 $self->{state} = $self->{prev_state};
4707 $self->{s_kwd} = '';
4708 ## Reconsume.
4709 return ({type => CHARACTER_TOKEN,
4710 data => '&#',
4711 line => $self->{line_prev},
4712 column => $self->{column_prev} - 1,
4713 });
4714 redo A;
4715 } else {
4716
4717 $self->{ca}->{value} .= '&#';
4718 $self->{state} = $self->{prev_state};
4719 $self->{s_kwd} = '';
4720 ## Reconsume.
4721 redo A;
4722 }
4723 }
4724 } elsif ($self->{state} == NCR_NUM_STATE) {
4725 if (0x0030 <= $self->{nc} and
4726 $self->{nc} <= 0x0039) { # 0..9
4727
4728 $self->{kwd} *= 10;
4729 $self->{kwd} += $self->{nc} - 0x0030;
4730
4731 ## Stay in the state.
4732
4733 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4734 $self->{line_prev} = $self->{line};
4735 $self->{column_prev} = $self->{column};
4736 $self->{column}++;
4737 $self->{nc}
4738 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4739 } else {
4740 $self->{set_nc}->($self);
4741 }
4742
4743 redo A;
4744 } elsif ($self->{nc} == 0x003B) { # ;
4745
4746
4747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4748 $self->{line_prev} = $self->{line};
4749 $self->{column_prev} = $self->{column};
4750 $self->{column}++;
4751 $self->{nc}
4752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4753 } else {
4754 $self->{set_nc}->($self);
4755 }
4756
4757 #
4758 } else {
4759
4760 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4761 ## Reconsume.
4762 #
4763 }
4764
4765 my $code = $self->{kwd};
4766 my $l = $self->{line_prev};
4767 my $c = $self->{column_prev};
4768 if ($charref_map->{$code}) {
4769
4770 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4771 text => (sprintf 'U+%04X', $code),
4772 line => $l, column => $c);
4773 $code = $charref_map->{$code};
4774 } elsif ($code > 0x10FFFF) {
4775
4776 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4777 text => (sprintf 'U-%08X', $code),
4778 line => $l, column => $c);
4779 $code = 0xFFFD;
4780 }
4781
4782 if ($self->{prev_state} == DATA_STATE) {
4783
4784 $self->{state} = $self->{prev_state};
4785 $self->{s_kwd} = '';
4786 ## Reconsume.
4787 return ({type => CHARACTER_TOKEN, data => chr $code,
4788 has_reference => 1,
4789 line => $l, column => $c,
4790 });
4791 redo A;
4792 } else {
4793
4794 $self->{ca}->{value} .= chr $code;
4795 $self->{ca}->{has_reference} = 1;
4796 $self->{state} = $self->{prev_state};
4797 $self->{s_kwd} = '';
4798 ## Reconsume.
4799 redo A;
4800 }
4801 } elsif ($self->{state} == HEXREF_X_STATE) {
4802 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4803 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4804 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4805 # 0..9, A..F, a..f
4806
4807 $self->{state} = HEXREF_HEX_STATE;
4808 $self->{kwd} = 0;
4809 ## Reconsume.
4810 redo A;
4811 } else {
4812 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4813 line => $self->{line_prev},
4814 column => $self->{column_prev} - 2);
4815
4816 ## NOTE: According to the spec algorithm, nothing is returned,
4817 ## and then "&#" followed by "X" or "x" is appended to the parent
4818 ## element or the attribute value in the later processing.
4819
4820 if ($self->{prev_state} == DATA_STATE) {
4821
4822 $self->{state} = $self->{prev_state};
4823 $self->{s_kwd} = '';
4824 ## Reconsume.
4825 return ({type => CHARACTER_TOKEN,
4826 data => '&' . $self->{kwd},
4827 line => $self->{line_prev},
4828 column => $self->{column_prev} - length $self->{kwd},
4829 });
4830 redo A;
4831 } else {
4832
4833 $self->{ca}->{value} .= '&' . $self->{kwd};
4834 $self->{state} = $self->{prev_state};
4835 $self->{s_kwd} = '';
4836 ## Reconsume.
4837 redo A;
4838 }
4839 }
4840 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4841 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4842 # 0..9
4843
4844 $self->{kwd} *= 0x10;
4845 $self->{kwd} += $self->{nc} - 0x0030;
4846 ## Stay in the state.
4847
4848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4849 $self->{line_prev} = $self->{line};
4850 $self->{column_prev} = $self->{column};
4851 $self->{column}++;
4852 $self->{nc}
4853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4854 } else {
4855 $self->{set_nc}->($self);
4856 }
4857
4858 redo A;
4859 } elsif (0x0061 <= $self->{nc} and
4860 $self->{nc} <= 0x0066) { # a..f
4861
4862 $self->{kwd} *= 0x10;
4863 $self->{kwd} += $self->{nc} - 0x0060 + 9;
4864 ## Stay in the state.
4865
4866 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4867 $self->{line_prev} = $self->{line};
4868 $self->{column_prev} = $self->{column};
4869 $self->{column}++;
4870 $self->{nc}
4871 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4872 } else {
4873 $self->{set_nc}->($self);
4874 }
4875
4876 redo A;
4877 } elsif (0x0041 <= $self->{nc} and
4878 $self->{nc} <= 0x0046) { # A..F
4879
4880 $self->{kwd} *= 0x10;
4881 $self->{kwd} += $self->{nc} - 0x0040 + 9;
4882 ## Stay in the state.
4883
4884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4885 $self->{line_prev} = $self->{line};
4886 $self->{column_prev} = $self->{column};
4887 $self->{column}++;
4888 $self->{nc}
4889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4890 } else {
4891 $self->{set_nc}->($self);
4892 }
4893
4894 redo A;
4895 } elsif ($self->{nc} == 0x003B) { # ;
4896
4897
4898 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4899 $self->{line_prev} = $self->{line};
4900 $self->{column_prev} = $self->{column};
4901 $self->{column}++;
4902 $self->{nc}
4903 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4904 } else {
4905 $self->{set_nc}->($self);
4906 }
4907
4908 #
4909 } else {
4910
4911 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4912 line => $self->{line},
4913 column => $self->{column});
4914 ## Reconsume.
4915 #
4916 }
4917
4918 my $code = $self->{kwd};
4919 my $l = $self->{line_prev};
4920 my $c = $self->{column_prev};
4921 if ($charref_map->{$code}) {
4922
4923 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4924 text => (sprintf 'U+%04X', $code),
4925 line => $l, column => $c);
4926 $code = $charref_map->{$code};
4927 } elsif ($code > 0x10FFFF) {
4928
4929 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4930 text => (sprintf 'U-%08X', $code),
4931 line => $l, column => $c);
4932 $code = 0xFFFD;
4933 }
4934
4935 if ($self->{prev_state} == DATA_STATE) {
4936
4937 $self->{state} = $self->{prev_state};
4938 $self->{s_kwd} = '';
4939 ## Reconsume.
4940 return ({type => CHARACTER_TOKEN, data => chr $code,
4941 has_reference => 1,
4942 line => $l, column => $c,
4943 });
4944 redo A;
4945 } else {
4946
4947 $self->{ca}->{value} .= chr $code;
4948 $self->{ca}->{has_reference} = 1;
4949 $self->{state} = $self->{prev_state};
4950 $self->{s_kwd} = '';
4951 ## Reconsume.
4952 redo A;
4953 }
4954 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4955 if (length $self->{kwd} < 30 and
4956 ## NOTE: Some number greater than the maximum length of entity name
4957 ((0x0041 <= $self->{nc} and # a
4958 $self->{nc} <= 0x005A) or # x
4959 (0x0061 <= $self->{nc} and # a
4960 $self->{nc} <= 0x007A) or # z
4961 (0x0030 <= $self->{nc} and # 0
4962 $self->{nc} <= 0x0039) or # 9
4963 $self->{nc} == 0x003B)) { # ;
4964 our $EntityChar;
4965 $self->{kwd} .= chr $self->{nc};
4966 if (defined $EntityChar->{$self->{kwd}}) {
4967 if ($self->{nc} == 0x003B) { # ;
4968
4969 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4970 $self->{entity__match} = 1;
4971
4972 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4973 $self->{line_prev} = $self->{line};
4974 $self->{column_prev} = $self->{column};
4975 $self->{column}++;
4976 $self->{nc}
4977 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4978 } else {
4979 $self->{set_nc}->($self);
4980 }
4981
4982 #
4983 } else {
4984
4985 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4986 $self->{entity__match} = -1;
4987 ## Stay in the state.
4988
4989 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4990 $self->{line_prev} = $self->{line};
4991 $self->{column_prev} = $self->{column};
4992 $self->{column}++;
4993 $self->{nc}
4994 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4995 } else {
4996 $self->{set_nc}->($self);
4997 }
4998
4999 redo A;
5000 }
5001 } else {
5002
5003 $self->{entity__value} .= chr $self->{nc};
5004 $self->{entity__match} *= 2;
5005 ## Stay in the state.
5006
5007 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5008 $self->{line_prev} = $self->{line};
5009 $self->{column_prev} = $self->{column};
5010 $self->{column}++;
5011 $self->{nc}
5012 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5013 } else {
5014 $self->{set_nc}->($self);
5015 }
5016
5017 redo A;
5018 }
5019 }
5020
5021 my $data;
5022 my $has_ref;
5023 if ($self->{entity__match} > 0) {
5024
5025 $data = $self->{entity__value};
5026 $has_ref = 1;
5027 #
5028 } elsif ($self->{entity__match} < 0) {
5029 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5030 if ($self->{prev_state} != DATA_STATE and # in attribute
5031 $self->{entity__match} < -1) {
5032
5033 $data = '&' . $self->{kwd};
5034 #
5035 } else {
5036
5037 $data = $self->{entity__value};
5038 $has_ref = 1;
5039 #
5040 }
5041 } else {
5042
5043 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5044 line => $self->{line_prev},
5045 column => $self->{column_prev} - length $self->{kwd});
5046 $data = '&' . $self->{kwd};
5047 #
5048 }
5049
5050 ## NOTE: In these cases, when a character reference is found,
5051 ## it is consumed and a character token is returned, or, otherwise,
5052 ## nothing is consumed and returned, according to the spec algorithm.
5053 ## In this implementation, anything that has been examined by the
5054 ## tokenizer is appended to the parent element or the attribute value
5055 ## as string, either literal string when no character reference or
5056 ## entity-replaced string otherwise, in this stage, since any characters
5057 ## that would not be consumed are appended in the data state or in an
5058 ## appropriate attribute value state anyway.
5059
5060 if ($self->{prev_state} == DATA_STATE) {
5061
5062 $self->{state} = $self->{prev_state};
5063 $self->{s_kwd} = '';
5064 ## Reconsume.
5065 return ({type => CHARACTER_TOKEN,
5066 data => $data,
5067 has_reference => $has_ref,
5068 line => $self->{line_prev},
5069 column => $self->{column_prev} + 1 - length $self->{kwd},
5070 });
5071 redo A;
5072 } else {
5073
5074 $self->{ca}->{value} .= $data;
5075 $self->{ca}->{has_reference} = 1 if $has_ref;
5076 $self->{state} = $self->{prev_state};
5077 $self->{s_kwd} = '';
5078 ## Reconsume.
5079 redo A;
5080 }
5081
5082 ## XML-only states
5083
5084 } elsif ($self->{state} == PI_STATE) {
5085 ## XML5: "Pi state" and "DOCTYPE pi state".
5086
5087 if ($is_space->{$self->{nc}} or
5088 $self->{nc} == 0x003F or # ?
5089 $self->{nc} == -1) {
5090 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5091 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5092 ## "DOCTYPE pi state": Parse error, switch to the "data
5093 ## state".
5094 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5095 line => $self->{line_prev},
5096 column => $self->{column_prev}
5097 - 1 * ($self->{nc} != -1));
5098 $self->{state} = BOGUS_COMMENT_STATE;
5099 ## Reconsume.
5100 $self->{ct} = {type => COMMENT_TOKEN,
5101 data => '?',
5102 line => $self->{line_prev},
5103 column => $self->{column_prev}
5104 - 1 * ($self->{nc} != -1),
5105 };
5106 redo A;
5107 } else {
5108 ## XML5: "DOCTYPE pi state": Stay in the state.
5109 $self->{ct} = {type => PI_TOKEN,
5110 target => chr $self->{nc},
5111 data => '',
5112 line => $self->{line_prev},
5113 column => $self->{column_prev} - 1,
5114 };
5115 $self->{state} = PI_TARGET_STATE;
5116
5117 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5118 $self->{line_prev} = $self->{line};
5119 $self->{column_prev} = $self->{column};
5120 $self->{column}++;
5121 $self->{nc}
5122 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5123 } else {
5124 $self->{set_nc}->($self);
5125 }
5126
5127 redo A;
5128 }
5129 } elsif ($self->{state} == PI_TARGET_STATE) {
5130 if ($is_space->{$self->{nc}}) {
5131 $self->{state} = PI_TARGET_AFTER_STATE;
5132
5133 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5134 $self->{line_prev} = $self->{line};
5135 $self->{column_prev} = $self->{column};
5136 $self->{column}++;
5137 $self->{nc}
5138 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5139 } else {
5140 $self->{set_nc}->($self);
5141 }
5142
5143 redo A;
5144 } elsif ($self->{nc} == -1) {
5145 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5146 if ($self->{in_subset}) {
5147 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5148 } else {
5149 $self->{state} = DATA_STATE;
5150 $self->{s_kwd} = '';
5151 }
5152 ## Reconsume.
5153 return ($self->{ct}); # pi
5154 redo A;
5155 } elsif ($self->{nc} == 0x003F) { # ?
5156 $self->{state} = PI_AFTER_STATE;
5157
5158 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5159 $self->{line_prev} = $self->{line};
5160 $self->{column_prev} = $self->{column};
5161 $self->{column}++;
5162 $self->{nc}
5163 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5164 } else {
5165 $self->{set_nc}->($self);
5166 }
5167
5168 redo A;
5169 } else {
5170 ## XML5: typo ("tag name" -> "target")
5171 $self->{ct}->{target} .= chr $self->{nc}; # pi
5172
5173 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5174 $self->{line_prev} = $self->{line};
5175 $self->{column_prev} = $self->{column};
5176 $self->{column}++;
5177 $self->{nc}
5178 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5179 } else {
5180 $self->{set_nc}->($self);
5181 }
5182
5183 redo A;
5184 }
5185 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5186 if ($is_space->{$self->{nc}}) {
5187 ## Stay in the state.
5188
5189 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5190 $self->{line_prev} = $self->{line};
5191 $self->{column_prev} = $self->{column};
5192 $self->{column}++;
5193 $self->{nc}
5194 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5195 } else {
5196 $self->{set_nc}->($self);
5197 }
5198
5199 redo A;
5200 } else {
5201 $self->{state} = PI_DATA_STATE;
5202 ## Reprocess.
5203 redo A;
5204 }
5205 } elsif ($self->{state} == PI_DATA_STATE) {
5206 if ($self->{nc} == 0x003F) { # ?
5207 $self->{state} = PI_DATA_AFTER_STATE;
5208
5209 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5210 $self->{line_prev} = $self->{line};
5211 $self->{column_prev} = $self->{column};
5212 $self->{column}++;
5213 $self->{nc}
5214 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5215 } else {
5216 $self->{set_nc}->($self);
5217 }
5218
5219 redo A;
5220 } elsif ($self->{nc} == -1) {
5221 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5222 if ($self->{in_subset}) {
5223 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5224 } else {
5225 $self->{state} = DATA_STATE;
5226 $self->{s_kwd} = '';
5227 }
5228 ## Reprocess.
5229 return ($self->{ct}); # pi
5230 redo A;
5231 } else {
5232 $self->{ct}->{data} .= chr $self->{nc}; # pi
5233 $self->{read_until}->($self->{ct}->{data}, q[?],
5234 length $self->{ct}->{data});
5235 ## Stay in the state.
5236
5237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5238 $self->{line_prev} = $self->{line};
5239 $self->{column_prev} = $self->{column};
5240 $self->{column}++;
5241 $self->{nc}
5242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5243 } else {
5244 $self->{set_nc}->($self);
5245 }
5246
5247 ## Reprocess.
5248 redo A;
5249 }
5250 } elsif ($self->{state} == PI_AFTER_STATE) {
5251 ## XML5: Part of "Pi after state".
5252
5253 if ($self->{nc} == 0x003E) { # >
5254 if ($self->{in_subset}) {
5255 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5256 } else {
5257 $self->{state} = DATA_STATE;
5258 $self->{s_kwd} = '';
5259 }
5260
5261 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5262 $self->{line_prev} = $self->{line};
5263 $self->{column_prev} = $self->{column};
5264 $self->{column}++;
5265 $self->{nc}
5266 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5267 } else {
5268 $self->{set_nc}->($self);
5269 }
5270
5271 return ($self->{ct}); # pi
5272 redo A;
5273 } elsif ($self->{nc} == 0x003F) { # ?
5274 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5275 line => $self->{line_prev},
5276 column => $self->{column_prev}); ## XML5: no error
5277 $self->{ct}->{data} .= '?';
5278 $self->{state} = PI_DATA_AFTER_STATE;
5279
5280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5281 $self->{line_prev} = $self->{line};
5282 $self->{column_prev} = $self->{column};
5283 $self->{column}++;
5284 $self->{nc}
5285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5286 } else {
5287 $self->{set_nc}->($self);
5288 }
5289
5290 redo A;
5291 } else {
5292 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5293 line => $self->{line_prev},
5294 column => $self->{column_prev}
5295 + 1 * ($self->{nc} == -1)); ## XML5: no error
5296 $self->{ct}->{data} .= '?'; ## XML5: not appended
5297 $self->{state} = PI_DATA_STATE;
5298 ## Reprocess.
5299 redo A;
5300 }
5301 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5302 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5303
5304 if ($self->{nc} == 0x003E) { # >
5305 if ($self->{in_subset}) {
5306 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5307 } else {
5308 $self->{state} = DATA_STATE;
5309 $self->{s_kwd} = '';
5310 }
5311
5312 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5313 $self->{line_prev} = $self->{line};
5314 $self->{column_prev} = $self->{column};
5315 $self->{column}++;
5316 $self->{nc}
5317 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5318 } else {
5319 $self->{set_nc}->($self);
5320 }
5321
5322 return ($self->{ct}); # pi
5323 redo A;
5324 } elsif ($self->{nc} == 0x003F) { # ?
5325 $self->{ct}->{data} .= '?';
5326 ## Stay in the state.
5327
5328 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5329 $self->{line_prev} = $self->{line};
5330 $self->{column_prev} = $self->{column};
5331 $self->{column}++;
5332 $self->{nc}
5333 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5334 } else {
5335 $self->{set_nc}->($self);
5336 }
5337
5338 redo A;
5339 } else {
5340 $self->{ct}->{data} .= '?'; ## XML5: not appended
5341 $self->{state} = PI_DATA_STATE;
5342 ## Reprocess.
5343 redo A;
5344 }
5345
5346 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5347 if ($self->{nc} == 0x003C) { # <
5348 $self->{state} = DOCTYPE_TAG_STATE;
5349
5350 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5351 $self->{line_prev} = $self->{line};
5352 $self->{column_prev} = $self->{column};
5353 $self->{column}++;
5354 $self->{nc}
5355 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5356 } else {
5357 $self->{set_nc}->($self);
5358 }
5359
5360 redo A;
5361 } elsif ($self->{nc} == 0x0025) { # %
5362 ## XML5: Not defined yet.
5363
5364 ## TODO:
5365
5366 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5367 $self->{line_prev} = $self->{line};
5368 $self->{column_prev} = $self->{column};
5369 $self->{column}++;
5370 $self->{nc}
5371 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5372 } else {
5373 $self->{set_nc}->($self);
5374 }
5375
5376 redo A;
5377 } elsif ($self->{nc} == 0x005D) { # ]
5378 delete $self->{in_subset};
5379 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5380
5381 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5382 $self->{line_prev} = $self->{line};
5383 $self->{column_prev} = $self->{column};
5384 $self->{column}++;
5385 $self->{nc}
5386 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5387 } else {
5388 $self->{set_nc}->($self);
5389 }
5390
5391 redo A;
5392 } elsif ($is_space->{$self->{nc}}) {
5393 ## Stay in the state.
5394
5395 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5396 $self->{line_prev} = $self->{line};
5397 $self->{column_prev} = $self->{column};
5398 $self->{column}++;
5399 $self->{nc}
5400 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5401 } else {
5402 $self->{set_nc}->($self);
5403 }
5404
5405 redo A;
5406 } elsif ($self->{nc} == -1) {
5407 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5408 delete $self->{in_subset};
5409 $self->{state} = DATA_STATE;
5410 $self->{s_kwd} = '';
5411 ## Reconsume.
5412 return ({type => END_OF_DOCTYPE_TOKEN});
5413 redo A;
5414 } else {
5415 unless ($self->{internal_subset_tainted}) {
5416 ## XML5: No parse error.
5417 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5418 $self->{internal_subset_tainted} = 1;
5419 }
5420 ## Stay in the state.
5421
5422 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5423 $self->{line_prev} = $self->{line};
5424 $self->{column_prev} = $self->{column};
5425 $self->{column}++;
5426 $self->{nc}
5427 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5428 } else {
5429 $self->{set_nc}->($self);
5430 }
5431
5432 redo A;
5433 }
5434 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5435 if ($self->{nc} == 0x003E) { # >
5436 $self->{state} = DATA_STATE;
5437 $self->{s_kwd} = '';
5438
5439 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5440 $self->{line_prev} = $self->{line};
5441 $self->{column_prev} = $self->{column};
5442 $self->{column}++;
5443 $self->{nc}
5444 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5445 } else {
5446 $self->{set_nc}->($self);
5447 }
5448
5449 return ({type => END_OF_DOCTYPE_TOKEN});
5450 redo A;
5451 } elsif ($self->{nc} == -1) {
5452 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5453 $self->{state} = DATA_STATE;
5454 $self->{s_kwd} = '';
5455 ## Reconsume.
5456 return ({type => END_OF_DOCTYPE_TOKEN});
5457 redo A;
5458 } else {
5459 ## XML5: No parse error and stay in the state.
5460 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5461
5462 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5463
5464 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5465 $self->{line_prev} = $self->{line};
5466 $self->{column_prev} = $self->{column};
5467 $self->{column}++;
5468 $self->{nc}
5469 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5470 } else {
5471 $self->{set_nc}->($self);
5472 }
5473
5474 redo A;
5475 }
5476 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5477 if ($self->{nc} == 0x003E) { # >
5478 $self->{state} = DATA_STATE;
5479 $self->{s_kwd} = '';
5480
5481 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5482 $self->{line_prev} = $self->{line};
5483 $self->{column_prev} = $self->{column};
5484 $self->{column}++;
5485 $self->{nc}
5486 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5487 } else {
5488 $self->{set_nc}->($self);
5489 }
5490
5491 return ({type => END_OF_DOCTYPE_TOKEN});
5492 redo A;
5493 } elsif ($self->{nc} == -1) {
5494 $self->{state} = DATA_STATE;
5495 $self->{s_kwd} = '';
5496 ## Reconsume.
5497 return ({type => END_OF_DOCTYPE_TOKEN});
5498 redo A;
5499 } else {
5500 ## Stay in the state.
5501
5502 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503 $self->{line_prev} = $self->{line};
5504 $self->{column_prev} = $self->{column};
5505 $self->{column}++;
5506 $self->{nc}
5507 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508 } else {
5509 $self->{set_nc}->($self);
5510 }
5511
5512 redo A;
5513 }
5514 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5515 if ($self->{nc} == 0x0021) { # !
5516 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5517
5518 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5519 $self->{line_prev} = $self->{line};
5520 $self->{column_prev} = $self->{column};
5521 $self->{column}++;
5522 $self->{nc}
5523 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5524 } else {
5525 $self->{set_nc}->($self);
5526 }
5527
5528 redo A;
5529 } elsif ($self->{nc} == 0x003F) { # ?
5530 $self->{state} = PI_STATE;
5531
5532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5533 $self->{line_prev} = $self->{line};
5534 $self->{column_prev} = $self->{column};
5535 $self->{column}++;
5536 $self->{nc}
5537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5538 } else {
5539 $self->{set_nc}->($self);
5540 }
5541
5542 redo A;
5543 } elsif ($self->{nc} == -1) {
5544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5545 $self->{state} = DATA_STATE;
5546 $self->{s_kwd} = '';
5547 ## Reconsume.
5548 redo A;
5549 } else {
5550 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5551 line => $self->{line_prev},
5552 column => $self->{column_prev});
5553 $self->{state} = BOGUS_COMMENT_STATE;
5554 $self->{ct} = {type => COMMENT_TOKEN,
5555 data => '',
5556 }; ## NOTE: Will be discarded.
5557
5558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5559 $self->{line_prev} = $self->{line};
5560 $self->{column_prev} = $self->{column};
5561 $self->{column}++;
5562 $self->{nc}
5563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5564 } else {
5565 $self->{set_nc}->($self);
5566 }
5567
5568 redo A;
5569 }
5570 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5571 ## XML5: "DOCTYPE markup declaration state".
5572
5573 if ($self->{nc} == 0x002D) { # -
5574 $self->{state} = MD_HYPHEN_STATE;
5575
5576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5577 $self->{line_prev} = $self->{line};
5578 $self->{column_prev} = $self->{column};
5579 $self->{column}++;
5580 $self->{nc}
5581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5582 } else {
5583 $self->{set_nc}->($self);
5584 }
5585
5586 redo A;
5587 } elsif ($self->{nc} == 0x0045 or # E
5588 $self->{nc} == 0x0065) { # e
5589 $self->{state} = MD_E_STATE;
5590 $self->{kwd} = chr $self->{nc};
5591
5592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5593 $self->{line_prev} = $self->{line};
5594 $self->{column_prev} = $self->{column};
5595 $self->{column}++;
5596 $self->{nc}
5597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5598 } else {
5599 $self->{set_nc}->($self);
5600 }
5601
5602 redo A;
5603 } elsif ($self->{nc} == 0x0041 or # A
5604 $self->{nc} == 0x0061) { # a
5605 $self->{state} = MD_ATTLIST_STATE;
5606 $self->{kwd} = chr $self->{nc};
5607
5608 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5609 $self->{line_prev} = $self->{line};
5610 $self->{column_prev} = $self->{column};
5611 $self->{column}++;
5612 $self->{nc}
5613 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5614 } else {
5615 $self->{set_nc}->($self);
5616 }
5617
5618 redo A;
5619 } elsif ($self->{nc} == 0x004E or # N
5620 $self->{nc} == 0x006E) { # n
5621 $self->{state} = MD_NOTATION_STATE;
5622 $self->{kwd} = chr $self->{nc};
5623
5624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625 $self->{line_prev} = $self->{line};
5626 $self->{column_prev} = $self->{column};
5627 $self->{column}++;
5628 $self->{nc}
5629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630 } else {
5631 $self->{set_nc}->($self);
5632 }
5633
5634 redo A;
5635 } else {
5636 #
5637 }
5638
5639 ## XML5: No parse error.
5640 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5641 line => $self->{line_prev},
5642 column => $self->{column_prev} - 1);
5643 ## Reconsume.
5644 $self->{state} = BOGUS_COMMENT_STATE;
5645 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5646 redo A;
5647 } elsif ($self->{state} == MD_E_STATE) {
5648 if ($self->{nc} == 0x004E or # N
5649 $self->{nc} == 0x006E) { # n
5650 $self->{state} = MD_ENTITY_STATE;
5651 $self->{kwd} .= chr $self->{nc};
5652
5653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5654 $self->{line_prev} = $self->{line};
5655 $self->{column_prev} = $self->{column};
5656 $self->{column}++;
5657 $self->{nc}
5658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5659 } else {
5660 $self->{set_nc}->($self);
5661 }
5662
5663 redo A;
5664 } elsif ($self->{nc} == 0x004C or # L
5665 $self->{nc} == 0x006C) { # l
5666 ## XML5: <!ELEMENT> not supported.
5667 $self->{state} = MD_ELEMENT_STATE;
5668 $self->{kwd} .= chr $self->{nc};
5669
5670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5671 $self->{line_prev} = $self->{line};
5672 $self->{column_prev} = $self->{column};
5673 $self->{column}++;
5674 $self->{nc}
5675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5676 } else {
5677 $self->{set_nc}->($self);
5678 }
5679
5680 redo A;
5681 } else {
5682 ## XML5: No parse error.
5683 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5684 line => $self->{line_prev},
5685 column => $self->{column_prev} - 2
5686 + 1 * ($self->{nc} == -1));
5687 ## Reconsume.
5688 $self->{state} = BOGUS_COMMENT_STATE;
5689 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5690 redo A;
5691 }
5692 } elsif ($self->{state} == MD_ENTITY_STATE) {
5693 if ($self->{nc} == [
5694 undef,
5695 undef,
5696 0x0054, # T
5697 0x0049, # I
5698 0x0054, # T
5699 ]->[length $self->{kwd}] or
5700 $self->{nc} == [
5701 undef,
5702 undef,
5703 0x0074, # t
5704 0x0069, # i
5705 0x0074, # t
5706 ]->[length $self->{kwd}]) {
5707 ## Stay in the state.
5708 $self->{kwd} .= chr $self->{nc};
5709
5710 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5711 $self->{line_prev} = $self->{line};
5712 $self->{column_prev} = $self->{column};
5713 $self->{column}++;
5714 $self->{nc}
5715 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5716 } else {
5717 $self->{set_nc}->($self);
5718 }
5719
5720 redo A;
5721 } elsif ((length $self->{kwd}) == 5 and
5722 ($self->{nc} == 0x0059 or # Y
5723 $self->{nc} == 0x0079)) { # y
5724 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5725 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5726 text => 'ENTITY',
5727 line => $self->{line_prev},
5728 column => $self->{column_prev} - 4);
5729 }
5730 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5731 line => $self->{line_prev},
5732 column => $self->{column_prev} - 6};
5733 $self->{state} = DOCTYPE_MD_STATE;
5734
5735 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5736 $self->{line_prev} = $self->{line};
5737 $self->{column_prev} = $self->{column};
5738 $self->{column}++;
5739 $self->{nc}
5740 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5741 } else {
5742 $self->{set_nc}->($self);
5743 }
5744
5745 redo A;
5746 } else {
5747 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5748 line => $self->{line_prev},
5749 column => $self->{column_prev} - 1
5750 - (length $self->{kwd})
5751 + 1 * ($self->{nc} == -1));
5752 $self->{state} = BOGUS_COMMENT_STATE;
5753 ## Reconsume.
5754 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5755 redo A;
5756 }
5757 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5758 if ($self->{nc} == [
5759 undef,
5760 undef,
5761 0x0045, # E
5762 0x004D, # M
5763 0x0045, # E
5764 0x004E, # N
5765 ]->[length $self->{kwd}] or
5766 $self->{nc} == [
5767 undef,
5768 undef,
5769 0x0065, # e
5770 0x006D, # m
5771 0x0065, # e
5772 0x006E, # n
5773 ]->[length $self->{kwd}]) {
5774 ## Stay in the state.
5775 $self->{kwd} .= chr $self->{nc};
5776
5777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778 $self->{line_prev} = $self->{line};
5779 $self->{column_prev} = $self->{column};
5780 $self->{column}++;
5781 $self->{nc}
5782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783 } else {
5784 $self->{set_nc}->($self);
5785 }
5786
5787 redo A;
5788 } elsif ((length $self->{kwd}) == 6 and
5789 ($self->{nc} == 0x0054 or # T
5790 $self->{nc} == 0x0074)) { # t
5791 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5792 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5793 text => 'ELEMENT',
5794 line => $self->{line_prev},
5795 column => $self->{column_prev} - 5);
5796 }
5797 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5798 line => $self->{line_prev},
5799 column => $self->{column_prev} - 6};
5800 $self->{state} = DOCTYPE_MD_STATE;
5801
5802 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5803 $self->{line_prev} = $self->{line};
5804 $self->{column_prev} = $self->{column};
5805 $self->{column}++;
5806 $self->{nc}
5807 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5808 } else {
5809 $self->{set_nc}->($self);
5810 }
5811
5812 redo A;
5813 } else {
5814 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5815 line => $self->{line_prev},
5816 column => $self->{column_prev} - 1
5817 - (length $self->{kwd})
5818 + 1 * ($self->{nc} == -1));
5819 $self->{state} = BOGUS_COMMENT_STATE;
5820 ## Reconsume.
5821 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5822 redo A;
5823 }
5824 } elsif ($self->{state} == MD_ATTLIST_STATE) {
5825 if ($self->{nc} == [
5826 undef,
5827 0x0054, # T
5828 0x0054, # T
5829 0x004C, # L
5830 0x0049, # I
5831 0x0053, # S
5832 ]->[length $self->{kwd}] or
5833 $self->{nc} == [
5834 undef,
5835 0x0074, # t
5836 0x0074, # t
5837 0x006C, # l
5838 0x0069, # i
5839 0x0073, # s
5840 ]->[length $self->{kwd}]) {
5841 ## Stay in the state.
5842 $self->{kwd} .= chr $self->{nc};
5843
5844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5845 $self->{line_prev} = $self->{line};
5846 $self->{column_prev} = $self->{column};
5847 $self->{column}++;
5848 $self->{nc}
5849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5850 } else {
5851 $self->{set_nc}->($self);
5852 }
5853
5854 redo A;
5855 } elsif ((length $self->{kwd}) == 6 and
5856 ($self->{nc} == 0x0054 or # T
5857 $self->{nc} == 0x0074)) { # t
5858 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5859 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5860 text => 'ATTLIST',
5861 line => $self->{line_prev},
5862 column => $self->{column_prev} - 5);
5863 }
5864 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5865 attrdefs => [],
5866 line => $self->{line_prev},
5867 column => $self->{column_prev} - 6};
5868 $self->{state} = DOCTYPE_MD_STATE;
5869
5870 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5871 $self->{line_prev} = $self->{line};
5872 $self->{column_prev} = $self->{column};
5873 $self->{column}++;
5874 $self->{nc}
5875 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5876 } else {
5877 $self->{set_nc}->($self);
5878 }
5879
5880 redo A;
5881 } else {
5882 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5883 line => $self->{line_prev},
5884 column => $self->{column_prev} - 1
5885 - (length $self->{kwd})
5886 + 1 * ($self->{nc} == -1));
5887 $self->{state} = BOGUS_COMMENT_STATE;
5888 ## Reconsume.
5889 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5890 redo A;
5891 }
5892 } elsif ($self->{state} == MD_NOTATION_STATE) {
5893 if ($self->{nc} == [
5894 undef,
5895 0x004F, # O
5896 0x0054, # T
5897 0x0041, # A
5898 0x0054, # T
5899 0x0049, # I
5900 0x004F, # O
5901 ]->[length $self->{kwd}] or
5902 $self->{nc} == [
5903 undef,
5904 0x006F, # o
5905 0x0074, # t
5906 0x0061, # a
5907 0x0074, # t
5908 0x0069, # i
5909 0x006F, # o
5910 ]->[length $self->{kwd}]) {
5911 ## Stay in the state.
5912 $self->{kwd} .= chr $self->{nc};
5913
5914 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5915 $self->{line_prev} = $self->{line};
5916 $self->{column_prev} = $self->{column};
5917 $self->{column}++;
5918 $self->{nc}
5919 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5920 } else {
5921 $self->{set_nc}->($self);
5922 }
5923
5924 redo A;
5925 } elsif ((length $self->{kwd}) == 7 and
5926 ($self->{nc} == 0x004E or # N
5927 $self->{nc} == 0x006E)) { # n
5928 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5929 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5930 text => 'NOTATION',
5931 line => $self->{line_prev},
5932 column => $self->{column_prev} - 6);
5933 }
5934 $self->{ct} = {type => NOTATION_TOKEN, name => '',
5935 line => $self->{line_prev},
5936 column => $self->{column_prev} - 6};
5937 $self->{state} = DOCTYPE_MD_STATE;
5938
5939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5940 $self->{line_prev} = $self->{line};
5941 $self->{column_prev} = $self->{column};
5942 $self->{column}++;
5943 $self->{nc}
5944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5945 } else {
5946 $self->{set_nc}->($self);
5947 }
5948
5949 redo A;
5950 } else {
5951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5952 line => $self->{line_prev},
5953 column => $self->{column_prev} - 1
5954 - (length $self->{kwd})
5955 + 1 * ($self->{nc} == -1));
5956 $self->{state} = BOGUS_COMMENT_STATE;
5957 ## Reconsume.
5958 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5959 redo A;
5960 }
5961 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5962 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5963 ## "DOCTYPE NOTATION state".
5964
5965 if ($is_space->{$self->{nc}}) {
5966 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5967 $self->{state} = BEFORE_MD_NAME_STATE;
5968
5969 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5970 $self->{line_prev} = $self->{line};
5971 $self->{column_prev} = $self->{column};
5972 $self->{column}++;
5973 $self->{nc}
5974 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5975 } else {
5976 $self->{set_nc}->($self);
5977 }
5978
5979 redo A;
5980 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5981 $self->{nc} == 0x0025) { # %
5982 ## XML5: Switch to the "DOCTYPE bogus comment state".
5983 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5984 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5985
5986 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5987 $self->{line_prev} = $self->{line};
5988 $self->{column_prev} = $self->{column};
5989 $self->{column}++;
5990 $self->{nc}
5991 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5992 } else {
5993 $self->{set_nc}->($self);
5994 }
5995
5996 redo A;
5997 } elsif ($self->{nc} == -1) {
5998 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5999 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6000 ## Reconsume.
6001 redo A;
6002 } elsif ($self->{nc} == 0x003E) { # >
6003 ## XML5: Switch to the "DOCTYPE bogus comment state".
6004 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6005 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6006
6007 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6008 $self->{line_prev} = $self->{line};
6009 $self->{column_prev} = $self->{column};
6010 $self->{column}++;
6011 $self->{nc}
6012 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6013 } else {
6014 $self->{set_nc}->($self);
6015 }
6016
6017 redo A;
6018 } else {
6019 ## XML5: Switch to the "DOCTYPE bogus comment state".
6020 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6021 $self->{state} = BEFORE_MD_NAME_STATE;
6022 redo A;
6023 }
6024 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6025 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6026 ## before state", "DOCTYPE ATTLIST name before state".
6027
6028 if ($is_space->{$self->{nc}}) {
6029 ## Stay in the state.
6030
6031 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032 $self->{line_prev} = $self->{line};
6033 $self->{column_prev} = $self->{column};
6034 $self->{column}++;
6035 $self->{nc}
6036 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037 } else {
6038 $self->{set_nc}->($self);
6039 }
6040
6041 redo A;
6042 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6043 $self->{nc} == 0x0025) { # %
6044 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6045
6046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047 $self->{line_prev} = $self->{line};
6048 $self->{column_prev} = $self->{column};
6049 $self->{column}++;
6050 $self->{nc}
6051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052 } else {
6053 $self->{set_nc}->($self);
6054 }
6055
6056 redo A;
6057 } elsif ($self->{nc} == 0x003E) { # >
6058 ## XML5: Same as "Anything else".
6059 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6060 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6061
6062 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6063 $self->{line_prev} = $self->{line};
6064 $self->{column_prev} = $self->{column};
6065 $self->{column}++;
6066 $self->{nc}
6067 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6068 } else {
6069 $self->{set_nc}->($self);
6070 }
6071
6072 redo A;
6073 } elsif ($self->{nc} == -1) {
6074 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6075 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6076 ## Reconsume.
6077 redo A;
6078 } else {
6079 ## XML5: [ATTLIST] Not defined yet.
6080 $self->{ct}->{name} .= chr $self->{nc};
6081 $self->{state} = MD_NAME_STATE;
6082
6083 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6084 $self->{line_prev} = $self->{line};
6085 $self->{column_prev} = $self->{column};
6086 $self->{column}++;
6087 $self->{nc}
6088 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6089 } else {
6090 $self->{set_nc}->($self);
6091 }
6092
6093 redo A;
6094 }
6095 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6096 if ($is_space->{$self->{nc}}) {
6097 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6098 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6099 $self->{state} = BEFORE_MD_NAME_STATE;
6100
6101 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6102 $self->{line_prev} = $self->{line};
6103 $self->{column_prev} = $self->{column};
6104 $self->{column}++;
6105 $self->{nc}
6106 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6107 } else {
6108 $self->{set_nc}->($self);
6109 }
6110
6111 redo A;
6112 } elsif ($self->{nc} == 0x003E) { # >
6113 ## XML5: Same as "Anything else".
6114 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6115 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6116
6117 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6118 $self->{line_prev} = $self->{line};
6119 $self->{column_prev} = $self->{column};
6120 $self->{column}++;
6121 $self->{nc}
6122 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6123 } else {
6124 $self->{set_nc}->($self);
6125 }
6126
6127 redo A;
6128 } elsif ($self->{nc} == -1) {
6129 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6130 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6131 ## Reconsume.
6132 redo A;
6133 } else {
6134 ## XML5: No parse error.
6135 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6136 $self->{state} = BOGUS_COMMENT_STATE;
6137 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6138 ## Reconsume.
6139 redo A;
6140 }
6141 } elsif ($self->{state} == MD_NAME_STATE) {
6142 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6143
6144 if ($is_space->{$self->{nc}}) {
6145 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6146 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6147 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6148 ## TODO: ...
6149 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6150 } else { # ENTITY/NOTATION
6151 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6152 }
6153
6154 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6155 $self->{line_prev} = $self->{line};
6156 $self->{column_prev} = $self->{column};
6157 $self->{column}++;
6158 $self->{nc}
6159 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6160 } else {
6161 $self->{set_nc}->($self);
6162 }
6163
6164 redo A;
6165 } elsif ($self->{nc} == 0x003E) { # >
6166 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6167 #
6168 } else {
6169 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6170 }
6171 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6172
6173 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6174 $self->{line_prev} = $self->{line};
6175 $self->{column_prev} = $self->{column};
6176 $self->{column}++;
6177 $self->{nc}
6178 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6179 } else {
6180 $self->{set_nc}->($self);
6181 }
6182
6183 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6184 redo A;
6185 } elsif ($self->{nc} == -1) {
6186 ## XML5: [ATTLIST] No parse error.
6187 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6188 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6189 ## Reconsume.
6190 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6191 redo A;
6192 } else {
6193 ## XML5: [ATTLIST] Not defined yet.
6194 $self->{ct}->{name} .= chr $self->{nc};
6195 ## Stay in the state.
6196
6197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6198 $self->{line_prev} = $self->{line};
6199 $self->{column_prev} = $self->{column};
6200 $self->{column}++;
6201 $self->{nc}
6202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6203 } else {
6204 $self->{set_nc}->($self);
6205 }
6206
6207 redo A;
6208 }
6209 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6210 if ($is_space->{$self->{nc}}) {
6211 ## Stay in the state.
6212
6213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6214 $self->{line_prev} = $self->{line};
6215 $self->{column_prev} = $self->{column};
6216 $self->{column}++;
6217 $self->{nc}
6218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6219 } else {
6220 $self->{set_nc}->($self);
6221 }
6222
6223 redo A;
6224 } elsif ($self->{nc} == 0x003E) { # >
6225 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6226
6227 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6228 $self->{line_prev} = $self->{line};
6229 $self->{column_prev} = $self->{column};
6230 $self->{column}++;
6231 $self->{nc}
6232 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6233 } else {
6234 $self->{set_nc}->($self);
6235 }
6236
6237 return ($self->{ct}); # ATTLIST
6238 redo A;
6239 } elsif ($self->{nc} == -1) {
6240 ## XML5: No parse error.
6241 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6242 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6243 return ($self->{ct});
6244 redo A;
6245 } else {
6246 ## XML5: Not defined yet.
6247 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6248 tokens => [],
6249 line => $self->{line}, column => $self->{column}};
6250 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6251
6252 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6253 $self->{line_prev} = $self->{line};
6254 $self->{column_prev} = $self->{column};
6255 $self->{column}++;
6256 $self->{nc}
6257 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6258 } else {
6259 $self->{set_nc}->($self);
6260 }
6261
6262 redo A;
6263 }
6264 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6265 if ($is_space->{$self->{nc}}) {
6266 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6267
6268 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6269 $self->{line_prev} = $self->{line};
6270 $self->{column_prev} = $self->{column};
6271 $self->{column}++;
6272 $self->{nc}
6273 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6274 } else {
6275 $self->{set_nc}->($self);
6276 }
6277
6278 redo A;
6279 } elsif ($self->{nc} == 0x003E) { # >
6280 ## XML5: Same as "anything else".
6281 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6282 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6283
6284 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6285 $self->{line_prev} = $self->{line};
6286 $self->{column_prev} = $self->{column};
6287 $self->{column}++;
6288 $self->{nc}
6289 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6290 } else {
6291 $self->{set_nc}->($self);
6292 }
6293
6294 return ($self->{ct}); # ATTLIST
6295 redo A;
6296 } elsif ($self->{nc} == 0x0028) { # (
6297 ## XML5: Same as "anything else".
6298 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6299 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6300
6301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6302 $self->{line_prev} = $self->{line};
6303 $self->{column_prev} = $self->{column};
6304 $self->{column}++;
6305 $self->{nc}
6306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6307 } else {
6308 $self->{set_nc}->($self);
6309 }
6310
6311 redo A;
6312 } elsif ($self->{nc} == -1) {
6313 ## XML5: No parse error.
6314 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6315 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6316
6317 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6318 $self->{line_prev} = $self->{line};
6319 $self->{column_prev} = $self->{column};
6320 $self->{column}++;
6321 $self->{nc}
6322 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6323 } else {
6324 $self->{set_nc}->($self);
6325 }
6326
6327 return ($self->{ct}); # ATTLIST
6328 redo A;
6329 } else {
6330 ## XML5: Not defined yet.
6331 $self->{ca}->{name} .= chr $self->{nc};
6332 ## Stay in the state.
6333
6334 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6335 $self->{line_prev} = $self->{line};
6336 $self->{column_prev} = $self->{column};
6337 $self->{column}++;
6338 $self->{nc}
6339 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6340 } else {
6341 $self->{set_nc}->($self);
6342 }
6343
6344 redo A;
6345 }
6346 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6347 if ($is_space->{$self->{nc}}) {
6348 ## Stay in the state.
6349
6350 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6351 $self->{line_prev} = $self->{line};
6352 $self->{column_prev} = $self->{column};
6353 $self->{column}++;
6354 $self->{nc}
6355 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6356 } else {
6357 $self->{set_nc}->($self);
6358 }
6359
6360 redo A;
6361 } elsif ($self->{nc} == 0x003E) { # >
6362 ## XML5: Same as "anything else".
6363 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6364 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6365
6366 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6367 $self->{line_prev} = $self->{line};
6368 $self->{column_prev} = $self->{column};
6369 $self->{column}++;
6370 $self->{nc}
6371 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6372 } else {
6373 $self->{set_nc}->($self);
6374 }
6375
6376 return ($self->{ct}); # ATTLIST
6377 redo A;
6378 } elsif ($self->{nc} == 0x0028) { # (
6379 ## XML5: Same as "anything else".
6380 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6381
6382 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6383 $self->{line_prev} = $self->{line};
6384 $self->{column_prev} = $self->{column};
6385 $self->{column}++;
6386 $self->{nc}
6387 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6388 } else {
6389 $self->{set_nc}->($self);
6390 }
6391
6392 redo A;
6393 } elsif ($self->{nc} == -1) {
6394 ## XML5: No parse error.
6395 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6396 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6397
6398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399 $self->{line_prev} = $self->{line};
6400 $self->{column_prev} = $self->{column};
6401 $self->{column}++;
6402 $self->{nc}
6403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404 } else {
6405 $self->{set_nc}->($self);
6406 }
6407
6408 return ($self->{ct});
6409 redo A;
6410 } else {
6411 ## XML5: Not defined yet.
6412 $self->{ca}->{type} = chr $self->{nc};
6413 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6414
6415 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6416 $self->{line_prev} = $self->{line};
6417 $self->{column_prev} = $self->{column};
6418 $self->{column}++;
6419 $self->{nc}
6420 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6421 } else {
6422 $self->{set_nc}->($self);
6423 }
6424
6425 redo A;
6426 }
6427 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6428 if ($is_space->{$self->{nc}}) {
6429 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6430
6431 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6432 $self->{line_prev} = $self->{line};
6433 $self->{column_prev} = $self->{column};
6434 $self->{column}++;
6435 $self->{nc}
6436 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6437 } else {
6438 $self->{set_nc}->($self);
6439 }
6440
6441 redo A;
6442 } elsif ($self->{nc} == 0x0023) { # #
6443 ## XML5: Same as "anything else".
6444 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6445 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6446
6447 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6448 $self->{line_prev} = $self->{line};
6449 $self->{column_prev} = $self->{column};
6450 $self->{column}++;
6451 $self->{nc}
6452 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6453 } else {
6454 $self->{set_nc}->($self);
6455 }
6456
6457 redo A;
6458 } elsif ($self->{nc} == 0x0022) { # "
6459 ## XML5: Same as "anything else".
6460 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6461 $self->{ca}->{value} = '';
6462 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6463
6464 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6465 $self->{line_prev} = $self->{line};
6466 $self->{column_prev} = $self->{column};
6467 $self->{column}++;
6468 $self->{nc}
6469 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6470 } else {
6471 $self->{set_nc}->($self);
6472 }
6473
6474 redo A;
6475 } elsif ($self->{nc} == 0x0027) { # '
6476 ## XML5: Same as "anything else".
6477 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6478 $self->{ca}->{value} = '';
6479 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6480
6481 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6482 $self->{line_prev} = $self->{line};
6483 $self->{column_prev} = $self->{column};
6484 $self->{column}++;
6485 $self->{nc}
6486 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6487 } else {
6488 $self->{set_nc}->($self);
6489 }
6490
6491 redo A;
6492 } elsif ($self->{nc} == 0x003E) { # >
6493 ## XML5: Same as "anything else".
6494 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6495 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6496
6497 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6498 $self->{line_prev} = $self->{line};
6499 $self->{column_prev} = $self->{column};
6500 $self->{column}++;
6501 $self->{nc}
6502 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6503 } else {
6504 $self->{set_nc}->($self);
6505 }
6506
6507 return ($self->{ct}); # ATTLIST
6508 redo A;
6509 } elsif ($self->{nc} == 0x0028) { # (
6510 ## XML5: Same as "anything else".
6511 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6512 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6513
6514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515 $self->{line_prev} = $self->{line};
6516 $self->{column_prev} = $self->{column};
6517 $self->{column}++;
6518 $self->{nc}
6519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520 } else {
6521 $self->{set_nc}->($self);
6522 }
6523
6524 redo A;
6525 } elsif ($self->{nc} == -1) {
6526 ## XML5: No parse error.
6527 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6528 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6529
6530 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6531 $self->{line_prev} = $self->{line};
6532 $self->{column_prev} = $self->{column};
6533 $self->{column}++;
6534 $self->{nc}
6535 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6536 } else {
6537 $self->{set_nc}->($self);
6538 }
6539
6540 return ($self->{ct});
6541 redo A;
6542 } else {
6543 ## XML5: Not defined yet.
6544 $self->{ca}->{type} .= chr $self->{nc};
6545 ## Stay in the state.
6546
6547 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6548 $self->{line_prev} = $self->{line};
6549 $self->{column_prev} = $self->{column};
6550 $self->{column}++;
6551 $self->{nc}
6552 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6553 } else {
6554 $self->{set_nc}->($self);
6555 }
6556
6557 redo A;
6558 }
6559 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6560 if ($is_space->{$self->{nc}}) {
6561 ## Stay in the state.
6562
6563 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6564 $self->{line_prev} = $self->{line};
6565 $self->{column_prev} = $self->{column};
6566 $self->{column}++;
6567 $self->{nc}
6568 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6569 } else {
6570 $self->{set_nc}->($self);
6571 }
6572
6573 redo A;
6574 } elsif ($self->{nc} == 0x0028) { # (
6575 ## XML5: Same as "anything else".
6576 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6577
6578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6579 $self->{line_prev} = $self->{line};
6580 $self->{column_prev} = $self->{column};
6581 $self->{column}++;
6582 $self->{nc}
6583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6584 } else {
6585 $self->{set_nc}->($self);
6586 }
6587
6588 redo A;
6589 } elsif ($self->{nc} == 0x0023) { # #
6590 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6591
6592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6593 $self->{line_prev} = $self->{line};
6594 $self->{column_prev} = $self->{column};
6595 $self->{column}++;
6596 $self->{nc}
6597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6598 } else {
6599 $self->{set_nc}->($self);
6600 }
6601
6602 redo A;
6603 } elsif ($self->{nc} == 0x0022) { # "
6604 ## XML5: Same as "anything else".
6605 $self->{ca}->{value} = '';
6606 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6607
6608 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6609 $self->{line_prev} = $self->{line};
6610 $self->{column_prev} = $self->{column};
6611 $self->{column}++;
6612 $self->{nc}
6613 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6614 } else {
6615 $self->{set_nc}->($self);
6616 }
6617
6618 redo A;
6619 } elsif ($self->{nc} == 0x0027) { # '
6620 ## XML5: Same as "anything else".
6621 $self->{ca}->{value} = '';
6622 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6623
6624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6625 $self->{line_prev} = $self->{line};
6626 $self->{column_prev} = $self->{column};
6627 $self->{column}++;
6628 $self->{nc}
6629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6630 } else {
6631 $self->{set_nc}->($self);
6632 }
6633
6634 redo A;
6635 } elsif ($self->{nc} == 0x003E) { # >
6636 ## XML5: Same as "anything else".
6637 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6638 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6639
6640 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6641 $self->{line_prev} = $self->{line};
6642 $self->{column_prev} = $self->{column};
6643 $self->{column}++;
6644 $self->{nc}
6645 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6646 } else {
6647 $self->{set_nc}->($self);
6648 }
6649
6650 return ($self->{ct}); # ATTLIST
6651 redo A;
6652 } elsif ($self->{nc} == -1) {
6653 ## XML5: No parse error.
6654 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6655 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6656
6657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6658 $self->{line_prev} = $self->{line};
6659 $self->{column_prev} = $self->{column};
6660 $self->{column}++;
6661 $self->{nc}
6662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6663 } else {
6664 $self->{set_nc}->($self);
6665 }
6666
6667 return ($self->{ct});
6668 redo A;
6669 } else {
6670 ## XML5: Switch to the "DOCTYPE bogus comment state".
6671 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6672 $self->{ca}->{value} = '';
6673 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6674 ## Reconsume.
6675 redo A;
6676 }
6677 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6678 if ($is_space->{$self->{nc}}) {
6679 ## Stay in the state.
6680
6681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6682 $self->{line_prev} = $self->{line};
6683 $self->{column_prev} = $self->{column};
6684 $self->{column}++;
6685 $self->{nc}
6686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6687 } else {
6688 $self->{set_nc}->($self);
6689 }
6690
6691 redo A;
6692 } elsif ($self->{nc} == 0x007C) { # |
6693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6694 ## Stay in the state.
6695
6696 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6697 $self->{line_prev} = $self->{line};
6698 $self->{column_prev} = $self->{column};
6699 $self->{column}++;
6700 $self->{nc}
6701 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6702 } else {
6703 $self->{set_nc}->($self);
6704 }
6705
6706 redo A;
6707 } elsif ($self->{nc} == 0x0029) { # )
6708 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6709 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6710
6711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712 $self->{line_prev} = $self->{line};
6713 $self->{column_prev} = $self->{column};
6714 $self->{column}++;
6715 $self->{nc}
6716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717 } else {
6718 $self->{set_nc}->($self);
6719 }
6720
6721 redo A;
6722 } elsif ($self->{nc} == 0x003E) { # >
6723 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6724 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6725
6726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6727 $self->{line_prev} = $self->{line};
6728 $self->{column_prev} = $self->{column};
6729 $self->{column}++;
6730 $self->{nc}
6731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6732 } else {
6733 $self->{set_nc}->($self);
6734 }
6735
6736 return ($self->{ct}); # ATTLIST
6737 redo A;
6738 } elsif ($self->{nc} == -1) {
6739 ## XML5: No parse error.
6740 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6741 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6742
6743 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6744 $self->{line_prev} = $self->{line};
6745 $self->{column_prev} = $self->{column};
6746 $self->{column}++;
6747 $self->{nc}
6748 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6749 } else {
6750 $self->{set_nc}->($self);
6751 }
6752
6753 return ($self->{ct});
6754 redo A;
6755 } else {
6756 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6757 $self->{state} = ALLOWED_TOKEN_STATE;
6758
6759 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6760 $self->{line_prev} = $self->{line};
6761 $self->{column_prev} = $self->{column};
6762 $self->{column}++;
6763 $self->{nc}
6764 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6765 } else {
6766 $self->{set_nc}->($self);
6767 }
6768
6769 redo A;
6770 }
6771 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6772 if ($is_space->{$self->{nc}}) {
6773 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6774
6775 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6776 $self->{line_prev} = $self->{line};
6777 $self->{column_prev} = $self->{column};
6778 $self->{column}++;
6779 $self->{nc}
6780 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6781 } else {
6782 $self->{set_nc}->($self);
6783 }
6784
6785 redo A;
6786 } elsif ($self->{nc} == 0x007C) { # |
6787 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6788
6789 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790 $self->{line_prev} = $self->{line};
6791 $self->{column_prev} = $self->{column};
6792 $self->{column}++;
6793 $self->{nc}
6794 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795 } else {
6796 $self->{set_nc}->($self);
6797 }
6798
6799 redo A;
6800 } elsif ($self->{nc} == 0x0029) { # )
6801 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6802
6803 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6804 $self->{line_prev} = $self->{line};
6805 $self->{column_prev} = $self->{column};
6806 $self->{column}++;
6807 $self->{nc}
6808 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6809 } else {
6810 $self->{set_nc}->($self);
6811 }
6812
6813 redo A;
6814 } elsif ($self->{nc} == 0x003E) { # >
6815 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6816 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6817
6818 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6819 $self->{line_prev} = $self->{line};
6820 $self->{column_prev} = $self->{column};
6821 $self->{column}++;
6822 $self->{nc}
6823 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6824 } else {
6825 $self->{set_nc}->($self);
6826 }
6827
6828 return ($self->{ct}); # ATTLIST
6829 redo A;
6830 } elsif ($self->{nc} == -1) {
6831 ## XML5: No parse error.
6832 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6833 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6834
6835 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6836 $self->{line_prev} = $self->{line};
6837 $self->{column_prev} = $self->{column};
6838 $self->{column}++;
6839 $self->{nc}
6840 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6841 } else {
6842 $self->{set_nc}->($self);
6843 }
6844
6845 return ($self->{ct});
6846 redo A;
6847 } else {
6848 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6849 ## Stay in the state.
6850
6851 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6852 $self->{line_prev} = $self->{line};
6853 $self->{column_prev} = $self->{column};
6854 $self->{column}++;
6855 $self->{nc}
6856 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6857 } else {
6858 $self->{set_nc}->($self);
6859 }
6860
6861 redo A;
6862 }
6863 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6864 if ($is_space->{$self->{nc}}) {
6865 ## Stay in the state.
6866
6867 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6868 $self->{line_prev} = $self->{line};
6869 $self->{column_prev} = $self->{column};
6870 $self->{column}++;
6871 $self->{nc}
6872 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6873 } else {
6874 $self->{set_nc}->($self);
6875 }
6876
6877 redo A;
6878 } elsif ($self->{nc} == 0x007C) { # |
6879 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6880
6881 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6882 $self->{line_prev} = $self->{line};
6883 $self->{column_prev} = $self->{column};
6884 $self->{column}++;
6885 $self->{nc}
6886 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6887 } else {
6888 $self->{set_nc}->($self);
6889 }
6890
6891 redo A;
6892 } elsif ($self->{nc} == 0x0029) { # )
6893 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6894
6895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896 $self->{line_prev} = $self->{line};
6897 $self->{column_prev} = $self->{column};
6898 $self->{column}++;
6899 $self->{nc}
6900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901 } else {
6902 $self->{set_nc}->($self);
6903 }
6904
6905 redo A;
6906 } elsif ($self->{nc} == 0x003E) { # >
6907 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6908 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6909
6910 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6911 $self->{line_prev} = $self->{line};
6912 $self->{column_prev} = $self->{column};
6913 $self->{column}++;
6914 $self->{nc}
6915 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6916 } else {
6917 $self->{set_nc}->($self);
6918 }
6919
6920 return ($self->{ct}); # ATTLIST
6921 redo A;
6922 } elsif ($self->{nc} == -1) {
6923 ## XML5: No parse error.
6924 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6925 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6926
6927 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928 $self->{line_prev} = $self->{line};
6929 $self->{column_prev} = $self->{column};
6930 $self->{column}++;
6931 $self->{nc}
6932 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933 } else {
6934 $self->{set_nc}->($self);
6935 }
6936
6937 return ($self->{ct});
6938 redo A;
6939 } else {
6940 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6941 line => $self->{line_prev},
6942 column => $self->{column_prev});
6943 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6944 $self->{state} = ALLOWED_TOKEN_STATE;
6945
6946 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6947 $self->{line_prev} = $self->{line};
6948 $self->{column_prev} = $self->{column};
6949 $self->{column}++;
6950 $self->{nc}
6951 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6952 } else {
6953 $self->{set_nc}->($self);
6954 }
6955
6956 redo A;
6957 }
6958 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6959 if ($is_space->{$self->{nc}}) {
6960 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6961
6962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6963 $self->{line_prev} = $self->{line};
6964 $self->{column_prev} = $self->{column};
6965 $self->{column}++;
6966 $self->{nc}
6967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6968 } else {
6969 $self->{set_nc}->($self);
6970 }
6971
6972 redo A;
6973 } elsif ($self->{nc} == 0x0023) { # #
6974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6975 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6976
6977 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6978 $self->{line_prev} = $self->{line};
6979 $self->{column_prev} = $self->{column};
6980 $self->{column}++;
6981 $self->{nc}
6982 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6983 } else {
6984 $self->{set_nc}->($self);
6985 }
6986
6987 redo A;
6988 } elsif ($self->{nc} == 0x0022) { # "
6989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6990 $self->{ca}->{value} = '';
6991 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6992
6993 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6994 $self->{line_prev} = $self->{line};
6995 $self->{column_prev} = $self->{column};
6996 $self->{column}++;
6997 $self->{nc}
6998 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6999 } else {
7000 $self->{set_nc}->($self);
7001 }
7002
7003 redo A;
7004 } elsif ($self->{nc} == 0x0027) { # '
7005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7006 $self->{ca}->{value} = '';
7007 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7008
7009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7010 $self->{line_prev} = $self->{line};
7011 $self->{column_prev} = $self->{column};
7012 $self->{column}++;
7013 $self->{nc}
7014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7015 } else {
7016 $self->{set_nc}->($self);
7017 }
7018
7019 redo A;
7020 } elsif ($self->{nc} == 0x003E) { # >
7021 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7022 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7023
7024 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7025 $self->{line_prev} = $self->{line};
7026 $self->{column_prev} = $self->{column};
7027 $self->{column}++;
7028 $self->{nc}
7029 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7030 } else {
7031 $self->{set_nc}->($self);
7032 }
7033
7034 return ($self->{ct}); # ATTLIST
7035 redo A;
7036 } elsif ($self->{nc} == -1) {
7037 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7038 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7039
7040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7041 $self->{line_prev} = $self->{line};
7042 $self->{column_prev} = $self->{column};
7043 $self->{column}++;
7044 $self->{nc}
7045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7046 } else {
7047 $self->{set_nc}->($self);
7048 }
7049
7050 return ($self->{ct});
7051 redo A;
7052 } else {
7053 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7054 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7055 ## Reconsume.
7056 redo A;
7057 }
7058 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7059 if ($is_space->{$self->{nc}}) {
7060 ## Stay in the state.
7061
7062 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063 $self->{line_prev} = $self->{line};
7064 $self->{column_prev} = $self->{column};
7065 $self->{column}++;
7066 $self->{nc}
7067 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068 } else {
7069 $self->{set_nc}->($self);
7070 }
7071
7072 redo A;
7073 } elsif ($self->{nc} == 0x0023) { # #
7074 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7075
7076 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7077 $self->{line_prev} = $self->{line};
7078 $self->{column_prev} = $self->{column};
7079 $self->{column}++;
7080 $self->{nc}
7081 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7082 } else {
7083 $self->{set_nc}->($self);
7084 }
7085
7086 redo A;
7087 } elsif ($self->{nc} == 0x0022) { # "
7088 $self->{ca}->{value} = '';
7089 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7090
7091 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7092 $self->{line_prev} = $self->{line};
7093 $self->{column_prev} = $self->{column};
7094 $self->{column}++;
7095 $self->{nc}
7096 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7097 } else {
7098 $self->{set_nc}->($self);
7099 }
7100
7101 redo A;
7102 } elsif ($self->{nc} == 0x0027) { # '
7103 $self->{ca}->{value} = '';
7104 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7105
7106 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7107 $self->{line_prev} = $self->{line};
7108 $self->{column_prev} = $self->{column};
7109 $self->{column}++;
7110 $self->{nc}
7111 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7112 } else {
7113 $self->{set_nc}->($self);
7114 }
7115
7116 redo A;
7117 } elsif ($self->{nc} == 0x003E) { # >
7118 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7119 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7120
7121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7122 $self->{line_prev} = $self->{line};
7123 $self->{column_prev} = $self->{column};
7124 $self->{column}++;
7125 $self->{nc}
7126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7127 } else {
7128 $self->{set_nc}->($self);
7129 }
7130
7131 return ($self->{ct}); # ATTLIST
7132 redo A;
7133 } elsif ($self->{nc} == -1) {
7134 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7135 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7136
7137 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7138 $self->{line_prev} = $self->{line};
7139 $self->{column_prev} = $self->{column};
7140 $self->{column}++;
7141 $self->{nc}
7142 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7143 } else {
7144 $self->{set_nc}->($self);
7145 }
7146
7147 return ($self->{ct});
7148 redo A;
7149 } else {
7150 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7151 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7152 ## Reconsume.
7153 redo A;
7154 }
7155 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7156 if ($is_space->{$self->{nc}}) {
7157 ## XML5: No parse error.
7158 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7159 $self->{state} = BOGUS_MD_STATE;
7160 ## Reconsume.
7161 redo A;
7162 } elsif ($self->{nc} == 0x0022) { # "
7163 ## XML5: Same as "anything else".
7164 $self->{ca}->{value} = '';
7165 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7166
7167 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7168 $self->{line_prev} = $self->{line};
7169 $self->{column_prev} = $self->{column};
7170 $self->{column}++;
7171 $self->{nc}
7172 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7173 } else {
7174 $self->{set_nc}->($self);
7175 }
7176
7177 redo A;
7178 } elsif ($self->{nc} == 0x0027) { # '
7179 ## XML5: Same as "anything else".
7180 $self->{ca}->{value} = '';
7181 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7182
7183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7184 $self->{line_prev} = $self->{line};
7185 $self->{column_prev} = $self->{column};
7186 $self->{column}++;
7187 $self->{nc}
7188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7189 } else {
7190 $self->{set_nc}->($self);
7191 }
7192
7193 redo A;
7194 } elsif ($self->{nc} == 0x003E) { # >
7195 ## XML5: Same as "anything else".
7196 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7197 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7198
7199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7200 $self->{line_prev} = $self->{line};
7201 $self->{column_prev} = $self->{column};
7202 $self->{column}++;
7203 $self->{nc}
7204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7205 } else {
7206 $self->{set_nc}->($self);
7207 }
7208
7209 return ($self->{ct}); # ATTLIST
7210 redo A;
7211 } elsif ($self->{nc} == -1) {
7212 ## XML5: No parse error.
7213 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7214 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7215
7216 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7217 $self->{line_prev} = $self->{line};
7218 $self->{column_prev} = $self->{column};
7219 $self->{column}++;
7220 $self->{nc}
7221 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7222 } else {
7223 $self->{set_nc}->($self);
7224 }
7225
7226 return ($self->{ct});
7227 redo A;
7228 } else {
7229 $self->{ca}->{default} = chr $self->{nc};
7230 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7231
7232 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7233 $self->{line_prev} = $self->{line};
7234 $self->{column_prev} = $self->{column};
7235 $self->{column}++;
7236 $self->{nc}
7237 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7238 } else {
7239 $self->{set_nc}->($self);
7240 }
7241
7242 redo A;
7243 }
7244 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7245 if ($is_space->{$self->{nc}}) {
7246 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7247
7248 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7249 $self->{line_prev} = $self->{line};
7250 $self->{column_prev} = $self->{column};
7251 $self->{column}++;
7252 $self->{nc}
7253 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7254 } else {
7255 $self->{set_nc}->($self);
7256 }
7257
7258 redo A;
7259 } elsif ($self->{nc} == 0x0022) { # "
7260 ## XML5: Same as "anything else".
7261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7262 $self->{ca}->{value} = '';
7263 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7264
7265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7266 $self->{line_prev} = $self->{line};
7267 $self->{column_prev} = $self->{column};
7268 $self->{column}++;
7269 $self->{nc}
7270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7271 } else {
7272 $self->{set_nc}->($self);
7273 }
7274
7275 redo A;
7276 } elsif ($self->{nc} == 0x0027) { # '
7277 ## XML5: Same as "anything else".
7278 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7279 $self->{ca}->{value} = '';
7280 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7281
7282 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7283 $self->{line_prev} = $self->{line};
7284 $self->{column_prev} = $self->{column};
7285 $self->{column}++;
7286 $self->{nc}
7287 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7288 } else {
7289 $self->{set_nc}->($self);
7290 }
7291
7292 redo A;
7293 } elsif ($self->{nc} == 0x003E) { # >
7294 ## XML5: Same as "anything else".
7295 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7296 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7297
7298 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7299 $self->{line_prev} = $self->{line};
7300 $self->{column_prev} = $self->{column};
7301 $self->{column}++;
7302 $self->{nc}
7303 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7304 } else {
7305 $self->{set_nc}->($self);
7306 }
7307
7308 return ($self->{ct}); # ATTLIST
7309 redo A;
7310 } elsif ($self->{nc} == -1) {
7311 ## XML5: No parse error.
7312 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7313 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7314 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7315
7316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7317 $self->{line_prev} = $self->{line};
7318 $self->{column_prev} = $self->{column};
7319 $self->{column}++;
7320 $self->{nc}
7321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7322 } else {
7323 $self->{set_nc}->($self);
7324 }
7325
7326 return ($self->{ct});
7327 redo A;
7328 } else {
7329 $self->{ca}->{default} .= chr $self->{nc};
7330 ## Stay in the state.
7331
7332 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7333 $self->{line_prev} = $self->{line};
7334 $self->{column_prev} = $self->{column};
7335 $self->{column}++;
7336 $self->{nc}
7337 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7338 } else {
7339 $self->{set_nc}->($self);
7340 }
7341
7342 redo A;
7343 }
7344 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7345 if ($is_space->{$self->{nc}}) {
7346 ## Stay in the state.
7347
7348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7349 $self->{line_prev} = $self->{line};
7350 $self->{column_prev} = $self->{column};
7351 $self->{column}++;
7352 $self->{nc}
7353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7354 } else {
7355 $self->{set_nc}->($self);
7356 }
7357
7358 redo A;
7359 } elsif ($self->{nc} == 0x0022) { # "
7360 $self->{ca}->{value} = '';
7361 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7362
7363 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7364 $self->{line_prev} = $self->{line};
7365 $self->{column_prev} = $self->{column};
7366 $self->{column}++;
7367 $self->{nc}
7368 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7369 } else {
7370 $self->{set_nc}->($self);
7371 }
7372
7373 redo A;
7374 } elsif ($self->{nc} == 0x0027) { # '
7375 $self->{ca}->{value} = '';
7376 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7377
7378 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7379 $self->{line_prev} = $self->{line};
7380 $self->{column_prev} = $self->{column};
7381 $self->{column}++;
7382 $self->{nc}
7383 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7384 } else {
7385 $self->{set_nc}->($self);
7386 }
7387
7388 redo A;
7389 } elsif ($self->{nc} == 0x003E) { # >
7390 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7391 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7392
7393 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7394 $self->{line_prev} = $self->{line};
7395 $self->{column_prev} = $self->{column};
7396 $self->{column}++;
7397 $self->{nc}
7398 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7399 } else {
7400 $self->{set_nc}->($self);
7401 }
7402
7403 return ($self->{ct}); # ATTLIST
7404 redo A;
7405 } elsif ($self->{nc} == -1) {
7406 ## XML5: No parse error.
7407 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7408 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7409 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7410
7411 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7412 $self->{line_prev} = $self->{line};
7413 $self->{column_prev} = $self->{column};
7414 $self->{column}++;
7415 $self->{nc}
7416 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7417 } else {
7418 $self->{set_nc}->($self);
7419 }
7420
7421 return ($self->{ct});
7422 redo A;
7423 } else {
7424 ## XML5: Not defined yet.
7425 if ($self->{ca}->{default} eq 'FIXED') {
7426 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7427 } else {
7428 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7429 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7430 }
7431 ## Reconsume.
7432 redo A;
7433 }
7434 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7435 if ($is_space->{$self->{nc}} or
7436 $self->{nc} == -1 or
7437 $self->{nc} == 0x003E) { # >
7438 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7439 ## Reconsume.
7440 redo A;
7441 } else {
7442 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7443 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7444 ## Reconsume.
7445 redo A;
7446 }
7447 } elsif ($self->{state} == NDATA_STATE) {
7448 ## ASCII case-insensitive
7449 if ($self->{nc} == [
7450 undef,
7451 0x0044, # D
7452 0x0041, # A
7453 0x0054, # T
7454 ]->[length $self->{kwd}] or
7455 $self->{nc} == [
7456 undef,
7457 0x0064, # d
7458 0x0061, # a
7459 0x0074, # t
7460 ]->[length $self->{kwd}]) {
7461
7462 ## Stay in the state.
7463 $self->{kwd} .= chr $self->{nc};
7464
7465 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7466 $self->{line_prev} = $self->{line};
7467 $self->{column_prev} = $self->{column};
7468 $self->{column}++;
7469 $self->{nc}
7470 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7471 } else {
7472 $self->{set_nc}->($self);
7473 }
7474
7475 redo A;
7476 } elsif ((length $self->{kwd}) == 4 and
7477 ($self->{nc} == 0x0041 or # A
7478 $self->{nc} == 0x0061)) { # a
7479 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7480
7481 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7482 text => 'NDATA',
7483 line => $self->{line_prev},
7484 column => $self->{column_prev} - 4);
7485 } else {
7486
7487 }
7488 $self->{state} = AFTER_NDATA_STATE;
7489
7490 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7491 $self->{line_prev} = $self->{line};
7492 $self->{column_prev} = $self->{column};
7493 $self->{column}++;
7494 $self->{nc}
7495 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7496 } else {
7497 $self->{set_nc}->($self);
7498 }
7499
7500 redo A;
7501 } else {
7502 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7503 line => $self->{line_prev},
7504 column => $self->{column_prev} + 1
7505 - length $self->{kwd});
7506
7507 $self->{state} = BOGUS_MD_STATE;
7508 ## Reconsume.
7509 redo A;
7510 }
7511 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7512 if ($is_space->{$self->{nc}}) {
7513 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7514
7515 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516 $self->{line_prev} = $self->{line};
7517 $self->{column_prev} = $self->{column};
7518 $self->{column}++;
7519 $self->{nc}
7520 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521 } else {
7522 $self->{set_nc}->($self);
7523 }
7524
7525 redo A;
7526 } elsif ($self->{nc} == 0x003E) { # >
7527 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7528 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7529
7530 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531 $self->{line_prev} = $self->{line};
7532 $self->{column_prev} = $self->{column};
7533 $self->{column}++;
7534 $self->{nc}
7535 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536 } else {
7537 $self->{set_nc}->($self);
7538 }
7539
7540 return ($self->{ct}); # ENTITY
7541 redo A;
7542 } elsif ($self->{nc} == -1) {
7543 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7544 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7545
7546 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547 $self->{line_prev} = $self->{line};
7548 $self->{column_prev} = $self->{column};
7549 $self->{column}++;
7550 $self->{nc}
7551 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552 } else {
7553 $self->{set_nc}->($self);
7554 }
7555
7556 return ($self->{ct}); # ENTITY
7557 redo A;
7558 } else {
7559 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7560 line => $self->{line_prev},
7561 column => $self->{column_prev} + 1
7562 - length $self->{kwd});
7563 $self->{state} = BOGUS_MD_STATE;
7564 ## Reconsume.
7565 redo A;
7566 }
7567 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7568 if ($is_space->{$self->{nc}}) {
7569 ## Stay in the state.
7570
7571 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7572 $self->{line_prev} = $self->{line};
7573 $self->{column_prev} = $self->{column};
7574 $self->{column}++;
7575 $self->{nc}
7576 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7577 } else {
7578 $self->{set_nc}->($self);
7579 }
7580
7581 redo A;
7582 } elsif ($self->{nc} == 0x003E) { # >
7583 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7584 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7585
7586 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7587 $self->{line_prev} = $self->{line};
7588 $self->{column_prev} = $self->{column};
7589 $self->{column}++;
7590 $self->{nc}
7591 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7592 } else {
7593 $self->{set_nc}->($self);
7594 }
7595
7596 return ($self->{ct}); # ENTITY
7597 redo A;
7598 } elsif ($self->{nc} == -1) {
7599 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7600 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7601
7602 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7603 $self->{line_prev} = $self->{line};
7604 $self->{column_prev} = $self->{column};
7605 $self->{column}++;
7606 $self->{nc}
7607 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7608 } else {
7609 $self->{set_nc}->($self);
7610 }
7611
7612 return ($self->{ct}); # ENTITY
7613 redo A;
7614 } else {
7615 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7616 $self->{state} = NOTATION_NAME_STATE;
7617
7618 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7619 $self->{line_prev} = $self->{line};
7620 $self->{column_prev} = $self->{column};
7621 $self->{column}++;
7622 $self->{nc}
7623 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7624 } else {
7625 $self->{set_nc}->($self);
7626 }
7627
7628 redo A;
7629 }
7630 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7631 if ($is_space->{$self->{nc}}) {
7632 $self->{state} = AFTER_NOTATION_NAME_STATE;
7633
7634 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7635 $self->{line_prev} = $self->{line};
7636 $self->{column_prev} = $self->{column};
7637 $self->{column}++;
7638 $self->{nc}
7639 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7640 } else {
7641 $self->{set_nc}->($self);
7642 }
7643
7644 redo A;
7645 } elsif ($self->{nc} == 0x003E) { # >
7646 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7647
7648 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7649 $self->{line_prev} = $self->{line};
7650 $self->{column_prev} = $self->{column};
7651 $self->{column}++;
7652 $self->{nc}
7653 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7654 } else {
7655 $self->{set_nc}->($self);
7656 }
7657
7658 return ($self->{ct}); # ENTITY
7659 redo A;
7660 } elsif ($self->{nc} == -1) {
7661 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7662 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7663
7664 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7665 $self->{line_prev} = $self->{line};
7666 $self->{column_prev} = $self->{column};
7667 $self->{column}++;
7668 $self->{nc}
7669 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7670 } else {
7671 $self->{set_nc}->($self);
7672 }
7673
7674 return ($self->{ct}); # ENTITY
7675 redo A;
7676 } else {
7677 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7678 ## Stay in the state.
7679
7680 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7681 $self->{line_prev} = $self->{line};
7682 $self->{column_prev} = $self->{column};
7683 $self->{column}++;
7684 $self->{nc}
7685 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7686 } else {
7687 $self->{set_nc}->($self);
7688 }
7689
7690 redo A;
7691 }
7692 } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
7693 if ($is_space->{$self->{nc}}) {
7694 ## Stay in the state.
7695
7696 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7697 $self->{line_prev} = $self->{line};
7698 $self->{column_prev} = $self->{column};
7699 $self->{column}++;
7700 $self->{nc}
7701 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7702 } else {
7703 $self->{set_nc}->($self);
7704 }
7705
7706 redo A;
7707 } elsif ($self->{nc} == 0x003E) { # >
7708 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7709
7710 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7711 $self->{line_prev} = $self->{line};
7712 $self->{column_prev} = $self->{column};
7713 $self->{column}++;
7714 $self->{nc}
7715 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7716 } else {
7717 $self->{set_nc}->($self);
7718 }
7719
7720 return ($self->{ct}); # ENTITY
7721 redo A;
7722 } elsif ($self->{nc} == -1) {
7723 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7724 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7725
7726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7727 $self->{line_prev} = $self->{line};
7728 $self->{column_prev} = $self->{column};
7729 $self->{column}++;
7730 $self->{nc}
7731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7732 } else {
7733 $self->{set_nc}->($self);
7734 }
7735
7736 return ($self->{ct}); # ENTITY
7737 redo A;
7738 } else {
7739 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type
7740 $self->{state} = BOGUS_MD_STATE;
7741 ## Reconsume.
7742 redo A;
7743 }
7744
7745
7746 } elsif ($self->{state} == BOGUS_MD_STATE) {
7747 if ($self->{nc} == 0x003E) { # >
7748 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7749
7750 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7751 $self->{line_prev} = $self->{line};
7752 $self->{column_prev} = $self->{column};
7753 $self->{column}++;
7754 $self->{nc}
7755 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7756 } else {
7757 $self->{set_nc}->($self);
7758 }
7759
7760 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7761 redo A;
7762 } elsif ($self->{nc} == -1) {
7763 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7764 ## Reconsume.
7765 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7766 redo A;
7767 } else {
7768 ## Stay in the state.
7769
7770 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771 $self->{line_prev} = $self->{line};
7772 $self->{column_prev} = $self->{column};
7773 $self->{column}++;
7774 $self->{nc}
7775 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776 } else {
7777 $self->{set_nc}->($self);
7778 }
7779
7780 redo A;
7781 }
7782 } else {
7783 die "$0: $self->{state}: Unknown state";
7784 }
7785 } # A
7786
7787 die "$0: _get_next_token: unexpected case";
7788 } # _get_next_token
7789
7790 1;
7791 ## $Date: 2008/10/19 04:39:25 $
7792

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24