/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (show annotations) (download)
Thu Oct 16 03:39:57 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.12: +237 -70 lines
++ whatpm/t/ChangeLog	16 Oct 2008 03:39:39 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/pis-2.dat" and "xml/comments-2.dat" are added.

++ whatpm/t/xml/ChangeLog	16 Oct 2008 03:39:53 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* doctypes-2.dat: New test added.

	* comments-2.dat, pis-2.dat: New test data files.

++ whatpm/Whatpm/HTML/ChangeLog	16 Oct 2008 03:36:51 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token type END_OF_DOCTYPE_TOKEN added.
	New states DOCTYPE_TAG_STATE and
	BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE are added.  (Bogus
	string after the internal subset, which was handled by the state
	BOGUS_DOCTYPE_STATE, are now handled by the new state.)  Support
	for comments, bogus comments, and processing instructions in the
	internal subset.  If there is the internal subset, then emit the
	doctype token before the internal subset (with its
	$token->{has_internal_subset} flag set) and an
	END_OF_DOCTYPE_TOKEN after the internal subset.

++ whatpm/Whatpm/XML/ChangeLog	16 Oct 2008 03:39:19 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Insertion mode IN_SUBSET_IM added.  In the
	"initial" insertion mode, if the DOCTYPE token's "has internal
	subset" flag is set, then switch to the "in subset" insertion
	mode.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.12 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 );
20
21 our %EXPORT_TAGS = (
22 token => [qw(
23 DOCTYPE_TOKEN
24 COMMENT_TOKEN
25 START_TAG_TOKEN
26 END_TAG_TOKEN
27 END_OF_FILE_TOKEN
28 CHARACTER_TOKEN
29 PI_TOKEN
30 ABORT_TOKEN
31 END_OF_DOCTYPE_TOKEN
32 )],
33 );
34 }
35
36 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
37
38 ## Token types
39
40 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
41 sub COMMENT_TOKEN () { 2 }
42 sub START_TAG_TOKEN () { 3 }
43 sub END_TAG_TOKEN () { 4 }
44 sub END_OF_FILE_TOKEN () { 5 }
45 sub CHARACTER_TOKEN () { 6 }
46 sub PI_TOKEN () { 7 } ## NOTE: XML only.
47 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
48 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only
49
50 ## XML5: XML5 has "empty tag token". In this implementation, it is
51 ## represented as a start tag token with $self->{self_closing} flag
52 ## set to true.
53
54 ## XML5: XML5 has "short end tag token". In this implementation, it
55 ## is represented as an end tag token with $token->{tag_name} flag set
56 ## to an empty string.
57
58 package Whatpm::HTML;
59
60 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
61
62 ## Content model flags
63
64 sub CM_ENTITY () { 0b001 } # & markup in data
65 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
66 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
67
68 sub PLAINTEXT_CONTENT_MODEL () { 0 }
69 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
70 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
71 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
72
73 ## Tokenizer states
74
75 sub DATA_STATE () { 0 }
76 #sub ENTITY_DATA_STATE () { 1 }
77 sub TAG_OPEN_STATE () { 2 }
78 sub CLOSE_TAG_OPEN_STATE () { 3 }
79 sub TAG_NAME_STATE () { 4 }
80 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
81 sub ATTRIBUTE_NAME_STATE () { 6 }
82 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
83 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
84 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
85 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
86 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
87 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
88 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
89 sub COMMENT_START_STATE () { 14 }
90 sub COMMENT_START_DASH_STATE () { 15 }
91 sub COMMENT_STATE () { 16 }
92 sub COMMENT_END_STATE () { 17 }
93 sub COMMENT_END_DASH_STATE () { 18 }
94 sub BOGUS_COMMENT_STATE () { 19 }
95 sub DOCTYPE_STATE () { 20 }
96 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
97 sub DOCTYPE_NAME_STATE () { 22 }
98 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
99 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
100 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
101 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
102 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
103 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
104 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
105 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
106 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
107 sub BOGUS_DOCTYPE_STATE () { 32 }
108 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
109 sub SELF_CLOSING_START_TAG_STATE () { 34 }
110 sub CDATA_SECTION_STATE () { 35 }
111 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
112 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
113 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
114 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
115 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
116 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
117 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
118 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
119 ## NOTE: "Entity data state", "entity in attribute value state", and
120 ## "consume a character reference" algorithm are jointly implemented
121 ## using the following six states:
122 sub ENTITY_STATE () { 44 }
123 sub ENTITY_HASH_STATE () { 45 }
124 sub NCR_NUM_STATE () { 46 }
125 sub HEXREF_X_STATE () { 47 }
126 sub HEXREF_HEX_STATE () { 48 }
127 sub ENTITY_NAME_STATE () { 49 }
128 sub PCDATA_STATE () { 50 } # "data state" in the spec
129
130 ## XML-only states
131 sub PI_STATE () { 51 }
132 sub PI_TARGET_STATE () { 52 }
133 sub PI_TARGET_AFTER_STATE () { 53 }
134 sub PI_DATA_STATE () { 54 }
135 sub PI_AFTER_STATE () { 55 }
136 sub PI_DATA_AFTER_STATE () { 56 }
137 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
138 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
139 sub DOCTYPE_TAG_STATE () { 59 }
140 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }
141
142 ## Tree constructor state constants (see Whatpm::HTML for the full
143 ## list and descriptions)
144
145 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
146 sub FOREIGN_EL () { 0b1_00000000000 }
147
148 ## Character reference mappings
149
150 my $charref_map = {
151 0x0D => 0x000A,
152 0x80 => 0x20AC,
153 0x81 => 0xFFFD,
154 0x82 => 0x201A,
155 0x83 => 0x0192,
156 0x84 => 0x201E,
157 0x85 => 0x2026,
158 0x86 => 0x2020,
159 0x87 => 0x2021,
160 0x88 => 0x02C6,
161 0x89 => 0x2030,
162 0x8A => 0x0160,
163 0x8B => 0x2039,
164 0x8C => 0x0152,
165 0x8D => 0xFFFD,
166 0x8E => 0x017D,
167 0x8F => 0xFFFD,
168 0x90 => 0xFFFD,
169 0x91 => 0x2018,
170 0x92 => 0x2019,
171 0x93 => 0x201C,
172 0x94 => 0x201D,
173 0x95 => 0x2022,
174 0x96 => 0x2013,
175 0x97 => 0x2014,
176 0x98 => 0x02DC,
177 0x99 => 0x2122,
178 0x9A => 0x0161,
179 0x9B => 0x203A,
180 0x9C => 0x0153,
181 0x9D => 0xFFFD,
182 0x9E => 0x017E,
183 0x9F => 0x0178,
184 }; # $charref_map
185 $charref_map->{$_} = 0xFFFD
186 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
187 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
188 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
189 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
190 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
191 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
192 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
193
194 ## Implementations MUST act as if state machine in the spec
195
196 sub _initialize_tokenizer ($) {
197 my $self = shift;
198
199 ## NOTE: Fields set by |new| constructor:
200 #$self->{level}
201 #$self->{set_nc}
202 #$self->{parse_error}
203 #$self->{is_xml} (if XML)
204
205 $self->{state} = DATA_STATE; # MUST
206 $self->{s_kwd} = ''; # Data state keyword
207 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
208 #$self->{entity__value}; # initialized when used
209 #$self->{entity__match}; # initialized when used
210 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
211 undef $self->{ct}; # current token
212 undef $self->{ca}; # current attribute
213 undef $self->{last_stag_name}; # last emitted start tag name
214 #$self->{prev_state}; # initialized when used
215 delete $self->{self_closing};
216 $self->{char_buffer} = '';
217 $self->{char_buffer_pos} = 0;
218 $self->{nc} = -1; # next input character
219 #$self->{next_nc}
220
221 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
222 $self->{line_prev} = $self->{line};
223 $self->{column_prev} = $self->{column};
224 $self->{column}++;
225 $self->{nc}
226 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
227 } else {
228 $self->{set_nc}->($self);
229 }
230
231 $self->{token} = [];
232 # $self->{escape}
233 } # _initialize_tokenizer
234
235 ## A token has:
236 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
237 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
238 ## ->{name} (DOCTYPE_TOKEN)
239 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
240 ## ->{target} (PI_TOKEN)
241 ## ->{pubid} (DOCTYPE_TOKEN)
242 ## ->{sysid} (DOCTYPE_TOKEN)
243 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
244 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
245 ## ->{name}
246 ## ->{value}
247 ## ->{has_reference} == 1 or 0
248 ## ->{index}: Index of the attribute in a tag.
249 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
250 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
251 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
252 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
253
254 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
255 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
256 ## while the token is pushed back to the stack.
257
258 ## Emitted token MUST immediately be handled by the tree construction state.
259
260 ## Before each step, UA MAY check to see if either one of the scripts in
261 ## "list of scripts that will execute as soon as possible" or the first
262 ## script in the "list of scripts that will execute asynchronously",
263 ## has completed loading. If one has, then it MUST be executed
264 ## and removed from the list.
265
266 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
267 ## (This requirement was dropped from HTML5 spec, unfortunately.)
268
269 my $is_space = {
270 0x0009 => 1, # CHARACTER TABULATION (HT)
271 0x000A => 1, # LINE FEED (LF)
272 #0x000B => 0, # LINE TABULATION (VT)
273 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
274 #0x000D => 1, # CARRIAGE RETURN (CR)
275 0x0020 => 1, # SPACE (SP)
276 };
277
278 sub _get_next_token ($) {
279 my $self = shift;
280
281 if ($self->{self_closing}) {
282 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
283 ## NOTE: The |self_closing| flag is only set by start tag token.
284 ## In addition, when a start tag token is emitted, it is always set to
285 ## |ct|.
286 delete $self->{self_closing};
287 }
288
289 if (@{$self->{token}}) {
290 $self->{self_closing} = $self->{token}->[0]->{self_closing};
291 return shift @{$self->{token}};
292 }
293
294 A: {
295 if ($self->{state} == PCDATA_STATE) {
296 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
297
298 if ($self->{nc} == 0x0026) { # &
299
300 ## NOTE: In the spec, the tokenizer is switched to the
301 ## "entity data state". In this implementation, the tokenizer
302 ## is switched to the |ENTITY_STATE|, which is an implementation
303 ## of the "consume a character reference" algorithm.
304 $self->{entity_add} = -1;
305 $self->{prev_state} = DATA_STATE;
306 $self->{state} = ENTITY_STATE;
307
308 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
309 $self->{line_prev} = $self->{line};
310 $self->{column_prev} = $self->{column};
311 $self->{column}++;
312 $self->{nc}
313 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
314 } else {
315 $self->{set_nc}->($self);
316 }
317
318 redo A;
319 } elsif ($self->{nc} == 0x003C) { # <
320
321 $self->{state} = TAG_OPEN_STATE;
322
323 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
324 $self->{line_prev} = $self->{line};
325 $self->{column_prev} = $self->{column};
326 $self->{column}++;
327 $self->{nc}
328 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
329 } else {
330 $self->{set_nc}->($self);
331 }
332
333 redo A;
334 } elsif ($self->{nc} == -1) {
335
336 return ({type => END_OF_FILE_TOKEN,
337 line => $self->{line}, column => $self->{column}});
338 last A; ## TODO: ok?
339 } else {
340
341 #
342 }
343
344 # Anything else
345 my $token = {type => CHARACTER_TOKEN,
346 data => chr $self->{nc},
347 line => $self->{line}, column => $self->{column},
348 };
349 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
350
351 ## Stay in the state.
352
353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
354 $self->{line_prev} = $self->{line};
355 $self->{column_prev} = $self->{column};
356 $self->{column}++;
357 $self->{nc}
358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
359 } else {
360 $self->{set_nc}->($self);
361 }
362
363 return ($token);
364 redo A;
365 } elsif ($self->{state} == DATA_STATE) {
366 $self->{s_kwd} = '' unless defined $self->{s_kwd};
367 if ($self->{nc} == 0x0026) { # &
368 $self->{s_kwd} = '';
369 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
370 not $self->{escape}) {
371
372 ## NOTE: In the spec, the tokenizer is switched to the
373 ## "entity data state". In this implementation, the tokenizer
374 ## is switched to the |ENTITY_STATE|, which is an implementation
375 ## of the "consume a character reference" algorithm.
376 $self->{entity_add} = -1;
377 $self->{prev_state} = DATA_STATE;
378 $self->{state} = ENTITY_STATE;
379
380 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
381 $self->{line_prev} = $self->{line};
382 $self->{column_prev} = $self->{column};
383 $self->{column}++;
384 $self->{nc}
385 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
386 } else {
387 $self->{set_nc}->($self);
388 }
389
390 redo A;
391 } else {
392
393 #
394 }
395 } elsif ($self->{nc} == 0x002D) { # -
396 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
397 if ($self->{s_kwd} eq '<!-') {
398
399 $self->{escape} = 1; # unless $self->{escape};
400 $self->{s_kwd} = '--';
401 #
402 } elsif ($self->{s_kwd} eq '-') {
403
404 $self->{s_kwd} = '--';
405 #
406 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
407
408 $self->{s_kwd} .= '-';
409 #
410 } else {
411
412 $self->{s_kwd} = '-';
413 #
414 }
415 }
416
417 #
418 } elsif ($self->{nc} == 0x0021) { # !
419 if (length $self->{s_kwd}) {
420
421 $self->{s_kwd} .= '!';
422 #
423 } else {
424
425 #$self->{s_kwd} = '';
426 #
427 }
428 #
429 } elsif ($self->{nc} == 0x003C) { # <
430 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
431 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
432 not $self->{escape})) {
433
434 $self->{state} = TAG_OPEN_STATE;
435
436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437 $self->{line_prev} = $self->{line};
438 $self->{column_prev} = $self->{column};
439 $self->{column}++;
440 $self->{nc}
441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442 } else {
443 $self->{set_nc}->($self);
444 }
445
446 redo A;
447 } else {
448
449 $self->{s_kwd} = '';
450 #
451 }
452 } elsif ($self->{nc} == 0x003E) { # >
453 if ($self->{escape} and
454 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
455 if ($self->{s_kwd} eq '--') {
456
457 delete $self->{escape};
458 #
459 } else {
460
461 #
462 }
463 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
464
465 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
466 line => $self->{line_prev},
467 column => $self->{column_prev} - 1);
468 #
469 } else {
470
471 #
472 }
473
474 $self->{s_kwd} = '';
475 #
476 } elsif ($self->{nc} == 0x005D) { # ]
477 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
478
479 $self->{s_kwd} .= ']';
480 } elsif ($self->{s_kwd} eq ']]') {
481
482 #
483 } else {
484
485 $self->{s_kwd} = '';
486 }
487 #
488 } elsif ($self->{nc} == -1) {
489
490 $self->{s_kwd} = '';
491 return ({type => END_OF_FILE_TOKEN,
492 line => $self->{line}, column => $self->{column}});
493 last A; ## TODO: ok?
494 } else {
495
496 $self->{s_kwd} = '';
497 #
498 }
499
500 # Anything else
501 my $token = {type => CHARACTER_TOKEN,
502 data => chr $self->{nc},
503 line => $self->{line}, column => $self->{column},
504 };
505 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
506 length $token->{data})) {
507 $self->{s_kwd} = '';
508 }
509
510 ## Stay in the data state.
511 if (not $self->{is_xml} and
512 $self->{content_model} == PCDATA_CONTENT_MODEL) {
513
514 $self->{state} = PCDATA_STATE;
515 } else {
516
517 ## Stay in the state.
518 }
519
520 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
521 $self->{line_prev} = $self->{line};
522 $self->{column_prev} = $self->{column};
523 $self->{column}++;
524 $self->{nc}
525 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
526 } else {
527 $self->{set_nc}->($self);
528 }
529
530 return ($token);
531 redo A;
532 } elsif ($self->{state} == TAG_OPEN_STATE) {
533 ## XML5: "tag state".
534
535 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
536 if ($self->{nc} == 0x002F) { # /
537
538
539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
540 $self->{line_prev} = $self->{line};
541 $self->{column_prev} = $self->{column};
542 $self->{column}++;
543 $self->{nc}
544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
545 } else {
546 $self->{set_nc}->($self);
547 }
548
549 $self->{state} = CLOSE_TAG_OPEN_STATE;
550 redo A;
551 } elsif ($self->{nc} == 0x0021) { # !
552
553 $self->{s_kwd} = $self->{escaped} ? '' : '<';
554 #
555 } else {
556
557 $self->{s_kwd} = '';
558 #
559 }
560
561 ## reconsume
562 $self->{state} = DATA_STATE;
563 return ({type => CHARACTER_TOKEN, data => '<',
564 line => $self->{line_prev},
565 column => $self->{column_prev},
566 });
567 redo A;
568 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
569 if ($self->{nc} == 0x0021) { # !
570
571 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
572
573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
574 $self->{line_prev} = $self->{line};
575 $self->{column_prev} = $self->{column};
576 $self->{column}++;
577 $self->{nc}
578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
579 } else {
580 $self->{set_nc}->($self);
581 }
582
583 redo A;
584 } elsif ($self->{nc} == 0x002F) { # /
585
586 $self->{state} = CLOSE_TAG_OPEN_STATE;
587
588 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
589 $self->{line_prev} = $self->{line};
590 $self->{column_prev} = $self->{column};
591 $self->{column}++;
592 $self->{nc}
593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
594 } else {
595 $self->{set_nc}->($self);
596 }
597
598 redo A;
599 } elsif (0x0041 <= $self->{nc} and
600 $self->{nc} <= 0x005A) { # A..Z
601
602 $self->{ct}
603 = {type => START_TAG_TOKEN,
604 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
605 line => $self->{line_prev},
606 column => $self->{column_prev}};
607 $self->{state} = TAG_NAME_STATE;
608
609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
610 $self->{line_prev} = $self->{line};
611 $self->{column_prev} = $self->{column};
612 $self->{column}++;
613 $self->{nc}
614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
615 } else {
616 $self->{set_nc}->($self);
617 }
618
619 redo A;
620 } elsif (0x0061 <= $self->{nc} and
621 $self->{nc} <= 0x007A) { # a..z
622
623 $self->{ct} = {type => START_TAG_TOKEN,
624 tag_name => chr ($self->{nc}),
625 line => $self->{line_prev},
626 column => $self->{column_prev}};
627 $self->{state} = TAG_NAME_STATE;
628
629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630 $self->{line_prev} = $self->{line};
631 $self->{column_prev} = $self->{column};
632 $self->{column}++;
633 $self->{nc}
634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635 } else {
636 $self->{set_nc}->($self);
637 }
638
639 redo A;
640 } elsif ($self->{nc} == 0x003E) { # >
641
642 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
643 line => $self->{line_prev},
644 column => $self->{column_prev});
645 $self->{state} = DATA_STATE;
646 $self->{s_kwd} = '';
647
648 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
649 $self->{line_prev} = $self->{line};
650 $self->{column_prev} = $self->{column};
651 $self->{column}++;
652 $self->{nc}
653 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
654 } else {
655 $self->{set_nc}->($self);
656 }
657
658
659 return ({type => CHARACTER_TOKEN, data => '<>',
660 line => $self->{line_prev},
661 column => $self->{column_prev},
662 });
663
664 redo A;
665 } elsif ($self->{nc} == 0x003F) { # ?
666 if ($self->{is_xml}) {
667
668 $self->{state} = PI_STATE;
669
670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
671 $self->{line_prev} = $self->{line};
672 $self->{column_prev} = $self->{column};
673 $self->{column}++;
674 $self->{nc}
675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
676 } else {
677 $self->{set_nc}->($self);
678 }
679
680 redo A;
681 } else {
682
683 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
684 line => $self->{line_prev},
685 column => $self->{column_prev});
686 $self->{state} = BOGUS_COMMENT_STATE;
687 $self->{ct} = {type => COMMENT_TOKEN, data => '',
688 line => $self->{line_prev},
689 column => $self->{column_prev},
690 };
691 ## $self->{nc} is intentionally left as is
692 redo A;
693 }
694 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
695
696 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
697 line => $self->{line_prev},
698 column => $self->{column_prev});
699 $self->{state} = DATA_STATE;
700 $self->{s_kwd} = '';
701 ## reconsume
702
703 return ({type => CHARACTER_TOKEN, data => '<',
704 line => $self->{line_prev},
705 column => $self->{column_prev},
706 });
707
708 redo A;
709 } else {
710 ## XML5: "<:" is a parse error.
711
712 $self->{ct} = {type => START_TAG_TOKEN,
713 tag_name => chr ($self->{nc}),
714 line => $self->{line_prev},
715 column => $self->{column_prev}};
716 $self->{state} = TAG_NAME_STATE;
717
718 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
719 $self->{line_prev} = $self->{line};
720 $self->{column_prev} = $self->{column};
721 $self->{column}++;
722 $self->{nc}
723 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
724 } else {
725 $self->{set_nc}->($self);
726 }
727
728 redo A;
729 }
730 } else {
731 die "$0: $self->{content_model} in tag open";
732 }
733 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
734 ## NOTE: The "close tag open state" in the spec is implemented as
735 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
736
737 ## XML5: "end tag state".
738
739 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
740 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
741 if (defined $self->{last_stag_name}) {
742 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
743 $self->{kwd} = '';
744 ## Reconsume.
745 redo A;
746 } else {
747 ## No start tag token has ever been emitted
748 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
749
750 $self->{state} = DATA_STATE;
751 $self->{s_kwd} = '';
752 ## Reconsume.
753 return ({type => CHARACTER_TOKEN, data => '</',
754 line => $l, column => $c,
755 });
756 redo A;
757 }
758 }
759
760 if (0x0041 <= $self->{nc} and
761 $self->{nc} <= 0x005A) { # A..Z
762
763 $self->{ct}
764 = {type => END_TAG_TOKEN,
765 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
766 line => $l, column => $c};
767 $self->{state} = TAG_NAME_STATE;
768
769 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
770 $self->{line_prev} = $self->{line};
771 $self->{column_prev} = $self->{column};
772 $self->{column}++;
773 $self->{nc}
774 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
775 } else {
776 $self->{set_nc}->($self);
777 }
778
779 redo A;
780 } elsif (0x0061 <= $self->{nc} and
781 $self->{nc} <= 0x007A) { # a..z
782
783 $self->{ct} = {type => END_TAG_TOKEN,
784 tag_name => chr ($self->{nc}),
785 line => $l, column => $c};
786 $self->{state} = TAG_NAME_STATE;
787
788 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
789 $self->{line_prev} = $self->{line};
790 $self->{column_prev} = $self->{column};
791 $self->{column}++;
792 $self->{nc}
793 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
794 } else {
795 $self->{set_nc}->($self);
796 }
797
798 redo A;
799 } elsif ($self->{nc} == 0x003E) { # >
800 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
801 line => $self->{line_prev}, ## "<" in "</>"
802 column => $self->{column_prev} - 1);
803 $self->{state} = DATA_STATE;
804 $self->{s_kwd} = '';
805 if ($self->{is_xml}) {
806
807 ## XML5: No parse error.
808
809 ## NOTE: This parser raises a parse error, since it supports
810 ## XML1, not XML5.
811
812 ## NOTE: A short end tag token.
813 my $ct = {type => END_TAG_TOKEN,
814 tag_name => '',
815 line => $self->{line_prev},
816 column => $self->{column_prev} - 1,
817 };
818
819 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
820 $self->{line_prev} = $self->{line};
821 $self->{column_prev} = $self->{column};
822 $self->{column}++;
823 $self->{nc}
824 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
825 } else {
826 $self->{set_nc}->($self);
827 }
828
829 return ($ct);
830 } else {
831
832
833 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
834 $self->{line_prev} = $self->{line};
835 $self->{column_prev} = $self->{column};
836 $self->{column}++;
837 $self->{nc}
838 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
839 } else {
840 $self->{set_nc}->($self);
841 }
842
843 }
844 redo A;
845 } elsif ($self->{nc} == -1) {
846
847 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
848 $self->{s_kwd} = '';
849 $self->{state} = DATA_STATE;
850 # reconsume
851
852 return ({type => CHARACTER_TOKEN, data => '</',
853 line => $l, column => $c,
854 });
855
856 redo A;
857 } elsif (not $self->{is_xml} or
858 $is_space->{$self->{nc}}) {
859
860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
861 line => $self->{line_prev}, # "<" of "</"
862 column => $self->{column_prev} - 1);
863 $self->{state} = BOGUS_COMMENT_STATE;
864 $self->{ct} = {type => COMMENT_TOKEN, data => '',
865 line => $self->{line_prev}, # "<" of "</"
866 column => $self->{column_prev} - 1,
867 };
868 ## NOTE: $self->{nc} is intentionally left as is.
869 ## Although the "anything else" case of the spec not explicitly
870 ## states that the next input character is to be reconsumed,
871 ## it will be included to the |data| of the comment token
872 ## generated from the bogus end tag, as defined in the
873 ## "bogus comment state" entry.
874 redo A;
875 } else {
876 ## XML5: "</:" is a parse error.
877
878 $self->{ct} = {type => END_TAG_TOKEN,
879 tag_name => chr ($self->{nc}),
880 line => $l, column => $c};
881 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
882
883 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
884 $self->{line_prev} = $self->{line};
885 $self->{column_prev} = $self->{column};
886 $self->{column}++;
887 $self->{nc}
888 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
889 } else {
890 $self->{set_nc}->($self);
891 }
892
893 redo A;
894 }
895 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
896 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
897 if (length $ch) {
898 my $CH = $ch;
899 $ch =~ tr/a-z/A-Z/;
900 my $nch = chr $self->{nc};
901 if ($nch eq $ch or $nch eq $CH) {
902
903 ## Stay in the state.
904 $self->{kwd} .= $nch;
905
906 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
907 $self->{line_prev} = $self->{line};
908 $self->{column_prev} = $self->{column};
909 $self->{column}++;
910 $self->{nc}
911 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
912 } else {
913 $self->{set_nc}->($self);
914 }
915
916 redo A;
917 } else {
918
919 $self->{state} = DATA_STATE;
920 $self->{s_kwd} = '';
921 ## Reconsume.
922 return ({type => CHARACTER_TOKEN,
923 data => '</' . $self->{kwd},
924 line => $self->{line_prev},
925 column => $self->{column_prev} - 1 - length $self->{kwd},
926 });
927 redo A;
928 }
929 } else { # after "<{tag-name}"
930 unless ($is_space->{$self->{nc}} or
931 {
932 0x003E => 1, # >
933 0x002F => 1, # /
934 -1 => 1, # EOF
935 }->{$self->{nc}}) {
936
937 ## Reconsume.
938 $self->{state} = DATA_STATE;
939 $self->{s_kwd} = '';
940 return ({type => CHARACTER_TOKEN,
941 data => '</' . $self->{kwd},
942 line => $self->{line_prev},
943 column => $self->{column_prev} - 1 - length $self->{kwd},
944 });
945 redo A;
946 } else {
947
948 $self->{ct}
949 = {type => END_TAG_TOKEN,
950 tag_name => $self->{last_stag_name},
951 line => $self->{line_prev},
952 column => $self->{column_prev} - 1 - length $self->{kwd}};
953 $self->{state} = TAG_NAME_STATE;
954 ## Reconsume.
955 redo A;
956 }
957 }
958 } elsif ($self->{state} == TAG_NAME_STATE) {
959 if ($is_space->{$self->{nc}}) {
960
961 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
962
963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
964 $self->{line_prev} = $self->{line};
965 $self->{column_prev} = $self->{column};
966 $self->{column}++;
967 $self->{nc}
968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
969 } else {
970 $self->{set_nc}->($self);
971 }
972
973 redo A;
974 } elsif ($self->{nc} == 0x003E) { # >
975 if ($self->{ct}->{type} == START_TAG_TOKEN) {
976
977 $self->{last_stag_name} = $self->{ct}->{tag_name};
978 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
979 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
980 #if ($self->{ct}->{attributes}) {
981 # ## NOTE: This should never be reached.
982 # !!! cp (36);
983 # !!! parse-error (type => 'end tag attribute');
984 #} else {
985
986 #}
987 } else {
988 die "$0: $self->{ct}->{type}: Unknown token type";
989 }
990 $self->{state} = DATA_STATE;
991 $self->{s_kwd} = '';
992
993 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
994 $self->{line_prev} = $self->{line};
995 $self->{column_prev} = $self->{column};
996 $self->{column}++;
997 $self->{nc}
998 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
999 } else {
1000 $self->{set_nc}->($self);
1001 }
1002
1003
1004 return ($self->{ct}); # start tag or end tag
1005
1006 redo A;
1007 } elsif (0x0041 <= $self->{nc} and
1008 $self->{nc} <= 0x005A) { # A..Z
1009
1010 $self->{ct}->{tag_name}
1011 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1012 # start tag or end tag
1013 ## Stay in this state
1014
1015 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1016 $self->{line_prev} = $self->{line};
1017 $self->{column_prev} = $self->{column};
1018 $self->{column}++;
1019 $self->{nc}
1020 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1021 } else {
1022 $self->{set_nc}->($self);
1023 }
1024
1025 redo A;
1026 } elsif ($self->{nc} == -1) {
1027 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1028 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1029
1030 $self->{last_stag_name} = $self->{ct}->{tag_name};
1031 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1032 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1033 #if ($self->{ct}->{attributes}) {
1034 # ## NOTE: This state should never be reached.
1035 # !!! cp (40);
1036 # !!! parse-error (type => 'end tag attribute');
1037 #} else {
1038
1039 #}
1040 } else {
1041 die "$0: $self->{ct}->{type}: Unknown token type";
1042 }
1043 $self->{state} = DATA_STATE;
1044 $self->{s_kwd} = '';
1045 # reconsume
1046
1047 return ($self->{ct}); # start tag or end tag
1048
1049 redo A;
1050 } elsif ($self->{nc} == 0x002F) { # /
1051
1052 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1053
1054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1055 $self->{line_prev} = $self->{line};
1056 $self->{column_prev} = $self->{column};
1057 $self->{column}++;
1058 $self->{nc}
1059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1060 } else {
1061 $self->{set_nc}->($self);
1062 }
1063
1064 redo A;
1065 } else {
1066
1067 $self->{ct}->{tag_name} .= chr $self->{nc};
1068 # start tag or end tag
1069 ## Stay in the state
1070
1071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072 $self->{line_prev} = $self->{line};
1073 $self->{column_prev} = $self->{column};
1074 $self->{column}++;
1075 $self->{nc}
1076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077 } else {
1078 $self->{set_nc}->($self);
1079 }
1080
1081 redo A;
1082 }
1083 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1084 ## XML5: "Tag attribute name before state".
1085
1086 if ($is_space->{$self->{nc}}) {
1087
1088 ## Stay in the state
1089
1090 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1091 $self->{line_prev} = $self->{line};
1092 $self->{column_prev} = $self->{column};
1093 $self->{column}++;
1094 $self->{nc}
1095 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1096 } else {
1097 $self->{set_nc}->($self);
1098 }
1099
1100 redo A;
1101 } elsif ($self->{nc} == 0x003E) { # >
1102 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1103
1104 $self->{last_stag_name} = $self->{ct}->{tag_name};
1105 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1106 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1107 if ($self->{ct}->{attributes}) {
1108
1109 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1110 } else {
1111
1112 }
1113 } else {
1114 die "$0: $self->{ct}->{type}: Unknown token type";
1115 }
1116 $self->{state} = DATA_STATE;
1117 $self->{s_kwd} = '';
1118
1119 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1120 $self->{line_prev} = $self->{line};
1121 $self->{column_prev} = $self->{column};
1122 $self->{column}++;
1123 $self->{nc}
1124 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1125 } else {
1126 $self->{set_nc}->($self);
1127 }
1128
1129
1130 return ($self->{ct}); # start tag or end tag
1131
1132 redo A;
1133 } elsif (0x0041 <= $self->{nc} and
1134 $self->{nc} <= 0x005A) { # A..Z
1135
1136 $self->{ca}
1137 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1138 value => '',
1139 line => $self->{line}, column => $self->{column}};
1140 $self->{state} = ATTRIBUTE_NAME_STATE;
1141
1142 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1143 $self->{line_prev} = $self->{line};
1144 $self->{column_prev} = $self->{column};
1145 $self->{column}++;
1146 $self->{nc}
1147 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1148 } else {
1149 $self->{set_nc}->($self);
1150 }
1151
1152 redo A;
1153 } elsif ($self->{nc} == 0x002F) { # /
1154
1155 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1156
1157 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1158 $self->{line_prev} = $self->{line};
1159 $self->{column_prev} = $self->{column};
1160 $self->{column}++;
1161 $self->{nc}
1162 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1163 } else {
1164 $self->{set_nc}->($self);
1165 }
1166
1167 redo A;
1168 } elsif ($self->{nc} == -1) {
1169 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1170 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171
1172 $self->{last_stag_name} = $self->{ct}->{tag_name};
1173 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175 if ($self->{ct}->{attributes}) {
1176
1177 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1178 } else {
1179
1180 }
1181 } else {
1182 die "$0: $self->{ct}->{type}: Unknown token type";
1183 }
1184 $self->{state} = DATA_STATE;
1185 $self->{s_kwd} = '';
1186 # reconsume
1187
1188 return ($self->{ct}); # start tag or end tag
1189
1190 redo A;
1191 } else {
1192 if ({
1193 0x0022 => 1, # "
1194 0x0027 => 1, # '
1195 0x003D => 1, # =
1196 }->{$self->{nc}}) {
1197
1198 ## XML5: Not a parse error.
1199 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1200 } else {
1201
1202 ## XML5: ":" raises a parse error and is ignored.
1203 }
1204 $self->{ca}
1205 = {name => chr ($self->{nc}),
1206 value => '',
1207 line => $self->{line}, column => $self->{column}};
1208 $self->{state} = ATTRIBUTE_NAME_STATE;
1209
1210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1211 $self->{line_prev} = $self->{line};
1212 $self->{column_prev} = $self->{column};
1213 $self->{column}++;
1214 $self->{nc}
1215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1216 } else {
1217 $self->{set_nc}->($self);
1218 }
1219
1220 redo A;
1221 }
1222 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1223 ## XML5: "Tag attribute name state".
1224
1225 my $before_leave = sub {
1226 if (exists $self->{ct}->{attributes} # start tag or end tag
1227 ->{$self->{ca}->{name}}) { # MUST
1228
1229 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1230 ## Discard $self->{ca} # MUST
1231 } else {
1232
1233 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1234 = $self->{ca};
1235 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1236 }
1237 }; # $before_leave
1238
1239 if ($is_space->{$self->{nc}}) {
1240
1241 $before_leave->();
1242 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1243
1244 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1245 $self->{line_prev} = $self->{line};
1246 $self->{column_prev} = $self->{column};
1247 $self->{column}++;
1248 $self->{nc}
1249 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1250 } else {
1251 $self->{set_nc}->($self);
1252 }
1253
1254 redo A;
1255 } elsif ($self->{nc} == 0x003D) { # =
1256
1257 $before_leave->();
1258 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1259
1260 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1261 $self->{line_prev} = $self->{line};
1262 $self->{column_prev} = $self->{column};
1263 $self->{column}++;
1264 $self->{nc}
1265 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1266 } else {
1267 $self->{set_nc}->($self);
1268 }
1269
1270 redo A;
1271 } elsif ($self->{nc} == 0x003E) { # >
1272 if ($self->{is_xml}) {
1273
1274 ## XML5: Not a parse error.
1275 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1276 } else {
1277
1278 }
1279
1280 $before_leave->();
1281 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1282
1283 $self->{last_stag_name} = $self->{ct}->{tag_name};
1284 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1285
1286 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1287 if ($self->{ct}->{attributes}) {
1288 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1289 }
1290 } else {
1291 die "$0: $self->{ct}->{type}: Unknown token type";
1292 }
1293 $self->{state} = DATA_STATE;
1294 $self->{s_kwd} = '';
1295
1296 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1297 $self->{line_prev} = $self->{line};
1298 $self->{column_prev} = $self->{column};
1299 $self->{column}++;
1300 $self->{nc}
1301 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1302 } else {
1303 $self->{set_nc}->($self);
1304 }
1305
1306
1307 return ($self->{ct}); # start tag or end tag
1308
1309 redo A;
1310 } elsif (0x0041 <= $self->{nc} and
1311 $self->{nc} <= 0x005A) { # A..Z
1312
1313 $self->{ca}->{name}
1314 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1315 ## Stay in the state
1316
1317 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1318 $self->{line_prev} = $self->{line};
1319 $self->{column_prev} = $self->{column};
1320 $self->{column}++;
1321 $self->{nc}
1322 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1323 } else {
1324 $self->{set_nc}->($self);
1325 }
1326
1327 redo A;
1328 } elsif ($self->{nc} == 0x002F) { # /
1329 if ($self->{is_xml}) {
1330
1331 ## XML5: Not a parse error.
1332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1333 } else {
1334
1335 }
1336
1337 $before_leave->();
1338 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1339
1340 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1341 $self->{line_prev} = $self->{line};
1342 $self->{column_prev} = $self->{column};
1343 $self->{column}++;
1344 $self->{nc}
1345 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1346 } else {
1347 $self->{set_nc}->($self);
1348 }
1349
1350 redo A;
1351 } elsif ($self->{nc} == -1) {
1352 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1353 $before_leave->();
1354 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1355
1356 $self->{last_stag_name} = $self->{ct}->{tag_name};
1357 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1358 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1359 if ($self->{ct}->{attributes}) {
1360
1361 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1362 } else {
1363 ## NOTE: This state should never be reached.
1364
1365 }
1366 } else {
1367 die "$0: $self->{ct}->{type}: Unknown token type";
1368 }
1369 $self->{state} = DATA_STATE;
1370 $self->{s_kwd} = '';
1371 # reconsume
1372
1373 return ($self->{ct}); # start tag or end tag
1374
1375 redo A;
1376 } else {
1377 if ($self->{nc} == 0x0022 or # "
1378 $self->{nc} == 0x0027) { # '
1379
1380 ## XML5: Not a parse error.
1381 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1382 } else {
1383
1384 }
1385 $self->{ca}->{name} .= chr ($self->{nc});
1386 ## Stay in the state
1387
1388 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1389 $self->{line_prev} = $self->{line};
1390 $self->{column_prev} = $self->{column};
1391 $self->{column}++;
1392 $self->{nc}
1393 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1394 } else {
1395 $self->{set_nc}->($self);
1396 }
1397
1398 redo A;
1399 }
1400 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1401 ## XML5: "Tag attribute name after state".
1402
1403 if ($is_space->{$self->{nc}}) {
1404
1405 ## Stay in the state
1406
1407 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1408 $self->{line_prev} = $self->{line};
1409 $self->{column_prev} = $self->{column};
1410 $self->{column}++;
1411 $self->{nc}
1412 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1413 } else {
1414 $self->{set_nc}->($self);
1415 }
1416
1417 redo A;
1418 } elsif ($self->{nc} == 0x003D) { # =
1419
1420 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1421
1422 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1423 $self->{line_prev} = $self->{line};
1424 $self->{column_prev} = $self->{column};
1425 $self->{column}++;
1426 $self->{nc}
1427 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1428 } else {
1429 $self->{set_nc}->($self);
1430 }
1431
1432 redo A;
1433 } elsif ($self->{nc} == 0x003E) { # >
1434 if ($self->{is_xml}) {
1435
1436 ## XML5: Not a parse error.
1437 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1438 } else {
1439
1440 }
1441
1442 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1443
1444 $self->{last_stag_name} = $self->{ct}->{tag_name};
1445 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1446 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1447 if ($self->{ct}->{attributes}) {
1448
1449 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1450 } else {
1451 ## NOTE: This state should never be reached.
1452
1453 }
1454 } else {
1455 die "$0: $self->{ct}->{type}: Unknown token type";
1456 }
1457 $self->{state} = DATA_STATE;
1458 $self->{s_kwd} = '';
1459
1460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1461 $self->{line_prev} = $self->{line};
1462 $self->{column_prev} = $self->{column};
1463 $self->{column}++;
1464 $self->{nc}
1465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1466 } else {
1467 $self->{set_nc}->($self);
1468 }
1469
1470
1471 return ($self->{ct}); # start tag or end tag
1472
1473 redo A;
1474 } elsif (0x0041 <= $self->{nc} and
1475 $self->{nc} <= 0x005A) { # A..Z
1476
1477 $self->{ca}
1478 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1479 value => '',
1480 line => $self->{line}, column => $self->{column}};
1481 $self->{state} = ATTRIBUTE_NAME_STATE;
1482
1483 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1484 $self->{line_prev} = $self->{line};
1485 $self->{column_prev} = $self->{column};
1486 $self->{column}++;
1487 $self->{nc}
1488 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1489 } else {
1490 $self->{set_nc}->($self);
1491 }
1492
1493 redo A;
1494 } elsif ($self->{nc} == 0x002F) { # /
1495 if ($self->{is_xml}) {
1496
1497 ## XML5: Not a parse error.
1498 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1499 } else {
1500
1501 }
1502
1503 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1504
1505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1506 $self->{line_prev} = $self->{line};
1507 $self->{column_prev} = $self->{column};
1508 $self->{column}++;
1509 $self->{nc}
1510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1511 } else {
1512 $self->{set_nc}->($self);
1513 }
1514
1515 redo A;
1516 } elsif ($self->{nc} == -1) {
1517 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1518 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1519
1520 $self->{last_stag_name} = $self->{ct}->{tag_name};
1521 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1522 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1523 if ($self->{ct}->{attributes}) {
1524
1525 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1526 } else {
1527 ## NOTE: This state should never be reached.
1528
1529 }
1530 } else {
1531 die "$0: $self->{ct}->{type}: Unknown token type";
1532 }
1533 $self->{s_kwd} = '';
1534 $self->{state} = DATA_STATE;
1535 # reconsume
1536
1537 return ($self->{ct}); # start tag or end tag
1538
1539 redo A;
1540 } else {
1541 if ($self->{is_xml}) {
1542
1543 ## XML5: Not a parse error.
1544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1545 } else {
1546
1547 }
1548
1549 if ($self->{nc} == 0x0022 or # "
1550 $self->{nc} == 0x0027) { # '
1551
1552 ## XML5: Not a parse error.
1553 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1554 } else {
1555
1556 }
1557 $self->{ca}
1558 = {name => chr ($self->{nc}),
1559 value => '',
1560 line => $self->{line}, column => $self->{column}};
1561 $self->{state} = ATTRIBUTE_NAME_STATE;
1562
1563 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1564 $self->{line_prev} = $self->{line};
1565 $self->{column_prev} = $self->{column};
1566 $self->{column}++;
1567 $self->{nc}
1568 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1569 } else {
1570 $self->{set_nc}->($self);
1571 }
1572
1573 redo A;
1574 }
1575 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1576 ## XML5: "Tag attribute value before state".
1577
1578 if ($is_space->{$self->{nc}}) {
1579
1580 ## Stay in the state
1581
1582 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1583 $self->{line_prev} = $self->{line};
1584 $self->{column_prev} = $self->{column};
1585 $self->{column}++;
1586 $self->{nc}
1587 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1588 } else {
1589 $self->{set_nc}->($self);
1590 }
1591
1592 redo A;
1593 } elsif ($self->{nc} == 0x0022) { # "
1594
1595 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1596
1597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1598 $self->{line_prev} = $self->{line};
1599 $self->{column_prev} = $self->{column};
1600 $self->{column}++;
1601 $self->{nc}
1602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1603 } else {
1604 $self->{set_nc}->($self);
1605 }
1606
1607 redo A;
1608 } elsif ($self->{nc} == 0x0026) { # &
1609
1610 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1611 ## reconsume
1612 redo A;
1613 } elsif ($self->{nc} == 0x0027) { # '
1614
1615 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1616
1617 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1618 $self->{line_prev} = $self->{line};
1619 $self->{column_prev} = $self->{column};
1620 $self->{column}++;
1621 $self->{nc}
1622 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1623 } else {
1624 $self->{set_nc}->($self);
1625 }
1626
1627 redo A;
1628 } elsif ($self->{nc} == 0x003E) { # >
1629 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1630 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1631
1632 $self->{last_stag_name} = $self->{ct}->{tag_name};
1633 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1634 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1635 if ($self->{ct}->{attributes}) {
1636
1637 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1638 } else {
1639 ## NOTE: This state should never be reached.
1640
1641 }
1642 } else {
1643 die "$0: $self->{ct}->{type}: Unknown token type";
1644 }
1645 $self->{state} = DATA_STATE;
1646 $self->{s_kwd} = '';
1647
1648 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1649 $self->{line_prev} = $self->{line};
1650 $self->{column_prev} = $self->{column};
1651 $self->{column}++;
1652 $self->{nc}
1653 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1654 } else {
1655 $self->{set_nc}->($self);
1656 }
1657
1658
1659 return ($self->{ct}); # start tag or end tag
1660
1661 redo A;
1662 } elsif ($self->{nc} == -1) {
1663 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1664 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1665
1666 $self->{last_stag_name} = $self->{ct}->{tag_name};
1667 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1668 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1669 if ($self->{ct}->{attributes}) {
1670
1671 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1672 } else {
1673 ## NOTE: This state should never be reached.
1674
1675 }
1676 } else {
1677 die "$0: $self->{ct}->{type}: Unknown token type";
1678 }
1679 $self->{state} = DATA_STATE;
1680 $self->{s_kwd} = '';
1681 ## reconsume
1682
1683 return ($self->{ct}); # start tag or end tag
1684
1685 redo A;
1686 } else {
1687 if ($self->{nc} == 0x003D) { # =
1688
1689 ## XML5: Not a parse error.
1690 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1691 } elsif ($self->{is_xml}) {
1692
1693 ## XML5: No parse error.
1694 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1695 } else {
1696
1697 }
1698 $self->{ca}->{value} .= chr ($self->{nc});
1699 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1700
1701 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1702 $self->{line_prev} = $self->{line};
1703 $self->{column_prev} = $self->{column};
1704 $self->{column}++;
1705 $self->{nc}
1706 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1707 } else {
1708 $self->{set_nc}->($self);
1709 }
1710
1711 redo A;
1712 }
1713 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1714 ## XML5: "Tag attribute value double quoted state".
1715
1716 if ($self->{nc} == 0x0022) { # "
1717
1718 ## XML5: "Tag attribute name before state".
1719 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1720
1721 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1722 $self->{line_prev} = $self->{line};
1723 $self->{column_prev} = $self->{column};
1724 $self->{column}++;
1725 $self->{nc}
1726 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1727 } else {
1728 $self->{set_nc}->($self);
1729 }
1730
1731 redo A;
1732 } elsif ($self->{nc} == 0x0026) { # &
1733
1734 ## XML5: Not defined yet.
1735
1736 ## NOTE: In the spec, the tokenizer is switched to the
1737 ## "entity in attribute value state". In this implementation, the
1738 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1739 ## implementation of the "consume a character reference" algorithm.
1740 $self->{prev_state} = $self->{state};
1741 $self->{entity_add} = 0x0022; # "
1742 $self->{state} = ENTITY_STATE;
1743
1744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1745 $self->{line_prev} = $self->{line};
1746 $self->{column_prev} = $self->{column};
1747 $self->{column}++;
1748 $self->{nc}
1749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1750 } else {
1751 $self->{set_nc}->($self);
1752 }
1753
1754 redo A;
1755 } elsif ($self->{nc} == -1) {
1756 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1757 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1758
1759 $self->{last_stag_name} = $self->{ct}->{tag_name};
1760 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1761 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1762 if ($self->{ct}->{attributes}) {
1763
1764 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1765 } else {
1766 ## NOTE: This state should never be reached.
1767
1768 }
1769 } else {
1770 die "$0: $self->{ct}->{type}: Unknown token type";
1771 }
1772 $self->{state} = DATA_STATE;
1773 $self->{s_kwd} = '';
1774 ## reconsume
1775
1776 return ($self->{ct}); # start tag or end tag
1777
1778 redo A;
1779 } else {
1780 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1781
1782 ## XML5: Not a parse error.
1783 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1784 } else {
1785
1786 }
1787 $self->{ca}->{value} .= chr ($self->{nc});
1788 $self->{read_until}->($self->{ca}->{value},
1789 q["&<],
1790 length $self->{ca}->{value});
1791
1792 ## Stay in the state
1793
1794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795 $self->{line_prev} = $self->{line};
1796 $self->{column_prev} = $self->{column};
1797 $self->{column}++;
1798 $self->{nc}
1799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800 } else {
1801 $self->{set_nc}->($self);
1802 }
1803
1804 redo A;
1805 }
1806 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1807 ## XML5: "Tag attribute value single quoted state".
1808
1809 if ($self->{nc} == 0x0027) { # '
1810
1811 ## XML5: "Before attribute name state" (sic).
1812 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1813
1814 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1815 $self->{line_prev} = $self->{line};
1816 $self->{column_prev} = $self->{column};
1817 $self->{column}++;
1818 $self->{nc}
1819 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1820 } else {
1821 $self->{set_nc}->($self);
1822 }
1823
1824 redo A;
1825 } elsif ($self->{nc} == 0x0026) { # &
1826
1827 ## XML5: Not defined yet.
1828
1829 ## NOTE: In the spec, the tokenizer is switched to the
1830 ## "entity in attribute value state". In this implementation, the
1831 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1832 ## implementation of the "consume a character reference" algorithm.
1833 $self->{entity_add} = 0x0027; # '
1834 $self->{prev_state} = $self->{state};
1835 $self->{state} = ENTITY_STATE;
1836
1837 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1838 $self->{line_prev} = $self->{line};
1839 $self->{column_prev} = $self->{column};
1840 $self->{column}++;
1841 $self->{nc}
1842 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1843 } else {
1844 $self->{set_nc}->($self);
1845 }
1846
1847 redo A;
1848 } elsif ($self->{nc} == -1) {
1849 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1850 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1851
1852 $self->{last_stag_name} = $self->{ct}->{tag_name};
1853 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1854 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1855 if ($self->{ct}->{attributes}) {
1856
1857 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1858 } else {
1859 ## NOTE: This state should never be reached.
1860
1861 }
1862 } else {
1863 die "$0: $self->{ct}->{type}: Unknown token type";
1864 }
1865 $self->{state} = DATA_STATE;
1866 $self->{s_kwd} = '';
1867 ## reconsume
1868
1869 return ($self->{ct}); # start tag or end tag
1870
1871 redo A;
1872 } else {
1873 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1874
1875 ## XML5: Not a parse error.
1876 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1877 } else {
1878
1879 }
1880 $self->{ca}->{value} .= chr ($self->{nc});
1881 $self->{read_until}->($self->{ca}->{value},
1882 q['&<],
1883 length $self->{ca}->{value});
1884
1885 ## Stay in the state
1886
1887 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1888 $self->{line_prev} = $self->{line};
1889 $self->{column_prev} = $self->{column};
1890 $self->{column}++;
1891 $self->{nc}
1892 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1893 } else {
1894 $self->{set_nc}->($self);
1895 }
1896
1897 redo A;
1898 }
1899 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1900 ## XML5: "Tag attribute value unquoted state".
1901
1902 if ($is_space->{$self->{nc}}) {
1903
1904 ## XML5: "Tag attribute name before state".
1905 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1906
1907 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1908 $self->{line_prev} = $self->{line};
1909 $self->{column_prev} = $self->{column};
1910 $self->{column}++;
1911 $self->{nc}
1912 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1913 } else {
1914 $self->{set_nc}->($self);
1915 }
1916
1917 redo A;
1918 } elsif ($self->{nc} == 0x0026) { # &
1919
1920
1921 ## XML5: Not defined yet.
1922
1923 ## NOTE: In the spec, the tokenizer is switched to the
1924 ## "entity in attribute value state". In this implementation, the
1925 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1926 ## implementation of the "consume a character reference" algorithm.
1927 $self->{entity_add} = -1;
1928 $self->{prev_state} = $self->{state};
1929 $self->{state} = ENTITY_STATE;
1930
1931 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1932 $self->{line_prev} = $self->{line};
1933 $self->{column_prev} = $self->{column};
1934 $self->{column}++;
1935 $self->{nc}
1936 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1937 } else {
1938 $self->{set_nc}->($self);
1939 }
1940
1941 redo A;
1942 } elsif ($self->{nc} == 0x003E) { # >
1943 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1944
1945 $self->{last_stag_name} = $self->{ct}->{tag_name};
1946 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1947 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1948 if ($self->{ct}->{attributes}) {
1949
1950 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1951 } else {
1952 ## NOTE: This state should never be reached.
1953
1954 }
1955 } else {
1956 die "$0: $self->{ct}->{type}: Unknown token type";
1957 }
1958 $self->{state} = DATA_STATE;
1959 $self->{s_kwd} = '';
1960
1961 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1962 $self->{line_prev} = $self->{line};
1963 $self->{column_prev} = $self->{column};
1964 $self->{column}++;
1965 $self->{nc}
1966 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1967 } else {
1968 $self->{set_nc}->($self);
1969 }
1970
1971
1972 return ($self->{ct}); # start tag or end tag
1973
1974 redo A;
1975 } elsif ($self->{nc} == -1) {
1976 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1977 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1978
1979 $self->{last_stag_name} = $self->{ct}->{tag_name};
1980 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1981 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1982 if ($self->{ct}->{attributes}) {
1983
1984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1985 } else {
1986 ## NOTE: This state should never be reached.
1987
1988 }
1989 } else {
1990 die "$0: $self->{ct}->{type}: Unknown token type";
1991 }
1992 $self->{state} = DATA_STATE;
1993 $self->{s_kwd} = '';
1994 ## reconsume
1995
1996 return ($self->{ct}); # start tag or end tag
1997
1998 redo A;
1999 } else {
2000 if ({
2001 0x0022 => 1, # "
2002 0x0027 => 1, # '
2003 0x003D => 1, # =
2004 }->{$self->{nc}}) {
2005
2006 ## XML5: Not a parse error.
2007 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2008 } else {
2009
2010 }
2011 $self->{ca}->{value} .= chr ($self->{nc});
2012 $self->{read_until}->($self->{ca}->{value},
2013 q["'=& >],
2014 length $self->{ca}->{value});
2015
2016 ## Stay in the state
2017
2018 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2019 $self->{line_prev} = $self->{line};
2020 $self->{column_prev} = $self->{column};
2021 $self->{column}++;
2022 $self->{nc}
2023 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2024 } else {
2025 $self->{set_nc}->($self);
2026 }
2027
2028 redo A;
2029 }
2030 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2031 if ($is_space->{$self->{nc}}) {
2032
2033 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2034
2035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036 $self->{line_prev} = $self->{line};
2037 $self->{column_prev} = $self->{column};
2038 $self->{column}++;
2039 $self->{nc}
2040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041 } else {
2042 $self->{set_nc}->($self);
2043 }
2044
2045 redo A;
2046 } elsif ($self->{nc} == 0x003E) { # >
2047 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048
2049 $self->{last_stag_name} = $self->{ct}->{tag_name};
2050 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2051 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2052 if ($self->{ct}->{attributes}) {
2053
2054 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2055 } else {
2056 ## NOTE: This state should never be reached.
2057
2058 }
2059 } else {
2060 die "$0: $self->{ct}->{type}: Unknown token type";
2061 }
2062 $self->{state} = DATA_STATE;
2063 $self->{s_kwd} = '';
2064
2065 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2066 $self->{line_prev} = $self->{line};
2067 $self->{column_prev} = $self->{column};
2068 $self->{column}++;
2069 $self->{nc}
2070 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2071 } else {
2072 $self->{set_nc}->($self);
2073 }
2074
2075
2076 return ($self->{ct}); # start tag or end tag
2077
2078 redo A;
2079 } elsif ($self->{nc} == 0x002F) { # /
2080
2081 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2082
2083 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2084 $self->{line_prev} = $self->{line};
2085 $self->{column_prev} = $self->{column};
2086 $self->{column}++;
2087 $self->{nc}
2088 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2089 } else {
2090 $self->{set_nc}->($self);
2091 }
2092
2093 redo A;
2094 } elsif ($self->{nc} == -1) {
2095 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2096 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2097
2098 $self->{last_stag_name} = $self->{ct}->{tag_name};
2099 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2100 if ($self->{ct}->{attributes}) {
2101
2102 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2103 } else {
2104 ## NOTE: This state should never be reached.
2105
2106 }
2107 } else {
2108 die "$0: $self->{ct}->{type}: Unknown token type";
2109 }
2110 $self->{state} = DATA_STATE;
2111 $self->{s_kwd} = '';
2112 ## Reconsume.
2113 return ($self->{ct}); # start tag or end tag
2114 redo A;
2115 } else {
2116
2117 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2118 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2119 ## reconsume
2120 redo A;
2121 }
2122 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2123 ## XML5: "Empty tag state".
2124
2125 if ($self->{nc} == 0x003E) { # >
2126 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2127
2128 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2129 ## TODO: Different type than slash in start tag
2130 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2131 if ($self->{ct}->{attributes}) {
2132
2133 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2134 } else {
2135
2136 }
2137 ## TODO: Test |<title></title/>|
2138 } else {
2139
2140 $self->{self_closing} = 1;
2141 }
2142
2143 $self->{state} = DATA_STATE;
2144 $self->{s_kwd} = '';
2145
2146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2147 $self->{line_prev} = $self->{line};
2148 $self->{column_prev} = $self->{column};
2149 $self->{column}++;
2150 $self->{nc}
2151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2152 } else {
2153 $self->{set_nc}->($self);
2154 }
2155
2156
2157 return ($self->{ct}); # start tag or end tag
2158
2159 redo A;
2160 } elsif ($self->{nc} == -1) {
2161 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2162 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2163
2164 $self->{last_stag_name} = $self->{ct}->{tag_name};
2165 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2166 if ($self->{ct}->{attributes}) {
2167
2168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2169 } else {
2170 ## NOTE: This state should never be reached.
2171
2172 }
2173 } else {
2174 die "$0: $self->{ct}->{type}: Unknown token type";
2175 }
2176 ## XML5: "Tag attribute name before state".
2177 $self->{state} = DATA_STATE;
2178 $self->{s_kwd} = '';
2179 ## Reconsume.
2180 return ($self->{ct}); # start tag or end tag
2181 redo A;
2182 } else {
2183
2184 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2185 ## TODO: This error type is wrong.
2186 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2187 ## Reconsume.
2188 redo A;
2189 }
2190 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2191 ## NOTE: Unlike spec's "bogus comment state", this implementation
2192 ## consumes characters one-by-one basis.
2193
2194 if ($self->{nc} == 0x003E) { # >
2195 if ($self->{in_subset}) {
2196
2197 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2198 } else {
2199
2200 $self->{state} = DATA_STATE;
2201 $self->{s_kwd} = '';
2202 }
2203
2204 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2205 $self->{line_prev} = $self->{line};
2206 $self->{column_prev} = $self->{column};
2207 $self->{column}++;
2208 $self->{nc}
2209 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2210 } else {
2211 $self->{set_nc}->($self);
2212 }
2213
2214
2215 return ($self->{ct}); # comment
2216 redo A;
2217 } elsif ($self->{nc} == -1) {
2218 if ($self->{in_subset}) {
2219
2220 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2221 } else {
2222
2223 $self->{state} = DATA_STATE;
2224 $self->{s_kwd} = '';
2225 }
2226 ## reconsume
2227
2228 return ($self->{ct}); # comment
2229 redo A;
2230 } else {
2231
2232 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2233 $self->{read_until}->($self->{ct}->{data},
2234 q[>],
2235 length $self->{ct}->{data});
2236
2237 ## Stay in the state.
2238
2239 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2240 $self->{line_prev} = $self->{line};
2241 $self->{column_prev} = $self->{column};
2242 $self->{column}++;
2243 $self->{nc}
2244 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2245 } else {
2246 $self->{set_nc}->($self);
2247 }
2248
2249 redo A;
2250 }
2251 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2252 ## XML5: "Markup declaration state" and "DOCTYPE markup
2253 ## declaration state".
2254
2255 if ($self->{nc} == 0x002D) { # -
2256
2257 $self->{state} = MD_HYPHEN_STATE;
2258
2259 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2260 $self->{line_prev} = $self->{line};
2261 $self->{column_prev} = $self->{column};
2262 $self->{column}++;
2263 $self->{nc}
2264 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2265 } else {
2266 $self->{set_nc}->($self);
2267 }
2268
2269 redo A;
2270 } elsif ($self->{nc} == 0x0044 or # D
2271 $self->{nc} == 0x0064) { # d
2272 ## ASCII case-insensitive.
2273
2274 $self->{state} = MD_DOCTYPE_STATE;
2275 $self->{kwd} = chr $self->{nc};
2276
2277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2278 $self->{line_prev} = $self->{line};
2279 $self->{column_prev} = $self->{column};
2280 $self->{column}++;
2281 $self->{nc}
2282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2283 } else {
2284 $self->{set_nc}->($self);
2285 }
2286
2287 redo A;
2288 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2289 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2290 $self->{is_xml}) and
2291 $self->{nc} == 0x005B) { # [
2292
2293 $self->{state} = MD_CDATA_STATE;
2294 $self->{kwd} = '[';
2295
2296 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2297 $self->{line_prev} = $self->{line};
2298 $self->{column_prev} = $self->{column};
2299 $self->{column}++;
2300 $self->{nc}
2301 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2302 } else {
2303 $self->{set_nc}->($self);
2304 }
2305
2306 redo A;
2307 } else {
2308
2309 }
2310
2311 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2312 line => $self->{line_prev},
2313 column => $self->{column_prev} - 1);
2314 ## Reconsume.
2315 $self->{state} = BOGUS_COMMENT_STATE;
2316 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2317 line => $self->{line_prev},
2318 column => $self->{column_prev} - 1,
2319 };
2320 redo A;
2321 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2322 if ($self->{nc} == 0x002D) { # -
2323
2324 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2325 line => $self->{line_prev},
2326 column => $self->{column_prev} - 2,
2327 };
2328 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2329
2330 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2331 $self->{line_prev} = $self->{line};
2332 $self->{column_prev} = $self->{column};
2333 $self->{column}++;
2334 $self->{nc}
2335 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2336 } else {
2337 $self->{set_nc}->($self);
2338 }
2339
2340 redo A;
2341 } else {
2342
2343 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2344 line => $self->{line_prev},
2345 column => $self->{column_prev} - 2);
2346 $self->{state} = BOGUS_COMMENT_STATE;
2347 ## Reconsume.
2348 $self->{ct} = {type => COMMENT_TOKEN,
2349 data => '-',
2350 line => $self->{line_prev},
2351 column => $self->{column_prev} - 2,
2352 };
2353 redo A;
2354 }
2355 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2356 ## ASCII case-insensitive.
2357 if ($self->{nc} == [
2358 undef,
2359 0x004F, # O
2360 0x0043, # C
2361 0x0054, # T
2362 0x0059, # Y
2363 0x0050, # P
2364 ]->[length $self->{kwd}] or
2365 $self->{nc} == [
2366 undef,
2367 0x006F, # o
2368 0x0063, # c
2369 0x0074, # t
2370 0x0079, # y
2371 0x0070, # p
2372 ]->[length $self->{kwd}]) {
2373
2374 ## Stay in the state.
2375 $self->{kwd} .= chr $self->{nc};
2376
2377 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2378 $self->{line_prev} = $self->{line};
2379 $self->{column_prev} = $self->{column};
2380 $self->{column}++;
2381 $self->{nc}
2382 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2383 } else {
2384 $self->{set_nc}->($self);
2385 }
2386
2387 redo A;
2388 } elsif ((length $self->{kwd}) == 6 and
2389 ($self->{nc} == 0x0045 or # E
2390 $self->{nc} == 0x0065)) { # e
2391 if ($self->{is_xml} and
2392 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2393
2394 ## XML5: case-sensitive.
2395 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2396 text => 'DOCTYPE',
2397 line => $self->{line_prev},
2398 column => $self->{column_prev} - 5);
2399 } else {
2400
2401 }
2402 $self->{state} = DOCTYPE_STATE;
2403 $self->{ct} = {type => DOCTYPE_TOKEN,
2404 quirks => 1,
2405 line => $self->{line_prev},
2406 column => $self->{column_prev} - 7,
2407 };
2408
2409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2410 $self->{line_prev} = $self->{line};
2411 $self->{column_prev} = $self->{column};
2412 $self->{column}++;
2413 $self->{nc}
2414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2415 } else {
2416 $self->{set_nc}->($self);
2417 }
2418
2419 redo A;
2420 } else {
2421
2422 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2423 line => $self->{line_prev},
2424 column => $self->{column_prev} - 1 - length $self->{kwd});
2425 $self->{state} = BOGUS_COMMENT_STATE;
2426 ## Reconsume.
2427 $self->{ct} = {type => COMMENT_TOKEN,
2428 data => $self->{kwd},
2429 line => $self->{line_prev},
2430 column => $self->{column_prev} - 1 - length $self->{kwd},
2431 };
2432 redo A;
2433 }
2434 } elsif ($self->{state} == MD_CDATA_STATE) {
2435 if ($self->{nc} == {
2436 '[' => 0x0043, # C
2437 '[C' => 0x0044, # D
2438 '[CD' => 0x0041, # A
2439 '[CDA' => 0x0054, # T
2440 '[CDAT' => 0x0041, # A
2441 }->{$self->{kwd}}) {
2442
2443 ## Stay in the state.
2444 $self->{kwd} .= chr $self->{nc};
2445
2446 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2447 $self->{line_prev} = $self->{line};
2448 $self->{column_prev} = $self->{column};
2449 $self->{column}++;
2450 $self->{nc}
2451 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2452 } else {
2453 $self->{set_nc}->($self);
2454 }
2455
2456 redo A;
2457 } elsif ($self->{kwd} eq '[CDATA' and
2458 $self->{nc} == 0x005B) { # [
2459 if ($self->{is_xml} and
2460 not $self->{tainted} and
2461 @{$self->{open_elements} or []} == 0) {
2462
2463 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2464 line => $self->{line_prev},
2465 column => $self->{column_prev} - 7);
2466 $self->{tainted} = 1;
2467 } else {
2468
2469 }
2470
2471 $self->{ct} = {type => CHARACTER_TOKEN,
2472 data => '',
2473 line => $self->{line_prev},
2474 column => $self->{column_prev} - 7};
2475 $self->{state} = CDATA_SECTION_STATE;
2476
2477 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2478 $self->{line_prev} = $self->{line};
2479 $self->{column_prev} = $self->{column};
2480 $self->{column}++;
2481 $self->{nc}
2482 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2483 } else {
2484 $self->{set_nc}->($self);
2485 }
2486
2487 redo A;
2488 } else {
2489
2490 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2491 line => $self->{line_prev},
2492 column => $self->{column_prev} - 1 - length $self->{kwd});
2493 $self->{state} = BOGUS_COMMENT_STATE;
2494 ## Reconsume.
2495 $self->{ct} = {type => COMMENT_TOKEN,
2496 data => $self->{kwd},
2497 line => $self->{line_prev},
2498 column => $self->{column_prev} - 1 - length $self->{kwd},
2499 };
2500 redo A;
2501 }
2502 } elsif ($self->{state} == COMMENT_START_STATE) {
2503 if ($self->{nc} == 0x002D) { # -
2504
2505 $self->{state} = COMMENT_START_DASH_STATE;
2506
2507 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2508 $self->{line_prev} = $self->{line};
2509 $self->{column_prev} = $self->{column};
2510 $self->{column}++;
2511 $self->{nc}
2512 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2513 } else {
2514 $self->{set_nc}->($self);
2515 }
2516
2517 redo A;
2518 } elsif ($self->{nc} == 0x003E) { # >
2519 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2520 if ($self->{in_subset}) {
2521
2522 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2523 } else {
2524
2525 $self->{state} = DATA_STATE;
2526 $self->{s_kwd} = '';
2527 }
2528
2529 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2530 $self->{line_prev} = $self->{line};
2531 $self->{column_prev} = $self->{column};
2532 $self->{column}++;
2533 $self->{nc}
2534 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2535 } else {
2536 $self->{set_nc}->($self);
2537 }
2538
2539
2540 return ($self->{ct}); # comment
2541
2542 redo A;
2543 } elsif ($self->{nc} == -1) {
2544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2545 if ($self->{in_subset}) {
2546
2547 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2548 } else {
2549
2550 $self->{state} = DATA_STATE;
2551 $self->{s_kwd} = '';
2552 }
2553 ## reconsume
2554
2555 return ($self->{ct}); # comment
2556
2557 redo A;
2558 } else {
2559
2560 $self->{ct}->{data} # comment
2561 .= chr ($self->{nc});
2562 $self->{state} = COMMENT_STATE;
2563
2564 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2565 $self->{line_prev} = $self->{line};
2566 $self->{column_prev} = $self->{column};
2567 $self->{column}++;
2568 $self->{nc}
2569 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2570 } else {
2571 $self->{set_nc}->($self);
2572 }
2573
2574 redo A;
2575 }
2576 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2577 if ($self->{nc} == 0x002D) { # -
2578
2579 $self->{state} = COMMENT_END_STATE;
2580
2581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2582 $self->{line_prev} = $self->{line};
2583 $self->{column_prev} = $self->{column};
2584 $self->{column}++;
2585 $self->{nc}
2586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2587 } else {
2588 $self->{set_nc}->($self);
2589 }
2590
2591 redo A;
2592 } elsif ($self->{nc} == 0x003E) { # >
2593 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2594 if ($self->{in_subset}) {
2595
2596 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597 } else {
2598
2599 $self->{state} = DATA_STATE;
2600 $self->{s_kwd} = '';
2601 }
2602
2603 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2604 $self->{line_prev} = $self->{line};
2605 $self->{column_prev} = $self->{column};
2606 $self->{column}++;
2607 $self->{nc}
2608 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2609 } else {
2610 $self->{set_nc}->($self);
2611 }
2612
2613
2614 return ($self->{ct}); # comment
2615
2616 redo A;
2617 } elsif ($self->{nc} == -1) {
2618 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619 if ($self->{in_subset}) {
2620
2621 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2622 } else {
2623
2624 $self->{state} = DATA_STATE;
2625 $self->{s_kwd} = '';
2626 }
2627 ## reconsume
2628
2629 return ($self->{ct}); # comment
2630
2631 redo A;
2632 } else {
2633
2634 $self->{ct}->{data} # comment
2635 .= '-' . chr ($self->{nc});
2636 $self->{state} = COMMENT_STATE;
2637
2638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2639 $self->{line_prev} = $self->{line};
2640 $self->{column_prev} = $self->{column};
2641 $self->{column}++;
2642 $self->{nc}
2643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2644 } else {
2645 $self->{set_nc}->($self);
2646 }
2647
2648 redo A;
2649 }
2650 } elsif ($self->{state} == COMMENT_STATE) {
2651 if ($self->{nc} == 0x002D) { # -
2652
2653 $self->{state} = COMMENT_END_DASH_STATE;
2654
2655 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2656 $self->{line_prev} = $self->{line};
2657 $self->{column_prev} = $self->{column};
2658 $self->{column}++;
2659 $self->{nc}
2660 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2661 } else {
2662 $self->{set_nc}->($self);
2663 }
2664
2665 redo A;
2666 } elsif ($self->{nc} == -1) {
2667 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668 if ($self->{in_subset}) {
2669
2670 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2671 } else {
2672
2673 $self->{state} = DATA_STATE;
2674 $self->{s_kwd} = '';
2675 }
2676 ## reconsume
2677
2678 return ($self->{ct}); # comment
2679
2680 redo A;
2681 } else {
2682
2683 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2684 $self->{read_until}->($self->{ct}->{data},
2685 q[-],
2686 length $self->{ct}->{data});
2687
2688 ## Stay in the state
2689
2690 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2691 $self->{line_prev} = $self->{line};
2692 $self->{column_prev} = $self->{column};
2693 $self->{column}++;
2694 $self->{nc}
2695 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2696 } else {
2697 $self->{set_nc}->($self);
2698 }
2699
2700 redo A;
2701 }
2702 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2703 ## XML5: "comment dash state".
2704
2705 if ($self->{nc} == 0x002D) { # -
2706
2707 $self->{state} = COMMENT_END_STATE;
2708
2709 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2710 $self->{line_prev} = $self->{line};
2711 $self->{column_prev} = $self->{column};
2712 $self->{column}++;
2713 $self->{nc}
2714 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2715 } else {
2716 $self->{set_nc}->($self);
2717 }
2718
2719 redo A;
2720 } elsif ($self->{nc} == -1) {
2721 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2722 if ($self->{in_subset}) {
2723
2724 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2725 } else {
2726
2727 $self->{state} = DATA_STATE;
2728 $self->{s_kwd} = '';
2729 }
2730 ## reconsume
2731
2732 return ($self->{ct}); # comment
2733
2734 redo A;
2735 } else {
2736
2737 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2738 $self->{state} = COMMENT_STATE;
2739
2740 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2741 $self->{line_prev} = $self->{line};
2742 $self->{column_prev} = $self->{column};
2743 $self->{column}++;
2744 $self->{nc}
2745 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2746 } else {
2747 $self->{set_nc}->($self);
2748 }
2749
2750 redo A;
2751 }
2752 } elsif ($self->{state} == COMMENT_END_STATE) {
2753 if ($self->{nc} == 0x003E) { # >
2754 if ($self->{in_subset}) {
2755
2756 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2757 } else {
2758
2759 $self->{state} = DATA_STATE;
2760 $self->{s_kwd} = '';
2761 }
2762
2763 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2764 $self->{line_prev} = $self->{line};
2765 $self->{column_prev} = $self->{column};
2766 $self->{column}++;
2767 $self->{nc}
2768 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2769 } else {
2770 $self->{set_nc}->($self);
2771 }
2772
2773
2774 return ($self->{ct}); # comment
2775
2776 redo A;
2777 } elsif ($self->{nc} == 0x002D) { # -
2778
2779 ## XML5: Not a parse error.
2780 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2781 line => $self->{line_prev},
2782 column => $self->{column_prev});
2783 $self->{ct}->{data} .= '-'; # comment
2784 ## Stay in the state
2785
2786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2787 $self->{line_prev} = $self->{line};
2788 $self->{column_prev} = $self->{column};
2789 $self->{column}++;
2790 $self->{nc}
2791 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2792 } else {
2793 $self->{set_nc}->($self);
2794 }
2795
2796 redo A;
2797 } elsif ($self->{nc} == -1) {
2798 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2799 if ($self->{in_subset}) {
2800
2801 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2802 } else {
2803
2804 $self->{state} = DATA_STATE;
2805 $self->{s_kwd} = '';
2806 }
2807 ## reconsume
2808
2809 return ($self->{ct}); # comment
2810
2811 redo A;
2812 } else {
2813
2814 ## XML5: Not a parse error.
2815 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2816 line => $self->{line_prev},
2817 column => $self->{column_prev});
2818 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2819 $self->{state} = COMMENT_STATE;
2820
2821 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2822 $self->{line_prev} = $self->{line};
2823 $self->{column_prev} = $self->{column};
2824 $self->{column}++;
2825 $self->{nc}
2826 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2827 } else {
2828 $self->{set_nc}->($self);
2829 }
2830
2831 redo A;
2832 }
2833 } elsif ($self->{state} == DOCTYPE_STATE) {
2834 if ($is_space->{$self->{nc}}) {
2835
2836 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2837
2838 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2839 $self->{line_prev} = $self->{line};
2840 $self->{column_prev} = $self->{column};
2841 $self->{column}++;
2842 $self->{nc}
2843 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2844 } else {
2845 $self->{set_nc}->($self);
2846 }
2847
2848 redo A;
2849 } else {
2850
2851 ## XML5: Unless EOF, swith to the bogus comment state.
2852 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2853 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2854 ## reconsume
2855 redo A;
2856 }
2857 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2858 ## XML5: "DOCTYPE root name before state".
2859
2860 if ($is_space->{$self->{nc}}) {
2861
2862 ## Stay in the state
2863
2864 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2865 $self->{line_prev} = $self->{line};
2866 $self->{column_prev} = $self->{column};
2867 $self->{column}++;
2868 $self->{nc}
2869 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2870 } else {
2871 $self->{set_nc}->($self);
2872 }
2873
2874 redo A;
2875 } elsif ($self->{nc} == 0x003E) { # >
2876
2877 ## XML5: No parse error.
2878 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2879 $self->{state} = DATA_STATE;
2880 $self->{s_kwd} = '';
2881
2882 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2883 $self->{line_prev} = $self->{line};
2884 $self->{column_prev} = $self->{column};
2885 $self->{column}++;
2886 $self->{nc}
2887 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2888 } else {
2889 $self->{set_nc}->($self);
2890 }
2891
2892
2893 return ($self->{ct}); # DOCTYPE (quirks)
2894
2895 redo A;
2896 } elsif ($self->{nc} == -1) {
2897
2898 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2899 $self->{state} = DATA_STATE;
2900 $self->{s_kwd} = '';
2901 ## reconsume
2902
2903 return ($self->{ct}); # DOCTYPE (quirks)
2904
2905 redo A;
2906 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2907
2908 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2909 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2911 $self->{in_subset} = 1;
2912
2913 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2914 $self->{line_prev} = $self->{line};
2915 $self->{column_prev} = $self->{column};
2916 $self->{column}++;
2917 $self->{nc}
2918 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2919 } else {
2920 $self->{set_nc}->($self);
2921 }
2922
2923 return ($self->{ct}); # DOCTYPE
2924 redo A;
2925 } else {
2926
2927 $self->{ct}->{name} = chr $self->{nc};
2928 delete $self->{ct}->{quirks};
2929 $self->{state} = DOCTYPE_NAME_STATE;
2930
2931 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2932 $self->{line_prev} = $self->{line};
2933 $self->{column_prev} = $self->{column};
2934 $self->{column}++;
2935 $self->{nc}
2936 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2937 } else {
2938 $self->{set_nc}->($self);
2939 }
2940
2941 redo A;
2942 }
2943 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2944 ## XML5: "DOCTYPE root name state".
2945
2946 ## ISSUE: Redundant "First," in the spec.
2947
2948 if ($is_space->{$self->{nc}}) {
2949
2950 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2951
2952 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2953 $self->{line_prev} = $self->{line};
2954 $self->{column_prev} = $self->{column};
2955 $self->{column}++;
2956 $self->{nc}
2957 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2958 } else {
2959 $self->{set_nc}->($self);
2960 }
2961
2962 redo A;
2963 } elsif ($self->{nc} == 0x003E) { # >
2964
2965 $self->{state} = DATA_STATE;
2966 $self->{s_kwd} = '';
2967
2968 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2969 $self->{line_prev} = $self->{line};
2970 $self->{column_prev} = $self->{column};
2971 $self->{column}++;
2972 $self->{nc}
2973 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2974 } else {
2975 $self->{set_nc}->($self);
2976 }
2977
2978
2979 return ($self->{ct}); # DOCTYPE
2980
2981 redo A;
2982 } elsif ($self->{nc} == -1) {
2983
2984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2985 $self->{state} = DATA_STATE;
2986 $self->{s_kwd} = '';
2987 ## reconsume
2988
2989 $self->{ct}->{quirks} = 1;
2990 return ($self->{ct}); # DOCTYPE
2991
2992 redo A;
2993 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2994
2995 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2996 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2997 $self->{in_subset} = 1;
2998
2999 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3000 $self->{line_prev} = $self->{line};
3001 $self->{column_prev} = $self->{column};
3002 $self->{column}++;
3003 $self->{nc}
3004 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3005 } else {
3006 $self->{set_nc}->($self);
3007 }
3008
3009 return ($self->{ct}); # DOCTYPE
3010 redo A;
3011 } else {
3012
3013 $self->{ct}->{name}
3014 .= chr ($self->{nc}); # DOCTYPE
3015 ## Stay in the state
3016
3017 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018 $self->{line_prev} = $self->{line};
3019 $self->{column_prev} = $self->{column};
3020 $self->{column}++;
3021 $self->{nc}
3022 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023 } else {
3024 $self->{set_nc}->($self);
3025 }
3026
3027 redo A;
3028 }
3029 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3030 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3031 ## state", but implemented differently.
3032
3033 if ($is_space->{$self->{nc}}) {
3034
3035 ## Stay in the state
3036
3037 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3038 $self->{line_prev} = $self->{line};
3039 $self->{column_prev} = $self->{column};
3040 $self->{column}++;
3041 $self->{nc}
3042 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3043 } else {
3044 $self->{set_nc}->($self);
3045 }
3046
3047 redo A;
3048 } elsif ($self->{nc} == 0x003E) { # >
3049
3050 $self->{state} = DATA_STATE;
3051 $self->{s_kwd} = '';
3052
3053 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3054 $self->{line_prev} = $self->{line};
3055 $self->{column_prev} = $self->{column};
3056 $self->{column}++;
3057 $self->{nc}
3058 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3059 } else {
3060 $self->{set_nc}->($self);
3061 }
3062
3063
3064 return ($self->{ct}); # DOCTYPE
3065
3066 redo A;
3067 } elsif ($self->{nc} == -1) {
3068
3069 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3070 $self->{state} = DATA_STATE;
3071 $self->{s_kwd} = '';
3072 ## reconsume
3073
3074 $self->{ct}->{quirks} = 1;
3075 return ($self->{ct}); # DOCTYPE
3076
3077 redo A;
3078 } elsif ($self->{nc} == 0x0050 or # P
3079 $self->{nc} == 0x0070) { # p
3080
3081 $self->{state} = PUBLIC_STATE;
3082 $self->{kwd} = chr $self->{nc};
3083
3084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085 $self->{line_prev} = $self->{line};
3086 $self->{column_prev} = $self->{column};
3087 $self->{column}++;
3088 $self->{nc}
3089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090 } else {
3091 $self->{set_nc}->($self);
3092 }
3093
3094 redo A;
3095 } elsif ($self->{nc} == 0x0053 or # S
3096 $self->{nc} == 0x0073) { # s
3097
3098 $self->{state} = SYSTEM_STATE;
3099 $self->{kwd} = chr $self->{nc};
3100
3101 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3102 $self->{line_prev} = $self->{line};
3103 $self->{column_prev} = $self->{column};
3104 $self->{column}++;
3105 $self->{nc}
3106 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3107 } else {
3108 $self->{set_nc}->($self);
3109 }
3110
3111 redo A;
3112 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3113
3114 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3116 $self->{in_subset} = 1;
3117
3118 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3119 $self->{line_prev} = $self->{line};
3120 $self->{column_prev} = $self->{column};
3121 $self->{column}++;
3122 $self->{nc}
3123 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3124 } else {
3125 $self->{set_nc}->($self);
3126 }
3127
3128 return ($self->{ct}); # DOCTYPE
3129 redo A;
3130 } else {
3131
3132 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
3133 $self->{ct}->{quirks} = 1;
3134
3135 $self->{state} = BOGUS_DOCTYPE_STATE;
3136
3137 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3138 $self->{line_prev} = $self->{line};
3139 $self->{column_prev} = $self->{column};
3140 $self->{column}++;
3141 $self->{nc}
3142 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3143 } else {
3144 $self->{set_nc}->($self);
3145 }
3146
3147 redo A;
3148 }
3149 } elsif ($self->{state} == PUBLIC_STATE) {
3150 ## ASCII case-insensitive
3151 if ($self->{nc} == [
3152 undef,
3153 0x0055, # U
3154 0x0042, # B
3155 0x004C, # L
3156 0x0049, # I
3157 ]->[length $self->{kwd}] or
3158 $self->{nc} == [
3159 undef,
3160 0x0075, # u
3161 0x0062, # b
3162 0x006C, # l
3163 0x0069, # i
3164 ]->[length $self->{kwd}]) {
3165
3166 ## Stay in the state.
3167 $self->{kwd} .= chr $self->{nc};
3168
3169 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3170 $self->{line_prev} = $self->{line};
3171 $self->{column_prev} = $self->{column};
3172 $self->{column}++;
3173 $self->{nc}
3174 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3175 } else {
3176 $self->{set_nc}->($self);
3177 }
3178
3179 redo A;
3180 } elsif ((length $self->{kwd}) == 5 and
3181 ($self->{nc} == 0x0043 or # C
3182 $self->{nc} == 0x0063)) { # c
3183 if ($self->{is_xml} and
3184 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3185
3186 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3187 text => 'PUBLIC',
3188 line => $self->{line_prev},
3189 column => $self->{column_prev} - 4);
3190 } else {
3191
3192 }
3193 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3194
3195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3196 $self->{line_prev} = $self->{line};
3197 $self->{column_prev} = $self->{column};
3198 $self->{column}++;
3199 $self->{nc}
3200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3201 } else {
3202 $self->{set_nc}->($self);
3203 }
3204
3205 redo A;
3206 } else {
3207
3208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3209 line => $self->{line_prev},
3210 column => $self->{column_prev} + 1 - length $self->{kwd});
3211 $self->{ct}->{quirks} = 1;
3212
3213 $self->{state} = BOGUS_DOCTYPE_STATE;
3214 ## Reconsume.
3215 redo A;
3216 }
3217 } elsif ($self->{state} == SYSTEM_STATE) {
3218 ## ASCII case-insensitive
3219 if ($self->{nc} == [
3220 undef,
3221 0x0059, # Y
3222 0x0053, # S
3223 0x0054, # T
3224 0x0045, # E
3225 ]->[length $self->{kwd}] or
3226 $self->{nc} == [
3227 undef,
3228 0x0079, # y
3229 0x0073, # s
3230 0x0074, # t
3231 0x0065, # e
3232 ]->[length $self->{kwd}]) {
3233
3234 ## Stay in the state.
3235 $self->{kwd} .= chr $self->{nc};
3236
3237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238 $self->{line_prev} = $self->{line};
3239 $self->{column_prev} = $self->{column};
3240 $self->{column}++;
3241 $self->{nc}
3242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243 } else {
3244 $self->{set_nc}->($self);
3245 }
3246
3247 redo A;
3248 } elsif ((length $self->{kwd}) == 5 and
3249 ($self->{nc} == 0x004D or # M
3250 $self->{nc} == 0x006D)) { # m
3251 if ($self->{is_xml} and
3252 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3253
3254 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3255 text => 'SYSTEM',
3256 line => $self->{line_prev},
3257 column => $self->{column_prev} - 4);
3258 } else {
3259
3260 }
3261 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3262
3263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264 $self->{line_prev} = $self->{line};
3265 $self->{column_prev} = $self->{column};
3266 $self->{column}++;
3267 $self->{nc}
3268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269 } else {
3270 $self->{set_nc}->($self);
3271 }
3272
3273 redo A;
3274 } else {
3275
3276 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3277 line => $self->{line_prev},
3278 column => $self->{column_prev} + 1 - length $self->{kwd});
3279 $self->{ct}->{quirks} = 1;
3280
3281 $self->{state} = BOGUS_DOCTYPE_STATE;
3282 ## Reconsume.
3283 redo A;
3284 }
3285 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3286 if ($is_space->{$self->{nc}}) {
3287
3288 ## Stay in the state
3289
3290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3291 $self->{line_prev} = $self->{line};
3292 $self->{column_prev} = $self->{column};
3293 $self->{column}++;
3294 $self->{nc}
3295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3296 } else {
3297 $self->{set_nc}->($self);
3298 }
3299
3300 redo A;
3301 } elsif ($self->{nc} eq 0x0022) { # "
3302
3303 $self->{ct}->{pubid} = ''; # DOCTYPE
3304 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3305
3306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3307 $self->{line_prev} = $self->{line};
3308 $self->{column_prev} = $self->{column};
3309 $self->{column}++;
3310 $self->{nc}
3311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3312 } else {
3313 $self->{set_nc}->($self);
3314 }
3315
3316 redo A;
3317 } elsif ($self->{nc} eq 0x0027) { # '
3318
3319 $self->{ct}->{pubid} = ''; # DOCTYPE
3320 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3321
3322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3323 $self->{line_prev} = $self->{line};
3324 $self->{column_prev} = $self->{column};
3325 $self->{column}++;
3326 $self->{nc}
3327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3328 } else {
3329 $self->{set_nc}->($self);
3330 }
3331
3332 redo A;
3333 } elsif ($self->{nc} eq 0x003E) { # >
3334
3335 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3336
3337 $self->{state} = DATA_STATE;
3338 $self->{s_kwd} = '';
3339
3340 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3341 $self->{line_prev} = $self->{line};
3342 $self->{column_prev} = $self->{column};
3343 $self->{column}++;
3344 $self->{nc}
3345 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3346 } else {
3347 $self->{set_nc}->($self);
3348 }
3349
3350
3351 $self->{ct}->{quirks} = 1;
3352 return ($self->{ct}); # DOCTYPE
3353
3354 redo A;
3355 } elsif ($self->{nc} == -1) {
3356
3357 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3358
3359 $self->{state} = DATA_STATE;
3360 $self->{s_kwd} = '';
3361 ## reconsume
3362
3363 $self->{ct}->{quirks} = 1;
3364 return ($self->{ct}); # DOCTYPE
3365
3366 redo A;
3367 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3368
3369 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3370 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3371 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3372 $self->{in_subset} = 1;
3373
3374 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3375 $self->{line_prev} = $self->{line};
3376 $self->{column_prev} = $self->{column};
3377 $self->{column}++;
3378 $self->{nc}
3379 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3380 } else {
3381 $self->{set_nc}->($self);
3382 }
3383
3384 return ($self->{ct}); # DOCTYPE
3385 redo A;
3386 } else {
3387
3388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3389 $self->{ct}->{quirks} = 1;
3390
3391 $self->{state} = BOGUS_DOCTYPE_STATE;
3392
3393 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3394 $self->{line_prev} = $self->{line};
3395 $self->{column_prev} = $self->{column};
3396 $self->{column}++;
3397 $self->{nc}
3398 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3399 } else {
3400 $self->{set_nc}->($self);
3401 }
3402
3403 redo A;
3404 }
3405 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3406 if ($self->{nc} == 0x0022) { # "
3407
3408 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3409
3410 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3411 $self->{line_prev} = $self->{line};
3412 $self->{column_prev} = $self->{column};
3413 $self->{column}++;
3414 $self->{nc}
3415 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3416 } else {
3417 $self->{set_nc}->($self);
3418 }
3419
3420 redo A;
3421 } elsif ($self->{nc} == 0x003E) { # >
3422
3423 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3424
3425 $self->{state} = DATA_STATE;
3426 $self->{s_kwd} = '';
3427
3428 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3429 $self->{line_prev} = $self->{line};
3430 $self->{column_prev} = $self->{column};
3431 $self->{column}++;
3432 $self->{nc}
3433 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3434 } else {
3435 $self->{set_nc}->($self);
3436 }
3437
3438
3439 $self->{ct}->{quirks} = 1;
3440 return ($self->{ct}); # DOCTYPE
3441
3442 redo A;
3443 } elsif ($self->{nc} == -1) {
3444
3445 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3446
3447 $self->{state} = DATA_STATE;
3448 $self->{s_kwd} = '';
3449 ## reconsume
3450
3451 $self->{ct}->{quirks} = 1;
3452 return ($self->{ct}); # DOCTYPE
3453
3454 redo A;
3455 } else {
3456
3457 $self->{ct}->{pubid} # DOCTYPE
3458 .= chr $self->{nc};
3459 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3460 length $self->{ct}->{pubid});
3461
3462 ## Stay in the state
3463
3464 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3465 $self->{line_prev} = $self->{line};
3466 $self->{column_prev} = $self->{column};
3467 $self->{column}++;
3468 $self->{nc}
3469 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3470 } else {
3471 $self->{set_nc}->($self);
3472 }
3473
3474 redo A;
3475 }
3476 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3477 if ($self->{nc} == 0x0027) { # '
3478
3479 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3480
3481 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3482 $self->{line_prev} = $self->{line};
3483 $self->{column_prev} = $self->{column};
3484 $self->{column}++;
3485 $self->{nc}
3486 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3487 } else {
3488 $self->{set_nc}->($self);
3489 }
3490
3491 redo A;
3492 } elsif ($self->{nc} == 0x003E) { # >
3493
3494 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3495
3496 $self->{state} = DATA_STATE;
3497 $self->{s_kwd} = '';
3498
3499 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3500 $self->{line_prev} = $self->{line};
3501 $self->{column_prev} = $self->{column};
3502 $self->{column}++;
3503 $self->{nc}
3504 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3505 } else {
3506 $self->{set_nc}->($self);
3507 }
3508
3509
3510 $self->{ct}->{quirks} = 1;
3511 return ($self->{ct}); # DOCTYPE
3512
3513 redo A;
3514 } elsif ($self->{nc} == -1) {
3515
3516 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3517
3518 $self->{state} = DATA_STATE;
3519 $self->{s_kwd} = '';
3520 ## reconsume
3521
3522 $self->{ct}->{quirks} = 1;
3523 return ($self->{ct}); # DOCTYPE
3524
3525 redo A;
3526 } else {
3527
3528 $self->{ct}->{pubid} # DOCTYPE
3529 .= chr $self->{nc};
3530 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3531 length $self->{ct}->{pubid});
3532
3533 ## Stay in the state
3534
3535 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3536 $self->{line_prev} = $self->{line};
3537 $self->{column_prev} = $self->{column};
3538 $self->{column}++;
3539 $self->{nc}
3540 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3541 } else {
3542 $self->{set_nc}->($self);
3543 }
3544
3545 redo A;
3546 }
3547 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3548 if ($is_space->{$self->{nc}}) {
3549
3550 ## Stay in the state
3551
3552 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3553 $self->{line_prev} = $self->{line};
3554 $self->{column_prev} = $self->{column};
3555 $self->{column}++;
3556 $self->{nc}
3557 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3558 } else {
3559 $self->{set_nc}->($self);
3560 }
3561
3562 redo A;
3563 } elsif ($self->{nc} == 0x0022) { # "
3564
3565 $self->{ct}->{sysid} = ''; # DOCTYPE
3566 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3567
3568 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3569 $self->{line_prev} = $self->{line};
3570 $self->{column_prev} = $self->{column};
3571 $self->{column}++;
3572 $self->{nc}
3573 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3574 } else {
3575 $self->{set_nc}->($self);
3576 }
3577
3578 redo A;
3579 } elsif ($self->{nc} == 0x0027) { # '
3580
3581 $self->{ct}->{sysid} = ''; # DOCTYPE
3582 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3583
3584 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3585 $self->{line_prev} = $self->{line};
3586 $self->{column_prev} = $self->{column};
3587 $self->{column}++;
3588 $self->{nc}
3589 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3590 } else {
3591 $self->{set_nc}->($self);
3592 }
3593
3594 redo A;
3595 } elsif ($self->{nc} == 0x003E) { # >
3596 if ($self->{is_xml}) {
3597
3598 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3599 } else {
3600
3601 }
3602 $self->{state} = DATA_STATE;
3603 $self->{s_kwd} = '';
3604
3605 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3606 $self->{line_prev} = $self->{line};
3607 $self->{column_prev} = $self->{column};
3608 $self->{column}++;
3609 $self->{nc}
3610 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3611 } else {
3612 $self->{set_nc}->($self);
3613 }
3614
3615
3616 return ($self->{ct}); # DOCTYPE
3617
3618 redo A;
3619 } elsif ($self->{nc} == -1) {
3620
3621 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3622
3623 $self->{state} = DATA_STATE;
3624 $self->{s_kwd} = '';
3625 ## reconsume
3626
3627 $self->{ct}->{quirks} = 1;
3628 return ($self->{ct}); # DOCTYPE
3629
3630 redo A;
3631 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3632
3633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3634 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3635 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3636 $self->{in_subset} = 1;
3637
3638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639 $self->{line_prev} = $self->{line};
3640 $self->{column_prev} = $self->{column};
3641 $self->{column}++;
3642 $self->{nc}
3643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3644 } else {
3645 $self->{set_nc}->($self);
3646 }
3647
3648 return ($self->{ct}); # DOCTYPE
3649 redo A;
3650 } else {
3651
3652 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3653 $self->{ct}->{quirks} = 1;
3654
3655 $self->{state} = BOGUS_DOCTYPE_STATE;
3656
3657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3658 $self->{line_prev} = $self->{line};
3659 $self->{column_prev} = $self->{column};
3660 $self->{column}++;
3661 $self->{nc}
3662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3663 } else {
3664 $self->{set_nc}->($self);
3665 }
3666
3667 redo A;
3668 }
3669 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3670 if ($is_space->{$self->{nc}}) {
3671
3672 ## Stay in the state
3673
3674 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3675 $self->{line_prev} = $self->{line};
3676 $self->{column_prev} = $self->{column};
3677 $self->{column}++;
3678 $self->{nc}
3679 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3680 } else {
3681 $self->{set_nc}->($self);
3682 }
3683
3684 redo A;
3685 } elsif ($self->{nc} == 0x0022) { # "
3686
3687 $self->{ct}->{sysid} = ''; # DOCTYPE
3688 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3689
3690 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3691 $self->{line_prev} = $self->{line};
3692 $self->{column_prev} = $self->{column};
3693 $self->{column}++;
3694 $self->{nc}
3695 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3696 } else {
3697 $self->{set_nc}->($self);
3698 }
3699
3700 redo A;
3701 } elsif ($self->{nc} == 0x0027) { # '
3702
3703 $self->{ct}->{sysid} = ''; # DOCTYPE
3704 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3705
3706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3707 $self->{line_prev} = $self->{line};
3708 $self->{column_prev} = $self->{column};
3709 $self->{column}++;
3710 $self->{nc}
3711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3712 } else {
3713 $self->{set_nc}->($self);
3714 }
3715
3716 redo A;
3717 } elsif ($self->{nc} == 0x003E) { # >
3718
3719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3720 $self->{state} = DATA_STATE;
3721 $self->{s_kwd} = '';
3722
3723 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3724 $self->{line_prev} = $self->{line};
3725 $self->{column_prev} = $self->{column};
3726 $self->{column}++;
3727 $self->{nc}
3728 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3729 } else {
3730 $self->{set_nc}->($self);
3731 }
3732
3733
3734 $self->{ct}->{quirks} = 1;
3735 return ($self->{ct}); # DOCTYPE
3736
3737 redo A;
3738 } elsif ($self->{nc} == -1) {
3739
3740 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3741
3742 $self->{state} = DATA_STATE;
3743 $self->{s_kwd} = '';
3744 ## reconsume
3745
3746 $self->{ct}->{quirks} = 1;
3747 return ($self->{ct}); # DOCTYPE
3748
3749 redo A;
3750 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3751
3752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3753
3754 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3755 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3756 $self->{in_subset} = 1;
3757
3758 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3759 $self->{line_prev} = $self->{line};
3760 $self->{column_prev} = $self->{column};
3761 $self->{column}++;
3762 $self->{nc}
3763 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3764 } else {
3765 $self->{set_nc}->($self);
3766 }
3767
3768 return ($self->{ct}); # DOCTYPE
3769 redo A;
3770 } else {
3771
3772 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3773 $self->{ct}->{quirks} = 1;
3774
3775 $self->{state} = BOGUS_DOCTYPE_STATE;
3776
3777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3778 $self->{line_prev} = $self->{line};
3779 $self->{column_prev} = $self->{column};
3780 $self->{column}++;
3781 $self->{nc}
3782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3783 } else {
3784 $self->{set_nc}->($self);
3785 }
3786
3787 redo A;
3788 }
3789 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3790 if ($self->{nc} == 0x0022) { # "
3791
3792 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3793
3794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3795 $self->{line_prev} = $self->{line};
3796 $self->{column_prev} = $self->{column};
3797 $self->{column}++;
3798 $self->{nc}
3799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3800 } else {
3801 $self->{set_nc}->($self);
3802 }
3803
3804 redo A;
3805 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3806
3807 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3808
3809 $self->{state} = DATA_STATE;
3810 $self->{s_kwd} = '';
3811
3812 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3813 $self->{line_prev} = $self->{line};
3814 $self->{column_prev} = $self->{column};
3815 $self->{column}++;
3816 $self->{nc}
3817 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3818 } else {
3819 $self->{set_nc}->($self);
3820 }
3821
3822
3823 $self->{ct}->{quirks} = 1;
3824 return ($self->{ct}); # DOCTYPE
3825
3826 redo A;
3827 } elsif ($self->{nc} == -1) {
3828
3829 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3830
3831 $self->{state} = DATA_STATE;
3832 $self->{s_kwd} = '';
3833 ## reconsume
3834
3835 $self->{ct}->{quirks} = 1;
3836 return ($self->{ct}); # DOCTYPE
3837
3838 redo A;
3839 } else {
3840
3841 $self->{ct}->{sysid} # DOCTYPE
3842 .= chr $self->{nc};
3843 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3844 length $self->{ct}->{sysid});
3845
3846 ## Stay in the state
3847
3848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3849 $self->{line_prev} = $self->{line};
3850 $self->{column_prev} = $self->{column};
3851 $self->{column}++;
3852 $self->{nc}
3853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3854 } else {
3855 $self->{set_nc}->($self);
3856 }
3857
3858 redo A;
3859 }
3860 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3861 if ($self->{nc} == 0x0027) { # '
3862
3863 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3864
3865 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3866 $self->{line_prev} = $self->{line};
3867 $self->{column_prev} = $self->{column};
3868 $self->{column}++;
3869 $self->{nc}
3870 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3871 } else {
3872 $self->{set_nc}->($self);
3873 }
3874
3875 redo A;
3876 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3877
3878 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3879
3880 $self->{state} = DATA_STATE;
3881 $self->{s_kwd} = '';
3882
3883 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3884 $self->{line_prev} = $self->{line};
3885 $self->{column_prev} = $self->{column};
3886 $self->{column}++;
3887 $self->{nc}
3888 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3889 } else {
3890 $self->{set_nc}->($self);
3891 }
3892
3893
3894 $self->{ct}->{quirks} = 1;
3895 return ($self->{ct}); # DOCTYPE
3896
3897 redo A;
3898 } elsif ($self->{nc} == -1) {
3899
3900 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3901
3902 $self->{state} = DATA_STATE;
3903 $self->{s_kwd} = '';
3904 ## reconsume
3905
3906 $self->{ct}->{quirks} = 1;
3907 return ($self->{ct}); # DOCTYPE
3908
3909 redo A;
3910 } else {
3911
3912 $self->{ct}->{sysid} # DOCTYPE
3913 .= chr $self->{nc};
3914 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3915 length $self->{ct}->{sysid});
3916
3917 ## Stay in the state
3918
3919 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3920 $self->{line_prev} = $self->{line};
3921 $self->{column_prev} = $self->{column};
3922 $self->{column}++;
3923 $self->{nc}
3924 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3925 } else {
3926 $self->{set_nc}->($self);
3927 }
3928
3929 redo A;
3930 }
3931 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3932 if ($is_space->{$self->{nc}}) {
3933
3934 ## Stay in the state
3935
3936 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3937 $self->{line_prev} = $self->{line};
3938 $self->{column_prev} = $self->{column};
3939 $self->{column}++;
3940 $self->{nc}
3941 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3942 } else {
3943 $self->{set_nc}->($self);
3944 }
3945
3946 redo A;
3947 } elsif ($self->{nc} == 0x003E) { # >
3948
3949 $self->{state} = DATA_STATE;
3950 $self->{s_kwd} = '';
3951
3952 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3953 $self->{line_prev} = $self->{line};
3954 $self->{column_prev} = $self->{column};
3955 $self->{column}++;
3956 $self->{nc}
3957 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3958 } else {
3959 $self->{set_nc}->($self);
3960 }
3961
3962
3963 return ($self->{ct}); # DOCTYPE
3964
3965 redo A;
3966 } elsif ($self->{nc} == -1) {
3967
3968 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3969 $self->{state} = DATA_STATE;
3970 $self->{s_kwd} = '';
3971 ## reconsume
3972
3973 $self->{ct}->{quirks} = 1;
3974 return ($self->{ct}); # DOCTYPE
3975
3976 redo A;
3977 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3978
3979 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3980 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3981 $self->{in_subset} = 1;
3982
3983 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3984 $self->{line_prev} = $self->{line};
3985 $self->{column_prev} = $self->{column};
3986 $self->{column}++;
3987 $self->{nc}
3988 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3989 } else {
3990 $self->{set_nc}->($self);
3991 }
3992
3993 return ($self->{ct}); # DOCTYPE
3994 redo A;
3995 } else {
3996
3997 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3998 #$self->{ct}->{quirks} = 1;
3999
4000 $self->{state} = BOGUS_DOCTYPE_STATE;
4001
4002 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4003 $self->{line_prev} = $self->{line};
4004 $self->{column_prev} = $self->{column};
4005 $self->{column}++;
4006 $self->{nc}
4007 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4008 } else {
4009 $self->{set_nc}->($self);
4010 }
4011
4012 redo A;
4013 }
4014 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4015 if ($self->{nc} == 0x003E) { # >
4016
4017 $self->{state} = DATA_STATE;
4018 $self->{s_kwd} = '';
4019
4020 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4021 $self->{line_prev} = $self->{line};
4022 $self->{column_prev} = $self->{column};
4023 $self->{column}++;
4024 $self->{nc}
4025 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4026 } else {
4027 $self->{set_nc}->($self);
4028 }
4029
4030
4031 return ($self->{ct}); # DOCTYPE
4032
4033 redo A;
4034 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4035
4036 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4037 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4038 $self->{in_subset} = 1;
4039
4040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4041 $self->{line_prev} = $self->{line};
4042 $self->{column_prev} = $self->{column};
4043 $self->{column}++;
4044 $self->{nc}
4045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4046 } else {
4047 $self->{set_nc}->($self);
4048 }
4049
4050 return ($self->{ct}); # DOCTYPE
4051 redo A;
4052 } elsif ($self->{nc} == -1) {
4053
4054 $self->{state} = DATA_STATE;
4055 $self->{s_kwd} = '';
4056 ## reconsume
4057
4058 return ($self->{ct}); # DOCTYPE
4059
4060 redo A;
4061 } else {
4062
4063 my $s = '';
4064 $self->{read_until}->($s, q{>[}, 0);
4065
4066 ## Stay in the state
4067
4068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4069 $self->{line_prev} = $self->{line};
4070 $self->{column_prev} = $self->{column};
4071 $self->{column}++;
4072 $self->{nc}
4073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4074 } else {
4075 $self->{set_nc}->($self);
4076 }
4077
4078 redo A;
4079 }
4080 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4081 ## NOTE: "CDATA section state" in the state is jointly implemented
4082 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4083 ## and |CDATA_SECTION_MSE2_STATE|.
4084
4085 ## XML5: "CDATA state".
4086
4087 if ($self->{nc} == 0x005D) { # ]
4088
4089 $self->{state} = CDATA_SECTION_MSE1_STATE;
4090
4091 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4092 $self->{line_prev} = $self->{line};
4093 $self->{column_prev} = $self->{column};
4094 $self->{column}++;
4095 $self->{nc}
4096 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4097 } else {
4098 $self->{set_nc}->($self);
4099 }
4100
4101 redo A;
4102 } elsif ($self->{nc} == -1) {
4103 if ($self->{is_xml}) {
4104
4105 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4106 } else {
4107
4108 }
4109
4110 $self->{state} = DATA_STATE;
4111 $self->{s_kwd} = '';
4112 ## Reconsume.
4113 if (length $self->{ct}->{data}) { # character
4114
4115 return ($self->{ct}); # character
4116 } else {
4117
4118 ## No token to emit. $self->{ct} is discarded.
4119 }
4120 redo A;
4121 } else {
4122
4123 $self->{ct}->{data} .= chr $self->{nc};
4124 $self->{read_until}->($self->{ct}->{data},
4125 q<]>,
4126 length $self->{ct}->{data});
4127
4128 ## Stay in the state.
4129
4130 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4131 $self->{line_prev} = $self->{line};
4132 $self->{column_prev} = $self->{column};
4133 $self->{column}++;
4134 $self->{nc}
4135 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4136 } else {
4137 $self->{set_nc}->($self);
4138 }
4139
4140 redo A;
4141 }
4142
4143 ## ISSUE: "text tokens" in spec.
4144 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4145 ## XML5: "CDATA bracket state".
4146
4147 if ($self->{nc} == 0x005D) { # ]
4148
4149 $self->{state} = CDATA_SECTION_MSE2_STATE;
4150
4151 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4152 $self->{line_prev} = $self->{line};
4153 $self->{column_prev} = $self->{column};
4154 $self->{column}++;
4155 $self->{nc}
4156 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4157 } else {
4158 $self->{set_nc}->($self);
4159 }
4160
4161 redo A;
4162 } else {
4163
4164 ## XML5: If EOF, "]" is not appended and changed to the data state.
4165 $self->{ct}->{data} .= ']';
4166 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4167 ## Reconsume.
4168 redo A;
4169 }
4170 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4171 ## XML5: "CDATA end state".
4172
4173 if ($self->{nc} == 0x003E) { # >
4174 $self->{state} = DATA_STATE;
4175 $self->{s_kwd} = '';
4176
4177 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4178 $self->{line_prev} = $self->{line};
4179 $self->{column_prev} = $self->{column};
4180 $self->{column}++;
4181 $self->{nc}
4182 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4183 } else {
4184 $self->{set_nc}->($self);
4185 }
4186
4187 if (length $self->{ct}->{data}) { # character
4188
4189 return ($self->{ct}); # character
4190 } else {
4191
4192 ## No token to emit. $self->{ct} is discarded.
4193 }
4194 redo A;
4195 } elsif ($self->{nc} == 0x005D) { # ]
4196 # character
4197 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4198 ## Stay in the state.
4199
4200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4201 $self->{line_prev} = $self->{line};
4202 $self->{column_prev} = $self->{column};
4203 $self->{column}++;
4204 $self->{nc}
4205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4206 } else {
4207 $self->{set_nc}->($self);
4208 }
4209
4210 redo A;
4211 } else {
4212
4213 $self->{ct}->{data} .= ']]'; # character
4214 $self->{state} = CDATA_SECTION_STATE;
4215 ## Reconsume. ## XML5: Emit.
4216 redo A;
4217 }
4218 } elsif ($self->{state} == ENTITY_STATE) {
4219 if ($is_space->{$self->{nc}} or
4220 {
4221 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4222 $self->{entity_add} => 1,
4223 }->{$self->{nc}}) {
4224
4225 ## Don't consume
4226 ## No error
4227 ## Return nothing.
4228 #
4229 } elsif ($self->{nc} == 0x0023) { # #
4230
4231 $self->{state} = ENTITY_HASH_STATE;
4232 $self->{kwd} = '#';
4233
4234 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235 $self->{line_prev} = $self->{line};
4236 $self->{column_prev} = $self->{column};
4237 $self->{column}++;
4238 $self->{nc}
4239 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240 } else {
4241 $self->{set_nc}->($self);
4242 }
4243
4244 redo A;
4245 } elsif ((0x0041 <= $self->{nc} and
4246 $self->{nc} <= 0x005A) or # A..Z
4247 (0x0061 <= $self->{nc} and
4248 $self->{nc} <= 0x007A)) { # a..z
4249
4250 require Whatpm::_NamedEntityList;
4251 $self->{state} = ENTITY_NAME_STATE;
4252 $self->{kwd} = chr $self->{nc};
4253 $self->{entity__value} = $self->{kwd};
4254 $self->{entity__match} = 0;
4255
4256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257 $self->{line_prev} = $self->{line};
4258 $self->{column_prev} = $self->{column};
4259 $self->{column}++;
4260 $self->{nc}
4261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262 } else {
4263 $self->{set_nc}->($self);
4264 }
4265
4266 redo A;
4267 } else {
4268
4269 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4270 ## Return nothing.
4271 #
4272 }
4273
4274 ## NOTE: No character is consumed by the "consume a character
4275 ## reference" algorithm. In other word, there is an "&" character
4276 ## that does not introduce a character reference, which would be
4277 ## appended to the parent element or the attribute value in later
4278 ## process of the tokenizer.
4279
4280 if ($self->{prev_state} == DATA_STATE) {
4281
4282 $self->{state} = $self->{prev_state};
4283 $self->{s_kwd} = '';
4284 ## Reconsume.
4285 return ({type => CHARACTER_TOKEN, data => '&',
4286 line => $self->{line_prev},
4287 column => $self->{column_prev},
4288 });
4289 redo A;
4290 } else {
4291
4292 $self->{ca}->{value} .= '&';
4293 $self->{state} = $self->{prev_state};
4294 $self->{s_kwd} = '';
4295 ## Reconsume.
4296 redo A;
4297 }
4298 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4299 if ($self->{nc} == 0x0078 or # x
4300 $self->{nc} == 0x0058) { # X
4301
4302 $self->{state} = HEXREF_X_STATE;
4303 $self->{kwd} .= chr $self->{nc};
4304
4305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4306 $self->{line_prev} = $self->{line};
4307 $self->{column_prev} = $self->{column};
4308 $self->{column}++;
4309 $self->{nc}
4310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4311 } else {
4312 $self->{set_nc}->($self);
4313 }
4314
4315 redo A;
4316 } elsif (0x0030 <= $self->{nc} and
4317 $self->{nc} <= 0x0039) { # 0..9
4318
4319 $self->{state} = NCR_NUM_STATE;
4320 $self->{kwd} = $self->{nc} - 0x0030;
4321
4322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4323 $self->{line_prev} = $self->{line};
4324 $self->{column_prev} = $self->{column};
4325 $self->{column}++;
4326 $self->{nc}
4327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4328 } else {
4329 $self->{set_nc}->($self);
4330 }
4331
4332 redo A;
4333 } else {
4334 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4335 line => $self->{line_prev},
4336 column => $self->{column_prev} - 1);
4337
4338 ## NOTE: According to the spec algorithm, nothing is returned,
4339 ## and then "&#" is appended to the parent element or the attribute
4340 ## value in the later processing.
4341
4342 if ($self->{prev_state} == DATA_STATE) {
4343
4344 $self->{state} = $self->{prev_state};
4345 $self->{s_kwd} = '';
4346 ## Reconsume.
4347 return ({type => CHARACTER_TOKEN,
4348 data => '&#',
4349 line => $self->{line_prev},
4350 column => $self->{column_prev} - 1,
4351 });
4352 redo A;
4353 } else {
4354
4355 $self->{ca}->{value} .= '&#';
4356 $self->{state} = $self->{prev_state};
4357 $self->{s_kwd} = '';
4358 ## Reconsume.
4359 redo A;
4360 }
4361 }
4362 } elsif ($self->{state} == NCR_NUM_STATE) {
4363 if (0x0030 <= $self->{nc} and
4364 $self->{nc} <= 0x0039) { # 0..9
4365
4366 $self->{kwd} *= 10;
4367 $self->{kwd} += $self->{nc} - 0x0030;
4368
4369 ## Stay in the state.
4370
4371 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4372 $self->{line_prev} = $self->{line};
4373 $self->{column_prev} = $self->{column};
4374 $self->{column}++;
4375 $self->{nc}
4376 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4377 } else {
4378 $self->{set_nc}->($self);
4379 }
4380
4381 redo A;
4382 } elsif ($self->{nc} == 0x003B) { # ;
4383
4384
4385 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4386 $self->{line_prev} = $self->{line};
4387 $self->{column_prev} = $self->{column};
4388 $self->{column}++;
4389 $self->{nc}
4390 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4391 } else {
4392 $self->{set_nc}->($self);
4393 }
4394
4395 #
4396 } else {
4397
4398 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4399 ## Reconsume.
4400 #
4401 }
4402
4403 my $code = $self->{kwd};
4404 my $l = $self->{line_prev};
4405 my $c = $self->{column_prev};
4406 if ($charref_map->{$code}) {
4407
4408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4409 text => (sprintf 'U+%04X', $code),
4410 line => $l, column => $c);
4411 $code = $charref_map->{$code};
4412 } elsif ($code > 0x10FFFF) {
4413
4414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4415 text => (sprintf 'U-%08X', $code),
4416 line => $l, column => $c);
4417 $code = 0xFFFD;
4418 }
4419
4420 if ($self->{prev_state} == DATA_STATE) {
4421
4422 $self->{state} = $self->{prev_state};
4423 $self->{s_kwd} = '';
4424 ## Reconsume.
4425 return ({type => CHARACTER_TOKEN, data => chr $code,
4426 has_reference => 1,
4427 line => $l, column => $c,
4428 });
4429 redo A;
4430 } else {
4431
4432 $self->{ca}->{value} .= chr $code;
4433 $self->{ca}->{has_reference} = 1;
4434 $self->{state} = $self->{prev_state};
4435 $self->{s_kwd} = '';
4436 ## Reconsume.
4437 redo A;
4438 }
4439 } elsif ($self->{state} == HEXREF_X_STATE) {
4440 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4441 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4442 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4443 # 0..9, A..F, a..f
4444
4445 $self->{state} = HEXREF_HEX_STATE;
4446 $self->{kwd} = 0;
4447 ## Reconsume.
4448 redo A;
4449 } else {
4450 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4451 line => $self->{line_prev},
4452 column => $self->{column_prev} - 2);
4453
4454 ## NOTE: According to the spec algorithm, nothing is returned,
4455 ## and then "&#" followed by "X" or "x" is appended to the parent
4456 ## element or the attribute value in the later processing.
4457
4458 if ($self->{prev_state} == DATA_STATE) {
4459
4460 $self->{state} = $self->{prev_state};
4461 $self->{s_kwd} = '';
4462 ## Reconsume.
4463 return ({type => CHARACTER_TOKEN,
4464 data => '&' . $self->{kwd},
4465 line => $self->{line_prev},
4466 column => $self->{column_prev} - length $self->{kwd},
4467 });
4468 redo A;
4469 } else {
4470
4471 $self->{ca}->{value} .= '&' . $self->{kwd};
4472 $self->{state} = $self->{prev_state};
4473 $self->{s_kwd} = '';
4474 ## Reconsume.
4475 redo A;
4476 }
4477 }
4478 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4479 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4480 # 0..9
4481
4482 $self->{kwd} *= 0x10;
4483 $self->{kwd} += $self->{nc} - 0x0030;
4484 ## Stay in the state.
4485
4486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4487 $self->{line_prev} = $self->{line};
4488 $self->{column_prev} = $self->{column};
4489 $self->{column}++;
4490 $self->{nc}
4491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4492 } else {
4493 $self->{set_nc}->($self);
4494 }
4495
4496 redo A;
4497 } elsif (0x0061 <= $self->{nc} and
4498 $self->{nc} <= 0x0066) { # a..f
4499
4500 $self->{kwd} *= 0x10;
4501 $self->{kwd} += $self->{nc} - 0x0060 + 9;
4502 ## Stay in the state.
4503
4504 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4505 $self->{line_prev} = $self->{line};
4506 $self->{column_prev} = $self->{column};
4507 $self->{column}++;
4508 $self->{nc}
4509 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4510 } else {
4511 $self->{set_nc}->($self);
4512 }
4513
4514 redo A;
4515 } elsif (0x0041 <= $self->{nc} and
4516 $self->{nc} <= 0x0046) { # A..F
4517
4518 $self->{kwd} *= 0x10;
4519 $self->{kwd} += $self->{nc} - 0x0040 + 9;
4520 ## Stay in the state.
4521
4522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4523 $self->{line_prev} = $self->{line};
4524 $self->{column_prev} = $self->{column};
4525 $self->{column}++;
4526 $self->{nc}
4527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4528 } else {
4529 $self->{set_nc}->($self);
4530 }
4531
4532 redo A;
4533 } elsif ($self->{nc} == 0x003B) { # ;
4534
4535
4536 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4537 $self->{line_prev} = $self->{line};
4538 $self->{column_prev} = $self->{column};
4539 $self->{column}++;
4540 $self->{nc}
4541 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4542 } else {
4543 $self->{set_nc}->($self);
4544 }
4545
4546 #
4547 } else {
4548
4549 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4550 line => $self->{line},
4551 column => $self->{column});
4552 ## Reconsume.
4553 #
4554 }
4555
4556 my $code = $self->{kwd};
4557 my $l = $self->{line_prev};
4558 my $c = $self->{column_prev};
4559 if ($charref_map->{$code}) {
4560
4561 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4562 text => (sprintf 'U+%04X', $code),
4563 line => $l, column => $c);
4564 $code = $charref_map->{$code};
4565 } elsif ($code > 0x10FFFF) {
4566
4567 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4568 text => (sprintf 'U-%08X', $code),
4569 line => $l, column => $c);
4570 $code = 0xFFFD;
4571 }
4572
4573 if ($self->{prev_state} == DATA_STATE) {
4574
4575 $self->{state} = $self->{prev_state};
4576 $self->{s_kwd} = '';
4577 ## Reconsume.
4578 return ({type => CHARACTER_TOKEN, data => chr $code,
4579 has_reference => 1,
4580 line => $l, column => $c,
4581 });
4582 redo A;
4583 } else {
4584
4585 $self->{ca}->{value} .= chr $code;
4586 $self->{ca}->{has_reference} = 1;
4587 $self->{state} = $self->{prev_state};
4588 $self->{s_kwd} = '';
4589 ## Reconsume.
4590 redo A;
4591 }
4592 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4593 if (length $self->{kwd} < 30 and
4594 ## NOTE: Some number greater than the maximum length of entity name
4595 ((0x0041 <= $self->{nc} and # a
4596 $self->{nc} <= 0x005A) or # x
4597 (0x0061 <= $self->{nc} and # a
4598 $self->{nc} <= 0x007A) or # z
4599 (0x0030 <= $self->{nc} and # 0
4600 $self->{nc} <= 0x0039) or # 9
4601 $self->{nc} == 0x003B)) { # ;
4602 our $EntityChar;
4603 $self->{kwd} .= chr $self->{nc};
4604 if (defined $EntityChar->{$self->{kwd}}) {
4605 if ($self->{nc} == 0x003B) { # ;
4606
4607 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4608 $self->{entity__match} = 1;
4609
4610 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4611 $self->{line_prev} = $self->{line};
4612 $self->{column_prev} = $self->{column};
4613 $self->{column}++;
4614 $self->{nc}
4615 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4616 } else {
4617 $self->{set_nc}->($self);
4618 }
4619
4620 #
4621 } else {
4622
4623 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4624 $self->{entity__match} = -1;
4625 ## Stay in the state.
4626
4627 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4628 $self->{line_prev} = $self->{line};
4629 $self->{column_prev} = $self->{column};
4630 $self->{column}++;
4631 $self->{nc}
4632 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4633 } else {
4634 $self->{set_nc}->($self);
4635 }
4636
4637 redo A;
4638 }
4639 } else {
4640
4641 $self->{entity__value} .= chr $self->{nc};
4642 $self->{entity__match} *= 2;
4643 ## Stay in the state.
4644
4645 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4646 $self->{line_prev} = $self->{line};
4647 $self->{column_prev} = $self->{column};
4648 $self->{column}++;
4649 $self->{nc}
4650 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4651 } else {
4652 $self->{set_nc}->($self);
4653 }
4654
4655 redo A;
4656 }
4657 }
4658
4659 my $data;
4660 my $has_ref;
4661 if ($self->{entity__match} > 0) {
4662
4663 $data = $self->{entity__value};
4664 $has_ref = 1;
4665 #
4666 } elsif ($self->{entity__match} < 0) {
4667 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4668 if ($self->{prev_state} != DATA_STATE and # in attribute
4669 $self->{entity__match} < -1) {
4670
4671 $data = '&' . $self->{kwd};
4672 #
4673 } else {
4674
4675 $data = $self->{entity__value};
4676 $has_ref = 1;
4677 #
4678 }
4679 } else {
4680
4681 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4682 line => $self->{line_prev},
4683 column => $self->{column_prev} - length $self->{kwd});
4684 $data = '&' . $self->{kwd};
4685 #
4686 }
4687
4688 ## NOTE: In these cases, when a character reference is found,
4689 ## it is consumed and a character token is returned, or, otherwise,
4690 ## nothing is consumed and returned, according to the spec algorithm.
4691 ## In this implementation, anything that has been examined by the
4692 ## tokenizer is appended to the parent element or the attribute value
4693 ## as string, either literal string when no character reference or
4694 ## entity-replaced string otherwise, in this stage, since any characters
4695 ## that would not be consumed are appended in the data state or in an
4696 ## appropriate attribute value state anyway.
4697
4698 if ($self->{prev_state} == DATA_STATE) {
4699
4700 $self->{state} = $self->{prev_state};
4701 $self->{s_kwd} = '';
4702 ## Reconsume.
4703 return ({type => CHARACTER_TOKEN,
4704 data => $data,
4705 has_reference => $has_ref,
4706 line => $self->{line_prev},
4707 column => $self->{column_prev} + 1 - length $self->{kwd},
4708 });
4709 redo A;
4710 } else {
4711
4712 $self->{ca}->{value} .= $data;
4713 $self->{ca}->{has_reference} = 1 if $has_ref;
4714 $self->{state} = $self->{prev_state};
4715 $self->{s_kwd} = '';
4716 ## Reconsume.
4717 redo A;
4718 }
4719
4720 ## XML-only states
4721
4722 } elsif ($self->{state} == PI_STATE) {
4723 if ($is_space->{$self->{nc}} or
4724 $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4725 $self->{nc} == -1) {
4726 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4727 line => $self->{line_prev},
4728 column => $self->{column_prev}
4729 - 1 * ($self->{nc} != -1));
4730 $self->{state} = BOGUS_COMMENT_STATE;
4731 ## Reconsume.
4732 $self->{ct} = {type => COMMENT_TOKEN,
4733 data => '?',
4734 line => $self->{line_prev},
4735 column => $self->{column_prev}
4736 - 1 * ($self->{nc} != -1),
4737 };
4738 redo A;
4739 } else {
4740 $self->{ct} = {type => PI_TOKEN,
4741 target => chr $self->{nc},
4742 data => '',
4743 line => $self->{line_prev},
4744 column => $self->{column_prev} - 1,
4745 };
4746 $self->{state} = PI_TARGET_STATE;
4747
4748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4749 $self->{line_prev} = $self->{line};
4750 $self->{column_prev} = $self->{column};
4751 $self->{column}++;
4752 $self->{nc}
4753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4754 } else {
4755 $self->{set_nc}->($self);
4756 }
4757
4758 redo A;
4759 }
4760 } elsif ($self->{state} == PI_TARGET_STATE) {
4761 if ($is_space->{$self->{nc}}) {
4762 $self->{state} = PI_TARGET_AFTER_STATE;
4763
4764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4765 $self->{line_prev} = $self->{line};
4766 $self->{column_prev} = $self->{column};
4767 $self->{column}++;
4768 $self->{nc}
4769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4770 } else {
4771 $self->{set_nc}->($self);
4772 }
4773
4774 redo A;
4775 } elsif ($self->{nc} == -1) {
4776 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4777 if ($self->{in_subset}) {
4778 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4779 } else {
4780 $self->{state} = DATA_STATE;
4781 $self->{s_kwd} = '';
4782 }
4783 ## Reconsume.
4784 return ($self->{ct}); # pi
4785 redo A;
4786 } elsif ($self->{nc} == 0x003F) { # ?
4787 $self->{state} = PI_AFTER_STATE;
4788
4789 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4790 $self->{line_prev} = $self->{line};
4791 $self->{column_prev} = $self->{column};
4792 $self->{column}++;
4793 $self->{nc}
4794 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4795 } else {
4796 $self->{set_nc}->($self);
4797 }
4798
4799 redo A;
4800 } else {
4801 ## XML5: typo ("tag name" -> "target")
4802 $self->{ct}->{target} .= chr $self->{nc}; # pi
4803
4804 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4805 $self->{line_prev} = $self->{line};
4806 $self->{column_prev} = $self->{column};
4807 $self->{column}++;
4808 $self->{nc}
4809 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4810 } else {
4811 $self->{set_nc}->($self);
4812 }
4813
4814 redo A;
4815 }
4816 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4817 if ($is_space->{$self->{nc}}) {
4818 ## Stay in the state.
4819
4820 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4821 $self->{line_prev} = $self->{line};
4822 $self->{column_prev} = $self->{column};
4823 $self->{column}++;
4824 $self->{nc}
4825 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4826 } else {
4827 $self->{set_nc}->($self);
4828 }
4829
4830 redo A;
4831 } else {
4832 $self->{state} = PI_DATA_STATE;
4833 ## Reprocess.
4834 redo A;
4835 }
4836 } elsif ($self->{state} == PI_DATA_STATE) {
4837 if ($self->{nc} == 0x003F) { # ?
4838 $self->{state} = PI_DATA_AFTER_STATE;
4839
4840 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4841 $self->{line_prev} = $self->{line};
4842 $self->{column_prev} = $self->{column};
4843 $self->{column}++;
4844 $self->{nc}
4845 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4846 } else {
4847 $self->{set_nc}->($self);
4848 }
4849
4850 redo A;
4851 } elsif ($self->{nc} == -1) {
4852 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4853 if ($self->{in_subset}) {
4854 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4855 } else {
4856 $self->{state} = DATA_STATE;
4857 $self->{s_kwd} = '';
4858 }
4859 ## Reprocess.
4860 return ($self->{ct}); # pi
4861 redo A;
4862 } else {
4863 $self->{ct}->{data} .= chr $self->{nc}; # pi
4864 $self->{read_until}->($self->{ct}->{data}, q[?],
4865 length $self->{ct}->{data});
4866 ## Stay in the state.
4867
4868 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4869 $self->{line_prev} = $self->{line};
4870 $self->{column_prev} = $self->{column};
4871 $self->{column}++;
4872 $self->{nc}
4873 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4874 } else {
4875 $self->{set_nc}->($self);
4876 }
4877
4878 ## Reprocess.
4879 redo A;
4880 }
4881 } elsif ($self->{state} == PI_AFTER_STATE) {
4882 if ($self->{nc} == 0x003E) { # >
4883 if ($self->{in_subset}) {
4884 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4885 } else {
4886 $self->{state} = DATA_STATE;
4887 $self->{s_kwd} = '';
4888 }
4889
4890 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4891 $self->{line_prev} = $self->{line};
4892 $self->{column_prev} = $self->{column};
4893 $self->{column}++;
4894 $self->{nc}
4895 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4896 } else {
4897 $self->{set_nc}->($self);
4898 }
4899
4900 return ($self->{ct}); # pi
4901 redo A;
4902 } elsif ($self->{nc} == 0x003F) { # ?
4903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4904 line => $self->{line_prev},
4905 column => $self->{column_prev}); ## XML5: no error
4906 $self->{ct}->{data} .= '?';
4907 $self->{state} = PI_DATA_AFTER_STATE;
4908
4909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4910 $self->{line_prev} = $self->{line};
4911 $self->{column_prev} = $self->{column};
4912 $self->{column}++;
4913 $self->{nc}
4914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4915 } else {
4916 $self->{set_nc}->($self);
4917 }
4918
4919 redo A;
4920 } else {
4921 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4922 line => $self->{line_prev},
4923 column => $self->{column_prev}
4924 + 1 * ($self->{nc} == -1)); ## XML5: no error
4925 $self->{ct}->{data} .= '?'; ## XML5: not appended
4926 $self->{state} = PI_DATA_STATE;
4927 ## Reprocess.
4928 redo A;
4929 }
4930 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4931 ## XML5: Same as "pi after state" in XML5
4932 if ($self->{nc} == 0x003E) { # >
4933 if ($self->{in_subset}) {
4934 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4935 } else {
4936 $self->{state} = DATA_STATE;
4937 $self->{s_kwd} = '';
4938 }
4939
4940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4941 $self->{line_prev} = $self->{line};
4942 $self->{column_prev} = $self->{column};
4943 $self->{column}++;
4944 $self->{nc}
4945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4946 } else {
4947 $self->{set_nc}->($self);
4948 }
4949
4950 return ($self->{ct}); # pi
4951 redo A;
4952 } elsif ($self->{nc} == 0x003F) { # ?
4953 $self->{ct}->{data} .= '?';
4954 ## Stay in the state.
4955
4956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4957 $self->{line_prev} = $self->{line};
4958 $self->{column_prev} = $self->{column};
4959 $self->{column}++;
4960 $self->{nc}
4961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4962 } else {
4963 $self->{set_nc}->($self);
4964 }
4965
4966 redo A;
4967 } else {
4968 $self->{ct}->{data} .= '?'; ## XML5: not appended
4969 $self->{state} = PI_DATA_STATE;
4970 ## Reprocess.
4971 redo A;
4972 }
4973
4974 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4975 if ($self->{nc} == 0x003C) { # <
4976 $self->{state} = DOCTYPE_TAG_STATE;
4977
4978 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4979 $self->{line_prev} = $self->{line};
4980 $self->{column_prev} = $self->{column};
4981 $self->{column}++;
4982 $self->{nc}
4983 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4984 } else {
4985 $self->{set_nc}->($self);
4986 }
4987
4988 redo A;
4989 } elsif ($self->{nc} == 0x0025) { # %
4990 ## XML5: Not defined yet.
4991
4992 ## TODO:
4993
4994 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4995 $self->{line_prev} = $self->{line};
4996 $self->{column_prev} = $self->{column};
4997 $self->{column}++;
4998 $self->{nc}
4999 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5000 } else {
5001 $self->{set_nc}->($self);
5002 }
5003
5004 redo A;
5005 } elsif ($self->{nc} == 0x005D) { # ]
5006 delete $self->{in_subset};
5007 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5008
5009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5010 $self->{line_prev} = $self->{line};
5011 $self->{column_prev} = $self->{column};
5012 $self->{column}++;
5013 $self->{nc}
5014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5015 } else {
5016 $self->{set_nc}->($self);
5017 }
5018
5019 redo A;
5020 } elsif ($is_space->{$self->{nc}}) {
5021 ## Stay in the state.
5022
5023 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5024 $self->{line_prev} = $self->{line};
5025 $self->{column_prev} = $self->{column};
5026 $self->{column}++;
5027 $self->{nc}
5028 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5029 } else {
5030 $self->{set_nc}->($self);
5031 }
5032
5033 redo A;
5034 } elsif ($self->{nc} == -1) {
5035 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5036 delete $self->{in_subset};
5037 $self->{state} = DATA_STATE;
5038 $self->{s_kwd} = '';
5039 ## Reconsume.
5040 return ({type => END_OF_DOCTYPE_TOKEN});
5041 redo A;
5042 } else {
5043 unless ($self->{internal_subset_tainted}) {
5044 ## XML5: No parse error.
5045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5046 $self->{internal_subset_tainted} = 1;
5047 }
5048 ## Stay in the state.
5049
5050 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5051 $self->{line_prev} = $self->{line};
5052 $self->{column_prev} = $self->{column};
5053 $self->{column}++;
5054 $self->{nc}
5055 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5056 } else {
5057 $self->{set_nc}->($self);
5058 }
5059
5060 redo A;
5061 }
5062 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5063 if ($self->{nc} == 0x003E) { # >
5064 $self->{state} = DATA_STATE;
5065 $self->{s_kwd} = '';
5066
5067 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5068 $self->{line_prev} = $self->{line};
5069 $self->{column_prev} = $self->{column};
5070 $self->{column}++;
5071 $self->{nc}
5072 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5073 } else {
5074 $self->{set_nc}->($self);
5075 }
5076
5077 return ({type => END_OF_DOCTYPE_TOKEN});
5078 redo A;
5079 } elsif ($self->{nc} == -1) {
5080 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5081 $self->{state} = DATA_STATE;
5082 $self->{s_kwd} = '';
5083 ## Reconsume.
5084 return ({type => END_OF_DOCTYPE_TOKEN});
5085 redo A;
5086 } else {
5087 ## XML5: No parse error and stay in the state.
5088 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5089
5090 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5091
5092 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5093 $self->{line_prev} = $self->{line};
5094 $self->{column_prev} = $self->{column};
5095 $self->{column}++;
5096 $self->{nc}
5097 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5098 } else {
5099 $self->{set_nc}->($self);
5100 }
5101
5102 redo A;
5103 }
5104 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5105 if ($self->{nc} == 0x003E) { # >
5106 $self->{state} = DATA_STATE;
5107 $self->{s_kwd} = '';
5108
5109 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5110 $self->{line_prev} = $self->{line};
5111 $self->{column_prev} = $self->{column};
5112 $self->{column}++;
5113 $self->{nc}
5114 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5115 } else {
5116 $self->{set_nc}->($self);
5117 }
5118
5119 return ({type => END_OF_DOCTYPE_TOKEN});
5120 redo A;
5121 } elsif ($self->{nc} == -1) {
5122 $self->{state} = DATA_STATE;
5123 $self->{s_kwd} = '';
5124 ## Reconsume.
5125 return ({type => END_OF_DOCTYPE_TOKEN});
5126 redo A;
5127 } else {
5128 ## Stay in the state.
5129
5130 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5131 $self->{line_prev} = $self->{line};
5132 $self->{column_prev} = $self->{column};
5133 $self->{column}++;
5134 $self->{nc}
5135 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5136 } else {
5137 $self->{set_nc}->($self);
5138 }
5139
5140 redo A;
5141 }
5142 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5143 if ($self->{nc} == 0x0021) { # !
5144 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
5145
5146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5147 $self->{line_prev} = $self->{line};
5148 $self->{column_prev} = $self->{column};
5149 $self->{column}++;
5150 $self->{nc}
5151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5152 } else {
5153 $self->{set_nc}->($self);
5154 }
5155
5156 redo A;
5157 } elsif ($self->{nc} == 0x003F) { # ?
5158 $self->{state} = PI_STATE;
5159
5160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5161 $self->{line_prev} = $self->{line};
5162 $self->{column_prev} = $self->{column};
5163 $self->{column}++;
5164 $self->{nc}
5165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5166 } else {
5167 $self->{set_nc}->($self);
5168 }
5169
5170 redo A;
5171 } elsif ($self->{nc} == -1) {
5172 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5173 $self->{state} = DATA_STATE;
5174 $self->{s_kwd} = '';
5175 ## Reconsume.
5176 redo A;
5177 } else {
5178 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5179 line => $self->{line_prev},
5180 column => $self->{column_prev});
5181 $self->{state} = BOGUS_COMMENT_STATE;
5182 $self->{ct} = {type => COMMENT_TOKEN,
5183 data => '',
5184 }; ## NOTE: Will be discarded.
5185
5186 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5187 $self->{line_prev} = $self->{line};
5188 $self->{column_prev} = $self->{column};
5189 $self->{column}++;
5190 $self->{nc}
5191 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5192 } else {
5193 $self->{set_nc}->($self);
5194 }
5195
5196 redo A;
5197 }
5198
5199 } else {
5200 die "$0: $self->{state}: Unknown state";
5201 }
5202 } # A
5203
5204 die "$0: _get_next_token: unexpected case";
5205 } # _get_next_token
5206
5207 1;
5208 ## $Date: 2008/10/15 12:49:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24