/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (show annotations) (download)
Wed Oct 15 08:51:02 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.9: +84 -21 lines
++ whatpm/t/xml/ChangeLog	15 Oct 2008 08:50:58 -0000
	* doctypes-1.dat: Lowercase <!doctype> test added.

	* elements-1.dat: End tag tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 08:50:06 -0000
	* Tokenizer.pm.src: XML tag name start character support for end
	tags.  Support for the short end tag syntax of XML5.  Raise a
	parse erorr for a lowercase <!doctype> in XML.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: XML tag name start character support for start

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.9 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## XML states
118 sub PI_STATE () { 51 }
119 sub PI_TARGET_STATE () { 52 }
120 sub PI_TARGET_AFTER_STATE () { 53 }
121 sub PI_DATA_STATE () { 54 }
122 sub PI_AFTER_STATE () { 55 }
123 sub PI_DATA_AFTER_STATE () { 56 }
124
125 ## Tree constructor state constants (see Whatpm::HTML for the full
126 ## list and descriptions)
127
128 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129 sub FOREIGN_EL () { 0b1_00000000000 }
130
131 ## Character reference mappings
132
133 my $charref_map = {
134 0x0D => 0x000A,
135 0x80 => 0x20AC,
136 0x81 => 0xFFFD,
137 0x82 => 0x201A,
138 0x83 => 0x0192,
139 0x84 => 0x201E,
140 0x85 => 0x2026,
141 0x86 => 0x2020,
142 0x87 => 0x2021,
143 0x88 => 0x02C6,
144 0x89 => 0x2030,
145 0x8A => 0x0160,
146 0x8B => 0x2039,
147 0x8C => 0x0152,
148 0x8D => 0xFFFD,
149 0x8E => 0x017D,
150 0x8F => 0xFFFD,
151 0x90 => 0xFFFD,
152 0x91 => 0x2018,
153 0x92 => 0x2019,
154 0x93 => 0x201C,
155 0x94 => 0x201D,
156 0x95 => 0x2022,
157 0x96 => 0x2013,
158 0x97 => 0x2014,
159 0x98 => 0x02DC,
160 0x99 => 0x2122,
161 0x9A => 0x0161,
162 0x9B => 0x203A,
163 0x9C => 0x0153,
164 0x9D => 0xFFFD,
165 0x9E => 0x017E,
166 0x9F => 0x0178,
167 }; # $charref_map
168 $charref_map->{$_} = 0xFFFD
169 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176
177 ## Implementations MUST act as if state machine in the spec
178
179 sub _initialize_tokenizer ($) {
180 my $self = shift;
181
182 ## NOTE: Fields set by |new| constructor:
183 #$self->{level}
184 #$self->{set_nc}
185 #$self->{parse_error}
186 #$self->{is_xml} (if XML)
187
188 $self->{state} = DATA_STATE; # MUST
189 $self->{s_kwd} = ''; # state keyword
190 #$self->{entity__value}; # initialized when used
191 #$self->{entity__match}; # initialized when used
192 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193 undef $self->{ct}; # current token
194 undef $self->{ca}; # current attribute
195 undef $self->{last_stag_name}; # last emitted start tag name
196 #$self->{prev_state}; # initialized when used
197 delete $self->{self_closing};
198 $self->{char_buffer} = '';
199 $self->{char_buffer_pos} = 0;
200 $self->{nc} = -1; # next input character
201 #$self->{next_nc}
202
203 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
204 $self->{line_prev} = $self->{line};
205 $self->{column_prev} = $self->{column};
206 $self->{column}++;
207 $self->{nc}
208 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
209 } else {
210 $self->{set_nc}->($self);
211 }
212
213 $self->{token} = [];
214 # $self->{escape}
215 } # _initialize_tokenizer
216
217 ## A token has:
218 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
220 ## ->{name} (DOCTYPE_TOKEN)
221 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222 ## ->{pubid} (DOCTYPE_TOKEN)
223 ## ->{sysid} (DOCTYPE_TOKEN)
224 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
225 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
226 ## ->{name}
227 ## ->{value}
228 ## ->{has_reference} == 1 or 0
229 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
230 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
231 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
232 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
233 ## while the token is pushed back to the stack.
234
235 ## Emitted token MUST immediately be handled by the tree construction state.
236
237 ## Before each step, UA MAY check to see if either one of the scripts in
238 ## "list of scripts that will execute as soon as possible" or the first
239 ## script in the "list of scripts that will execute asynchronously",
240 ## has completed loading. If one has, then it MUST be executed
241 ## and removed from the list.
242
243 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
244 ## (This requirement was dropped from HTML5 spec, unfortunately.)
245
246 my $is_space = {
247 0x0009 => 1, # CHARACTER TABULATION (HT)
248 0x000A => 1, # LINE FEED (LF)
249 #0x000B => 0, # LINE TABULATION (VT)
250 0x000C => 1, # FORM FEED (FF)
251 #0x000D => 1, # CARRIAGE RETURN (CR)
252 0x0020 => 1, # SPACE (SP)
253 };
254
255 sub _get_next_token ($) {
256 my $self = shift;
257
258 if ($self->{self_closing}) {
259 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
260 ## NOTE: The |self_closing| flag is only set by start tag token.
261 ## In addition, when a start tag token is emitted, it is always set to
262 ## |ct|.
263 delete $self->{self_closing};
264 }
265
266 if (@{$self->{token}}) {
267 $self->{self_closing} = $self->{token}->[0]->{self_closing};
268 return shift @{$self->{token}};
269 }
270
271 A: {
272 if ($self->{state} == PCDATA_STATE) {
273 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
274
275 if ($self->{nc} == 0x0026) { # &
276
277 ## NOTE: In the spec, the tokenizer is switched to the
278 ## "entity data state". In this implementation, the tokenizer
279 ## is switched to the |ENTITY_STATE|, which is an implementation
280 ## of the "consume a character reference" algorithm.
281 $self->{entity_add} = -1;
282 $self->{prev_state} = DATA_STATE;
283 $self->{state} = ENTITY_STATE;
284
285 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
286 $self->{line_prev} = $self->{line};
287 $self->{column_prev} = $self->{column};
288 $self->{column}++;
289 $self->{nc}
290 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
291 } else {
292 $self->{set_nc}->($self);
293 }
294
295 redo A;
296 } elsif ($self->{nc} == 0x003C) { # <
297
298 $self->{state} = TAG_OPEN_STATE;
299
300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
301 $self->{line_prev} = $self->{line};
302 $self->{column_prev} = $self->{column};
303 $self->{column}++;
304 $self->{nc}
305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
306 } else {
307 $self->{set_nc}->($self);
308 }
309
310 redo A;
311 } elsif ($self->{nc} == -1) {
312
313 return ({type => END_OF_FILE_TOKEN,
314 line => $self->{line}, column => $self->{column}});
315 last A; ## TODO: ok?
316 } else {
317
318 #
319 }
320
321 # Anything else
322 my $token = {type => CHARACTER_TOKEN,
323 data => chr $self->{nc},
324 line => $self->{line}, column => $self->{column},
325 };
326 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
327
328 ## Stay in the state.
329
330 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
331 $self->{line_prev} = $self->{line};
332 $self->{column_prev} = $self->{column};
333 $self->{column}++;
334 $self->{nc}
335 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
336 } else {
337 $self->{set_nc}->($self);
338 }
339
340 return ($token);
341 redo A;
342 } elsif ($self->{state} == DATA_STATE) {
343 $self->{s_kwd} = '' unless defined $self->{s_kwd};
344 if ($self->{nc} == 0x0026) { # &
345 $self->{s_kwd} = '';
346 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
347 not $self->{escape}) {
348
349 ## NOTE: In the spec, the tokenizer is switched to the
350 ## "entity data state". In this implementation, the tokenizer
351 ## is switched to the |ENTITY_STATE|, which is an implementation
352 ## of the "consume a character reference" algorithm.
353 $self->{entity_add} = -1;
354 $self->{prev_state} = DATA_STATE;
355 $self->{state} = ENTITY_STATE;
356
357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
358 $self->{line_prev} = $self->{line};
359 $self->{column_prev} = $self->{column};
360 $self->{column}++;
361 $self->{nc}
362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
363 } else {
364 $self->{set_nc}->($self);
365 }
366
367 redo A;
368 } else {
369
370 #
371 }
372 } elsif ($self->{nc} == 0x002D) { # -
373 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
374 if ($self->{s_kwd} eq '<!-') {
375
376 $self->{escape} = 1; # unless $self->{escape};
377 $self->{s_kwd} = '--';
378 #
379 } elsif ($self->{s_kwd} eq '-') {
380
381 $self->{s_kwd} = '--';
382 #
383 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
384
385 $self->{s_kwd} .= '-';
386 #
387 } else {
388
389 $self->{s_kwd} = '-';
390 #
391 }
392 }
393
394 #
395 } elsif ($self->{nc} == 0x0021) { # !
396 if (length $self->{s_kwd}) {
397
398 $self->{s_kwd} .= '!';
399 #
400 } else {
401
402 #$self->{s_kwd} = '';
403 #
404 }
405 #
406 } elsif ($self->{nc} == 0x003C) { # <
407 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
408 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
409 not $self->{escape})) {
410
411 $self->{state} = TAG_OPEN_STATE;
412
413 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
414 $self->{line_prev} = $self->{line};
415 $self->{column_prev} = $self->{column};
416 $self->{column}++;
417 $self->{nc}
418 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
419 } else {
420 $self->{set_nc}->($self);
421 }
422
423 redo A;
424 } else {
425
426 $self->{s_kwd} = '';
427 #
428 }
429 } elsif ($self->{nc} == 0x003E) { # >
430 if ($self->{escape} and
431 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
432 if ($self->{s_kwd} eq '--') {
433
434 delete $self->{escape};
435 #
436 } else {
437
438 #
439 }
440 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
441
442 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
443 line => $self->{line_prev},
444 column => $self->{column_prev} - 1);
445 #
446 } else {
447
448 #
449 }
450
451 $self->{s_kwd} = '';
452 #
453 } elsif ($self->{nc} == 0x005D) { # ]
454 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
455
456 $self->{s_kwd} .= ']';
457 } elsif ($self->{s_kwd} eq ']]') {
458
459 #
460 } else {
461
462 $self->{s_kwd} = '';
463 }
464 #
465 } elsif ($self->{nc} == -1) {
466
467 $self->{s_kwd} = '';
468 return ({type => END_OF_FILE_TOKEN,
469 line => $self->{line}, column => $self->{column}});
470 last A; ## TODO: ok?
471 } else {
472
473 $self->{s_kwd} = '';
474 #
475 }
476
477 # Anything else
478 my $token = {type => CHARACTER_TOKEN,
479 data => chr $self->{nc},
480 line => $self->{line}, column => $self->{column},
481 };
482 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
483 length $token->{data})) {
484 $self->{s_kwd} = '';
485 }
486
487 ## Stay in the data state.
488 if (not $self->{is_xml} and
489 $self->{content_model} == PCDATA_CONTENT_MODEL) {
490
491 $self->{state} = PCDATA_STATE;
492 } else {
493
494 ## Stay in the state.
495 }
496
497 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
498 $self->{line_prev} = $self->{line};
499 $self->{column_prev} = $self->{column};
500 $self->{column}++;
501 $self->{nc}
502 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
503 } else {
504 $self->{set_nc}->($self);
505 }
506
507 return ($token);
508 redo A;
509 } elsif ($self->{state} == TAG_OPEN_STATE) {
510 ## XML5: "tag state".
511
512 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
513 if ($self->{nc} == 0x002F) { # /
514
515
516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
517 $self->{line_prev} = $self->{line};
518 $self->{column_prev} = $self->{column};
519 $self->{column}++;
520 $self->{nc}
521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
522 } else {
523 $self->{set_nc}->($self);
524 }
525
526 $self->{state} = CLOSE_TAG_OPEN_STATE;
527 redo A;
528 } elsif ($self->{nc} == 0x0021) { # !
529
530 $self->{s_kwd} = '<' unless $self->{escape};
531 #
532 } else {
533
534 #
535 }
536
537 ## reconsume
538 $self->{state} = DATA_STATE;
539 $self->{s_kwd} = '';
540 return ({type => CHARACTER_TOKEN, data => '<',
541 line => $self->{line_prev},
542 column => $self->{column_prev},
543 });
544 redo A;
545 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
546 if ($self->{nc} == 0x0021) { # !
547
548 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
549
550 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
551 $self->{line_prev} = $self->{line};
552 $self->{column_prev} = $self->{column};
553 $self->{column}++;
554 $self->{nc}
555 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
556 } else {
557 $self->{set_nc}->($self);
558 }
559
560 redo A;
561 } elsif ($self->{nc} == 0x002F) { # /
562
563 $self->{state} = CLOSE_TAG_OPEN_STATE;
564
565 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
566 $self->{line_prev} = $self->{line};
567 $self->{column_prev} = $self->{column};
568 $self->{column}++;
569 $self->{nc}
570 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
571 } else {
572 $self->{set_nc}->($self);
573 }
574
575 redo A;
576 } elsif (0x0041 <= $self->{nc} and
577 $self->{nc} <= 0x005A) { # A..Z
578
579 $self->{ct}
580 = {type => START_TAG_TOKEN,
581 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
582 line => $self->{line_prev},
583 column => $self->{column_prev}};
584 $self->{state} = TAG_NAME_STATE;
585
586 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
587 $self->{line_prev} = $self->{line};
588 $self->{column_prev} = $self->{column};
589 $self->{column}++;
590 $self->{nc}
591 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
592 } else {
593 $self->{set_nc}->($self);
594 }
595
596 redo A;
597 } elsif (0x0061 <= $self->{nc} and
598 $self->{nc} <= 0x007A) { # a..z
599
600 $self->{ct} = {type => START_TAG_TOKEN,
601 tag_name => chr ($self->{nc}),
602 line => $self->{line_prev},
603 column => $self->{column_prev}};
604 $self->{state} = TAG_NAME_STATE;
605
606 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
607 $self->{line_prev} = $self->{line};
608 $self->{column_prev} = $self->{column};
609 $self->{column}++;
610 $self->{nc}
611 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
612 } else {
613 $self->{set_nc}->($self);
614 }
615
616 redo A;
617 } elsif ($self->{nc} == 0x003E) { # >
618
619 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
620 line => $self->{line_prev},
621 column => $self->{column_prev});
622 $self->{state} = DATA_STATE;
623 $self->{s_kwd} = '';
624
625 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
626 $self->{line_prev} = $self->{line};
627 $self->{column_prev} = $self->{column};
628 $self->{column}++;
629 $self->{nc}
630 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
631 } else {
632 $self->{set_nc}->($self);
633 }
634
635
636 return ({type => CHARACTER_TOKEN, data => '<>',
637 line => $self->{line_prev},
638 column => $self->{column_prev},
639 });
640
641 redo A;
642 } elsif ($self->{nc} == 0x003F) { # ?
643 if ($self->{is_xml}) {
644
645 $self->{state} = PI_STATE;
646
647 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
648 $self->{line_prev} = $self->{line};
649 $self->{column_prev} = $self->{column};
650 $self->{column}++;
651 $self->{nc}
652 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
653 } else {
654 $self->{set_nc}->($self);
655 }
656
657 redo A;
658 } else {
659
660 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
661 line => $self->{line_prev},
662 column => $self->{column_prev});
663 $self->{state} = BOGUS_COMMENT_STATE;
664 $self->{ct} = {type => COMMENT_TOKEN, data => '',
665 line => $self->{line_prev},
666 column => $self->{column_prev},
667 };
668 ## $self->{nc} is intentionally left as is
669 redo A;
670 }
671 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
672
673 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
674 line => $self->{line_prev},
675 column => $self->{column_prev});
676 $self->{state} = DATA_STATE;
677 $self->{s_kwd} = '';
678 ## reconsume
679
680 return ({type => CHARACTER_TOKEN, data => '<',
681 line => $self->{line_prev},
682 column => $self->{column_prev},
683 });
684
685 redo A;
686 } else {
687 ## XML5: "<:" is a parse error.
688
689 $self->{ct} = {type => START_TAG_TOKEN,
690 tag_name => chr ($self->{nc}),
691 line => $self->{line_prev},
692 column => $self->{column_prev}};
693 $self->{state} = TAG_NAME_STATE;
694
695 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
696 $self->{line_prev} = $self->{line};
697 $self->{column_prev} = $self->{column};
698 $self->{column}++;
699 $self->{nc}
700 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
701 } else {
702 $self->{set_nc}->($self);
703 }
704
705 redo A;
706 }
707 } else {
708 die "$0: $self->{content_model} in tag open";
709 }
710 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
711 ## NOTE: The "close tag open state" in the spec is implemented as
712 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
713
714 ## XML5: "end tag state".
715
716 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
717 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
718 if (defined $self->{last_stag_name}) {
719 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
720 $self->{s_kwd} = '';
721 ## Reconsume.
722 redo A;
723 } else {
724 ## No start tag token has ever been emitted
725 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
726
727 $self->{state} = DATA_STATE;
728 $self->{s_kwd} = '';
729 ## Reconsume.
730 return ({type => CHARACTER_TOKEN, data => '</',
731 line => $l, column => $c,
732 });
733 redo A;
734 }
735 }
736
737 if (0x0041 <= $self->{nc} and
738 $self->{nc} <= 0x005A) { # A..Z
739
740 $self->{ct}
741 = {type => END_TAG_TOKEN,
742 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
743 line => $l, column => $c};
744 $self->{state} = TAG_NAME_STATE;
745
746 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
747 $self->{line_prev} = $self->{line};
748 $self->{column_prev} = $self->{column};
749 $self->{column}++;
750 $self->{nc}
751 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
752 } else {
753 $self->{set_nc}->($self);
754 }
755
756 redo A;
757 } elsif (0x0061 <= $self->{nc} and
758 $self->{nc} <= 0x007A) { # a..z
759
760 $self->{ct} = {type => END_TAG_TOKEN,
761 tag_name => chr ($self->{nc}),
762 line => $l, column => $c};
763 $self->{state} = TAG_NAME_STATE;
764
765 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
766 $self->{line_prev} = $self->{line};
767 $self->{column_prev} = $self->{column};
768 $self->{column}++;
769 $self->{nc}
770 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
771 } else {
772 $self->{set_nc}->($self);
773 }
774
775 redo A;
776 } elsif ($self->{nc} == 0x003E) { # >
777 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
778 line => $self->{line_prev}, ## "<" in "</>"
779 column => $self->{column_prev} - 1);
780 $self->{state} = DATA_STATE;
781 $self->{s_kwd} = '';
782 if ($self->{is_xml}) {
783
784 ## XML5: No parse error.
785
786 ## NOTE: This parser raises a parse error, since it supports
787 ## XML1, not XML5.
788
789 ## NOTE: A short end tag token.
790 my $ct = {type => END_TAG_TOKEN,
791 tag_name => '',
792 line => $self->{line_prev},
793 column => $self->{column_prev} - 1,
794 };
795
796 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
797 $self->{line_prev} = $self->{line};
798 $self->{column_prev} = $self->{column};
799 $self->{column}++;
800 $self->{nc}
801 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
802 } else {
803 $self->{set_nc}->($self);
804 }
805
806 return ($ct);
807 } else {
808
809
810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
811 $self->{line_prev} = $self->{line};
812 $self->{column_prev} = $self->{column};
813 $self->{column}++;
814 $self->{nc}
815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
816 } else {
817 $self->{set_nc}->($self);
818 }
819
820 }
821 redo A;
822 } elsif ($self->{nc} == -1) {
823
824 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
825 $self->{s_kwd} = '';
826 $self->{state} = DATA_STATE;
827 # reconsume
828
829 return ({type => CHARACTER_TOKEN, data => '</',
830 line => $l, column => $c,
831 });
832
833 redo A;
834 } elsif (not $self->{is_xml} or
835 $is_space->{$self->{nc}}) {
836
837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
838 line => $self->{line_prev}, # "<" of "</"
839 column => $self->{column_prev} - 1);
840 $self->{state} = BOGUS_COMMENT_STATE;
841 $self->{ct} = {type => COMMENT_TOKEN, data => '',
842 line => $self->{line_prev}, # "<" of "</"
843 column => $self->{column_prev} - 1,
844 };
845 ## NOTE: $self->{nc} is intentionally left as is.
846 ## Although the "anything else" case of the spec not explicitly
847 ## states that the next input character is to be reconsumed,
848 ## it will be included to the |data| of the comment token
849 ## generated from the bogus end tag, as defined in the
850 ## "bogus comment state" entry.
851 redo A;
852 } else {
853 ## XML5: "</:" is a parse error.
854
855 $self->{ct} = {type => END_TAG_TOKEN,
856 tag_name => chr ($self->{nc}),
857 line => $l, column => $c};
858 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
859
860 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
861 $self->{line_prev} = $self->{line};
862 $self->{column_prev} = $self->{column};
863 $self->{column}++;
864 $self->{nc}
865 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
866 } else {
867 $self->{set_nc}->($self);
868 }
869
870 redo A;
871 }
872 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
873 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
874 if (length $ch) {
875 my $CH = $ch;
876 $ch =~ tr/a-z/A-Z/;
877 my $nch = chr $self->{nc};
878 if ($nch eq $ch or $nch eq $CH) {
879
880 ## Stay in the state.
881 $self->{s_kwd} .= $nch;
882
883 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
884 $self->{line_prev} = $self->{line};
885 $self->{column_prev} = $self->{column};
886 $self->{column}++;
887 $self->{nc}
888 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
889 } else {
890 $self->{set_nc}->($self);
891 }
892
893 redo A;
894 } else {
895
896 $self->{state} = DATA_STATE;
897 $self->{s_kwd} = '';
898 ## Reconsume.
899 return ({type => CHARACTER_TOKEN,
900 data => '</' . $self->{s_kwd},
901 line => $self->{line_prev},
902 column => $self->{column_prev} - 1 - length $self->{s_kwd},
903 });
904 redo A;
905 }
906 } else { # after "<{tag-name}"
907 unless ($is_space->{$self->{nc}} or
908 {
909 0x003E => 1, # >
910 0x002F => 1, # /
911 -1 => 1, # EOF
912 }->{$self->{nc}}) {
913
914 ## Reconsume.
915 $self->{state} = DATA_STATE;
916 $self->{s_kwd} = '';
917 return ({type => CHARACTER_TOKEN,
918 data => '</' . $self->{s_kwd},
919 line => $self->{line_prev},
920 column => $self->{column_prev} - 1 - length $self->{s_kwd},
921 });
922 redo A;
923 } else {
924
925 $self->{ct}
926 = {type => END_TAG_TOKEN,
927 tag_name => $self->{last_stag_name},
928 line => $self->{line_prev},
929 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
930 $self->{state} = TAG_NAME_STATE;
931 ## Reconsume.
932 redo A;
933 }
934 }
935 } elsif ($self->{state} == TAG_NAME_STATE) {
936 if ($is_space->{$self->{nc}}) {
937
938 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
939
940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
941 $self->{line_prev} = $self->{line};
942 $self->{column_prev} = $self->{column};
943 $self->{column}++;
944 $self->{nc}
945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
946 } else {
947 $self->{set_nc}->($self);
948 }
949
950 redo A;
951 } elsif ($self->{nc} == 0x003E) { # >
952 if ($self->{ct}->{type} == START_TAG_TOKEN) {
953
954 $self->{last_stag_name} = $self->{ct}->{tag_name};
955 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
956 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
957 #if ($self->{ct}->{attributes}) {
958 # ## NOTE: This should never be reached.
959 # !!! cp (36);
960 # !!! parse-error (type => 'end tag attribute');
961 #} else {
962
963 #}
964 } else {
965 die "$0: $self->{ct}->{type}: Unknown token type";
966 }
967 $self->{state} = DATA_STATE;
968 $self->{s_kwd} = '';
969
970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
971 $self->{line_prev} = $self->{line};
972 $self->{column_prev} = $self->{column};
973 $self->{column}++;
974 $self->{nc}
975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
976 } else {
977 $self->{set_nc}->($self);
978 }
979
980
981 return ($self->{ct}); # start tag or end tag
982
983 redo A;
984 } elsif (0x0041 <= $self->{nc} and
985 $self->{nc} <= 0x005A) { # A..Z
986
987 $self->{ct}->{tag_name}
988 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
989 # start tag or end tag
990 ## Stay in this state
991
992 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
993 $self->{line_prev} = $self->{line};
994 $self->{column_prev} = $self->{column};
995 $self->{column}++;
996 $self->{nc}
997 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
998 } else {
999 $self->{set_nc}->($self);
1000 }
1001
1002 redo A;
1003 } elsif ($self->{nc} == -1) {
1004 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1005 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1006
1007 $self->{last_stag_name} = $self->{ct}->{tag_name};
1008 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1009 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1010 #if ($self->{ct}->{attributes}) {
1011 # ## NOTE: This state should never be reached.
1012 # !!! cp (40);
1013 # !!! parse-error (type => 'end tag attribute');
1014 #} else {
1015
1016 #}
1017 } else {
1018 die "$0: $self->{ct}->{type}: Unknown token type";
1019 }
1020 $self->{state} = DATA_STATE;
1021 $self->{s_kwd} = '';
1022 # reconsume
1023
1024 return ($self->{ct}); # start tag or end tag
1025
1026 redo A;
1027 } elsif ($self->{nc} == 0x002F) { # /
1028
1029 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1030
1031 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1032 $self->{line_prev} = $self->{line};
1033 $self->{column_prev} = $self->{column};
1034 $self->{column}++;
1035 $self->{nc}
1036 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1037 } else {
1038 $self->{set_nc}->($self);
1039 }
1040
1041 redo A;
1042 } else {
1043
1044 $self->{ct}->{tag_name} .= chr $self->{nc};
1045 # start tag or end tag
1046 ## Stay in the state
1047
1048 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1049 $self->{line_prev} = $self->{line};
1050 $self->{column_prev} = $self->{column};
1051 $self->{column}++;
1052 $self->{nc}
1053 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1054 } else {
1055 $self->{set_nc}->($self);
1056 }
1057
1058 redo A;
1059 }
1060 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1061 if ($is_space->{$self->{nc}}) {
1062
1063 ## Stay in the state
1064
1065 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1066 $self->{line_prev} = $self->{line};
1067 $self->{column_prev} = $self->{column};
1068 $self->{column}++;
1069 $self->{nc}
1070 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1071 } else {
1072 $self->{set_nc}->($self);
1073 }
1074
1075 redo A;
1076 } elsif ($self->{nc} == 0x003E) { # >
1077 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1078
1079 $self->{last_stag_name} = $self->{ct}->{tag_name};
1080 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1081 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1082 if ($self->{ct}->{attributes}) {
1083
1084 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1085 } else {
1086
1087 }
1088 } else {
1089 die "$0: $self->{ct}->{type}: Unknown token type";
1090 }
1091 $self->{state} = DATA_STATE;
1092 $self->{s_kwd} = '';
1093
1094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1095 $self->{line_prev} = $self->{line};
1096 $self->{column_prev} = $self->{column};
1097 $self->{column}++;
1098 $self->{nc}
1099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1100 } else {
1101 $self->{set_nc}->($self);
1102 }
1103
1104
1105 return ($self->{ct}); # start tag or end tag
1106
1107 redo A;
1108 } elsif (0x0041 <= $self->{nc} and
1109 $self->{nc} <= 0x005A) { # A..Z
1110
1111 $self->{ca}
1112 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1113 value => '',
1114 line => $self->{line}, column => $self->{column}};
1115 $self->{state} = ATTRIBUTE_NAME_STATE;
1116
1117 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1118 $self->{line_prev} = $self->{line};
1119 $self->{column_prev} = $self->{column};
1120 $self->{column}++;
1121 $self->{nc}
1122 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1123 } else {
1124 $self->{set_nc}->($self);
1125 }
1126
1127 redo A;
1128 } elsif ($self->{nc} == 0x002F) { # /
1129
1130 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1131
1132 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1133 $self->{line_prev} = $self->{line};
1134 $self->{column_prev} = $self->{column};
1135 $self->{column}++;
1136 $self->{nc}
1137 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1138 } else {
1139 $self->{set_nc}->($self);
1140 }
1141
1142 redo A;
1143 } elsif ($self->{nc} == -1) {
1144 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1145 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1146
1147 $self->{last_stag_name} = $self->{ct}->{tag_name};
1148 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1149 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1150 if ($self->{ct}->{attributes}) {
1151
1152 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1153 } else {
1154
1155 }
1156 } else {
1157 die "$0: $self->{ct}->{type}: Unknown token type";
1158 }
1159 $self->{state} = DATA_STATE;
1160 $self->{s_kwd} = '';
1161 # reconsume
1162
1163 return ($self->{ct}); # start tag or end tag
1164
1165 redo A;
1166 } else {
1167 if ({
1168 0x0022 => 1, # "
1169 0x0027 => 1, # '
1170 0x003D => 1, # =
1171 }->{$self->{nc}}) {
1172
1173 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1174 } else {
1175
1176 }
1177 $self->{ca}
1178 = {name => chr ($self->{nc}),
1179 value => '',
1180 line => $self->{line}, column => $self->{column}};
1181 $self->{state} = ATTRIBUTE_NAME_STATE;
1182
1183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1184 $self->{line_prev} = $self->{line};
1185 $self->{column_prev} = $self->{column};
1186 $self->{column}++;
1187 $self->{nc}
1188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1189 } else {
1190 $self->{set_nc}->($self);
1191 }
1192
1193 redo A;
1194 }
1195 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1196 my $before_leave = sub {
1197 if (exists $self->{ct}->{attributes} # start tag or end tag
1198 ->{$self->{ca}->{name}}) { # MUST
1199
1200 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1201 ## Discard $self->{ca} # MUST
1202 } else {
1203
1204 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1205 = $self->{ca};
1206 }
1207 }; # $before_leave
1208
1209 if ($is_space->{$self->{nc}}) {
1210
1211 $before_leave->();
1212 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1213
1214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1215 $self->{line_prev} = $self->{line};
1216 $self->{column_prev} = $self->{column};
1217 $self->{column}++;
1218 $self->{nc}
1219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1220 } else {
1221 $self->{set_nc}->($self);
1222 }
1223
1224 redo A;
1225 } elsif ($self->{nc} == 0x003D) { # =
1226
1227 $before_leave->();
1228 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1229
1230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1231 $self->{line_prev} = $self->{line};
1232 $self->{column_prev} = $self->{column};
1233 $self->{column}++;
1234 $self->{nc}
1235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1236 } else {
1237 $self->{set_nc}->($self);
1238 }
1239
1240 redo A;
1241 } elsif ($self->{nc} == 0x003E) { # >
1242 $before_leave->();
1243 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1244
1245 $self->{last_stag_name} = $self->{ct}->{tag_name};
1246 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1247
1248 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1249 if ($self->{ct}->{attributes}) {
1250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1251 }
1252 } else {
1253 die "$0: $self->{ct}->{type}: Unknown token type";
1254 }
1255 $self->{state} = DATA_STATE;
1256 $self->{s_kwd} = '';
1257
1258 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1259 $self->{line_prev} = $self->{line};
1260 $self->{column_prev} = $self->{column};
1261 $self->{column}++;
1262 $self->{nc}
1263 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1264 } else {
1265 $self->{set_nc}->($self);
1266 }
1267
1268
1269 return ($self->{ct}); # start tag or end tag
1270
1271 redo A;
1272 } elsif (0x0041 <= $self->{nc} and
1273 $self->{nc} <= 0x005A) { # A..Z
1274
1275 $self->{ca}->{name}
1276 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1277 ## Stay in the state
1278
1279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1280 $self->{line_prev} = $self->{line};
1281 $self->{column_prev} = $self->{column};
1282 $self->{column}++;
1283 $self->{nc}
1284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1285 } else {
1286 $self->{set_nc}->($self);
1287 }
1288
1289 redo A;
1290 } elsif ($self->{nc} == 0x002F) { # /
1291
1292 $before_leave->();
1293 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1294
1295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1296 $self->{line_prev} = $self->{line};
1297 $self->{column_prev} = $self->{column};
1298 $self->{column}++;
1299 $self->{nc}
1300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1301 } else {
1302 $self->{set_nc}->($self);
1303 }
1304
1305 redo A;
1306 } elsif ($self->{nc} == -1) {
1307 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1308 $before_leave->();
1309 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1310
1311 $self->{last_stag_name} = $self->{ct}->{tag_name};
1312 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1313 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1314 if ($self->{ct}->{attributes}) {
1315
1316 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1317 } else {
1318 ## NOTE: This state should never be reached.
1319
1320 }
1321 } else {
1322 die "$0: $self->{ct}->{type}: Unknown token type";
1323 }
1324 $self->{state} = DATA_STATE;
1325 $self->{s_kwd} = '';
1326 # reconsume
1327
1328 return ($self->{ct}); # start tag or end tag
1329
1330 redo A;
1331 } else {
1332 if ($self->{nc} == 0x0022 or # "
1333 $self->{nc} == 0x0027) { # '
1334
1335 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1336 } else {
1337
1338 }
1339 $self->{ca}->{name} .= chr ($self->{nc});
1340 ## Stay in the state
1341
1342 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1343 $self->{line_prev} = $self->{line};
1344 $self->{column_prev} = $self->{column};
1345 $self->{column}++;
1346 $self->{nc}
1347 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1348 } else {
1349 $self->{set_nc}->($self);
1350 }
1351
1352 redo A;
1353 }
1354 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1355 if ($is_space->{$self->{nc}}) {
1356
1357 ## Stay in the state
1358
1359 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1360 $self->{line_prev} = $self->{line};
1361 $self->{column_prev} = $self->{column};
1362 $self->{column}++;
1363 $self->{nc}
1364 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1365 } else {
1366 $self->{set_nc}->($self);
1367 }
1368
1369 redo A;
1370 } elsif ($self->{nc} == 0x003D) { # =
1371
1372 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1373
1374 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1375 $self->{line_prev} = $self->{line};
1376 $self->{column_prev} = $self->{column};
1377 $self->{column}++;
1378 $self->{nc}
1379 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1380 } else {
1381 $self->{set_nc}->($self);
1382 }
1383
1384 redo A;
1385 } elsif ($self->{nc} == 0x003E) { # >
1386 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1387
1388 $self->{last_stag_name} = $self->{ct}->{tag_name};
1389 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1390 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1391 if ($self->{ct}->{attributes}) {
1392
1393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1394 } else {
1395 ## NOTE: This state should never be reached.
1396
1397 }
1398 } else {
1399 die "$0: $self->{ct}->{type}: Unknown token type";
1400 }
1401 $self->{state} = DATA_STATE;
1402 $self->{s_kwd} = '';
1403
1404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1405 $self->{line_prev} = $self->{line};
1406 $self->{column_prev} = $self->{column};
1407 $self->{column}++;
1408 $self->{nc}
1409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1410 } else {
1411 $self->{set_nc}->($self);
1412 }
1413
1414
1415 return ($self->{ct}); # start tag or end tag
1416
1417 redo A;
1418 } elsif (0x0041 <= $self->{nc} and
1419 $self->{nc} <= 0x005A) { # A..Z
1420
1421 $self->{ca}
1422 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1423 value => '',
1424 line => $self->{line}, column => $self->{column}};
1425 $self->{state} = ATTRIBUTE_NAME_STATE;
1426
1427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1428 $self->{line_prev} = $self->{line};
1429 $self->{column_prev} = $self->{column};
1430 $self->{column}++;
1431 $self->{nc}
1432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1433 } else {
1434 $self->{set_nc}->($self);
1435 }
1436
1437 redo A;
1438 } elsif ($self->{nc} == 0x002F) { # /
1439
1440 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1441
1442 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1443 $self->{line_prev} = $self->{line};
1444 $self->{column_prev} = $self->{column};
1445 $self->{column}++;
1446 $self->{nc}
1447 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1448 } else {
1449 $self->{set_nc}->($self);
1450 }
1451
1452 redo A;
1453 } elsif ($self->{nc} == -1) {
1454 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1455 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1456
1457 $self->{last_stag_name} = $self->{ct}->{tag_name};
1458 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1459 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1460 if ($self->{ct}->{attributes}) {
1461
1462 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1463 } else {
1464 ## NOTE: This state should never be reached.
1465
1466 }
1467 } else {
1468 die "$0: $self->{ct}->{type}: Unknown token type";
1469 }
1470 $self->{s_kwd} = '';
1471 $self->{state} = DATA_STATE;
1472 # reconsume
1473
1474 return ($self->{ct}); # start tag or end tag
1475
1476 redo A;
1477 } else {
1478 if ($self->{nc} == 0x0022 or # "
1479 $self->{nc} == 0x0027) { # '
1480
1481 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1482 } else {
1483
1484 }
1485 $self->{ca}
1486 = {name => chr ($self->{nc}),
1487 value => '',
1488 line => $self->{line}, column => $self->{column}};
1489 $self->{state} = ATTRIBUTE_NAME_STATE;
1490
1491 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1492 $self->{line_prev} = $self->{line};
1493 $self->{column_prev} = $self->{column};
1494 $self->{column}++;
1495 $self->{nc}
1496 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1497 } else {
1498 $self->{set_nc}->($self);
1499 }
1500
1501 redo A;
1502 }
1503 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1504 if ($is_space->{$self->{nc}}) {
1505
1506 ## Stay in the state
1507
1508 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1509 $self->{line_prev} = $self->{line};
1510 $self->{column_prev} = $self->{column};
1511 $self->{column}++;
1512 $self->{nc}
1513 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1514 } else {
1515 $self->{set_nc}->($self);
1516 }
1517
1518 redo A;
1519 } elsif ($self->{nc} == 0x0022) { # "
1520
1521 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1522
1523 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524 $self->{line_prev} = $self->{line};
1525 $self->{column_prev} = $self->{column};
1526 $self->{column}++;
1527 $self->{nc}
1528 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529 } else {
1530 $self->{set_nc}->($self);
1531 }
1532
1533 redo A;
1534 } elsif ($self->{nc} == 0x0026) { # &
1535
1536 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1537 ## reconsume
1538 redo A;
1539 } elsif ($self->{nc} == 0x0027) { # '
1540
1541 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1542
1543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1544 $self->{line_prev} = $self->{line};
1545 $self->{column_prev} = $self->{column};
1546 $self->{column}++;
1547 $self->{nc}
1548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1549 } else {
1550 $self->{set_nc}->($self);
1551 }
1552
1553 redo A;
1554 } elsif ($self->{nc} == 0x003E) { # >
1555 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1556 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1557
1558 $self->{last_stag_name} = $self->{ct}->{tag_name};
1559 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1560 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1561 if ($self->{ct}->{attributes}) {
1562
1563 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1564 } else {
1565 ## NOTE: This state should never be reached.
1566
1567 }
1568 } else {
1569 die "$0: $self->{ct}->{type}: Unknown token type";
1570 }
1571 $self->{state} = DATA_STATE;
1572 $self->{s_kwd} = '';
1573
1574 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1575 $self->{line_prev} = $self->{line};
1576 $self->{column_prev} = $self->{column};
1577 $self->{column}++;
1578 $self->{nc}
1579 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1580 } else {
1581 $self->{set_nc}->($self);
1582 }
1583
1584
1585 return ($self->{ct}); # start tag or end tag
1586
1587 redo A;
1588 } elsif ($self->{nc} == -1) {
1589 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1590 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1591
1592 $self->{last_stag_name} = $self->{ct}->{tag_name};
1593 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1594 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1595 if ($self->{ct}->{attributes}) {
1596
1597 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1598 } else {
1599 ## NOTE: This state should never be reached.
1600
1601 }
1602 } else {
1603 die "$0: $self->{ct}->{type}: Unknown token type";
1604 }
1605 $self->{state} = DATA_STATE;
1606 $self->{s_kwd} = '';
1607 ## reconsume
1608
1609 return ($self->{ct}); # start tag or end tag
1610
1611 redo A;
1612 } else {
1613 if ($self->{nc} == 0x003D) { # =
1614
1615 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1616 } else {
1617
1618 }
1619 $self->{ca}->{value} .= chr ($self->{nc});
1620 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1621
1622 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623 $self->{line_prev} = $self->{line};
1624 $self->{column_prev} = $self->{column};
1625 $self->{column}++;
1626 $self->{nc}
1627 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628 } else {
1629 $self->{set_nc}->($self);
1630 }
1631
1632 redo A;
1633 }
1634 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1635 if ($self->{nc} == 0x0022) { # "
1636
1637 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1638
1639 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1640 $self->{line_prev} = $self->{line};
1641 $self->{column_prev} = $self->{column};
1642 $self->{column}++;
1643 $self->{nc}
1644 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1645 } else {
1646 $self->{set_nc}->($self);
1647 }
1648
1649 redo A;
1650 } elsif ($self->{nc} == 0x0026) { # &
1651
1652 ## NOTE: In the spec, the tokenizer is switched to the
1653 ## "entity in attribute value state". In this implementation, the
1654 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1655 ## implementation of the "consume a character reference" algorithm.
1656 $self->{prev_state} = $self->{state};
1657 $self->{entity_add} = 0x0022; # "
1658 $self->{state} = ENTITY_STATE;
1659
1660 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1661 $self->{line_prev} = $self->{line};
1662 $self->{column_prev} = $self->{column};
1663 $self->{column}++;
1664 $self->{nc}
1665 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1666 } else {
1667 $self->{set_nc}->($self);
1668 }
1669
1670 redo A;
1671 } elsif ($self->{nc} == -1) {
1672 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1673 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1674
1675 $self->{last_stag_name} = $self->{ct}->{tag_name};
1676 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1677 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1678 if ($self->{ct}->{attributes}) {
1679
1680 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1681 } else {
1682 ## NOTE: This state should never be reached.
1683
1684 }
1685 } else {
1686 die "$0: $self->{ct}->{type}: Unknown token type";
1687 }
1688 $self->{state} = DATA_STATE;
1689 $self->{s_kwd} = '';
1690 ## reconsume
1691
1692 return ($self->{ct}); # start tag or end tag
1693
1694 redo A;
1695 } else {
1696
1697 $self->{ca}->{value} .= chr ($self->{nc});
1698 $self->{read_until}->($self->{ca}->{value},
1699 q["&],
1700 length $self->{ca}->{value});
1701
1702 ## Stay in the state
1703
1704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705 $self->{line_prev} = $self->{line};
1706 $self->{column_prev} = $self->{column};
1707 $self->{column}++;
1708 $self->{nc}
1709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710 } else {
1711 $self->{set_nc}->($self);
1712 }
1713
1714 redo A;
1715 }
1716 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1717 if ($self->{nc} == 0x0027) { # '
1718
1719 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1720
1721 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1722 $self->{line_prev} = $self->{line};
1723 $self->{column_prev} = $self->{column};
1724 $self->{column}++;
1725 $self->{nc}
1726 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1727 } else {
1728 $self->{set_nc}->($self);
1729 }
1730
1731 redo A;
1732 } elsif ($self->{nc} == 0x0026) { # &
1733
1734 ## NOTE: In the spec, the tokenizer is switched to the
1735 ## "entity in attribute value state". In this implementation, the
1736 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1737 ## implementation of the "consume a character reference" algorithm.
1738 $self->{entity_add} = 0x0027; # '
1739 $self->{prev_state} = $self->{state};
1740 $self->{state} = ENTITY_STATE;
1741
1742 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1743 $self->{line_prev} = $self->{line};
1744 $self->{column_prev} = $self->{column};
1745 $self->{column}++;
1746 $self->{nc}
1747 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1748 } else {
1749 $self->{set_nc}->($self);
1750 }
1751
1752 redo A;
1753 } elsif ($self->{nc} == -1) {
1754 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1755 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1756
1757 $self->{last_stag_name} = $self->{ct}->{tag_name};
1758 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1759 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1760 if ($self->{ct}->{attributes}) {
1761
1762 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1763 } else {
1764 ## NOTE: This state should never be reached.
1765
1766 }
1767 } else {
1768 die "$0: $self->{ct}->{type}: Unknown token type";
1769 }
1770 $self->{state} = DATA_STATE;
1771 $self->{s_kwd} = '';
1772 ## reconsume
1773
1774 return ($self->{ct}); # start tag or end tag
1775
1776 redo A;
1777 } else {
1778
1779 $self->{ca}->{value} .= chr ($self->{nc});
1780 $self->{read_until}->($self->{ca}->{value},
1781 q['&],
1782 length $self->{ca}->{value});
1783
1784 ## Stay in the state
1785
1786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1787 $self->{line_prev} = $self->{line};
1788 $self->{column_prev} = $self->{column};
1789 $self->{column}++;
1790 $self->{nc}
1791 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1792 } else {
1793 $self->{set_nc}->($self);
1794 }
1795
1796 redo A;
1797 }
1798 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1799 if ($is_space->{$self->{nc}}) {
1800
1801 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1802
1803 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1804 $self->{line_prev} = $self->{line};
1805 $self->{column_prev} = $self->{column};
1806 $self->{column}++;
1807 $self->{nc}
1808 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1809 } else {
1810 $self->{set_nc}->($self);
1811 }
1812
1813 redo A;
1814 } elsif ($self->{nc} == 0x0026) { # &
1815
1816 ## NOTE: In the spec, the tokenizer is switched to the
1817 ## "entity in attribute value state". In this implementation, the
1818 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1819 ## implementation of the "consume a character reference" algorithm.
1820 $self->{entity_add} = -1;
1821 $self->{prev_state} = $self->{state};
1822 $self->{state} = ENTITY_STATE;
1823
1824 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1825 $self->{line_prev} = $self->{line};
1826 $self->{column_prev} = $self->{column};
1827 $self->{column}++;
1828 $self->{nc}
1829 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1830 } else {
1831 $self->{set_nc}->($self);
1832 }
1833
1834 redo A;
1835 } elsif ($self->{nc} == 0x003E) { # >
1836 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1837
1838 $self->{last_stag_name} = $self->{ct}->{tag_name};
1839 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1840 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1841 if ($self->{ct}->{attributes}) {
1842
1843 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1844 } else {
1845 ## NOTE: This state should never be reached.
1846
1847 }
1848 } else {
1849 die "$0: $self->{ct}->{type}: Unknown token type";
1850 }
1851 $self->{state} = DATA_STATE;
1852 $self->{s_kwd} = '';
1853
1854 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1855 $self->{line_prev} = $self->{line};
1856 $self->{column_prev} = $self->{column};
1857 $self->{column}++;
1858 $self->{nc}
1859 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1860 } else {
1861 $self->{set_nc}->($self);
1862 }
1863
1864
1865 return ($self->{ct}); # start tag or end tag
1866
1867 redo A;
1868 } elsif ($self->{nc} == -1) {
1869 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1870 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1871
1872 $self->{last_stag_name} = $self->{ct}->{tag_name};
1873 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1874 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1875 if ($self->{ct}->{attributes}) {
1876
1877 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1878 } else {
1879 ## NOTE: This state should never be reached.
1880
1881 }
1882 } else {
1883 die "$0: $self->{ct}->{type}: Unknown token type";
1884 }
1885 $self->{state} = DATA_STATE;
1886 $self->{s_kwd} = '';
1887 ## reconsume
1888
1889 return ($self->{ct}); # start tag or end tag
1890
1891 redo A;
1892 } else {
1893 if ({
1894 0x0022 => 1, # "
1895 0x0027 => 1, # '
1896 0x003D => 1, # =
1897 }->{$self->{nc}}) {
1898
1899 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1900 } else {
1901
1902 }
1903 $self->{ca}->{value} .= chr ($self->{nc});
1904 $self->{read_until}->($self->{ca}->{value},
1905 q["'=& >],
1906 length $self->{ca}->{value});
1907
1908 ## Stay in the state
1909
1910 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1911 $self->{line_prev} = $self->{line};
1912 $self->{column_prev} = $self->{column};
1913 $self->{column}++;
1914 $self->{nc}
1915 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1916 } else {
1917 $self->{set_nc}->($self);
1918 }
1919
1920 redo A;
1921 }
1922 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1923 if ($is_space->{$self->{nc}}) {
1924
1925 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1926
1927 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1928 $self->{line_prev} = $self->{line};
1929 $self->{column_prev} = $self->{column};
1930 $self->{column}++;
1931 $self->{nc}
1932 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1933 } else {
1934 $self->{set_nc}->($self);
1935 }
1936
1937 redo A;
1938 } elsif ($self->{nc} == 0x003E) { # >
1939 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1940
1941 $self->{last_stag_name} = $self->{ct}->{tag_name};
1942 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1943 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1944 if ($self->{ct}->{attributes}) {
1945
1946 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1947 } else {
1948 ## NOTE: This state should never be reached.
1949
1950 }
1951 } else {
1952 die "$0: $self->{ct}->{type}: Unknown token type";
1953 }
1954 $self->{state} = DATA_STATE;
1955 $self->{s_kwd} = '';
1956
1957 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1958 $self->{line_prev} = $self->{line};
1959 $self->{column_prev} = $self->{column};
1960 $self->{column}++;
1961 $self->{nc}
1962 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1963 } else {
1964 $self->{set_nc}->($self);
1965 }
1966
1967
1968 return ($self->{ct}); # start tag or end tag
1969
1970 redo A;
1971 } elsif ($self->{nc} == 0x002F) { # /
1972
1973 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1974
1975 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1976 $self->{line_prev} = $self->{line};
1977 $self->{column_prev} = $self->{column};
1978 $self->{column}++;
1979 $self->{nc}
1980 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1981 } else {
1982 $self->{set_nc}->($self);
1983 }
1984
1985 redo A;
1986 } elsif ($self->{nc} == -1) {
1987 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1988 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1989
1990 $self->{last_stag_name} = $self->{ct}->{tag_name};
1991 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1992 if ($self->{ct}->{attributes}) {
1993
1994 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1995 } else {
1996 ## NOTE: This state should never be reached.
1997
1998 }
1999 } else {
2000 die "$0: $self->{ct}->{type}: Unknown token type";
2001 }
2002 $self->{state} = DATA_STATE;
2003 $self->{s_kwd} = '';
2004 ## Reconsume.
2005 return ($self->{ct}); # start tag or end tag
2006 redo A;
2007 } else {
2008
2009 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2010 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2011 ## reconsume
2012 redo A;
2013 }
2014 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2015 if ($self->{nc} == 0x003E) { # >
2016 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2017
2018 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2019 ## TODO: Different type than slash in start tag
2020 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2021 if ($self->{ct}->{attributes}) {
2022
2023 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2024 } else {
2025
2026 }
2027 ## TODO: Test |<title></title/>|
2028 } else {
2029
2030 $self->{self_closing} = 1;
2031 }
2032
2033 $self->{state} = DATA_STATE;
2034 $self->{s_kwd} = '';
2035
2036 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2037 $self->{line_prev} = $self->{line};
2038 $self->{column_prev} = $self->{column};
2039 $self->{column}++;
2040 $self->{nc}
2041 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2042 } else {
2043 $self->{set_nc}->($self);
2044 }
2045
2046
2047 return ($self->{ct}); # start tag or end tag
2048
2049 redo A;
2050 } elsif ($self->{nc} == -1) {
2051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2052 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2053
2054 $self->{last_stag_name} = $self->{ct}->{tag_name};
2055 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2056 if ($self->{ct}->{attributes}) {
2057
2058 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2059 } else {
2060 ## NOTE: This state should never be reached.
2061
2062 }
2063 } else {
2064 die "$0: $self->{ct}->{type}: Unknown token type";
2065 }
2066 $self->{state} = DATA_STATE;
2067 $self->{s_kwd} = '';
2068 ## Reconsume.
2069 return ($self->{ct}); # start tag or end tag
2070 redo A;
2071 } else {
2072
2073 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2074 ## TODO: This error type is wrong.
2075 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2076 ## Reconsume.
2077 redo A;
2078 }
2079 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2080 ## (only happen if PCDATA state)
2081
2082 ## NOTE: Unlike spec's "bogus comment state", this implementation
2083 ## consumes characters one-by-one basis.
2084
2085 if ($self->{nc} == 0x003E) { # >
2086
2087 $self->{state} = DATA_STATE;
2088 $self->{s_kwd} = '';
2089
2090 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2091 $self->{line_prev} = $self->{line};
2092 $self->{column_prev} = $self->{column};
2093 $self->{column}++;
2094 $self->{nc}
2095 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2096 } else {
2097 $self->{set_nc}->($self);
2098 }
2099
2100
2101 return ($self->{ct}); # comment
2102 redo A;
2103 } elsif ($self->{nc} == -1) {
2104
2105 $self->{state} = DATA_STATE;
2106 $self->{s_kwd} = '';
2107 ## reconsume
2108
2109 return ($self->{ct}); # comment
2110 redo A;
2111 } else {
2112
2113 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2114 $self->{read_until}->($self->{ct}->{data},
2115 q[>],
2116 length $self->{ct}->{data});
2117
2118 ## Stay in the state.
2119
2120 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2121 $self->{line_prev} = $self->{line};
2122 $self->{column_prev} = $self->{column};
2123 $self->{column}++;
2124 $self->{nc}
2125 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2126 } else {
2127 $self->{set_nc}->($self);
2128 }
2129
2130 redo A;
2131 }
2132 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2133 ## (only happen if PCDATA state)
2134
2135 if ($self->{nc} == 0x002D) { # -
2136
2137 $self->{state} = MD_HYPHEN_STATE;
2138
2139 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2140 $self->{line_prev} = $self->{line};
2141 $self->{column_prev} = $self->{column};
2142 $self->{column}++;
2143 $self->{nc}
2144 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2145 } else {
2146 $self->{set_nc}->($self);
2147 }
2148
2149 redo A;
2150 } elsif ($self->{nc} == 0x0044 or # D
2151 $self->{nc} == 0x0064) { # d
2152 ## ASCII case-insensitive.
2153
2154 $self->{state} = MD_DOCTYPE_STATE;
2155 $self->{s_kwd} = chr $self->{nc};
2156
2157 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2158 $self->{line_prev} = $self->{line};
2159 $self->{column_prev} = $self->{column};
2160 $self->{column}++;
2161 $self->{nc}
2162 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2163 } else {
2164 $self->{set_nc}->($self);
2165 }
2166
2167 redo A;
2168 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2169 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2170 $self->{is_xml}) and
2171 $self->{nc} == 0x005B) { # [
2172
2173 $self->{state} = MD_CDATA_STATE;
2174 $self->{s_kwd} = '[';
2175
2176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2177 $self->{line_prev} = $self->{line};
2178 $self->{column_prev} = $self->{column};
2179 $self->{column}++;
2180 $self->{nc}
2181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2182 } else {
2183 $self->{set_nc}->($self);
2184 }
2185
2186 redo A;
2187 } else {
2188
2189 }
2190
2191 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2192 line => $self->{line_prev},
2193 column => $self->{column_prev} - 1);
2194 ## Reconsume.
2195 $self->{state} = BOGUS_COMMENT_STATE;
2196 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2197 line => $self->{line_prev},
2198 column => $self->{column_prev} - 1,
2199 };
2200 redo A;
2201 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2202 if ($self->{nc} == 0x002D) { # -
2203
2204 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2205 line => $self->{line_prev},
2206 column => $self->{column_prev} - 2,
2207 };
2208 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2209
2210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2211 $self->{line_prev} = $self->{line};
2212 $self->{column_prev} = $self->{column};
2213 $self->{column}++;
2214 $self->{nc}
2215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2216 } else {
2217 $self->{set_nc}->($self);
2218 }
2219
2220 redo A;
2221 } else {
2222
2223 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2224 line => $self->{line_prev},
2225 column => $self->{column_prev} - 2);
2226 $self->{state} = BOGUS_COMMENT_STATE;
2227 ## Reconsume.
2228 $self->{ct} = {type => COMMENT_TOKEN,
2229 data => '-',
2230 line => $self->{line_prev},
2231 column => $self->{column_prev} - 2,
2232 };
2233 redo A;
2234 }
2235 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2236 ## ASCII case-insensitive.
2237 if ($self->{nc} == [
2238 undef,
2239 0x004F, # O
2240 0x0043, # C
2241 0x0054, # T
2242 0x0059, # Y
2243 0x0050, # P
2244 ]->[length $self->{s_kwd}] or
2245 $self->{nc} == [
2246 undef,
2247 0x006F, # o
2248 0x0063, # c
2249 0x0074, # t
2250 0x0079, # y
2251 0x0070, # p
2252 ]->[length $self->{s_kwd}]) {
2253
2254 ## Stay in the state.
2255 $self->{s_kwd} .= chr $self->{nc};
2256
2257 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258 $self->{line_prev} = $self->{line};
2259 $self->{column_prev} = $self->{column};
2260 $self->{column}++;
2261 $self->{nc}
2262 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263 } else {
2264 $self->{set_nc}->($self);
2265 }
2266
2267 redo A;
2268 } elsif ((length $self->{s_kwd}) == 6 and
2269 ($self->{nc} == 0x0045 or # E
2270 $self->{nc} == 0x0065)) { # e
2271 if ($self->{s_kwd} ne 'DOCTYP') {
2272
2273 ## XML5: case-sensitive.
2274 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2275 text => 'DOCTYPE',
2276 line => $self->{line_prev},
2277 column => $self->{column_prev} - 5);
2278 } else {
2279
2280 }
2281 $self->{state} = DOCTYPE_STATE;
2282 $self->{ct} = {type => DOCTYPE_TOKEN,
2283 quirks => 1,
2284 line => $self->{line_prev},
2285 column => $self->{column_prev} - 7,
2286 };
2287
2288 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2289 $self->{line_prev} = $self->{line};
2290 $self->{column_prev} = $self->{column};
2291 $self->{column}++;
2292 $self->{nc}
2293 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2294 } else {
2295 $self->{set_nc}->($self);
2296 }
2297
2298 redo A;
2299 } else {
2300
2301 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2302 line => $self->{line_prev},
2303 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2304 $self->{state} = BOGUS_COMMENT_STATE;
2305 ## Reconsume.
2306 $self->{ct} = {type => COMMENT_TOKEN,
2307 data => $self->{s_kwd},
2308 line => $self->{line_prev},
2309 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2310 };
2311 redo A;
2312 }
2313 } elsif ($self->{state} == MD_CDATA_STATE) {
2314 if ($self->{nc} == {
2315 '[' => 0x0043, # C
2316 '[C' => 0x0044, # D
2317 '[CD' => 0x0041, # A
2318 '[CDA' => 0x0054, # T
2319 '[CDAT' => 0x0041, # A
2320 }->{$self->{s_kwd}}) {
2321
2322 ## Stay in the state.
2323 $self->{s_kwd} .= chr $self->{nc};
2324
2325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2326 $self->{line_prev} = $self->{line};
2327 $self->{column_prev} = $self->{column};
2328 $self->{column}++;
2329 $self->{nc}
2330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2331 } else {
2332 $self->{set_nc}->($self);
2333 }
2334
2335 redo A;
2336 } elsif ($self->{s_kwd} eq '[CDATA' and
2337 $self->{nc} == 0x005B) { # [
2338 if ($self->{is_xml} and
2339 not $self->{tainted} and
2340 @{$self->{open_elements} or []} == 0) {
2341
2342 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2343 line => $self->{line_prev},
2344 column => $self->{column_prev} - 7);
2345 $self->{tainted} = 1;
2346 } else {
2347
2348 }
2349
2350 $self->{ct} = {type => CHARACTER_TOKEN,
2351 data => '',
2352 line => $self->{line_prev},
2353 column => $self->{column_prev} - 7};
2354 $self->{state} = CDATA_SECTION_STATE;
2355
2356 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2357 $self->{line_prev} = $self->{line};
2358 $self->{column_prev} = $self->{column};
2359 $self->{column}++;
2360 $self->{nc}
2361 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2362 } else {
2363 $self->{set_nc}->($self);
2364 }
2365
2366 redo A;
2367 } else {
2368
2369 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2370 line => $self->{line_prev},
2371 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2372 $self->{state} = BOGUS_COMMENT_STATE;
2373 ## Reconsume.
2374 $self->{ct} = {type => COMMENT_TOKEN,
2375 data => $self->{s_kwd},
2376 line => $self->{line_prev},
2377 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2378 };
2379 redo A;
2380 }
2381 } elsif ($self->{state} == COMMENT_START_STATE) {
2382 if ($self->{nc} == 0x002D) { # -
2383
2384 $self->{state} = COMMENT_START_DASH_STATE;
2385
2386 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2387 $self->{line_prev} = $self->{line};
2388 $self->{column_prev} = $self->{column};
2389 $self->{column}++;
2390 $self->{nc}
2391 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2392 } else {
2393 $self->{set_nc}->($self);
2394 }
2395
2396 redo A;
2397 } elsif ($self->{nc} == 0x003E) { # >
2398
2399 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2400 $self->{state} = DATA_STATE;
2401 $self->{s_kwd} = '';
2402
2403 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2404 $self->{line_prev} = $self->{line};
2405 $self->{column_prev} = $self->{column};
2406 $self->{column}++;
2407 $self->{nc}
2408 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2409 } else {
2410 $self->{set_nc}->($self);
2411 }
2412
2413
2414 return ($self->{ct}); # comment
2415
2416 redo A;
2417 } elsif ($self->{nc} == -1) {
2418
2419 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2420 $self->{state} = DATA_STATE;
2421 $self->{s_kwd} = '';
2422 ## reconsume
2423
2424 return ($self->{ct}); # comment
2425
2426 redo A;
2427 } else {
2428
2429 $self->{ct}->{data} # comment
2430 .= chr ($self->{nc});
2431 $self->{state} = COMMENT_STATE;
2432
2433 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2434 $self->{line_prev} = $self->{line};
2435 $self->{column_prev} = $self->{column};
2436 $self->{column}++;
2437 $self->{nc}
2438 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2439 } else {
2440 $self->{set_nc}->($self);
2441 }
2442
2443 redo A;
2444 }
2445 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2446 if ($self->{nc} == 0x002D) { # -
2447
2448 $self->{state} = COMMENT_END_STATE;
2449
2450 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2451 $self->{line_prev} = $self->{line};
2452 $self->{column_prev} = $self->{column};
2453 $self->{column}++;
2454 $self->{nc}
2455 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2456 } else {
2457 $self->{set_nc}->($self);
2458 }
2459
2460 redo A;
2461 } elsif ($self->{nc} == 0x003E) { # >
2462
2463 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2464 $self->{state} = DATA_STATE;
2465 $self->{s_kwd} = '';
2466
2467 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2468 $self->{line_prev} = $self->{line};
2469 $self->{column_prev} = $self->{column};
2470 $self->{column}++;
2471 $self->{nc}
2472 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2473 } else {
2474 $self->{set_nc}->($self);
2475 }
2476
2477
2478 return ($self->{ct}); # comment
2479
2480 redo A;
2481 } elsif ($self->{nc} == -1) {
2482
2483 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2484 $self->{state} = DATA_STATE;
2485 $self->{s_kwd} = '';
2486 ## reconsume
2487
2488 return ($self->{ct}); # comment
2489
2490 redo A;
2491 } else {
2492
2493 $self->{ct}->{data} # comment
2494 .= '-' . chr ($self->{nc});
2495 $self->{state} = COMMENT_STATE;
2496
2497 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2498 $self->{line_prev} = $self->{line};
2499 $self->{column_prev} = $self->{column};
2500 $self->{column}++;
2501 $self->{nc}
2502 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2503 } else {
2504 $self->{set_nc}->($self);
2505 }
2506
2507 redo A;
2508 }
2509 } elsif ($self->{state} == COMMENT_STATE) {
2510 if ($self->{nc} == 0x002D) { # -
2511
2512 $self->{state} = COMMENT_END_DASH_STATE;
2513
2514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515 $self->{line_prev} = $self->{line};
2516 $self->{column_prev} = $self->{column};
2517 $self->{column}++;
2518 $self->{nc}
2519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520 } else {
2521 $self->{set_nc}->($self);
2522 }
2523
2524 redo A;
2525 } elsif ($self->{nc} == -1) {
2526
2527 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2528 $self->{state} = DATA_STATE;
2529 $self->{s_kwd} = '';
2530 ## reconsume
2531
2532 return ($self->{ct}); # comment
2533
2534 redo A;
2535 } else {
2536
2537 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2538 $self->{read_until}->($self->{ct}->{data},
2539 q[-],
2540 length $self->{ct}->{data});
2541
2542 ## Stay in the state
2543
2544 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2545 $self->{line_prev} = $self->{line};
2546 $self->{column_prev} = $self->{column};
2547 $self->{column}++;
2548 $self->{nc}
2549 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2550 } else {
2551 $self->{set_nc}->($self);
2552 }
2553
2554 redo A;
2555 }
2556 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2557 ## XML5: "comment dash state".
2558
2559 if ($self->{nc} == 0x002D) { # -
2560
2561 $self->{state} = COMMENT_END_STATE;
2562
2563 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2564 $self->{line_prev} = $self->{line};
2565 $self->{column_prev} = $self->{column};
2566 $self->{column}++;
2567 $self->{nc}
2568 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2569 } else {
2570 $self->{set_nc}->($self);
2571 }
2572
2573 redo A;
2574 } elsif ($self->{nc} == -1) {
2575
2576 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2577 $self->{s_kwd} = '';
2578 $self->{state} = DATA_STATE;
2579 $self->{s_kwd} = '';
2580 ## reconsume
2581
2582 return ($self->{ct}); # comment
2583
2584 redo A;
2585 } else {
2586
2587 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2588 $self->{state} = COMMENT_STATE;
2589
2590 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2591 $self->{line_prev} = $self->{line};
2592 $self->{column_prev} = $self->{column};
2593 $self->{column}++;
2594 $self->{nc}
2595 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2596 } else {
2597 $self->{set_nc}->($self);
2598 }
2599
2600 redo A;
2601 }
2602 } elsif ($self->{state} == COMMENT_END_STATE) {
2603 if ($self->{nc} == 0x003E) { # >
2604
2605 $self->{state} = DATA_STATE;
2606 $self->{s_kwd} = '';
2607
2608 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2609 $self->{line_prev} = $self->{line};
2610 $self->{column_prev} = $self->{column};
2611 $self->{column}++;
2612 $self->{nc}
2613 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2614 } else {
2615 $self->{set_nc}->($self);
2616 }
2617
2618
2619 return ($self->{ct}); # comment
2620
2621 redo A;
2622 } elsif ($self->{nc} == 0x002D) { # -
2623
2624 ## XML5: Not a parse error.
2625 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2626 line => $self->{line_prev},
2627 column => $self->{column_prev});
2628 $self->{ct}->{data} .= '-'; # comment
2629 ## Stay in the state
2630
2631 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2632 $self->{line_prev} = $self->{line};
2633 $self->{column_prev} = $self->{column};
2634 $self->{column}++;
2635 $self->{nc}
2636 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2637 } else {
2638 $self->{set_nc}->($self);
2639 }
2640
2641 redo A;
2642 } elsif ($self->{nc} == -1) {
2643
2644 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2645 $self->{state} = DATA_STATE;
2646 $self->{s_kwd} = '';
2647 ## reconsume
2648
2649 return ($self->{ct}); # comment
2650
2651 redo A;
2652 } else {
2653
2654 ## XML5: Not a parse error.
2655 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2656 line => $self->{line_prev},
2657 column => $self->{column_prev});
2658 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2659 $self->{state} = COMMENT_STATE;
2660
2661 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2662 $self->{line_prev} = $self->{line};
2663 $self->{column_prev} = $self->{column};
2664 $self->{column}++;
2665 $self->{nc}
2666 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2667 } else {
2668 $self->{set_nc}->($self);
2669 }
2670
2671 redo A;
2672 }
2673 } elsif ($self->{state} == DOCTYPE_STATE) {
2674 if ($is_space->{$self->{nc}}) {
2675
2676 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2677
2678 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679 $self->{line_prev} = $self->{line};
2680 $self->{column_prev} = $self->{column};
2681 $self->{column}++;
2682 $self->{nc}
2683 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684 } else {
2685 $self->{set_nc}->($self);
2686 }
2687
2688 redo A;
2689 } else {
2690
2691 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2692 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2693 ## reconsume
2694 redo A;
2695 }
2696 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2697 if ($is_space->{$self->{nc}}) {
2698
2699 ## Stay in the state
2700
2701 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2702 $self->{line_prev} = $self->{line};
2703 $self->{column_prev} = $self->{column};
2704 $self->{column}++;
2705 $self->{nc}
2706 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2707 } else {
2708 $self->{set_nc}->($self);
2709 }
2710
2711 redo A;
2712 } elsif ($self->{nc} == 0x003E) { # >
2713
2714 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2715 $self->{state} = DATA_STATE;
2716 $self->{s_kwd} = '';
2717
2718 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2719 $self->{line_prev} = $self->{line};
2720 $self->{column_prev} = $self->{column};
2721 $self->{column}++;
2722 $self->{nc}
2723 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2724 } else {
2725 $self->{set_nc}->($self);
2726 }
2727
2728
2729 return ($self->{ct}); # DOCTYPE (quirks)
2730
2731 redo A;
2732 } elsif ($self->{nc} == -1) {
2733
2734 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2735 $self->{state} = DATA_STATE;
2736 $self->{s_kwd} = '';
2737 ## reconsume
2738
2739 return ($self->{ct}); # DOCTYPE (quirks)
2740
2741 redo A;
2742 } else {
2743
2744 $self->{ct}->{name} = chr $self->{nc};
2745 delete $self->{ct}->{quirks};
2746 $self->{state} = DOCTYPE_NAME_STATE;
2747
2748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2749 $self->{line_prev} = $self->{line};
2750 $self->{column_prev} = $self->{column};
2751 $self->{column}++;
2752 $self->{nc}
2753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2754 } else {
2755 $self->{set_nc}->($self);
2756 }
2757
2758 redo A;
2759 }
2760 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2761 ## ISSUE: Redundant "First," in the spec.
2762 if ($is_space->{$self->{nc}}) {
2763
2764 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2765
2766 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2767 $self->{line_prev} = $self->{line};
2768 $self->{column_prev} = $self->{column};
2769 $self->{column}++;
2770 $self->{nc}
2771 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2772 } else {
2773 $self->{set_nc}->($self);
2774 }
2775
2776 redo A;
2777 } elsif ($self->{nc} == 0x003E) { # >
2778
2779 $self->{state} = DATA_STATE;
2780 $self->{s_kwd} = '';
2781
2782 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2783 $self->{line_prev} = $self->{line};
2784 $self->{column_prev} = $self->{column};
2785 $self->{column}++;
2786 $self->{nc}
2787 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2788 } else {
2789 $self->{set_nc}->($self);
2790 }
2791
2792
2793 return ($self->{ct}); # DOCTYPE
2794
2795 redo A;
2796 } elsif ($self->{nc} == -1) {
2797
2798 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2799 $self->{state} = DATA_STATE;
2800 $self->{s_kwd} = '';
2801 ## reconsume
2802
2803 $self->{ct}->{quirks} = 1;
2804 return ($self->{ct}); # DOCTYPE
2805
2806 redo A;
2807 } else {
2808
2809 $self->{ct}->{name}
2810 .= chr ($self->{nc}); # DOCTYPE
2811 ## Stay in the state
2812
2813 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2814 $self->{line_prev} = $self->{line};
2815 $self->{column_prev} = $self->{column};
2816 $self->{column}++;
2817 $self->{nc}
2818 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2819 } else {
2820 $self->{set_nc}->($self);
2821 }
2822
2823 redo A;
2824 }
2825 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2826 if ($is_space->{$self->{nc}}) {
2827
2828 ## Stay in the state
2829
2830 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2831 $self->{line_prev} = $self->{line};
2832 $self->{column_prev} = $self->{column};
2833 $self->{column}++;
2834 $self->{nc}
2835 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2836 } else {
2837 $self->{set_nc}->($self);
2838 }
2839
2840 redo A;
2841 } elsif ($self->{nc} == 0x003E) { # >
2842
2843 $self->{state} = DATA_STATE;
2844 $self->{s_kwd} = '';
2845
2846 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2847 $self->{line_prev} = $self->{line};
2848 $self->{column_prev} = $self->{column};
2849 $self->{column}++;
2850 $self->{nc}
2851 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2852 } else {
2853 $self->{set_nc}->($self);
2854 }
2855
2856
2857 return ($self->{ct}); # DOCTYPE
2858
2859 redo A;
2860 } elsif ($self->{nc} == -1) {
2861
2862 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2863 $self->{state} = DATA_STATE;
2864 $self->{s_kwd} = '';
2865 ## reconsume
2866
2867 $self->{ct}->{quirks} = 1;
2868 return ($self->{ct}); # DOCTYPE
2869
2870 redo A;
2871 } elsif ($self->{nc} == 0x0050 or # P
2872 $self->{nc} == 0x0070) { # p
2873 $self->{state} = PUBLIC_STATE;
2874 $self->{s_kwd} = chr $self->{nc};
2875
2876 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2877 $self->{line_prev} = $self->{line};
2878 $self->{column_prev} = $self->{column};
2879 $self->{column}++;
2880 $self->{nc}
2881 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2882 } else {
2883 $self->{set_nc}->($self);
2884 }
2885
2886 redo A;
2887 } elsif ($self->{nc} == 0x0053 or # S
2888 $self->{nc} == 0x0073) { # s
2889 $self->{state} = SYSTEM_STATE;
2890 $self->{s_kwd} = chr $self->{nc};
2891
2892 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2893 $self->{line_prev} = $self->{line};
2894 $self->{column_prev} = $self->{column};
2895 $self->{column}++;
2896 $self->{nc}
2897 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2898 } else {
2899 $self->{set_nc}->($self);
2900 }
2901
2902 redo A;
2903 } else {
2904
2905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2906 $self->{ct}->{quirks} = 1;
2907
2908 $self->{state} = BOGUS_DOCTYPE_STATE;
2909
2910 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2911 $self->{line_prev} = $self->{line};
2912 $self->{column_prev} = $self->{column};
2913 $self->{column}++;
2914 $self->{nc}
2915 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2916 } else {
2917 $self->{set_nc}->($self);
2918 }
2919
2920 redo A;
2921 }
2922 } elsif ($self->{state} == PUBLIC_STATE) {
2923 ## ASCII case-insensitive
2924 if ($self->{nc} == [
2925 undef,
2926 0x0055, # U
2927 0x0042, # B
2928 0x004C, # L
2929 0x0049, # I
2930 ]->[length $self->{s_kwd}] or
2931 $self->{nc} == [
2932 undef,
2933 0x0075, # u
2934 0x0062, # b
2935 0x006C, # l
2936 0x0069, # i
2937 ]->[length $self->{s_kwd}]) {
2938
2939 ## Stay in the state.
2940 $self->{s_kwd} .= chr $self->{nc};
2941
2942 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2943 $self->{line_prev} = $self->{line};
2944 $self->{column_prev} = $self->{column};
2945 $self->{column}++;
2946 $self->{nc}
2947 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2948 } else {
2949 $self->{set_nc}->($self);
2950 }
2951
2952 redo A;
2953 } elsif ((length $self->{s_kwd}) == 5 and
2954 ($self->{nc} == 0x0043 or # C
2955 $self->{nc} == 0x0063)) { # c
2956
2957 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2958
2959 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2960 $self->{line_prev} = $self->{line};
2961 $self->{column_prev} = $self->{column};
2962 $self->{column}++;
2963 $self->{nc}
2964 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2965 } else {
2966 $self->{set_nc}->($self);
2967 }
2968
2969 redo A;
2970 } else {
2971
2972 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2973 line => $self->{line_prev},
2974 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2975 $self->{ct}->{quirks} = 1;
2976
2977 $self->{state} = BOGUS_DOCTYPE_STATE;
2978 ## Reconsume.
2979 redo A;
2980 }
2981 } elsif ($self->{state} == SYSTEM_STATE) {
2982 ## ASCII case-insensitive
2983 if ($self->{nc} == [
2984 undef,
2985 0x0059, # Y
2986 0x0053, # S
2987 0x0054, # T
2988 0x0045, # E
2989 ]->[length $self->{s_kwd}] or
2990 $self->{nc} == [
2991 undef,
2992 0x0079, # y
2993 0x0073, # s
2994 0x0074, # t
2995 0x0065, # e
2996 ]->[length $self->{s_kwd}]) {
2997
2998 ## Stay in the state.
2999 $self->{s_kwd} .= chr $self->{nc};
3000
3001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002 $self->{line_prev} = $self->{line};
3003 $self->{column_prev} = $self->{column};
3004 $self->{column}++;
3005 $self->{nc}
3006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007 } else {
3008 $self->{set_nc}->($self);
3009 }
3010
3011 redo A;
3012 } elsif ((length $self->{s_kwd}) == 5 and
3013 ($self->{nc} == 0x004D or # M
3014 $self->{nc} == 0x006D)) { # m
3015
3016 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3017
3018 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3019 $self->{line_prev} = $self->{line};
3020 $self->{column_prev} = $self->{column};
3021 $self->{column}++;
3022 $self->{nc}
3023 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3024 } else {
3025 $self->{set_nc}->($self);
3026 }
3027
3028 redo A;
3029 } else {
3030
3031 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3032 line => $self->{line_prev},
3033 column => $self->{column_prev} + 1 - length $self->{s_kwd});
3034 $self->{ct}->{quirks} = 1;
3035
3036 $self->{state} = BOGUS_DOCTYPE_STATE;
3037 ## Reconsume.
3038 redo A;
3039 }
3040 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3041 if ($is_space->{$self->{nc}}) {
3042
3043 ## Stay in the state
3044
3045 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3046 $self->{line_prev} = $self->{line};
3047 $self->{column_prev} = $self->{column};
3048 $self->{column}++;
3049 $self->{nc}
3050 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3051 } else {
3052 $self->{set_nc}->($self);
3053 }
3054
3055 redo A;
3056 } elsif ($self->{nc} eq 0x0022) { # "
3057
3058 $self->{ct}->{pubid} = ''; # DOCTYPE
3059 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3060
3061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062 $self->{line_prev} = $self->{line};
3063 $self->{column_prev} = $self->{column};
3064 $self->{column}++;
3065 $self->{nc}
3066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3067 } else {
3068 $self->{set_nc}->($self);
3069 }
3070
3071 redo A;
3072 } elsif ($self->{nc} eq 0x0027) { # '
3073
3074 $self->{ct}->{pubid} = ''; # DOCTYPE
3075 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3076
3077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3078 $self->{line_prev} = $self->{line};
3079 $self->{column_prev} = $self->{column};
3080 $self->{column}++;
3081 $self->{nc}
3082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3083 } else {
3084 $self->{set_nc}->($self);
3085 }
3086
3087 redo A;
3088 } elsif ($self->{nc} eq 0x003E) { # >
3089
3090 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3091
3092 $self->{state} = DATA_STATE;
3093 $self->{s_kwd} = '';
3094
3095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3096 $self->{line_prev} = $self->{line};
3097 $self->{column_prev} = $self->{column};
3098 $self->{column}++;
3099 $self->{nc}
3100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3101 } else {
3102 $self->{set_nc}->($self);
3103 }
3104
3105
3106 $self->{ct}->{quirks} = 1;
3107 return ($self->{ct}); # DOCTYPE
3108
3109 redo A;
3110 } elsif ($self->{nc} == -1) {
3111
3112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3113
3114 $self->{state} = DATA_STATE;
3115 $self->{s_kwd} = '';
3116 ## reconsume
3117
3118 $self->{ct}->{quirks} = 1;
3119 return ($self->{ct}); # DOCTYPE
3120
3121 redo A;
3122 } else {
3123
3124 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3125 $self->{ct}->{quirks} = 1;
3126
3127 $self->{state} = BOGUS_DOCTYPE_STATE;
3128
3129 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3130 $self->{line_prev} = $self->{line};
3131 $self->{column_prev} = $self->{column};
3132 $self->{column}++;
3133 $self->{nc}
3134 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3135 } else {
3136 $self->{set_nc}->($self);
3137 }
3138
3139 redo A;
3140 }
3141 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3142 if ($self->{nc} == 0x0022) { # "
3143
3144 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3145
3146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3147 $self->{line_prev} = $self->{line};
3148 $self->{column_prev} = $self->{column};
3149 $self->{column}++;
3150 $self->{nc}
3151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3152 } else {
3153 $self->{set_nc}->($self);
3154 }
3155
3156 redo A;
3157 } elsif ($self->{nc} == 0x003E) { # >
3158
3159 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3160
3161 $self->{state} = DATA_STATE;
3162 $self->{s_kwd} = '';
3163
3164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3165 $self->{line_prev} = $self->{line};
3166 $self->{column_prev} = $self->{column};
3167 $self->{column}++;
3168 $self->{nc}
3169 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3170 } else {
3171 $self->{set_nc}->($self);
3172 }
3173
3174
3175 $self->{ct}->{quirks} = 1;
3176 return ($self->{ct}); # DOCTYPE
3177
3178 redo A;
3179 } elsif ($self->{nc} == -1) {
3180
3181 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3182
3183 $self->{state} = DATA_STATE;
3184 $self->{s_kwd} = '';
3185 ## reconsume
3186
3187 $self->{ct}->{quirks} = 1;
3188 return ($self->{ct}); # DOCTYPE
3189
3190 redo A;
3191 } else {
3192
3193 $self->{ct}->{pubid} # DOCTYPE
3194 .= chr $self->{nc};
3195 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3196 length $self->{ct}->{pubid});
3197
3198 ## Stay in the state
3199
3200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3201 $self->{line_prev} = $self->{line};
3202 $self->{column_prev} = $self->{column};
3203 $self->{column}++;
3204 $self->{nc}
3205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3206 } else {
3207 $self->{set_nc}->($self);
3208 }
3209
3210 redo A;
3211 }
3212 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3213 if ($self->{nc} == 0x0027) { # '
3214
3215 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3216
3217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3218 $self->{line_prev} = $self->{line};
3219 $self->{column_prev} = $self->{column};
3220 $self->{column}++;
3221 $self->{nc}
3222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3223 } else {
3224 $self->{set_nc}->($self);
3225 }
3226
3227 redo A;
3228 } elsif ($self->{nc} == 0x003E) { # >
3229
3230 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3231
3232 $self->{state} = DATA_STATE;
3233 $self->{s_kwd} = '';
3234
3235 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3236 $self->{line_prev} = $self->{line};
3237 $self->{column_prev} = $self->{column};
3238 $self->{column}++;
3239 $self->{nc}
3240 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3241 } else {
3242 $self->{set_nc}->($self);
3243 }
3244
3245
3246 $self->{ct}->{quirks} = 1;
3247 return ($self->{ct}); # DOCTYPE
3248
3249 redo A;
3250 } elsif ($self->{nc} == -1) {
3251
3252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3253
3254 $self->{state} = DATA_STATE;
3255 $self->{s_kwd} = '';
3256 ## reconsume
3257
3258 $self->{ct}->{quirks} = 1;
3259 return ($self->{ct}); # DOCTYPE
3260
3261 redo A;
3262 } else {
3263
3264 $self->{ct}->{pubid} # DOCTYPE
3265 .= chr $self->{nc};
3266 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3267 length $self->{ct}->{pubid});
3268
3269 ## Stay in the state
3270
3271 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3272 $self->{line_prev} = $self->{line};
3273 $self->{column_prev} = $self->{column};
3274 $self->{column}++;
3275 $self->{nc}
3276 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3277 } else {
3278 $self->{set_nc}->($self);
3279 }
3280
3281 redo A;
3282 }
3283 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3284 if ($is_space->{$self->{nc}}) {
3285
3286 ## Stay in the state
3287
3288 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3289 $self->{line_prev} = $self->{line};
3290 $self->{column_prev} = $self->{column};
3291 $self->{column}++;
3292 $self->{nc}
3293 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3294 } else {
3295 $self->{set_nc}->($self);
3296 }
3297
3298 redo A;
3299 } elsif ($self->{nc} == 0x0022) { # "
3300
3301 $self->{ct}->{sysid} = ''; # DOCTYPE
3302 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3303
3304 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3305 $self->{line_prev} = $self->{line};
3306 $self->{column_prev} = $self->{column};
3307 $self->{column}++;
3308 $self->{nc}
3309 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3310 } else {
3311 $self->{set_nc}->($self);
3312 }
3313
3314 redo A;
3315 } elsif ($self->{nc} == 0x0027) { # '
3316
3317 $self->{ct}->{sysid} = ''; # DOCTYPE
3318 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3319
3320 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3321 $self->{line_prev} = $self->{line};
3322 $self->{column_prev} = $self->{column};
3323 $self->{column}++;
3324 $self->{nc}
3325 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3326 } else {
3327 $self->{set_nc}->($self);
3328 }
3329
3330 redo A;
3331 } elsif ($self->{nc} == 0x003E) { # >
3332
3333 $self->{state} = DATA_STATE;
3334 $self->{s_kwd} = '';
3335
3336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3337 $self->{line_prev} = $self->{line};
3338 $self->{column_prev} = $self->{column};
3339 $self->{column}++;
3340 $self->{nc}
3341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3342 } else {
3343 $self->{set_nc}->($self);
3344 }
3345
3346
3347 return ($self->{ct}); # DOCTYPE
3348
3349 redo A;
3350 } elsif ($self->{nc} == -1) {
3351
3352 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3353
3354 $self->{state} = DATA_STATE;
3355 $self->{s_kwd} = '';
3356 ## reconsume
3357
3358 $self->{ct}->{quirks} = 1;
3359 return ($self->{ct}); # DOCTYPE
3360
3361 redo A;
3362 } else {
3363
3364 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3365 $self->{ct}->{quirks} = 1;
3366
3367 $self->{state} = BOGUS_DOCTYPE_STATE;
3368
3369 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370 $self->{line_prev} = $self->{line};
3371 $self->{column_prev} = $self->{column};
3372 $self->{column}++;
3373 $self->{nc}
3374 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3375 } else {
3376 $self->{set_nc}->($self);
3377 }
3378
3379 redo A;
3380 }
3381 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3382 if ($is_space->{$self->{nc}}) {
3383
3384 ## Stay in the state
3385
3386 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3387 $self->{line_prev} = $self->{line};
3388 $self->{column_prev} = $self->{column};
3389 $self->{column}++;
3390 $self->{nc}
3391 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3392 } else {
3393 $self->{set_nc}->($self);
3394 }
3395
3396 redo A;
3397 } elsif ($self->{nc} == 0x0022) { # "
3398
3399 $self->{ct}->{sysid} = ''; # DOCTYPE
3400 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3401
3402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403 $self->{line_prev} = $self->{line};
3404 $self->{column_prev} = $self->{column};
3405 $self->{column}++;
3406 $self->{nc}
3407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408 } else {
3409 $self->{set_nc}->($self);
3410 }
3411
3412 redo A;
3413 } elsif ($self->{nc} == 0x0027) { # '
3414
3415 $self->{ct}->{sysid} = ''; # DOCTYPE
3416 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3417
3418 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3419 $self->{line_prev} = $self->{line};
3420 $self->{column_prev} = $self->{column};
3421 $self->{column}++;
3422 $self->{nc}
3423 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3424 } else {
3425 $self->{set_nc}->($self);
3426 }
3427
3428 redo A;
3429 } elsif ($self->{nc} == 0x003E) { # >
3430
3431 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3432 $self->{state} = DATA_STATE;
3433 $self->{s_kwd} = '';
3434
3435 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3436 $self->{line_prev} = $self->{line};
3437 $self->{column_prev} = $self->{column};
3438 $self->{column}++;
3439 $self->{nc}
3440 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3441 } else {
3442 $self->{set_nc}->($self);
3443 }
3444
3445
3446 $self->{ct}->{quirks} = 1;
3447 return ($self->{ct}); # DOCTYPE
3448
3449 redo A;
3450 } elsif ($self->{nc} == -1) {
3451
3452 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3453
3454 $self->{state} = DATA_STATE;
3455 $self->{s_kwd} = '';
3456 ## reconsume
3457
3458 $self->{ct}->{quirks} = 1;
3459 return ($self->{ct}); # DOCTYPE
3460
3461 redo A;
3462 } else {
3463
3464 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3465 $self->{ct}->{quirks} = 1;
3466
3467 $self->{state} = BOGUS_DOCTYPE_STATE;
3468
3469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3470 $self->{line_prev} = $self->{line};
3471 $self->{column_prev} = $self->{column};
3472 $self->{column}++;
3473 $self->{nc}
3474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3475 } else {
3476 $self->{set_nc}->($self);
3477 }
3478
3479 redo A;
3480 }
3481 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3482 if ($self->{nc} == 0x0022) { # "
3483
3484 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3485
3486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3487 $self->{line_prev} = $self->{line};
3488 $self->{column_prev} = $self->{column};
3489 $self->{column}++;
3490 $self->{nc}
3491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3492 } else {
3493 $self->{set_nc}->($self);
3494 }
3495
3496 redo A;
3497 } elsif ($self->{nc} == 0x003E) { # >
3498
3499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3500
3501 $self->{state} = DATA_STATE;
3502 $self->{s_kwd} = '';
3503
3504 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3505 $self->{line_prev} = $self->{line};
3506 $self->{column_prev} = $self->{column};
3507 $self->{column}++;
3508 $self->{nc}
3509 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3510 } else {
3511 $self->{set_nc}->($self);
3512 }
3513
3514
3515 $self->{ct}->{quirks} = 1;
3516 return ($self->{ct}); # DOCTYPE
3517
3518 redo A;
3519 } elsif ($self->{nc} == -1) {
3520
3521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3522
3523 $self->{state} = DATA_STATE;
3524 $self->{s_kwd} = '';
3525 ## reconsume
3526
3527 $self->{ct}->{quirks} = 1;
3528 return ($self->{ct}); # DOCTYPE
3529
3530 redo A;
3531 } else {
3532
3533 $self->{ct}->{sysid} # DOCTYPE
3534 .= chr $self->{nc};
3535 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3536 length $self->{ct}->{sysid});
3537
3538 ## Stay in the state
3539
3540 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3541 $self->{line_prev} = $self->{line};
3542 $self->{column_prev} = $self->{column};
3543 $self->{column}++;
3544 $self->{nc}
3545 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3546 } else {
3547 $self->{set_nc}->($self);
3548 }
3549
3550 redo A;
3551 }
3552 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3553 if ($self->{nc} == 0x0027) { # '
3554
3555 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3556
3557 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3558 $self->{line_prev} = $self->{line};
3559 $self->{column_prev} = $self->{column};
3560 $self->{column}++;
3561 $self->{nc}
3562 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3563 } else {
3564 $self->{set_nc}->($self);
3565 }
3566
3567 redo A;
3568 } elsif ($self->{nc} == 0x003E) { # >
3569
3570 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3571
3572 $self->{state} = DATA_STATE;
3573 $self->{s_kwd} = '';
3574
3575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3576 $self->{line_prev} = $self->{line};
3577 $self->{column_prev} = $self->{column};
3578 $self->{column}++;
3579 $self->{nc}
3580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3581 } else {
3582 $self->{set_nc}->($self);
3583 }
3584
3585
3586 $self->{ct}->{quirks} = 1;
3587 return ($self->{ct}); # DOCTYPE
3588
3589 redo A;
3590 } elsif ($self->{nc} == -1) {
3591
3592 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3593
3594 $self->{state} = DATA_STATE;
3595 $self->{s_kwd} = '';
3596 ## reconsume
3597
3598 $self->{ct}->{quirks} = 1;
3599 return ($self->{ct}); # DOCTYPE
3600
3601 redo A;
3602 } else {
3603
3604 $self->{ct}->{sysid} # DOCTYPE
3605 .= chr $self->{nc};
3606 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3607 length $self->{ct}->{sysid});
3608
3609 ## Stay in the state
3610
3611 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3612 $self->{line_prev} = $self->{line};
3613 $self->{column_prev} = $self->{column};
3614 $self->{column}++;
3615 $self->{nc}
3616 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3617 } else {
3618 $self->{set_nc}->($self);
3619 }
3620
3621 redo A;
3622 }
3623 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3624 if ($is_space->{$self->{nc}}) {
3625
3626 ## Stay in the state
3627
3628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3629 $self->{line_prev} = $self->{line};
3630 $self->{column_prev} = $self->{column};
3631 $self->{column}++;
3632 $self->{nc}
3633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3634 } else {
3635 $self->{set_nc}->($self);
3636 }
3637
3638 redo A;
3639 } elsif ($self->{nc} == 0x003E) { # >
3640
3641 $self->{state} = DATA_STATE;
3642 $self->{s_kwd} = '';
3643
3644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3645 $self->{line_prev} = $self->{line};
3646 $self->{column_prev} = $self->{column};
3647 $self->{column}++;
3648 $self->{nc}
3649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3650 } else {
3651 $self->{set_nc}->($self);
3652 }
3653
3654
3655 return ($self->{ct}); # DOCTYPE
3656
3657 redo A;
3658 } elsif ($self->{nc} == -1) {
3659
3660 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3661 $self->{state} = DATA_STATE;
3662 $self->{s_kwd} = '';
3663 ## reconsume
3664
3665 $self->{ct}->{quirks} = 1;
3666 return ($self->{ct}); # DOCTYPE
3667
3668 redo A;
3669 } else {
3670
3671 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3672 #$self->{ct}->{quirks} = 1;
3673
3674 $self->{state} = BOGUS_DOCTYPE_STATE;
3675
3676 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3677 $self->{line_prev} = $self->{line};
3678 $self->{column_prev} = $self->{column};
3679 $self->{column}++;
3680 $self->{nc}
3681 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3682 } else {
3683 $self->{set_nc}->($self);
3684 }
3685
3686 redo A;
3687 }
3688 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3689 if ($self->{nc} == 0x003E) { # >
3690
3691 $self->{state} = DATA_STATE;
3692 $self->{s_kwd} = '';
3693
3694 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3695 $self->{line_prev} = $self->{line};
3696 $self->{column_prev} = $self->{column};
3697 $self->{column}++;
3698 $self->{nc}
3699 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3700 } else {
3701 $self->{set_nc}->($self);
3702 }
3703
3704
3705 return ($self->{ct}); # DOCTYPE
3706
3707 redo A;
3708 } elsif ($self->{nc} == -1) {
3709
3710 $self->{state} = DATA_STATE;
3711 $self->{s_kwd} = '';
3712 ## reconsume
3713
3714 return ($self->{ct}); # DOCTYPE
3715
3716 redo A;
3717 } else {
3718
3719 my $s = '';
3720 $self->{read_until}->($s, q[>], 0);
3721
3722 ## Stay in the state
3723
3724 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3725 $self->{line_prev} = $self->{line};
3726 $self->{column_prev} = $self->{column};
3727 $self->{column}++;
3728 $self->{nc}
3729 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3730 } else {
3731 $self->{set_nc}->($self);
3732 }
3733
3734 redo A;
3735 }
3736 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3737 ## NOTE: "CDATA section state" in the state is jointly implemented
3738 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3739 ## and |CDATA_SECTION_MSE2_STATE|.
3740
3741 ## XML5: "CDATA state".
3742
3743 if ($self->{nc} == 0x005D) { # ]
3744
3745 $self->{state} = CDATA_SECTION_MSE1_STATE;
3746
3747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3748 $self->{line_prev} = $self->{line};
3749 $self->{column_prev} = $self->{column};
3750 $self->{column}++;
3751 $self->{nc}
3752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3753 } else {
3754 $self->{set_nc}->($self);
3755 }
3756
3757 redo A;
3758 } elsif ($self->{nc} == -1) {
3759 if ($self->{is_xml}) {
3760
3761 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3762 } else {
3763
3764 }
3765
3766 $self->{state} = DATA_STATE;
3767 $self->{s_kwd} = '';
3768 ## Reconsume.
3769 if (length $self->{ct}->{data}) { # character
3770
3771 return ($self->{ct}); # character
3772 } else {
3773
3774 ## No token to emit. $self->{ct} is discarded.
3775 }
3776 redo A;
3777 } else {
3778
3779 $self->{ct}->{data} .= chr $self->{nc};
3780 $self->{read_until}->($self->{ct}->{data},
3781 q<]>,
3782 length $self->{ct}->{data});
3783
3784 ## Stay in the state.
3785
3786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3787 $self->{line_prev} = $self->{line};
3788 $self->{column_prev} = $self->{column};
3789 $self->{column}++;
3790 $self->{nc}
3791 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3792 } else {
3793 $self->{set_nc}->($self);
3794 }
3795
3796 redo A;
3797 }
3798
3799 ## ISSUE: "text tokens" in spec.
3800 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3801 ## XML5: "CDATA bracket state".
3802
3803 if ($self->{nc} == 0x005D) { # ]
3804
3805 $self->{state} = CDATA_SECTION_MSE2_STATE;
3806
3807 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3808 $self->{line_prev} = $self->{line};
3809 $self->{column_prev} = $self->{column};
3810 $self->{column}++;
3811 $self->{nc}
3812 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3813 } else {
3814 $self->{set_nc}->($self);
3815 }
3816
3817 redo A;
3818 } else {
3819
3820 ## XML5: If EOF, "]" is not appended and changed to the data state.
3821 $self->{ct}->{data} .= ']';
3822 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3823 ## Reconsume.
3824 redo A;
3825 }
3826 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3827 ## XML5: "CDATA end state".
3828
3829 if ($self->{nc} == 0x003E) { # >
3830 $self->{state} = DATA_STATE;
3831 $self->{s_kwd} = '';
3832
3833 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3834 $self->{line_prev} = $self->{line};
3835 $self->{column_prev} = $self->{column};
3836 $self->{column}++;
3837 $self->{nc}
3838 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3839 } else {
3840 $self->{set_nc}->($self);
3841 }
3842
3843 if (length $self->{ct}->{data}) { # character
3844
3845 return ($self->{ct}); # character
3846 } else {
3847
3848 ## No token to emit. $self->{ct} is discarded.
3849 }
3850 redo A;
3851 } elsif ($self->{nc} == 0x005D) { # ]
3852 # character
3853 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3854 ## Stay in the state.
3855
3856 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3857 $self->{line_prev} = $self->{line};
3858 $self->{column_prev} = $self->{column};
3859 $self->{column}++;
3860 $self->{nc}
3861 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3862 } else {
3863 $self->{set_nc}->($self);
3864 }
3865
3866 redo A;
3867 } else {
3868
3869 $self->{ct}->{data} .= ']]'; # character
3870 $self->{state} = CDATA_SECTION_STATE;
3871 ## Reconsume. ## XML5: Emit.
3872 redo A;
3873 }
3874 } elsif ($self->{state} == ENTITY_STATE) {
3875 if ($is_space->{$self->{nc}} or
3876 {
3877 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3878 $self->{entity_add} => 1,
3879 }->{$self->{nc}}) {
3880
3881 ## Don't consume
3882 ## No error
3883 ## Return nothing.
3884 #
3885 } elsif ($self->{nc} == 0x0023) { # #
3886
3887 $self->{state} = ENTITY_HASH_STATE;
3888 $self->{s_kwd} = '#';
3889
3890 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3891 $self->{line_prev} = $self->{line};
3892 $self->{column_prev} = $self->{column};
3893 $self->{column}++;
3894 $self->{nc}
3895 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3896 } else {
3897 $self->{set_nc}->($self);
3898 }
3899
3900 redo A;
3901 } elsif ((0x0041 <= $self->{nc} and
3902 $self->{nc} <= 0x005A) or # A..Z
3903 (0x0061 <= $self->{nc} and
3904 $self->{nc} <= 0x007A)) { # a..z
3905
3906 require Whatpm::_NamedEntityList;
3907 $self->{state} = ENTITY_NAME_STATE;
3908 $self->{s_kwd} = chr $self->{nc};
3909 $self->{entity__value} = $self->{s_kwd};
3910 $self->{entity__match} = 0;
3911
3912 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3913 $self->{line_prev} = $self->{line};
3914 $self->{column_prev} = $self->{column};
3915 $self->{column}++;
3916 $self->{nc}
3917 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3918 } else {
3919 $self->{set_nc}->($self);
3920 }
3921
3922 redo A;
3923 } else {
3924
3925 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3926 ## Return nothing.
3927 #
3928 }
3929
3930 ## NOTE: No character is consumed by the "consume a character
3931 ## reference" algorithm. In other word, there is an "&" character
3932 ## that does not introduce a character reference, which would be
3933 ## appended to the parent element or the attribute value in later
3934 ## process of the tokenizer.
3935
3936 if ($self->{prev_state} == DATA_STATE) {
3937
3938 $self->{state} = $self->{prev_state};
3939 $self->{s_kwd} = '';
3940 ## Reconsume.
3941 return ({type => CHARACTER_TOKEN, data => '&',
3942 line => $self->{line_prev},
3943 column => $self->{column_prev},
3944 });
3945 redo A;
3946 } else {
3947
3948 $self->{ca}->{value} .= '&';
3949 $self->{state} = $self->{prev_state};
3950 $self->{s_kwd} = '';
3951 ## Reconsume.
3952 redo A;
3953 }
3954 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3955 if ($self->{nc} == 0x0078 or # x
3956 $self->{nc} == 0x0058) { # X
3957
3958 $self->{state} = HEXREF_X_STATE;
3959 $self->{s_kwd} .= chr $self->{nc};
3960
3961 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3962 $self->{line_prev} = $self->{line};
3963 $self->{column_prev} = $self->{column};
3964 $self->{column}++;
3965 $self->{nc}
3966 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3967 } else {
3968 $self->{set_nc}->($self);
3969 }
3970
3971 redo A;
3972 } elsif (0x0030 <= $self->{nc} and
3973 $self->{nc} <= 0x0039) { # 0..9
3974
3975 $self->{state} = NCR_NUM_STATE;
3976 $self->{s_kwd} = $self->{nc} - 0x0030;
3977
3978 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3979 $self->{line_prev} = $self->{line};
3980 $self->{column_prev} = $self->{column};
3981 $self->{column}++;
3982 $self->{nc}
3983 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3984 } else {
3985 $self->{set_nc}->($self);
3986 }
3987
3988 redo A;
3989 } else {
3990 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3991 line => $self->{line_prev},
3992 column => $self->{column_prev} - 1);
3993
3994 ## NOTE: According to the spec algorithm, nothing is returned,
3995 ## and then "&#" is appended to the parent element or the attribute
3996 ## value in the later processing.
3997
3998 if ($self->{prev_state} == DATA_STATE) {
3999
4000 $self->{state} = $self->{prev_state};
4001 $self->{s_kwd} = '';
4002 ## Reconsume.
4003 return ({type => CHARACTER_TOKEN,
4004 data => '&#',
4005 line => $self->{line_prev},
4006 column => $self->{column_prev} - 1,
4007 });
4008 redo A;
4009 } else {
4010
4011 $self->{ca}->{value} .= '&#';
4012 $self->{state} = $self->{prev_state};
4013 $self->{s_kwd} = '';
4014 ## Reconsume.
4015 redo A;
4016 }
4017 }
4018 } elsif ($self->{state} == NCR_NUM_STATE) {
4019 if (0x0030 <= $self->{nc} and
4020 $self->{nc} <= 0x0039) { # 0..9
4021
4022 $self->{s_kwd} *= 10;
4023 $self->{s_kwd} += $self->{nc} - 0x0030;
4024
4025 ## Stay in the state.
4026
4027 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4028 $self->{line_prev} = $self->{line};
4029 $self->{column_prev} = $self->{column};
4030 $self->{column}++;
4031 $self->{nc}
4032 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4033 } else {
4034 $self->{set_nc}->($self);
4035 }
4036
4037 redo A;
4038 } elsif ($self->{nc} == 0x003B) { # ;
4039
4040
4041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4042 $self->{line_prev} = $self->{line};
4043 $self->{column_prev} = $self->{column};
4044 $self->{column}++;
4045 $self->{nc}
4046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4047 } else {
4048 $self->{set_nc}->($self);
4049 }
4050
4051 #
4052 } else {
4053
4054 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4055 ## Reconsume.
4056 #
4057 }
4058
4059 my $code = $self->{s_kwd};
4060 my $l = $self->{line_prev};
4061 my $c = $self->{column_prev};
4062 if ($charref_map->{$code}) {
4063
4064 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4065 text => (sprintf 'U+%04X', $code),
4066 line => $l, column => $c);
4067 $code = $charref_map->{$code};
4068 } elsif ($code > 0x10FFFF) {
4069
4070 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4071 text => (sprintf 'U-%08X', $code),
4072 line => $l, column => $c);
4073 $code = 0xFFFD;
4074 }
4075
4076 if ($self->{prev_state} == DATA_STATE) {
4077
4078 $self->{state} = $self->{prev_state};
4079 $self->{s_kwd} = '';
4080 ## Reconsume.
4081 return ({type => CHARACTER_TOKEN, data => chr $code,
4082 has_reference => 1,
4083 line => $l, column => $c,
4084 });
4085 redo A;
4086 } else {
4087
4088 $self->{ca}->{value} .= chr $code;
4089 $self->{ca}->{has_reference} = 1;
4090 $self->{state} = $self->{prev_state};
4091 $self->{s_kwd} = '';
4092 ## Reconsume.
4093 redo A;
4094 }
4095 } elsif ($self->{state} == HEXREF_X_STATE) {
4096 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4097 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4098 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4099 # 0..9, A..F, a..f
4100
4101 $self->{state} = HEXREF_HEX_STATE;
4102 $self->{s_kwd} = 0;
4103 ## Reconsume.
4104 redo A;
4105 } else {
4106 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4107 line => $self->{line_prev},
4108 column => $self->{column_prev} - 2);
4109
4110 ## NOTE: According to the spec algorithm, nothing is returned,
4111 ## and then "&#" followed by "X" or "x" is appended to the parent
4112 ## element or the attribute value in the later processing.
4113
4114 if ($self->{prev_state} == DATA_STATE) {
4115
4116 $self->{state} = $self->{prev_state};
4117 $self->{s_kwd} = '';
4118 ## Reconsume.
4119 return ({type => CHARACTER_TOKEN,
4120 data => '&' . $self->{s_kwd},
4121 line => $self->{line_prev},
4122 column => $self->{column_prev} - length $self->{s_kwd},
4123 });
4124 redo A;
4125 } else {
4126
4127 $self->{ca}->{value} .= '&' . $self->{s_kwd};
4128 $self->{state} = $self->{prev_state};
4129 $self->{s_kwd} = '';
4130 ## Reconsume.
4131 redo A;
4132 }
4133 }
4134 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4135 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4136 # 0..9
4137
4138 $self->{s_kwd} *= 0x10;
4139 $self->{s_kwd} += $self->{nc} - 0x0030;
4140 ## Stay in the state.
4141
4142 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4143 $self->{line_prev} = $self->{line};
4144 $self->{column_prev} = $self->{column};
4145 $self->{column}++;
4146 $self->{nc}
4147 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4148 } else {
4149 $self->{set_nc}->($self);
4150 }
4151
4152 redo A;
4153 } elsif (0x0061 <= $self->{nc} and
4154 $self->{nc} <= 0x0066) { # a..f
4155
4156 $self->{s_kwd} *= 0x10;
4157 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4158 ## Stay in the state.
4159
4160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4161 $self->{line_prev} = $self->{line};
4162 $self->{column_prev} = $self->{column};
4163 $self->{column}++;
4164 $self->{nc}
4165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4166 } else {
4167 $self->{set_nc}->($self);
4168 }
4169
4170 redo A;
4171 } elsif (0x0041 <= $self->{nc} and
4172 $self->{nc} <= 0x0046) { # A..F
4173
4174 $self->{s_kwd} *= 0x10;
4175 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4176 ## Stay in the state.
4177
4178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4179 $self->{line_prev} = $self->{line};
4180 $self->{column_prev} = $self->{column};
4181 $self->{column}++;
4182 $self->{nc}
4183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4184 } else {
4185 $self->{set_nc}->($self);
4186 }
4187
4188 redo A;
4189 } elsif ($self->{nc} == 0x003B) { # ;
4190
4191
4192 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4193 $self->{line_prev} = $self->{line};
4194 $self->{column_prev} = $self->{column};
4195 $self->{column}++;
4196 $self->{nc}
4197 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4198 } else {
4199 $self->{set_nc}->($self);
4200 }
4201
4202 #
4203 } else {
4204
4205 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4206 line => $self->{line},
4207 column => $self->{column});
4208 ## Reconsume.
4209 #
4210 }
4211
4212 my $code = $self->{s_kwd};
4213 my $l = $self->{line_prev};
4214 my $c = $self->{column_prev};
4215 if ($charref_map->{$code}) {
4216
4217 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4218 text => (sprintf 'U+%04X', $code),
4219 line => $l, column => $c);
4220 $code = $charref_map->{$code};
4221 } elsif ($code > 0x10FFFF) {
4222
4223 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4224 text => (sprintf 'U-%08X', $code),
4225 line => $l, column => $c);
4226 $code = 0xFFFD;
4227 }
4228
4229 if ($self->{prev_state} == DATA_STATE) {
4230
4231 $self->{state} = $self->{prev_state};
4232 $self->{s_kwd} = '';
4233 ## Reconsume.
4234 return ({type => CHARACTER_TOKEN, data => chr $code,
4235 has_reference => 1,
4236 line => $l, column => $c,
4237 });
4238 redo A;
4239 } else {
4240
4241 $self->{ca}->{value} .= chr $code;
4242 $self->{ca}->{has_reference} = 1;
4243 $self->{state} = $self->{prev_state};
4244 $self->{s_kwd} = '';
4245 ## Reconsume.
4246 redo A;
4247 }
4248 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4249 if (length $self->{s_kwd} < 30 and
4250 ## NOTE: Some number greater than the maximum length of entity name
4251 ((0x0041 <= $self->{nc} and # a
4252 $self->{nc} <= 0x005A) or # x
4253 (0x0061 <= $self->{nc} and # a
4254 $self->{nc} <= 0x007A) or # z
4255 (0x0030 <= $self->{nc} and # 0
4256 $self->{nc} <= 0x0039) or # 9
4257 $self->{nc} == 0x003B)) { # ;
4258 our $EntityChar;
4259 $self->{s_kwd} .= chr $self->{nc};
4260 if (defined $EntityChar->{$self->{s_kwd}}) {
4261 if ($self->{nc} == 0x003B) { # ;
4262
4263 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4264 $self->{entity__match} = 1;
4265
4266 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4267 $self->{line_prev} = $self->{line};
4268 $self->{column_prev} = $self->{column};
4269 $self->{column}++;
4270 $self->{nc}
4271 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4272 } else {
4273 $self->{set_nc}->($self);
4274 }
4275
4276 #
4277 } else {
4278
4279 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4280 $self->{entity__match} = -1;
4281 ## Stay in the state.
4282
4283 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4284 $self->{line_prev} = $self->{line};
4285 $self->{column_prev} = $self->{column};
4286 $self->{column}++;
4287 $self->{nc}
4288 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4289 } else {
4290 $self->{set_nc}->($self);
4291 }
4292
4293 redo A;
4294 }
4295 } else {
4296
4297 $self->{entity__value} .= chr $self->{nc};
4298 $self->{entity__match} *= 2;
4299 ## Stay in the state.
4300
4301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4302 $self->{line_prev} = $self->{line};
4303 $self->{column_prev} = $self->{column};
4304 $self->{column}++;
4305 $self->{nc}
4306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4307 } else {
4308 $self->{set_nc}->($self);
4309 }
4310
4311 redo A;
4312 }
4313 }
4314
4315 my $data;
4316 my $has_ref;
4317 if ($self->{entity__match} > 0) {
4318
4319 $data = $self->{entity__value};
4320 $has_ref = 1;
4321 #
4322 } elsif ($self->{entity__match} < 0) {
4323 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4324 if ($self->{prev_state} != DATA_STATE and # in attribute
4325 $self->{entity__match} < -1) {
4326
4327 $data = '&' . $self->{s_kwd};
4328 #
4329 } else {
4330
4331 $data = $self->{entity__value};
4332 $has_ref = 1;
4333 #
4334 }
4335 } else {
4336
4337 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4338 line => $self->{line_prev},
4339 column => $self->{column_prev} - length $self->{s_kwd});
4340 $data = '&' . $self->{s_kwd};
4341 #
4342 }
4343
4344 ## NOTE: In these cases, when a character reference is found,
4345 ## it is consumed and a character token is returned, or, otherwise,
4346 ## nothing is consumed and returned, according to the spec algorithm.
4347 ## In this implementation, anything that has been examined by the
4348 ## tokenizer is appended to the parent element or the attribute value
4349 ## as string, either literal string when no character reference or
4350 ## entity-replaced string otherwise, in this stage, since any characters
4351 ## that would not be consumed are appended in the data state or in an
4352 ## appropriate attribute value state anyway.
4353
4354 if ($self->{prev_state} == DATA_STATE) {
4355
4356 $self->{state} = $self->{prev_state};
4357 $self->{s_kwd} = '';
4358 ## Reconsume.
4359 return ({type => CHARACTER_TOKEN,
4360 data => $data,
4361 has_reference => $has_ref,
4362 line => $self->{line_prev},
4363 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4364 });
4365 redo A;
4366 } else {
4367
4368 $self->{ca}->{value} .= $data;
4369 $self->{ca}->{has_reference} = 1 if $has_ref;
4370 $self->{state} = $self->{prev_state};
4371 $self->{s_kwd} = '';
4372 ## Reconsume.
4373 redo A;
4374 }
4375
4376 ## XML-only states
4377
4378 } elsif ($self->{state} == PI_STATE) {
4379 if ($is_space->{$self->{nc}} or
4380 $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4381 $self->{nc} == -1) {
4382 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4383 line => $self->{line_prev},
4384 column => $self->{column_prev}
4385 - 1 * ($self->{nc} != -1));
4386 $self->{state} = BOGUS_COMMENT_STATE;
4387 ## Reconsume.
4388 $self->{ct} = {type => COMMENT_TOKEN,
4389 data => '?',
4390 line => $self->{line_prev},
4391 column => $self->{column_prev}
4392 - 1 * ($self->{nc} != -1),
4393 };
4394 redo A;
4395 } else {
4396 $self->{ct} = {type => PI_TOKEN,
4397 target => chr $self->{nc},
4398 data => '',
4399 line => $self->{line_prev},
4400 column => $self->{column_prev} - 1,
4401 };
4402 $self->{state} = PI_TARGET_STATE;
4403
4404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4405 $self->{line_prev} = $self->{line};
4406 $self->{column_prev} = $self->{column};
4407 $self->{column}++;
4408 $self->{nc}
4409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4410 } else {
4411 $self->{set_nc}->($self);
4412 }
4413
4414 redo A;
4415 }
4416 } elsif ($self->{state} == PI_TARGET_STATE) {
4417 if ($is_space->{$self->{nc}}) {
4418 $self->{state} = PI_TARGET_AFTER_STATE;
4419
4420 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4421 $self->{line_prev} = $self->{line};
4422 $self->{column_prev} = $self->{column};
4423 $self->{column}++;
4424 $self->{nc}
4425 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4426 } else {
4427 $self->{set_nc}->($self);
4428 }
4429
4430 redo A;
4431 } elsif ($self->{nc} == -1) {
4432 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4433 $self->{state} = DATA_STATE;
4434 $self->{s_kwd} = '';
4435 ## Reconsume.
4436 return ($self->{ct}); # pi
4437 redo A;
4438 } elsif ($self->{nc} == 0x003F) { # ?
4439 $self->{state} = PI_AFTER_STATE;
4440
4441 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4442 $self->{line_prev} = $self->{line};
4443 $self->{column_prev} = $self->{column};
4444 $self->{column}++;
4445 $self->{nc}
4446 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4447 } else {
4448 $self->{set_nc}->($self);
4449 }
4450
4451 redo A;
4452 } else {
4453 ## XML5: typo ("tag name" -> "target")
4454 $self->{ct}->{target} .= chr $self->{nc}; # pi
4455
4456 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4457 $self->{line_prev} = $self->{line};
4458 $self->{column_prev} = $self->{column};
4459 $self->{column}++;
4460 $self->{nc}
4461 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4462 } else {
4463 $self->{set_nc}->($self);
4464 }
4465
4466 redo A;
4467 }
4468 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4469 if ($is_space->{$self->{nc}}) {
4470 ## Stay in the state.
4471
4472 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4473 $self->{line_prev} = $self->{line};
4474 $self->{column_prev} = $self->{column};
4475 $self->{column}++;
4476 $self->{nc}
4477 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4478 } else {
4479 $self->{set_nc}->($self);
4480 }
4481
4482 redo A;
4483 } else {
4484 $self->{state} = PI_DATA_STATE;
4485 ## Reprocess.
4486 redo A;
4487 }
4488 } elsif ($self->{state} == PI_DATA_STATE) {
4489 if ($self->{nc} == 0x003F) { # ?
4490 $self->{state} = PI_DATA_AFTER_STATE;
4491
4492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493 $self->{line_prev} = $self->{line};
4494 $self->{column_prev} = $self->{column};
4495 $self->{column}++;
4496 $self->{nc}
4497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498 } else {
4499 $self->{set_nc}->($self);
4500 }
4501
4502 redo A;
4503 } elsif ($self->{nc} == -1) {
4504 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4505 $self->{state} = DATA_STATE;
4506 $self->{s_kwd} = '';
4507 ## Reprocess.
4508 return ($self->{ct}); # pi
4509 redo A;
4510 } else {
4511 $self->{ct}->{data} .= chr $self->{nc}; # pi
4512 $self->{read_until}->($self->{ct}->{data}, q[?],
4513 length $self->{ct}->{data});
4514 ## Stay in the state.
4515
4516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4517 $self->{line_prev} = $self->{line};
4518 $self->{column_prev} = $self->{column};
4519 $self->{column}++;
4520 $self->{nc}
4521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4522 } else {
4523 $self->{set_nc}->($self);
4524 }
4525
4526 ## Reprocess.
4527 redo A;
4528 }
4529 } elsif ($self->{state} == PI_AFTER_STATE) {
4530 if ($self->{nc} == 0x003E) { # >
4531 $self->{state} = DATA_STATE;
4532 $self->{s_kwd} = '';
4533
4534 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4535 $self->{line_prev} = $self->{line};
4536 $self->{column_prev} = $self->{column};
4537 $self->{column}++;
4538 $self->{nc}
4539 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4540 } else {
4541 $self->{set_nc}->($self);
4542 }
4543
4544 return ($self->{ct}); # pi
4545 redo A;
4546 } elsif ($self->{nc} == 0x003F) { # ?
4547 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4548 line => $self->{line_prev},
4549 column => $self->{column_prev}); ## XML5: no error
4550 $self->{ct}->{data} .= '?';
4551 $self->{state} = PI_DATA_AFTER_STATE;
4552
4553 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4554 $self->{line_prev} = $self->{line};
4555 $self->{column_prev} = $self->{column};
4556 $self->{column}++;
4557 $self->{nc}
4558 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4559 } else {
4560 $self->{set_nc}->($self);
4561 }
4562
4563 redo A;
4564 } else {
4565 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4566 line => $self->{line_prev},
4567 column => $self->{column_prev}
4568 + 1 * ($self->{nc} == -1)); ## XML5: no error
4569 $self->{ct}->{data} .= '?'; ## XML5: not appended
4570 $self->{state} = PI_DATA_STATE;
4571 ## Reprocess.
4572 redo A;
4573 }
4574 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4575 ## XML5: Same as "pi after state" in XML5
4576 if ($self->{nc} == 0x003E) { # >
4577 $self->{state} = DATA_STATE;
4578 $self->{s_kwd} = '';
4579
4580 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4581 $self->{line_prev} = $self->{line};
4582 $self->{column_prev} = $self->{column};
4583 $self->{column}++;
4584 $self->{nc}
4585 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4586 } else {
4587 $self->{set_nc}->($self);
4588 }
4589
4590 return ($self->{ct}); # pi
4591 redo A;
4592 } elsif ($self->{nc} == 0x003F) { # ?
4593 $self->{ct}->{data} .= '?';
4594 ## Stay in the state.
4595
4596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597 $self->{line_prev} = $self->{line};
4598 $self->{column_prev} = $self->{column};
4599 $self->{column}++;
4600 $self->{nc}
4601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4602 } else {
4603 $self->{set_nc}->($self);
4604 }
4605
4606 redo A;
4607 } else {
4608 $self->{ct}->{data} .= '?'; ## XML5: not appended
4609 $self->{state} = PI_DATA_STATE;
4610 ## Reprocess.
4611 redo A;
4612 }
4613
4614 } else {
4615 die "$0: $self->{state}: Unknown state";
4616 }
4617 } # A
4618
4619 die "$0: _get_next_token: unexpected case";
4620 } # _get_next_token
4621
4622 1;
4623 ## $Date: 2008/10/15 08:05:47 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24