/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations) (download)
Tue Oct 14 05:34:05 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.2: +6 -4 lines
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 05:33:48 -0000
	* Tokenizer.pm.src: Introduced "in_xml" flag for CDATA section
	support in XML.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 05:34:00 -0000
	* Parser.pm.src: Set |in_xml| flag for tokenizer.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: A bug on end tag handling fixed.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 #$self->{s_kwd}; # state keyword - initialized when used
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194
195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196 $self->{line_prev} = $self->{line};
197 $self->{column_prev} = $self->{column};
198 $self->{column}++;
199 $self->{nc}
200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201 } else {
202 $self->{set_nc}->($self);
203 }
204
205 $self->{token} = [];
206 # $self->{escape}
207 } # _initialize_tokenizer
208
209 ## A token has:
210 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212 ## ->{name} (DOCTYPE_TOKEN)
213 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214 ## ->{pubid} (DOCTYPE_TOKEN)
215 ## ->{sysid} (DOCTYPE_TOKEN)
216 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218 ## ->{name}
219 ## ->{value}
220 ## ->{has_reference} == 1 or 0
221 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224 ## while the token is pushed back to the stack.
225
226 ## Emitted token MUST immediately be handled by the tree construction state.
227
228 ## Before each step, UA MAY check to see if either one of the scripts in
229 ## "list of scripts that will execute as soon as possible" or the first
230 ## script in the "list of scripts that will execute asynchronously",
231 ## has completed loading. If one has, then it MUST be executed
232 ## and removed from the list.
233
234 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235 ## (This requirement was dropped from HTML5 spec, unfortunately.)
236
237 my $is_space = {
238 0x0009 => 1, # CHARACTER TABULATION (HT)
239 0x000A => 1, # LINE FEED (LF)
240 #0x000B => 0, # LINE TABULATION (VT)
241 0x000C => 1, # FORM FEED (FF)
242 #0x000D => 1, # CARRIAGE RETURN (CR)
243 0x0020 => 1, # SPACE (SP)
244 };
245
246 sub _get_next_token ($) {
247 my $self = shift;
248
249 if ($self->{self_closing}) {
250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251 ## NOTE: The |self_closing| flag is only set by start tag token.
252 ## In addition, when a start tag token is emitted, it is always set to
253 ## |ct|.
254 delete $self->{self_closing};
255 }
256
257 if (@{$self->{token}}) {
258 $self->{self_closing} = $self->{token}->[0]->{self_closing};
259 return shift @{$self->{token}};
260 }
261
262 A: {
263 if ($self->{state} == PCDATA_STATE) {
264 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265
266 if ($self->{nc} == 0x0026) { # &
267
268 ## NOTE: In the spec, the tokenizer is switched to the
269 ## "entity data state". In this implementation, the tokenizer
270 ## is switched to the |ENTITY_STATE|, which is an implementation
271 ## of the "consume a character reference" algorithm.
272 $self->{entity_add} = -1;
273 $self->{prev_state} = DATA_STATE;
274 $self->{state} = ENTITY_STATE;
275
276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277 $self->{line_prev} = $self->{line};
278 $self->{column_prev} = $self->{column};
279 $self->{column}++;
280 $self->{nc}
281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282 } else {
283 $self->{set_nc}->($self);
284 }
285
286 redo A;
287 } elsif ($self->{nc} == 0x003C) { # <
288
289 $self->{state} = TAG_OPEN_STATE;
290
291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292 $self->{line_prev} = $self->{line};
293 $self->{column_prev} = $self->{column};
294 $self->{column}++;
295 $self->{nc}
296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297 } else {
298 $self->{set_nc}->($self);
299 }
300
301 redo A;
302 } elsif ($self->{nc} == -1) {
303
304 return ({type => END_OF_FILE_TOKEN,
305 line => $self->{line}, column => $self->{column}});
306 last A; ## TODO: ok?
307 } else {
308
309 #
310 }
311
312 # Anything else
313 my $token = {type => CHARACTER_TOKEN,
314 data => chr $self->{nc},
315 line => $self->{line}, column => $self->{column},
316 };
317 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318
319 ## Stay in the state.
320
321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322 $self->{line_prev} = $self->{line};
323 $self->{column_prev} = $self->{column};
324 $self->{column}++;
325 $self->{nc}
326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327 } else {
328 $self->{set_nc}->($self);
329 }
330
331 return ($token);
332 redo A;
333 } elsif ($self->{state} == DATA_STATE) {
334 $self->{s_kwd} = '' unless defined $self->{s_kwd};
335 if ($self->{nc} == 0x0026) { # &
336 $self->{s_kwd} = '';
337 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338 not $self->{escape}) {
339
340 ## NOTE: In the spec, the tokenizer is switched to the
341 ## "entity data state". In this implementation, the tokenizer
342 ## is switched to the |ENTITY_STATE|, which is an implementation
343 ## of the "consume a character reference" algorithm.
344 $self->{entity_add} = -1;
345 $self->{prev_state} = DATA_STATE;
346 $self->{state} = ENTITY_STATE;
347
348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349 $self->{line_prev} = $self->{line};
350 $self->{column_prev} = $self->{column};
351 $self->{column}++;
352 $self->{nc}
353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354 } else {
355 $self->{set_nc}->($self);
356 }
357
358 redo A;
359 } else {
360
361 #
362 }
363 } elsif ($self->{nc} == 0x002D) { # -
364 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365 $self->{s_kwd} .= '-';
366
367 if ($self->{s_kwd} eq '<!--') {
368
369 $self->{escape} = 1; # unless $self->{escape};
370 $self->{s_kwd} = '--';
371 #
372 } elsif ($self->{s_kwd} eq '---') {
373
374 $self->{s_kwd} = '--';
375 #
376 } else {
377
378 #
379 }
380 }
381
382 #
383 } elsif ($self->{nc} == 0x0021) { # !
384 if (length $self->{s_kwd}) {
385
386 $self->{s_kwd} .= '!';
387 #
388 } else {
389
390 #$self->{s_kwd} = '';
391 #
392 }
393 #
394 } elsif ($self->{nc} == 0x003C) { # <
395 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
396 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
397 not $self->{escape})) {
398
399 $self->{state} = TAG_OPEN_STATE;
400
401 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
402 $self->{line_prev} = $self->{line};
403 $self->{column_prev} = $self->{column};
404 $self->{column}++;
405 $self->{nc}
406 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
407 } else {
408 $self->{set_nc}->($self);
409 }
410
411 redo A;
412 } else {
413
414 $self->{s_kwd} = '';
415 #
416 }
417 } elsif ($self->{nc} == 0x003E) { # >
418 if ($self->{escape} and
419 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
420 if ($self->{s_kwd} eq '--') {
421
422 delete $self->{escape};
423 } else {
424
425 }
426 } else {
427
428 }
429
430 $self->{s_kwd} = '';
431 #
432 } elsif ($self->{nc} == -1) {
433
434 $self->{s_kwd} = '';
435 return ({type => END_OF_FILE_TOKEN,
436 line => $self->{line}, column => $self->{column}});
437 last A; ## TODO: ok?
438 } else {
439
440 $self->{s_kwd} = '';
441 #
442 }
443
444 # Anything else
445 my $token = {type => CHARACTER_TOKEN,
446 data => chr $self->{nc},
447 line => $self->{line}, column => $self->{column},
448 };
449 if ($self->{read_until}->($token->{data}, q[-!<>&],
450 length $token->{data})) {
451 $self->{s_kwd} = '';
452 }
453
454 ## Stay in the data state.
455 if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
456
457 $self->{state} = PCDATA_STATE;
458 } else {
459
460 ## Stay in the state.
461 }
462
463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
464 $self->{line_prev} = $self->{line};
465 $self->{column_prev} = $self->{column};
466 $self->{column}++;
467 $self->{nc}
468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
469 } else {
470 $self->{set_nc}->($self);
471 }
472
473 return ($token);
474 redo A;
475 } elsif ($self->{state} == TAG_OPEN_STATE) {
476 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
477 if ($self->{nc} == 0x002F) { # /
478
479
480 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
481 $self->{line_prev} = $self->{line};
482 $self->{column_prev} = $self->{column};
483 $self->{column}++;
484 $self->{nc}
485 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
486 } else {
487 $self->{set_nc}->($self);
488 }
489
490 $self->{state} = CLOSE_TAG_OPEN_STATE;
491 redo A;
492 } elsif ($self->{nc} == 0x0021) { # !
493
494 $self->{s_kwd} = '<' unless $self->{escape};
495 #
496 } else {
497
498 #
499 }
500
501 ## reconsume
502 $self->{state} = DATA_STATE;
503 return ({type => CHARACTER_TOKEN, data => '<',
504 line => $self->{line_prev},
505 column => $self->{column_prev},
506 });
507 redo A;
508 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
509 if ($self->{nc} == 0x0021) { # !
510
511 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
512
513 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
514 $self->{line_prev} = $self->{line};
515 $self->{column_prev} = $self->{column};
516 $self->{column}++;
517 $self->{nc}
518 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
519 } else {
520 $self->{set_nc}->($self);
521 }
522
523 redo A;
524 } elsif ($self->{nc} == 0x002F) { # /
525
526 $self->{state} = CLOSE_TAG_OPEN_STATE;
527
528 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
529 $self->{line_prev} = $self->{line};
530 $self->{column_prev} = $self->{column};
531 $self->{column}++;
532 $self->{nc}
533 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
534 } else {
535 $self->{set_nc}->($self);
536 }
537
538 redo A;
539 } elsif (0x0041 <= $self->{nc} and
540 $self->{nc} <= 0x005A) { # A..Z
541
542 $self->{ct}
543 = {type => START_TAG_TOKEN,
544 tag_name => chr ($self->{nc} + 0x0020),
545 line => $self->{line_prev},
546 column => $self->{column_prev}};
547 $self->{state} = TAG_NAME_STATE;
548
549 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
550 $self->{line_prev} = $self->{line};
551 $self->{column_prev} = $self->{column};
552 $self->{column}++;
553 $self->{nc}
554 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
555 } else {
556 $self->{set_nc}->($self);
557 }
558
559 redo A;
560 } elsif (0x0061 <= $self->{nc} and
561 $self->{nc} <= 0x007A) { # a..z
562
563 $self->{ct} = {type => START_TAG_TOKEN,
564 tag_name => chr ($self->{nc}),
565 line => $self->{line_prev},
566 column => $self->{column_prev}};
567 $self->{state} = TAG_NAME_STATE;
568
569 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
570 $self->{line_prev} = $self->{line};
571 $self->{column_prev} = $self->{column};
572 $self->{column}++;
573 $self->{nc}
574 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
575 } else {
576 $self->{set_nc}->($self);
577 }
578
579 redo A;
580 } elsif ($self->{nc} == 0x003E) { # >
581
582 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
583 line => $self->{line_prev},
584 column => $self->{column_prev});
585 $self->{state} = DATA_STATE;
586
587 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
588 $self->{line_prev} = $self->{line};
589 $self->{column_prev} = $self->{column};
590 $self->{column}++;
591 $self->{nc}
592 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
593 } else {
594 $self->{set_nc}->($self);
595 }
596
597
598 return ({type => CHARACTER_TOKEN, data => '<>',
599 line => $self->{line_prev},
600 column => $self->{column_prev},
601 });
602
603 redo A;
604 } elsif ($self->{nc} == 0x003F) { # ?
605
606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
607 line => $self->{line_prev},
608 column => $self->{column_prev});
609 $self->{state} = BOGUS_COMMENT_STATE;
610 $self->{ct} = {type => COMMENT_TOKEN, data => '',
611 line => $self->{line_prev},
612 column => $self->{column_prev},
613 };
614 ## $self->{nc} is intentionally left as is
615 redo A;
616 } else {
617
618 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
619 line => $self->{line_prev},
620 column => $self->{column_prev});
621 $self->{state} = DATA_STATE;
622 ## reconsume
623
624 return ({type => CHARACTER_TOKEN, data => '<',
625 line => $self->{line_prev},
626 column => $self->{column_prev},
627 });
628
629 redo A;
630 }
631 } else {
632 die "$0: $self->{content_model} in tag open";
633 }
634 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
635 ## NOTE: The "close tag open state" in the spec is implemented as
636 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
637
638 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
639 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
640 if (defined $self->{last_stag_name}) {
641 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
642 $self->{s_kwd} = '';
643 ## Reconsume.
644 redo A;
645 } else {
646 ## No start tag token has ever been emitted
647 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
648
649 $self->{state} = DATA_STATE;
650 ## Reconsume.
651 return ({type => CHARACTER_TOKEN, data => '</',
652 line => $l, column => $c,
653 });
654 redo A;
655 }
656 }
657
658 if (0x0041 <= $self->{nc} and
659 $self->{nc} <= 0x005A) { # A..Z
660
661 $self->{ct}
662 = {type => END_TAG_TOKEN,
663 tag_name => chr ($self->{nc} + 0x0020),
664 line => $l, column => $c};
665 $self->{state} = TAG_NAME_STATE;
666
667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668 $self->{line_prev} = $self->{line};
669 $self->{column_prev} = $self->{column};
670 $self->{column}++;
671 $self->{nc}
672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673 } else {
674 $self->{set_nc}->($self);
675 }
676
677 redo A;
678 } elsif (0x0061 <= $self->{nc} and
679 $self->{nc} <= 0x007A) { # a..z
680
681 $self->{ct} = {type => END_TAG_TOKEN,
682 tag_name => chr ($self->{nc}),
683 line => $l, column => $c};
684 $self->{state} = TAG_NAME_STATE;
685
686 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
687 $self->{line_prev} = $self->{line};
688 $self->{column_prev} = $self->{column};
689 $self->{column}++;
690 $self->{nc}
691 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
692 } else {
693 $self->{set_nc}->($self);
694 }
695
696 redo A;
697 } elsif ($self->{nc} == 0x003E) { # >
698
699 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
700 line => $self->{line_prev}, ## "<" in "</>"
701 column => $self->{column_prev} - 1);
702 $self->{state} = DATA_STATE;
703
704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705 $self->{line_prev} = $self->{line};
706 $self->{column_prev} = $self->{column};
707 $self->{column}++;
708 $self->{nc}
709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710 } else {
711 $self->{set_nc}->($self);
712 }
713
714 redo A;
715 } elsif ($self->{nc} == -1) {
716
717 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
718 $self->{state} = DATA_STATE;
719 # reconsume
720
721 return ({type => CHARACTER_TOKEN, data => '</',
722 line => $l, column => $c,
723 });
724
725 redo A;
726 } else {
727
728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
729 $self->{state} = BOGUS_COMMENT_STATE;
730 $self->{ct} = {type => COMMENT_TOKEN, data => '',
731 line => $self->{line_prev}, # "<" of "</"
732 column => $self->{column_prev} - 1,
733 };
734 ## NOTE: $self->{nc} is intentionally left as is.
735 ## Although the "anything else" case of the spec not explicitly
736 ## states that the next input character is to be reconsumed,
737 ## it will be included to the |data| of the comment token
738 ## generated from the bogus end tag, as defined in the
739 ## "bogus comment state" entry.
740 redo A;
741 }
742 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
743 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
744 if (length $ch) {
745 my $CH = $ch;
746 $ch =~ tr/a-z/A-Z/;
747 my $nch = chr $self->{nc};
748 if ($nch eq $ch or $nch eq $CH) {
749
750 ## Stay in the state.
751 $self->{s_kwd} .= $nch;
752
753 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
754 $self->{line_prev} = $self->{line};
755 $self->{column_prev} = $self->{column};
756 $self->{column}++;
757 $self->{nc}
758 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
759 } else {
760 $self->{set_nc}->($self);
761 }
762
763 redo A;
764 } else {
765
766 $self->{state} = DATA_STATE;
767 ## Reconsume.
768 return ({type => CHARACTER_TOKEN,
769 data => '</' . $self->{s_kwd},
770 line => $self->{line_prev},
771 column => $self->{column_prev} - 1 - length $self->{s_kwd},
772 });
773 redo A;
774 }
775 } else { # after "<{tag-name}"
776 unless ($is_space->{$self->{nc}} or
777 {
778 0x003E => 1, # >
779 0x002F => 1, # /
780 -1 => 1, # EOF
781 }->{$self->{nc}}) {
782
783 ## Reconsume.
784 $self->{state} = DATA_STATE;
785 return ({type => CHARACTER_TOKEN,
786 data => '</' . $self->{s_kwd},
787 line => $self->{line_prev},
788 column => $self->{column_prev} - 1 - length $self->{s_kwd},
789 });
790 redo A;
791 } else {
792
793 $self->{ct}
794 = {type => END_TAG_TOKEN,
795 tag_name => $self->{last_stag_name},
796 line => $self->{line_prev},
797 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
798 $self->{state} = TAG_NAME_STATE;
799 ## Reconsume.
800 redo A;
801 }
802 }
803 } elsif ($self->{state} == TAG_NAME_STATE) {
804 if ($is_space->{$self->{nc}}) {
805
806 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
807
808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
809 $self->{line_prev} = $self->{line};
810 $self->{column_prev} = $self->{column};
811 $self->{column}++;
812 $self->{nc}
813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
814 } else {
815 $self->{set_nc}->($self);
816 }
817
818 redo A;
819 } elsif ($self->{nc} == 0x003E) { # >
820 if ($self->{ct}->{type} == START_TAG_TOKEN) {
821
822 $self->{last_stag_name} = $self->{ct}->{tag_name};
823 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
824 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
825 #if ($self->{ct}->{attributes}) {
826 # ## NOTE: This should never be reached.
827 # !!! cp (36);
828 # !!! parse-error (type => 'end tag attribute');
829 #} else {
830
831 #}
832 } else {
833 die "$0: $self->{ct}->{type}: Unknown token type";
834 }
835 $self->{state} = DATA_STATE;
836
837 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
838 $self->{line_prev} = $self->{line};
839 $self->{column_prev} = $self->{column};
840 $self->{column}++;
841 $self->{nc}
842 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
843 } else {
844 $self->{set_nc}->($self);
845 }
846
847
848 return ($self->{ct}); # start tag or end tag
849
850 redo A;
851 } elsif (0x0041 <= $self->{nc} and
852 $self->{nc} <= 0x005A) { # A..Z
853
854 $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
855 # start tag or end tag
856 ## Stay in this state
857
858 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
859 $self->{line_prev} = $self->{line};
860 $self->{column_prev} = $self->{column};
861 $self->{column}++;
862 $self->{nc}
863 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
864 } else {
865 $self->{set_nc}->($self);
866 }
867
868 redo A;
869 } elsif ($self->{nc} == -1) {
870 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
871 if ($self->{ct}->{type} == START_TAG_TOKEN) {
872
873 $self->{last_stag_name} = $self->{ct}->{tag_name};
874 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
875 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
876 #if ($self->{ct}->{attributes}) {
877 # ## NOTE: This state should never be reached.
878 # !!! cp (40);
879 # !!! parse-error (type => 'end tag attribute');
880 #} else {
881
882 #}
883 } else {
884 die "$0: $self->{ct}->{type}: Unknown token type";
885 }
886 $self->{state} = DATA_STATE;
887 # reconsume
888
889 return ($self->{ct}); # start tag or end tag
890
891 redo A;
892 } elsif ($self->{nc} == 0x002F) { # /
893
894 $self->{state} = SELF_CLOSING_START_TAG_STATE;
895
896 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
897 $self->{line_prev} = $self->{line};
898 $self->{column_prev} = $self->{column};
899 $self->{column}++;
900 $self->{nc}
901 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
902 } else {
903 $self->{set_nc}->($self);
904 }
905
906 redo A;
907 } else {
908
909 $self->{ct}->{tag_name} .= chr $self->{nc};
910 # start tag or end tag
911 ## Stay in the state
912
913 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
914 $self->{line_prev} = $self->{line};
915 $self->{column_prev} = $self->{column};
916 $self->{column}++;
917 $self->{nc}
918 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
919 } else {
920 $self->{set_nc}->($self);
921 }
922
923 redo A;
924 }
925 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
926 if ($is_space->{$self->{nc}}) {
927
928 ## Stay in the state
929
930 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
931 $self->{line_prev} = $self->{line};
932 $self->{column_prev} = $self->{column};
933 $self->{column}++;
934 $self->{nc}
935 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
936 } else {
937 $self->{set_nc}->($self);
938 }
939
940 redo A;
941 } elsif ($self->{nc} == 0x003E) { # >
942 if ($self->{ct}->{type} == START_TAG_TOKEN) {
943
944 $self->{last_stag_name} = $self->{ct}->{tag_name};
945 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
946 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
947 if ($self->{ct}->{attributes}) {
948
949 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
950 } else {
951
952 }
953 } else {
954 die "$0: $self->{ct}->{type}: Unknown token type";
955 }
956 $self->{state} = DATA_STATE;
957
958 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
959 $self->{line_prev} = $self->{line};
960 $self->{column_prev} = $self->{column};
961 $self->{column}++;
962 $self->{nc}
963 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
964 } else {
965 $self->{set_nc}->($self);
966 }
967
968
969 return ($self->{ct}); # start tag or end tag
970
971 redo A;
972 } elsif (0x0041 <= $self->{nc} and
973 $self->{nc} <= 0x005A) { # A..Z
974
975 $self->{ca}
976 = {name => chr ($self->{nc} + 0x0020),
977 value => '',
978 line => $self->{line}, column => $self->{column}};
979 $self->{state} = ATTRIBUTE_NAME_STATE;
980
981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
982 $self->{line_prev} = $self->{line};
983 $self->{column_prev} = $self->{column};
984 $self->{column}++;
985 $self->{nc}
986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
987 } else {
988 $self->{set_nc}->($self);
989 }
990
991 redo A;
992 } elsif ($self->{nc} == 0x002F) { # /
993
994 $self->{state} = SELF_CLOSING_START_TAG_STATE;
995
996 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
997 $self->{line_prev} = $self->{line};
998 $self->{column_prev} = $self->{column};
999 $self->{column}++;
1000 $self->{nc}
1001 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1002 } else {
1003 $self->{set_nc}->($self);
1004 }
1005
1006 redo A;
1007 } elsif ($self->{nc} == -1) {
1008 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1009 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1010
1011 $self->{last_stag_name} = $self->{ct}->{tag_name};
1012 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1013 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1014 if ($self->{ct}->{attributes}) {
1015
1016 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1017 } else {
1018
1019 }
1020 } else {
1021 die "$0: $self->{ct}->{type}: Unknown token type";
1022 }
1023 $self->{state} = DATA_STATE;
1024 # reconsume
1025
1026 return ($self->{ct}); # start tag or end tag
1027
1028 redo A;
1029 } else {
1030 if ({
1031 0x0022 => 1, # "
1032 0x0027 => 1, # '
1033 0x003D => 1, # =
1034 }->{$self->{nc}}) {
1035
1036 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1037 } else {
1038
1039 }
1040 $self->{ca}
1041 = {name => chr ($self->{nc}),
1042 value => '',
1043 line => $self->{line}, column => $self->{column}};
1044 $self->{state} = ATTRIBUTE_NAME_STATE;
1045
1046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1047 $self->{line_prev} = $self->{line};
1048 $self->{column_prev} = $self->{column};
1049 $self->{column}++;
1050 $self->{nc}
1051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1052 } else {
1053 $self->{set_nc}->($self);
1054 }
1055
1056 redo A;
1057 }
1058 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1059 my $before_leave = sub {
1060 if (exists $self->{ct}->{attributes} # start tag or end tag
1061 ->{$self->{ca}->{name}}) { # MUST
1062
1063 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1064 ## Discard $self->{ca} # MUST
1065 } else {
1066
1067 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1068 = $self->{ca};
1069 }
1070 }; # $before_leave
1071
1072 if ($is_space->{$self->{nc}}) {
1073
1074 $before_leave->();
1075 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1076
1077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1078 $self->{line_prev} = $self->{line};
1079 $self->{column_prev} = $self->{column};
1080 $self->{column}++;
1081 $self->{nc}
1082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1083 } else {
1084 $self->{set_nc}->($self);
1085 }
1086
1087 redo A;
1088 } elsif ($self->{nc} == 0x003D) { # =
1089
1090 $before_leave->();
1091 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1092
1093 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1094 $self->{line_prev} = $self->{line};
1095 $self->{column_prev} = $self->{column};
1096 $self->{column}++;
1097 $self->{nc}
1098 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1099 } else {
1100 $self->{set_nc}->($self);
1101 }
1102
1103 redo A;
1104 } elsif ($self->{nc} == 0x003E) { # >
1105 $before_leave->();
1106 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1107
1108 $self->{last_stag_name} = $self->{ct}->{tag_name};
1109 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1110
1111 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1112 if ($self->{ct}->{attributes}) {
1113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1114 }
1115 } else {
1116 die "$0: $self->{ct}->{type}: Unknown token type";
1117 }
1118 $self->{state} = DATA_STATE;
1119
1120 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1121 $self->{line_prev} = $self->{line};
1122 $self->{column_prev} = $self->{column};
1123 $self->{column}++;
1124 $self->{nc}
1125 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1126 } else {
1127 $self->{set_nc}->($self);
1128 }
1129
1130
1131 return ($self->{ct}); # start tag or end tag
1132
1133 redo A;
1134 } elsif (0x0041 <= $self->{nc} and
1135 $self->{nc} <= 0x005A) { # A..Z
1136
1137 $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1138 ## Stay in the state
1139
1140 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1141 $self->{line_prev} = $self->{line};
1142 $self->{column_prev} = $self->{column};
1143 $self->{column}++;
1144 $self->{nc}
1145 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1146 } else {
1147 $self->{set_nc}->($self);
1148 }
1149
1150 redo A;
1151 } elsif ($self->{nc} == 0x002F) { # /
1152
1153 $before_leave->();
1154 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1155
1156 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1157 $self->{line_prev} = $self->{line};
1158 $self->{column_prev} = $self->{column};
1159 $self->{column}++;
1160 $self->{nc}
1161 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1162 } else {
1163 $self->{set_nc}->($self);
1164 }
1165
1166 redo A;
1167 } elsif ($self->{nc} == -1) {
1168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1169 $before_leave->();
1170 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171
1172 $self->{last_stag_name} = $self->{ct}->{tag_name};
1173 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175 if ($self->{ct}->{attributes}) {
1176
1177 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1178 } else {
1179 ## NOTE: This state should never be reached.
1180
1181 }
1182 } else {
1183 die "$0: $self->{ct}->{type}: Unknown token type";
1184 }
1185 $self->{state} = DATA_STATE;
1186 # reconsume
1187
1188 return ($self->{ct}); # start tag or end tag
1189
1190 redo A;
1191 } else {
1192 if ($self->{nc} == 0x0022 or # "
1193 $self->{nc} == 0x0027) { # '
1194
1195 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1196 } else {
1197
1198 }
1199 $self->{ca}->{name} .= chr ($self->{nc});
1200 ## Stay in the state
1201
1202 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1203 $self->{line_prev} = $self->{line};
1204 $self->{column_prev} = $self->{column};
1205 $self->{column}++;
1206 $self->{nc}
1207 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1208 } else {
1209 $self->{set_nc}->($self);
1210 }
1211
1212 redo A;
1213 }
1214 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1215 if ($is_space->{$self->{nc}}) {
1216
1217 ## Stay in the state
1218
1219 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1220 $self->{line_prev} = $self->{line};
1221 $self->{column_prev} = $self->{column};
1222 $self->{column}++;
1223 $self->{nc}
1224 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1225 } else {
1226 $self->{set_nc}->($self);
1227 }
1228
1229 redo A;
1230 } elsif ($self->{nc} == 0x003D) { # =
1231
1232 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1233
1234 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1235 $self->{line_prev} = $self->{line};
1236 $self->{column_prev} = $self->{column};
1237 $self->{column}++;
1238 $self->{nc}
1239 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1240 } else {
1241 $self->{set_nc}->($self);
1242 }
1243
1244 redo A;
1245 } elsif ($self->{nc} == 0x003E) { # >
1246 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1247
1248 $self->{last_stag_name} = $self->{ct}->{tag_name};
1249 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1250 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1251 if ($self->{ct}->{attributes}) {
1252
1253 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1254 } else {
1255 ## NOTE: This state should never be reached.
1256
1257 }
1258 } else {
1259 die "$0: $self->{ct}->{type}: Unknown token type";
1260 }
1261 $self->{state} = DATA_STATE;
1262
1263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1264 $self->{line_prev} = $self->{line};
1265 $self->{column_prev} = $self->{column};
1266 $self->{column}++;
1267 $self->{nc}
1268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1269 } else {
1270 $self->{set_nc}->($self);
1271 }
1272
1273
1274 return ($self->{ct}); # start tag or end tag
1275
1276 redo A;
1277 } elsif (0x0041 <= $self->{nc} and
1278 $self->{nc} <= 0x005A) { # A..Z
1279
1280 $self->{ca}
1281 = {name => chr ($self->{nc} + 0x0020),
1282 value => '',
1283 line => $self->{line}, column => $self->{column}};
1284 $self->{state} = ATTRIBUTE_NAME_STATE;
1285
1286 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1287 $self->{line_prev} = $self->{line};
1288 $self->{column_prev} = $self->{column};
1289 $self->{column}++;
1290 $self->{nc}
1291 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1292 } else {
1293 $self->{set_nc}->($self);
1294 }
1295
1296 redo A;
1297 } elsif ($self->{nc} == 0x002F) { # /
1298
1299 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1300
1301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1302 $self->{line_prev} = $self->{line};
1303 $self->{column_prev} = $self->{column};
1304 $self->{column}++;
1305 $self->{nc}
1306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1307 } else {
1308 $self->{set_nc}->($self);
1309 }
1310
1311 redo A;
1312 } elsif ($self->{nc} == -1) {
1313 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1314 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1315
1316 $self->{last_stag_name} = $self->{ct}->{tag_name};
1317 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1318 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1319 if ($self->{ct}->{attributes}) {
1320
1321 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1322 } else {
1323 ## NOTE: This state should never be reached.
1324
1325 }
1326 } else {
1327 die "$0: $self->{ct}->{type}: Unknown token type";
1328 }
1329 $self->{state} = DATA_STATE;
1330 # reconsume
1331
1332 return ($self->{ct}); # start tag or end tag
1333
1334 redo A;
1335 } else {
1336 if ($self->{nc} == 0x0022 or # "
1337 $self->{nc} == 0x0027) { # '
1338
1339 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1340 } else {
1341
1342 }
1343 $self->{ca}
1344 = {name => chr ($self->{nc}),
1345 value => '',
1346 line => $self->{line}, column => $self->{column}};
1347 $self->{state} = ATTRIBUTE_NAME_STATE;
1348
1349 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1350 $self->{line_prev} = $self->{line};
1351 $self->{column_prev} = $self->{column};
1352 $self->{column}++;
1353 $self->{nc}
1354 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1355 } else {
1356 $self->{set_nc}->($self);
1357 }
1358
1359 redo A;
1360 }
1361 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1362 if ($is_space->{$self->{nc}}) {
1363
1364 ## Stay in the state
1365
1366 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1367 $self->{line_prev} = $self->{line};
1368 $self->{column_prev} = $self->{column};
1369 $self->{column}++;
1370 $self->{nc}
1371 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1372 } else {
1373 $self->{set_nc}->($self);
1374 }
1375
1376 redo A;
1377 } elsif ($self->{nc} == 0x0022) { # "
1378
1379 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1380
1381 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1382 $self->{line_prev} = $self->{line};
1383 $self->{column_prev} = $self->{column};
1384 $self->{column}++;
1385 $self->{nc}
1386 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1387 } else {
1388 $self->{set_nc}->($self);
1389 }
1390
1391 redo A;
1392 } elsif ($self->{nc} == 0x0026) { # &
1393
1394 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1395 ## reconsume
1396 redo A;
1397 } elsif ($self->{nc} == 0x0027) { # '
1398
1399 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1400
1401 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1402 $self->{line_prev} = $self->{line};
1403 $self->{column_prev} = $self->{column};
1404 $self->{column}++;
1405 $self->{nc}
1406 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1407 } else {
1408 $self->{set_nc}->($self);
1409 }
1410
1411 redo A;
1412 } elsif ($self->{nc} == 0x003E) { # >
1413 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1414 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1415
1416 $self->{last_stag_name} = $self->{ct}->{tag_name};
1417 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1418 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1419 if ($self->{ct}->{attributes}) {
1420
1421 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1422 } else {
1423 ## NOTE: This state should never be reached.
1424
1425 }
1426 } else {
1427 die "$0: $self->{ct}->{type}: Unknown token type";
1428 }
1429 $self->{state} = DATA_STATE;
1430
1431 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1432 $self->{line_prev} = $self->{line};
1433 $self->{column_prev} = $self->{column};
1434 $self->{column}++;
1435 $self->{nc}
1436 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1437 } else {
1438 $self->{set_nc}->($self);
1439 }
1440
1441
1442 return ($self->{ct}); # start tag or end tag
1443
1444 redo A;
1445 } elsif ($self->{nc} == -1) {
1446 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1447 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1448
1449 $self->{last_stag_name} = $self->{ct}->{tag_name};
1450 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1451 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1452 if ($self->{ct}->{attributes}) {
1453
1454 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1455 } else {
1456 ## NOTE: This state should never be reached.
1457
1458 }
1459 } else {
1460 die "$0: $self->{ct}->{type}: Unknown token type";
1461 }
1462 $self->{state} = DATA_STATE;
1463 ## reconsume
1464
1465 return ($self->{ct}); # start tag or end tag
1466
1467 redo A;
1468 } else {
1469 if ($self->{nc} == 0x003D) { # =
1470
1471 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1472 } else {
1473
1474 }
1475 $self->{ca}->{value} .= chr ($self->{nc});
1476 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1477
1478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479 $self->{line_prev} = $self->{line};
1480 $self->{column_prev} = $self->{column};
1481 $self->{column}++;
1482 $self->{nc}
1483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484 } else {
1485 $self->{set_nc}->($self);
1486 }
1487
1488 redo A;
1489 }
1490 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1491 if ($self->{nc} == 0x0022) { # "
1492
1493 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1494
1495 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1496 $self->{line_prev} = $self->{line};
1497 $self->{column_prev} = $self->{column};
1498 $self->{column}++;
1499 $self->{nc}
1500 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1501 } else {
1502 $self->{set_nc}->($self);
1503 }
1504
1505 redo A;
1506 } elsif ($self->{nc} == 0x0026) { # &
1507
1508 ## NOTE: In the spec, the tokenizer is switched to the
1509 ## "entity in attribute value state". In this implementation, the
1510 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1511 ## implementation of the "consume a character reference" algorithm.
1512 $self->{prev_state} = $self->{state};
1513 $self->{entity_add} = 0x0022; # "
1514 $self->{state} = ENTITY_STATE;
1515
1516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517 $self->{line_prev} = $self->{line};
1518 $self->{column_prev} = $self->{column};
1519 $self->{column}++;
1520 $self->{nc}
1521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522 } else {
1523 $self->{set_nc}->($self);
1524 }
1525
1526 redo A;
1527 } elsif ($self->{nc} == -1) {
1528 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1529 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1530
1531 $self->{last_stag_name} = $self->{ct}->{tag_name};
1532 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1533 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1534 if ($self->{ct}->{attributes}) {
1535
1536 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1537 } else {
1538 ## NOTE: This state should never be reached.
1539
1540 }
1541 } else {
1542 die "$0: $self->{ct}->{type}: Unknown token type";
1543 }
1544 $self->{state} = DATA_STATE;
1545 ## reconsume
1546
1547 return ($self->{ct}); # start tag or end tag
1548
1549 redo A;
1550 } else {
1551
1552 $self->{ca}->{value} .= chr ($self->{nc});
1553 $self->{read_until}->($self->{ca}->{value},
1554 q["&],
1555 length $self->{ca}->{value});
1556
1557 ## Stay in the state
1558
1559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1560 $self->{line_prev} = $self->{line};
1561 $self->{column_prev} = $self->{column};
1562 $self->{column}++;
1563 $self->{nc}
1564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1565 } else {
1566 $self->{set_nc}->($self);
1567 }
1568
1569 redo A;
1570 }
1571 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1572 if ($self->{nc} == 0x0027) { # '
1573
1574 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1575
1576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1577 $self->{line_prev} = $self->{line};
1578 $self->{column_prev} = $self->{column};
1579 $self->{column}++;
1580 $self->{nc}
1581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1582 } else {
1583 $self->{set_nc}->($self);
1584 }
1585
1586 redo A;
1587 } elsif ($self->{nc} == 0x0026) { # &
1588
1589 ## NOTE: In the spec, the tokenizer is switched to the
1590 ## "entity in attribute value state". In this implementation, the
1591 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1592 ## implementation of the "consume a character reference" algorithm.
1593 $self->{entity_add} = 0x0027; # '
1594 $self->{prev_state} = $self->{state};
1595 $self->{state} = ENTITY_STATE;
1596
1597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1598 $self->{line_prev} = $self->{line};
1599 $self->{column_prev} = $self->{column};
1600 $self->{column}++;
1601 $self->{nc}
1602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1603 } else {
1604 $self->{set_nc}->($self);
1605 }
1606
1607 redo A;
1608 } elsif ($self->{nc} == -1) {
1609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1610 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1611
1612 $self->{last_stag_name} = $self->{ct}->{tag_name};
1613 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1614 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1615 if ($self->{ct}->{attributes}) {
1616
1617 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1618 } else {
1619 ## NOTE: This state should never be reached.
1620
1621 }
1622 } else {
1623 die "$0: $self->{ct}->{type}: Unknown token type";
1624 }
1625 $self->{state} = DATA_STATE;
1626 ## reconsume
1627
1628 return ($self->{ct}); # start tag or end tag
1629
1630 redo A;
1631 } else {
1632
1633 $self->{ca}->{value} .= chr ($self->{nc});
1634 $self->{read_until}->($self->{ca}->{value},
1635 q['&],
1636 length $self->{ca}->{value});
1637
1638 ## Stay in the state
1639
1640 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1641 $self->{line_prev} = $self->{line};
1642 $self->{column_prev} = $self->{column};
1643 $self->{column}++;
1644 $self->{nc}
1645 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1646 } else {
1647 $self->{set_nc}->($self);
1648 }
1649
1650 redo A;
1651 }
1652 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1653 if ($is_space->{$self->{nc}}) {
1654
1655 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1656
1657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1658 $self->{line_prev} = $self->{line};
1659 $self->{column_prev} = $self->{column};
1660 $self->{column}++;
1661 $self->{nc}
1662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1663 } else {
1664 $self->{set_nc}->($self);
1665 }
1666
1667 redo A;
1668 } elsif ($self->{nc} == 0x0026) { # &
1669
1670 ## NOTE: In the spec, the tokenizer is switched to the
1671 ## "entity in attribute value state". In this implementation, the
1672 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1673 ## implementation of the "consume a character reference" algorithm.
1674 $self->{entity_add} = -1;
1675 $self->{prev_state} = $self->{state};
1676 $self->{state} = ENTITY_STATE;
1677
1678 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1679 $self->{line_prev} = $self->{line};
1680 $self->{column_prev} = $self->{column};
1681 $self->{column}++;
1682 $self->{nc}
1683 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1684 } else {
1685 $self->{set_nc}->($self);
1686 }
1687
1688 redo A;
1689 } elsif ($self->{nc} == 0x003E) { # >
1690 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1691
1692 $self->{last_stag_name} = $self->{ct}->{tag_name};
1693 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1694 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1695 if ($self->{ct}->{attributes}) {
1696
1697 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1698 } else {
1699 ## NOTE: This state should never be reached.
1700
1701 }
1702 } else {
1703 die "$0: $self->{ct}->{type}: Unknown token type";
1704 }
1705 $self->{state} = DATA_STATE;
1706
1707 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1708 $self->{line_prev} = $self->{line};
1709 $self->{column_prev} = $self->{column};
1710 $self->{column}++;
1711 $self->{nc}
1712 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1713 } else {
1714 $self->{set_nc}->($self);
1715 }
1716
1717
1718 return ($self->{ct}); # start tag or end tag
1719
1720 redo A;
1721 } elsif ($self->{nc} == -1) {
1722 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1723 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1724
1725 $self->{last_stag_name} = $self->{ct}->{tag_name};
1726 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1727 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1728 if ($self->{ct}->{attributes}) {
1729
1730 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1731 } else {
1732 ## NOTE: This state should never be reached.
1733
1734 }
1735 } else {
1736 die "$0: $self->{ct}->{type}: Unknown token type";
1737 }
1738 $self->{state} = DATA_STATE;
1739 ## reconsume
1740
1741 return ($self->{ct}); # start tag or end tag
1742
1743 redo A;
1744 } else {
1745 if ({
1746 0x0022 => 1, # "
1747 0x0027 => 1, # '
1748 0x003D => 1, # =
1749 }->{$self->{nc}}) {
1750
1751 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1752 } else {
1753
1754 }
1755 $self->{ca}->{value} .= chr ($self->{nc});
1756 $self->{read_until}->($self->{ca}->{value},
1757 q["'=& >],
1758 length $self->{ca}->{value});
1759
1760 ## Stay in the state
1761
1762 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1763 $self->{line_prev} = $self->{line};
1764 $self->{column_prev} = $self->{column};
1765 $self->{column}++;
1766 $self->{nc}
1767 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1768 } else {
1769 $self->{set_nc}->($self);
1770 }
1771
1772 redo A;
1773 }
1774 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1775 if ($is_space->{$self->{nc}}) {
1776
1777 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1778
1779 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1780 $self->{line_prev} = $self->{line};
1781 $self->{column_prev} = $self->{column};
1782 $self->{column}++;
1783 $self->{nc}
1784 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1785 } else {
1786 $self->{set_nc}->($self);
1787 }
1788
1789 redo A;
1790 } elsif ($self->{nc} == 0x003E) { # >
1791 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1792
1793 $self->{last_stag_name} = $self->{ct}->{tag_name};
1794 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1795 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1796 if ($self->{ct}->{attributes}) {
1797
1798 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1799 } else {
1800 ## NOTE: This state should never be reached.
1801
1802 }
1803 } else {
1804 die "$0: $self->{ct}->{type}: Unknown token type";
1805 }
1806 $self->{state} = DATA_STATE;
1807
1808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809 $self->{line_prev} = $self->{line};
1810 $self->{column_prev} = $self->{column};
1811 $self->{column}++;
1812 $self->{nc}
1813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814 } else {
1815 $self->{set_nc}->($self);
1816 }
1817
1818
1819 return ($self->{ct}); # start tag or end tag
1820
1821 redo A;
1822 } elsif ($self->{nc} == 0x002F) { # /
1823
1824 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1825
1826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1827 $self->{line_prev} = $self->{line};
1828 $self->{column_prev} = $self->{column};
1829 $self->{column}++;
1830 $self->{nc}
1831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1832 } else {
1833 $self->{set_nc}->($self);
1834 }
1835
1836 redo A;
1837 } elsif ($self->{nc} == -1) {
1838 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1839 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1840
1841 $self->{last_stag_name} = $self->{ct}->{tag_name};
1842 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1843 if ($self->{ct}->{attributes}) {
1844
1845 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1846 } else {
1847 ## NOTE: This state should never be reached.
1848
1849 }
1850 } else {
1851 die "$0: $self->{ct}->{type}: Unknown token type";
1852 }
1853 $self->{state} = DATA_STATE;
1854 ## Reconsume.
1855 return ($self->{ct}); # start tag or end tag
1856 redo A;
1857 } else {
1858
1859 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1860 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1861 ## reconsume
1862 redo A;
1863 }
1864 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1865 if ($self->{nc} == 0x003E) { # >
1866 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1867
1868 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1869 ## TODO: Different type than slash in start tag
1870 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1871 if ($self->{ct}->{attributes}) {
1872
1873 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1874 } else {
1875
1876 }
1877 ## TODO: Test |<title></title/>|
1878 } else {
1879
1880 $self->{self_closing} = 1;
1881 }
1882
1883 $self->{state} = DATA_STATE;
1884
1885 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1886 $self->{line_prev} = $self->{line};
1887 $self->{column_prev} = $self->{column};
1888 $self->{column}++;
1889 $self->{nc}
1890 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1891 } else {
1892 $self->{set_nc}->($self);
1893 }
1894
1895
1896 return ($self->{ct}); # start tag or end tag
1897
1898 redo A;
1899 } elsif ($self->{nc} == -1) {
1900 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1901 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1902
1903 $self->{last_stag_name} = $self->{ct}->{tag_name};
1904 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1905 if ($self->{ct}->{attributes}) {
1906
1907 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1908 } else {
1909 ## NOTE: This state should never be reached.
1910
1911 }
1912 } else {
1913 die "$0: $self->{ct}->{type}: Unknown token type";
1914 }
1915 $self->{state} = DATA_STATE;
1916 ## Reconsume.
1917 return ($self->{ct}); # start tag or end tag
1918 redo A;
1919 } else {
1920
1921 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1922 ## TODO: This error type is wrong.
1923 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1924 ## Reconsume.
1925 redo A;
1926 }
1927 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1928 ## (only happen if PCDATA state)
1929
1930 ## NOTE: Unlike spec's "bogus comment state", this implementation
1931 ## consumes characters one-by-one basis.
1932
1933 if ($self->{nc} == 0x003E) { # >
1934
1935 $self->{state} = DATA_STATE;
1936
1937 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938 $self->{line_prev} = $self->{line};
1939 $self->{column_prev} = $self->{column};
1940 $self->{column}++;
1941 $self->{nc}
1942 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943 } else {
1944 $self->{set_nc}->($self);
1945 }
1946
1947
1948 return ($self->{ct}); # comment
1949 redo A;
1950 } elsif ($self->{nc} == -1) {
1951
1952 $self->{state} = DATA_STATE;
1953 ## reconsume
1954
1955 return ($self->{ct}); # comment
1956 redo A;
1957 } else {
1958
1959 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1960 $self->{read_until}->($self->{ct}->{data},
1961 q[>],
1962 length $self->{ct}->{data});
1963
1964 ## Stay in the state.
1965
1966 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1967 $self->{line_prev} = $self->{line};
1968 $self->{column_prev} = $self->{column};
1969 $self->{column}++;
1970 $self->{nc}
1971 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1972 } else {
1973 $self->{set_nc}->($self);
1974 }
1975
1976 redo A;
1977 }
1978 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1979 ## (only happen if PCDATA state)
1980
1981 if ($self->{nc} == 0x002D) { # -
1982
1983 $self->{state} = MD_HYPHEN_STATE;
1984
1985 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986 $self->{line_prev} = $self->{line};
1987 $self->{column_prev} = $self->{column};
1988 $self->{column}++;
1989 $self->{nc}
1990 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991 } else {
1992 $self->{set_nc}->($self);
1993 }
1994
1995 redo A;
1996 } elsif ($self->{nc} == 0x0044 or # D
1997 $self->{nc} == 0x0064) { # d
1998 ## ASCII case-insensitive.
1999
2000 $self->{state} = MD_DOCTYPE_STATE;
2001 $self->{s_kwd} = chr $self->{nc};
2002
2003 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2004 $self->{line_prev} = $self->{line};
2005 $self->{column_prev} = $self->{column};
2006 $self->{column}++;
2007 $self->{nc}
2008 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2009 } else {
2010 $self->{set_nc}->($self);
2011 }
2012
2013 redo A;
2014 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2015 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2016 $self->{is_xml}) and
2017 $self->{nc} == 0x005B) { # [
2018
2019 $self->{state} = MD_CDATA_STATE;
2020 $self->{s_kwd} = '[';
2021
2022 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2023 $self->{line_prev} = $self->{line};
2024 $self->{column_prev} = $self->{column};
2025 $self->{column}++;
2026 $self->{nc}
2027 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2028 } else {
2029 $self->{set_nc}->($self);
2030 }
2031
2032 redo A;
2033 } else {
2034
2035 }
2036
2037 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2038 line => $self->{line_prev},
2039 column => $self->{column_prev} - 1);
2040 ## Reconsume.
2041 $self->{state} = BOGUS_COMMENT_STATE;
2042 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2043 line => $self->{line_prev},
2044 column => $self->{column_prev} - 1,
2045 };
2046 redo A;
2047 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2048 if ($self->{nc} == 0x002D) { # -
2049
2050 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2051 line => $self->{line_prev},
2052 column => $self->{column_prev} - 2,
2053 };
2054 $self->{state} = COMMENT_START_STATE;
2055
2056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2057 $self->{line_prev} = $self->{line};
2058 $self->{column_prev} = $self->{column};
2059 $self->{column}++;
2060 $self->{nc}
2061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2062 } else {
2063 $self->{set_nc}->($self);
2064 }
2065
2066 redo A;
2067 } else {
2068
2069 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2070 line => $self->{line_prev},
2071 column => $self->{column_prev} - 2);
2072 $self->{state} = BOGUS_COMMENT_STATE;
2073 ## Reconsume.
2074 $self->{ct} = {type => COMMENT_TOKEN,
2075 data => '-',
2076 line => $self->{line_prev},
2077 column => $self->{column_prev} - 2,
2078 };
2079 redo A;
2080 }
2081 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2082 ## ASCII case-insensitive.
2083 if ($self->{nc} == [
2084 undef,
2085 0x004F, # O
2086 0x0043, # C
2087 0x0054, # T
2088 0x0059, # Y
2089 0x0050, # P
2090 ]->[length $self->{s_kwd}] or
2091 $self->{nc} == [
2092 undef,
2093 0x006F, # o
2094 0x0063, # c
2095 0x0074, # t
2096 0x0079, # y
2097 0x0070, # p
2098 ]->[length $self->{s_kwd}]) {
2099
2100 ## Stay in the state.
2101 $self->{s_kwd} .= chr $self->{nc};
2102
2103 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2104 $self->{line_prev} = $self->{line};
2105 $self->{column_prev} = $self->{column};
2106 $self->{column}++;
2107 $self->{nc}
2108 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2109 } else {
2110 $self->{set_nc}->($self);
2111 }
2112
2113 redo A;
2114 } elsif ((length $self->{s_kwd}) == 6 and
2115 ($self->{nc} == 0x0045 or # E
2116 $self->{nc} == 0x0065)) { # e
2117
2118 $self->{state} = DOCTYPE_STATE;
2119 $self->{ct} = {type => DOCTYPE_TOKEN,
2120 quirks => 1,
2121 line => $self->{line_prev},
2122 column => $self->{column_prev} - 7,
2123 };
2124
2125 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2126 $self->{line_prev} = $self->{line};
2127 $self->{column_prev} = $self->{column};
2128 $self->{column}++;
2129 $self->{nc}
2130 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2131 } else {
2132 $self->{set_nc}->($self);
2133 }
2134
2135 redo A;
2136 } else {
2137
2138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2139 line => $self->{line_prev},
2140 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2141 $self->{state} = BOGUS_COMMENT_STATE;
2142 ## Reconsume.
2143 $self->{ct} = {type => COMMENT_TOKEN,
2144 data => $self->{s_kwd},
2145 line => $self->{line_prev},
2146 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2147 };
2148 redo A;
2149 }
2150 } elsif ($self->{state} == MD_CDATA_STATE) {
2151 if ($self->{nc} == {
2152 '[' => 0x0043, # C
2153 '[C' => 0x0044, # D
2154 '[CD' => 0x0041, # A
2155 '[CDA' => 0x0054, # T
2156 '[CDAT' => 0x0041, # A
2157 }->{$self->{s_kwd}}) {
2158
2159 ## Stay in the state.
2160 $self->{s_kwd} .= chr $self->{nc};
2161
2162 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2163 $self->{line_prev} = $self->{line};
2164 $self->{column_prev} = $self->{column};
2165 $self->{column}++;
2166 $self->{nc}
2167 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2168 } else {
2169 $self->{set_nc}->($self);
2170 }
2171
2172 redo A;
2173 } elsif ($self->{s_kwd} eq '[CDATA' and
2174 $self->{nc} == 0x005B) { # [
2175
2176 $self->{ct} = {type => CHARACTER_TOKEN,
2177 data => '',
2178 line => $self->{line_prev},
2179 column => $self->{column_prev} - 7};
2180 $self->{state} = CDATA_SECTION_STATE;
2181
2182 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2183 $self->{line_prev} = $self->{line};
2184 $self->{column_prev} = $self->{column};
2185 $self->{column}++;
2186 $self->{nc}
2187 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2188 } else {
2189 $self->{set_nc}->($self);
2190 }
2191
2192 redo A;
2193 } else {
2194
2195 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2196 line => $self->{line_prev},
2197 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2198 $self->{state} = BOGUS_COMMENT_STATE;
2199 ## Reconsume.
2200 $self->{ct} = {type => COMMENT_TOKEN,
2201 data => $self->{s_kwd},
2202 line => $self->{line_prev},
2203 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2204 };
2205 redo A;
2206 }
2207 } elsif ($self->{state} == COMMENT_START_STATE) {
2208 if ($self->{nc} == 0x002D) { # -
2209
2210 $self->{state} = COMMENT_START_DASH_STATE;
2211
2212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2213 $self->{line_prev} = $self->{line};
2214 $self->{column_prev} = $self->{column};
2215 $self->{column}++;
2216 $self->{nc}
2217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2218 } else {
2219 $self->{set_nc}->($self);
2220 }
2221
2222 redo A;
2223 } elsif ($self->{nc} == 0x003E) { # >
2224
2225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2226 $self->{state} = DATA_STATE;
2227
2228 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2229 $self->{line_prev} = $self->{line};
2230 $self->{column_prev} = $self->{column};
2231 $self->{column}++;
2232 $self->{nc}
2233 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2234 } else {
2235 $self->{set_nc}->($self);
2236 }
2237
2238
2239 return ($self->{ct}); # comment
2240
2241 redo A;
2242 } elsif ($self->{nc} == -1) {
2243
2244 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2245 $self->{state} = DATA_STATE;
2246 ## reconsume
2247
2248 return ($self->{ct}); # comment
2249
2250 redo A;
2251 } else {
2252
2253 $self->{ct}->{data} # comment
2254 .= chr ($self->{nc});
2255 $self->{state} = COMMENT_STATE;
2256
2257 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258 $self->{line_prev} = $self->{line};
2259 $self->{column_prev} = $self->{column};
2260 $self->{column}++;
2261 $self->{nc}
2262 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263 } else {
2264 $self->{set_nc}->($self);
2265 }
2266
2267 redo A;
2268 }
2269 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2270 if ($self->{nc} == 0x002D) { # -
2271
2272 $self->{state} = COMMENT_END_STATE;
2273
2274 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2275 $self->{line_prev} = $self->{line};
2276 $self->{column_prev} = $self->{column};
2277 $self->{column}++;
2278 $self->{nc}
2279 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2280 } else {
2281 $self->{set_nc}->($self);
2282 }
2283
2284 redo A;
2285 } elsif ($self->{nc} == 0x003E) { # >
2286
2287 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2288 $self->{state} = DATA_STATE;
2289
2290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2291 $self->{line_prev} = $self->{line};
2292 $self->{column_prev} = $self->{column};
2293 $self->{column}++;
2294 $self->{nc}
2295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2296 } else {
2297 $self->{set_nc}->($self);
2298 }
2299
2300
2301 return ($self->{ct}); # comment
2302
2303 redo A;
2304 } elsif ($self->{nc} == -1) {
2305
2306 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2307 $self->{state} = DATA_STATE;
2308 ## reconsume
2309
2310 return ($self->{ct}); # comment
2311
2312 redo A;
2313 } else {
2314
2315 $self->{ct}->{data} # comment
2316 .= '-' . chr ($self->{nc});
2317 $self->{state} = COMMENT_STATE;
2318
2319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2320 $self->{line_prev} = $self->{line};
2321 $self->{column_prev} = $self->{column};
2322 $self->{column}++;
2323 $self->{nc}
2324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2325 } else {
2326 $self->{set_nc}->($self);
2327 }
2328
2329 redo A;
2330 }
2331 } elsif ($self->{state} == COMMENT_STATE) {
2332 if ($self->{nc} == 0x002D) { # -
2333
2334 $self->{state} = COMMENT_END_DASH_STATE;
2335
2336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2337 $self->{line_prev} = $self->{line};
2338 $self->{column_prev} = $self->{column};
2339 $self->{column}++;
2340 $self->{nc}
2341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2342 } else {
2343 $self->{set_nc}->($self);
2344 }
2345
2346 redo A;
2347 } elsif ($self->{nc} == -1) {
2348
2349 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2350 $self->{state} = DATA_STATE;
2351 ## reconsume
2352
2353 return ($self->{ct}); # comment
2354
2355 redo A;
2356 } else {
2357
2358 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2359 $self->{read_until}->($self->{ct}->{data},
2360 q[-],
2361 length $self->{ct}->{data});
2362
2363 ## Stay in the state
2364
2365 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2366 $self->{line_prev} = $self->{line};
2367 $self->{column_prev} = $self->{column};
2368 $self->{column}++;
2369 $self->{nc}
2370 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2371 } else {
2372 $self->{set_nc}->($self);
2373 }
2374
2375 redo A;
2376 }
2377 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2378 if ($self->{nc} == 0x002D) { # -
2379
2380 $self->{state} = COMMENT_END_STATE;
2381
2382 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2383 $self->{line_prev} = $self->{line};
2384 $self->{column_prev} = $self->{column};
2385 $self->{column}++;
2386 $self->{nc}
2387 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2388 } else {
2389 $self->{set_nc}->($self);
2390 }
2391
2392 redo A;
2393 } elsif ($self->{nc} == -1) {
2394
2395 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2396 $self->{state} = DATA_STATE;
2397 ## reconsume
2398
2399 return ($self->{ct}); # comment
2400
2401 redo A;
2402 } else {
2403
2404 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2405 $self->{state} = COMMENT_STATE;
2406
2407 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2408 $self->{line_prev} = $self->{line};
2409 $self->{column_prev} = $self->{column};
2410 $self->{column}++;
2411 $self->{nc}
2412 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2413 } else {
2414 $self->{set_nc}->($self);
2415 }
2416
2417 redo A;
2418 }
2419 } elsif ($self->{state} == COMMENT_END_STATE) {
2420 if ($self->{nc} == 0x003E) { # >
2421
2422 $self->{state} = DATA_STATE;
2423
2424 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2425 $self->{line_prev} = $self->{line};
2426 $self->{column_prev} = $self->{column};
2427 $self->{column}++;
2428 $self->{nc}
2429 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2430 } else {
2431 $self->{set_nc}->($self);
2432 }
2433
2434
2435 return ($self->{ct}); # comment
2436
2437 redo A;
2438 } elsif ($self->{nc} == 0x002D) { # -
2439
2440 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2441 line => $self->{line_prev},
2442 column => $self->{column_prev});
2443 $self->{ct}->{data} .= '-'; # comment
2444 ## Stay in the state
2445
2446 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2447 $self->{line_prev} = $self->{line};
2448 $self->{column_prev} = $self->{column};
2449 $self->{column}++;
2450 $self->{nc}
2451 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2452 } else {
2453 $self->{set_nc}->($self);
2454 }
2455
2456 redo A;
2457 } elsif ($self->{nc} == -1) {
2458
2459 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2460 $self->{state} = DATA_STATE;
2461 ## reconsume
2462
2463 return ($self->{ct}); # comment
2464
2465 redo A;
2466 } else {
2467
2468 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2469 line => $self->{line_prev},
2470 column => $self->{column_prev});
2471 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2472 $self->{state} = COMMENT_STATE;
2473
2474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2475 $self->{line_prev} = $self->{line};
2476 $self->{column_prev} = $self->{column};
2477 $self->{column}++;
2478 $self->{nc}
2479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2480 } else {
2481 $self->{set_nc}->($self);
2482 }
2483
2484 redo A;
2485 }
2486 } elsif ($self->{state} == DOCTYPE_STATE) {
2487 if ($is_space->{$self->{nc}}) {
2488
2489 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2490
2491 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2492 $self->{line_prev} = $self->{line};
2493 $self->{column_prev} = $self->{column};
2494 $self->{column}++;
2495 $self->{nc}
2496 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2497 } else {
2498 $self->{set_nc}->($self);
2499 }
2500
2501 redo A;
2502 } else {
2503
2504 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2505 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2506 ## reconsume
2507 redo A;
2508 }
2509 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2510 if ($is_space->{$self->{nc}}) {
2511
2512 ## Stay in the state
2513
2514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515 $self->{line_prev} = $self->{line};
2516 $self->{column_prev} = $self->{column};
2517 $self->{column}++;
2518 $self->{nc}
2519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520 } else {
2521 $self->{set_nc}->($self);
2522 }
2523
2524 redo A;
2525 } elsif ($self->{nc} == 0x003E) { # >
2526
2527 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2528 $self->{state} = DATA_STATE;
2529
2530 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2531 $self->{line_prev} = $self->{line};
2532 $self->{column_prev} = $self->{column};
2533 $self->{column}++;
2534 $self->{nc}
2535 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2536 } else {
2537 $self->{set_nc}->($self);
2538 }
2539
2540
2541 return ($self->{ct}); # DOCTYPE (quirks)
2542
2543 redo A;
2544 } elsif ($self->{nc} == -1) {
2545
2546 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2547 $self->{state} = DATA_STATE;
2548 ## reconsume
2549
2550 return ($self->{ct}); # DOCTYPE (quirks)
2551
2552 redo A;
2553 } else {
2554
2555 $self->{ct}->{name} = chr $self->{nc};
2556 delete $self->{ct}->{quirks};
2557 $self->{state} = DOCTYPE_NAME_STATE;
2558
2559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2560 $self->{line_prev} = $self->{line};
2561 $self->{column_prev} = $self->{column};
2562 $self->{column}++;
2563 $self->{nc}
2564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2565 } else {
2566 $self->{set_nc}->($self);
2567 }
2568
2569 redo A;
2570 }
2571 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2572 ## ISSUE: Redundant "First," in the spec.
2573 if ($is_space->{$self->{nc}}) {
2574
2575 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2576
2577 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2578 $self->{line_prev} = $self->{line};
2579 $self->{column_prev} = $self->{column};
2580 $self->{column}++;
2581 $self->{nc}
2582 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2583 } else {
2584 $self->{set_nc}->($self);
2585 }
2586
2587 redo A;
2588 } elsif ($self->{nc} == 0x003E) { # >
2589
2590 $self->{state} = DATA_STATE;
2591
2592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2593 $self->{line_prev} = $self->{line};
2594 $self->{column_prev} = $self->{column};
2595 $self->{column}++;
2596 $self->{nc}
2597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2598 } else {
2599 $self->{set_nc}->($self);
2600 }
2601
2602
2603 return ($self->{ct}); # DOCTYPE
2604
2605 redo A;
2606 } elsif ($self->{nc} == -1) {
2607
2608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2609 $self->{state} = DATA_STATE;
2610 ## reconsume
2611
2612 $self->{ct}->{quirks} = 1;
2613 return ($self->{ct}); # DOCTYPE
2614
2615 redo A;
2616 } else {
2617
2618 $self->{ct}->{name}
2619 .= chr ($self->{nc}); # DOCTYPE
2620 ## Stay in the state
2621
2622 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2623 $self->{line_prev} = $self->{line};
2624 $self->{column_prev} = $self->{column};
2625 $self->{column}++;
2626 $self->{nc}
2627 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2628 } else {
2629 $self->{set_nc}->($self);
2630 }
2631
2632 redo A;
2633 }
2634 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2635 if ($is_space->{$self->{nc}}) {
2636
2637 ## Stay in the state
2638
2639 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2640 $self->{line_prev} = $self->{line};
2641 $self->{column_prev} = $self->{column};
2642 $self->{column}++;
2643 $self->{nc}
2644 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2645 } else {
2646 $self->{set_nc}->($self);
2647 }
2648
2649 redo A;
2650 } elsif ($self->{nc} == 0x003E) { # >
2651
2652 $self->{state} = DATA_STATE;
2653
2654 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2655 $self->{line_prev} = $self->{line};
2656 $self->{column_prev} = $self->{column};
2657 $self->{column}++;
2658 $self->{nc}
2659 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2660 } else {
2661 $self->{set_nc}->($self);
2662 }
2663
2664
2665 return ($self->{ct}); # DOCTYPE
2666
2667 redo A;
2668 } elsif ($self->{nc} == -1) {
2669
2670 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2671 $self->{state} = DATA_STATE;
2672 ## reconsume
2673
2674 $self->{ct}->{quirks} = 1;
2675 return ($self->{ct}); # DOCTYPE
2676
2677 redo A;
2678 } elsif ($self->{nc} == 0x0050 or # P
2679 $self->{nc} == 0x0070) { # p
2680 $self->{state} = PUBLIC_STATE;
2681 $self->{s_kwd} = chr $self->{nc};
2682
2683 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2684 $self->{line_prev} = $self->{line};
2685 $self->{column_prev} = $self->{column};
2686 $self->{column}++;
2687 $self->{nc}
2688 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2689 } else {
2690 $self->{set_nc}->($self);
2691 }
2692
2693 redo A;
2694 } elsif ($self->{nc} == 0x0053 or # S
2695 $self->{nc} == 0x0073) { # s
2696 $self->{state} = SYSTEM_STATE;
2697 $self->{s_kwd} = chr $self->{nc};
2698
2699 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2700 $self->{line_prev} = $self->{line};
2701 $self->{column_prev} = $self->{column};
2702 $self->{column}++;
2703 $self->{nc}
2704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2705 } else {
2706 $self->{set_nc}->($self);
2707 }
2708
2709 redo A;
2710 } else {
2711
2712 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2713 $self->{ct}->{quirks} = 1;
2714
2715 $self->{state} = BOGUS_DOCTYPE_STATE;
2716
2717 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2718 $self->{line_prev} = $self->{line};
2719 $self->{column_prev} = $self->{column};
2720 $self->{column}++;
2721 $self->{nc}
2722 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2723 } else {
2724 $self->{set_nc}->($self);
2725 }
2726
2727 redo A;
2728 }
2729 } elsif ($self->{state} == PUBLIC_STATE) {
2730 ## ASCII case-insensitive
2731 if ($self->{nc} == [
2732 undef,
2733 0x0055, # U
2734 0x0042, # B
2735 0x004C, # L
2736 0x0049, # I
2737 ]->[length $self->{s_kwd}] or
2738 $self->{nc} == [
2739 undef,
2740 0x0075, # u
2741 0x0062, # b
2742 0x006C, # l
2743 0x0069, # i
2744 ]->[length $self->{s_kwd}]) {
2745
2746 ## Stay in the state.
2747 $self->{s_kwd} .= chr $self->{nc};
2748
2749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2750 $self->{line_prev} = $self->{line};
2751 $self->{column_prev} = $self->{column};
2752 $self->{column}++;
2753 $self->{nc}
2754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2755 } else {
2756 $self->{set_nc}->($self);
2757 }
2758
2759 redo A;
2760 } elsif ((length $self->{s_kwd}) == 5 and
2761 ($self->{nc} == 0x0043 or # C
2762 $self->{nc} == 0x0063)) { # c
2763
2764 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2765
2766 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2767 $self->{line_prev} = $self->{line};
2768 $self->{column_prev} = $self->{column};
2769 $self->{column}++;
2770 $self->{nc}
2771 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2772 } else {
2773 $self->{set_nc}->($self);
2774 }
2775
2776 redo A;
2777 } else {
2778
2779 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2780 line => $self->{line_prev},
2781 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2782 $self->{ct}->{quirks} = 1;
2783
2784 $self->{state} = BOGUS_DOCTYPE_STATE;
2785 ## Reconsume.
2786 redo A;
2787 }
2788 } elsif ($self->{state} == SYSTEM_STATE) {
2789 ## ASCII case-insensitive
2790 if ($self->{nc} == [
2791 undef,
2792 0x0059, # Y
2793 0x0053, # S
2794 0x0054, # T
2795 0x0045, # E
2796 ]->[length $self->{s_kwd}] or
2797 $self->{nc} == [
2798 undef,
2799 0x0079, # y
2800 0x0073, # s
2801 0x0074, # t
2802 0x0065, # e
2803 ]->[length $self->{s_kwd}]) {
2804
2805 ## Stay in the state.
2806 $self->{s_kwd} .= chr $self->{nc};
2807
2808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2809 $self->{line_prev} = $self->{line};
2810 $self->{column_prev} = $self->{column};
2811 $self->{column}++;
2812 $self->{nc}
2813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2814 } else {
2815 $self->{set_nc}->($self);
2816 }
2817
2818 redo A;
2819 } elsif ((length $self->{s_kwd}) == 5 and
2820 ($self->{nc} == 0x004D or # M
2821 $self->{nc} == 0x006D)) { # m
2822
2823 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2824
2825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2826 $self->{line_prev} = $self->{line};
2827 $self->{column_prev} = $self->{column};
2828 $self->{column}++;
2829 $self->{nc}
2830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2831 } else {
2832 $self->{set_nc}->($self);
2833 }
2834
2835 redo A;
2836 } else {
2837
2838 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2839 line => $self->{line_prev},
2840 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2841 $self->{ct}->{quirks} = 1;
2842
2843 $self->{state} = BOGUS_DOCTYPE_STATE;
2844 ## Reconsume.
2845 redo A;
2846 }
2847 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2848 if ($is_space->{$self->{nc}}) {
2849
2850 ## Stay in the state
2851
2852 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2853 $self->{line_prev} = $self->{line};
2854 $self->{column_prev} = $self->{column};
2855 $self->{column}++;
2856 $self->{nc}
2857 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2858 } else {
2859 $self->{set_nc}->($self);
2860 }
2861
2862 redo A;
2863 } elsif ($self->{nc} eq 0x0022) { # "
2864
2865 $self->{ct}->{pubid} = ''; # DOCTYPE
2866 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2867
2868 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2869 $self->{line_prev} = $self->{line};
2870 $self->{column_prev} = $self->{column};
2871 $self->{column}++;
2872 $self->{nc}
2873 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2874 } else {
2875 $self->{set_nc}->($self);
2876 }
2877
2878 redo A;
2879 } elsif ($self->{nc} eq 0x0027) { # '
2880
2881 $self->{ct}->{pubid} = ''; # DOCTYPE
2882 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2883
2884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2885 $self->{line_prev} = $self->{line};
2886 $self->{column_prev} = $self->{column};
2887 $self->{column}++;
2888 $self->{nc}
2889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2890 } else {
2891 $self->{set_nc}->($self);
2892 }
2893
2894 redo A;
2895 } elsif ($self->{nc} eq 0x003E) { # >
2896
2897 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2898
2899 $self->{state} = DATA_STATE;
2900
2901 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2902 $self->{line_prev} = $self->{line};
2903 $self->{column_prev} = $self->{column};
2904 $self->{column}++;
2905 $self->{nc}
2906 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2907 } else {
2908 $self->{set_nc}->($self);
2909 }
2910
2911
2912 $self->{ct}->{quirks} = 1;
2913 return ($self->{ct}); # DOCTYPE
2914
2915 redo A;
2916 } elsif ($self->{nc} == -1) {
2917
2918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2919
2920 $self->{state} = DATA_STATE;
2921 ## reconsume
2922
2923 $self->{ct}->{quirks} = 1;
2924 return ($self->{ct}); # DOCTYPE
2925
2926 redo A;
2927 } else {
2928
2929 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
2930 $self->{ct}->{quirks} = 1;
2931
2932 $self->{state} = BOGUS_DOCTYPE_STATE;
2933
2934 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2935 $self->{line_prev} = $self->{line};
2936 $self->{column_prev} = $self->{column};
2937 $self->{column}++;
2938 $self->{nc}
2939 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2940 } else {
2941 $self->{set_nc}->($self);
2942 }
2943
2944 redo A;
2945 }
2946 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2947 if ($self->{nc} == 0x0022) { # "
2948
2949 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2950
2951 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2952 $self->{line_prev} = $self->{line};
2953 $self->{column_prev} = $self->{column};
2954 $self->{column}++;
2955 $self->{nc}
2956 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2957 } else {
2958 $self->{set_nc}->($self);
2959 }
2960
2961 redo A;
2962 } elsif ($self->{nc} == 0x003E) { # >
2963
2964 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2965
2966 $self->{state} = DATA_STATE;
2967
2968 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2969 $self->{line_prev} = $self->{line};
2970 $self->{column_prev} = $self->{column};
2971 $self->{column}++;
2972 $self->{nc}
2973 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2974 } else {
2975 $self->{set_nc}->($self);
2976 }
2977
2978
2979 $self->{ct}->{quirks} = 1;
2980 return ($self->{ct}); # DOCTYPE
2981
2982 redo A;
2983 } elsif ($self->{nc} == -1) {
2984
2985 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2986
2987 $self->{state} = DATA_STATE;
2988 ## reconsume
2989
2990 $self->{ct}->{quirks} = 1;
2991 return ($self->{ct}); # DOCTYPE
2992
2993 redo A;
2994 } else {
2995
2996 $self->{ct}->{pubid} # DOCTYPE
2997 .= chr $self->{nc};
2998 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2999 length $self->{ct}->{pubid});
3000
3001 ## Stay in the state
3002
3003 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3004 $self->{line_prev} = $self->{line};
3005 $self->{column_prev} = $self->{column};
3006 $self->{column}++;
3007 $self->{nc}
3008 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3009 } else {
3010 $self->{set_nc}->($self);
3011 }
3012
3013 redo A;
3014 }
3015 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3016 if ($self->{nc} == 0x0027) { # '
3017
3018 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3019
3020 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3021 $self->{line_prev} = $self->{line};
3022 $self->{column_prev} = $self->{column};
3023 $self->{column}++;
3024 $self->{nc}
3025 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3026 } else {
3027 $self->{set_nc}->($self);
3028 }
3029
3030 redo A;
3031 } elsif ($self->{nc} == 0x003E) { # >
3032
3033 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3034
3035 $self->{state} = DATA_STATE;
3036
3037 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3038 $self->{line_prev} = $self->{line};
3039 $self->{column_prev} = $self->{column};
3040 $self->{column}++;
3041 $self->{nc}
3042 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3043 } else {
3044 $self->{set_nc}->($self);
3045 }
3046
3047
3048 $self->{ct}->{quirks} = 1;
3049 return ($self->{ct}); # DOCTYPE
3050
3051 redo A;
3052 } elsif ($self->{nc} == -1) {
3053
3054 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3055
3056 $self->{state} = DATA_STATE;
3057 ## reconsume
3058
3059 $self->{ct}->{quirks} = 1;
3060 return ($self->{ct}); # DOCTYPE
3061
3062 redo A;
3063 } else {
3064
3065 $self->{ct}->{pubid} # DOCTYPE
3066 .= chr $self->{nc};
3067 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3068 length $self->{ct}->{pubid});
3069
3070 ## Stay in the state
3071
3072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3073 $self->{line_prev} = $self->{line};
3074 $self->{column_prev} = $self->{column};
3075 $self->{column}++;
3076 $self->{nc}
3077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3078 } else {
3079 $self->{set_nc}->($self);
3080 }
3081
3082 redo A;
3083 }
3084 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3085 if ($is_space->{$self->{nc}}) {
3086
3087 ## Stay in the state
3088
3089 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3090 $self->{line_prev} = $self->{line};
3091 $self->{column_prev} = $self->{column};
3092 $self->{column}++;
3093 $self->{nc}
3094 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3095 } else {
3096 $self->{set_nc}->($self);
3097 }
3098
3099 redo A;
3100 } elsif ($self->{nc} == 0x0022) { # "
3101
3102 $self->{ct}->{sysid} = ''; # DOCTYPE
3103 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3104
3105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106 $self->{line_prev} = $self->{line};
3107 $self->{column_prev} = $self->{column};
3108 $self->{column}++;
3109 $self->{nc}
3110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111 } else {
3112 $self->{set_nc}->($self);
3113 }
3114
3115 redo A;
3116 } elsif ($self->{nc} == 0x0027) { # '
3117
3118 $self->{ct}->{sysid} = ''; # DOCTYPE
3119 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3120
3121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122 $self->{line_prev} = $self->{line};
3123 $self->{column_prev} = $self->{column};
3124 $self->{column}++;
3125 $self->{nc}
3126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127 } else {
3128 $self->{set_nc}->($self);
3129 }
3130
3131 redo A;
3132 } elsif ($self->{nc} == 0x003E) { # >
3133
3134 $self->{state} = DATA_STATE;
3135
3136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137 $self->{line_prev} = $self->{line};
3138 $self->{column_prev} = $self->{column};
3139 $self->{column}++;
3140 $self->{nc}
3141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142 } else {
3143 $self->{set_nc}->($self);
3144 }
3145
3146
3147 return ($self->{ct}); # DOCTYPE
3148
3149 redo A;
3150 } elsif ($self->{nc} == -1) {
3151
3152 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3153
3154 $self->{state} = DATA_STATE;
3155 ## reconsume
3156
3157 $self->{ct}->{quirks} = 1;
3158 return ($self->{ct}); # DOCTYPE
3159
3160 redo A;
3161 } else {
3162
3163 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3164 $self->{ct}->{quirks} = 1;
3165
3166 $self->{state} = BOGUS_DOCTYPE_STATE;
3167
3168 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3169 $self->{line_prev} = $self->{line};
3170 $self->{column_prev} = $self->{column};
3171 $self->{column}++;
3172 $self->{nc}
3173 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3174 } else {
3175 $self->{set_nc}->($self);
3176 }
3177
3178 redo A;
3179 }
3180 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3181 if ($is_space->{$self->{nc}}) {
3182
3183 ## Stay in the state
3184
3185 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3186 $self->{line_prev} = $self->{line};
3187 $self->{column_prev} = $self->{column};
3188 $self->{column}++;
3189 $self->{nc}
3190 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3191 } else {
3192 $self->{set_nc}->($self);
3193 }
3194
3195 redo A;
3196 } elsif ($self->{nc} == 0x0022) { # "
3197
3198 $self->{ct}->{sysid} = ''; # DOCTYPE
3199 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3200
3201 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3202 $self->{line_prev} = $self->{line};
3203 $self->{column_prev} = $self->{column};
3204 $self->{column}++;
3205 $self->{nc}
3206 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3207 } else {
3208 $self->{set_nc}->($self);
3209 }
3210
3211 redo A;
3212 } elsif ($self->{nc} == 0x0027) { # '
3213
3214 $self->{ct}->{sysid} = ''; # DOCTYPE
3215 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3216
3217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3218 $self->{line_prev} = $self->{line};
3219 $self->{column_prev} = $self->{column};
3220 $self->{column}++;
3221 $self->{nc}
3222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3223 } else {
3224 $self->{set_nc}->($self);
3225 }
3226
3227 redo A;
3228 } elsif ($self->{nc} == 0x003E) { # >
3229
3230 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3231 $self->{state} = DATA_STATE;
3232
3233 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3234 $self->{line_prev} = $self->{line};
3235 $self->{column_prev} = $self->{column};
3236 $self->{column}++;
3237 $self->{nc}
3238 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3239 } else {
3240 $self->{set_nc}->($self);
3241 }
3242
3243
3244 $self->{ct}->{quirks} = 1;
3245 return ($self->{ct}); # DOCTYPE
3246
3247 redo A;
3248 } elsif ($self->{nc} == -1) {
3249
3250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3251
3252 $self->{state} = DATA_STATE;
3253 ## reconsume
3254
3255 $self->{ct}->{quirks} = 1;
3256 return ($self->{ct}); # DOCTYPE
3257
3258 redo A;
3259 } else {
3260
3261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3262 $self->{ct}->{quirks} = 1;
3263
3264 $self->{state} = BOGUS_DOCTYPE_STATE;
3265
3266 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3267 $self->{line_prev} = $self->{line};
3268 $self->{column_prev} = $self->{column};
3269 $self->{column}++;
3270 $self->{nc}
3271 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3272 } else {
3273 $self->{set_nc}->($self);
3274 }
3275
3276 redo A;
3277 }
3278 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3279 if ($self->{nc} == 0x0022) { # "
3280
3281 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3282
3283 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3284 $self->{line_prev} = $self->{line};
3285 $self->{column_prev} = $self->{column};
3286 $self->{column}++;
3287 $self->{nc}
3288 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3289 } else {
3290 $self->{set_nc}->($self);
3291 }
3292
3293 redo A;
3294 } elsif ($self->{nc} == 0x003E) { # >
3295
3296 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3297
3298 $self->{state} = DATA_STATE;
3299
3300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301 $self->{line_prev} = $self->{line};
3302 $self->{column_prev} = $self->{column};
3303 $self->{column}++;
3304 $self->{nc}
3305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306 } else {
3307 $self->{set_nc}->($self);
3308 }
3309
3310
3311 $self->{ct}->{quirks} = 1;
3312 return ($self->{ct}); # DOCTYPE
3313
3314 redo A;
3315 } elsif ($self->{nc} == -1) {
3316
3317 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3318
3319 $self->{state} = DATA_STATE;
3320 ## reconsume
3321
3322 $self->{ct}->{quirks} = 1;
3323 return ($self->{ct}); # DOCTYPE
3324
3325 redo A;
3326 } else {
3327
3328 $self->{ct}->{sysid} # DOCTYPE
3329 .= chr $self->{nc};
3330 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3331 length $self->{ct}->{sysid});
3332
3333 ## Stay in the state
3334
3335 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3336 $self->{line_prev} = $self->{line};
3337 $self->{column_prev} = $self->{column};
3338 $self->{column}++;
3339 $self->{nc}
3340 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3341 } else {
3342 $self->{set_nc}->($self);
3343 }
3344
3345 redo A;
3346 }
3347 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3348 if ($self->{nc} == 0x0027) { # '
3349
3350 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3351
3352 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3353 $self->{line_prev} = $self->{line};
3354 $self->{column_prev} = $self->{column};
3355 $self->{column}++;
3356 $self->{nc}
3357 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3358 } else {
3359 $self->{set_nc}->($self);
3360 }
3361
3362 redo A;
3363 } elsif ($self->{nc} == 0x003E) { # >
3364
3365 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3366
3367 $self->{state} = DATA_STATE;
3368
3369 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370 $self->{line_prev} = $self->{line};
3371 $self->{column_prev} = $self->{column};
3372 $self->{column}++;
3373 $self->{nc}
3374 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3375 } else {
3376 $self->{set_nc}->($self);
3377 }
3378
3379
3380 $self->{ct}->{quirks} = 1;
3381 return ($self->{ct}); # DOCTYPE
3382
3383 redo A;
3384 } elsif ($self->{nc} == -1) {
3385
3386 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3387
3388 $self->{state} = DATA_STATE;
3389 ## reconsume
3390
3391 $self->{ct}->{quirks} = 1;
3392 return ($self->{ct}); # DOCTYPE
3393
3394 redo A;
3395 } else {
3396
3397 $self->{ct}->{sysid} # DOCTYPE
3398 .= chr $self->{nc};
3399 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3400 length $self->{ct}->{sysid});
3401
3402 ## Stay in the state
3403
3404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3405 $self->{line_prev} = $self->{line};
3406 $self->{column_prev} = $self->{column};
3407 $self->{column}++;
3408 $self->{nc}
3409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3410 } else {
3411 $self->{set_nc}->($self);
3412 }
3413
3414 redo A;
3415 }
3416 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3417 if ($is_space->{$self->{nc}}) {
3418
3419 ## Stay in the state
3420
3421 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3422 $self->{line_prev} = $self->{line};
3423 $self->{column_prev} = $self->{column};
3424 $self->{column}++;
3425 $self->{nc}
3426 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3427 } else {
3428 $self->{set_nc}->($self);
3429 }
3430
3431 redo A;
3432 } elsif ($self->{nc} == 0x003E) { # >
3433
3434 $self->{state} = DATA_STATE;
3435
3436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3437 $self->{line_prev} = $self->{line};
3438 $self->{column_prev} = $self->{column};
3439 $self->{column}++;
3440 $self->{nc}
3441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3442 } else {
3443 $self->{set_nc}->($self);
3444 }
3445
3446
3447 return ($self->{ct}); # DOCTYPE
3448
3449 redo A;
3450 } elsif ($self->{nc} == -1) {
3451
3452 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3453 $self->{state} = DATA_STATE;
3454 ## reconsume
3455
3456 $self->{ct}->{quirks} = 1;
3457 return ($self->{ct}); # DOCTYPE
3458
3459 redo A;
3460 } else {
3461
3462 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3463 #$self->{ct}->{quirks} = 1;
3464
3465 $self->{state} = BOGUS_DOCTYPE_STATE;
3466
3467 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3468 $self->{line_prev} = $self->{line};
3469 $self->{column_prev} = $self->{column};
3470 $self->{column}++;
3471 $self->{nc}
3472 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3473 } else {
3474 $self->{set_nc}->($self);
3475 }
3476
3477 redo A;
3478 }
3479 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3480 if ($self->{nc} == 0x003E) { # >
3481
3482 $self->{state} = DATA_STATE;
3483
3484 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3485 $self->{line_prev} = $self->{line};
3486 $self->{column_prev} = $self->{column};
3487 $self->{column}++;
3488 $self->{nc}
3489 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3490 } else {
3491 $self->{set_nc}->($self);
3492 }
3493
3494
3495 return ($self->{ct}); # DOCTYPE
3496
3497 redo A;
3498 } elsif ($self->{nc} == -1) {
3499
3500 $self->{state} = DATA_STATE;
3501 ## reconsume
3502
3503 return ($self->{ct}); # DOCTYPE
3504
3505 redo A;
3506 } else {
3507
3508 my $s = '';
3509 $self->{read_until}->($s, q[>], 0);
3510
3511 ## Stay in the state
3512
3513 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3514 $self->{line_prev} = $self->{line};
3515 $self->{column_prev} = $self->{column};
3516 $self->{column}++;
3517 $self->{nc}
3518 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3519 } else {
3520 $self->{set_nc}->($self);
3521 }
3522
3523 redo A;
3524 }
3525 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3526 ## NOTE: "CDATA section state" in the state is jointly implemented
3527 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3528 ## and |CDATA_SECTION_MSE2_STATE|.
3529
3530 if ($self->{nc} == 0x005D) { # ]
3531
3532 $self->{state} = CDATA_SECTION_MSE1_STATE;
3533
3534 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3535 $self->{line_prev} = $self->{line};
3536 $self->{column_prev} = $self->{column};
3537 $self->{column}++;
3538 $self->{nc}
3539 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3540 } else {
3541 $self->{set_nc}->($self);
3542 }
3543
3544 redo A;
3545 } elsif ($self->{nc} == -1) {
3546 $self->{state} = DATA_STATE;
3547
3548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3549 $self->{line_prev} = $self->{line};
3550 $self->{column_prev} = $self->{column};
3551 $self->{column}++;
3552 $self->{nc}
3553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3554 } else {
3555 $self->{set_nc}->($self);
3556 }
3557
3558 if (length $self->{ct}->{data}) { # character
3559
3560 return ($self->{ct}); # character
3561 } else {
3562
3563 ## No token to emit. $self->{ct} is discarded.
3564 }
3565 redo A;
3566 } else {
3567
3568 $self->{ct}->{data} .= chr $self->{nc};
3569 $self->{read_until}->($self->{ct}->{data},
3570 q<]>,
3571 length $self->{ct}->{data});
3572
3573 ## Stay in the state.
3574
3575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3576 $self->{line_prev} = $self->{line};
3577 $self->{column_prev} = $self->{column};
3578 $self->{column}++;
3579 $self->{nc}
3580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3581 } else {
3582 $self->{set_nc}->($self);
3583 }
3584
3585 redo A;
3586 }
3587
3588 ## ISSUE: "text tokens" in spec.
3589 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3590 if ($self->{nc} == 0x005D) { # ]
3591
3592 $self->{state} = CDATA_SECTION_MSE2_STATE;
3593
3594 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3595 $self->{line_prev} = $self->{line};
3596 $self->{column_prev} = $self->{column};
3597 $self->{column}++;
3598 $self->{nc}
3599 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3600 } else {
3601 $self->{set_nc}->($self);
3602 }
3603
3604 redo A;
3605 } else {
3606
3607 $self->{ct}->{data} .= ']';
3608 $self->{state} = CDATA_SECTION_STATE;
3609 ## Reconsume.
3610 redo A;
3611 }
3612 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3613 if ($self->{nc} == 0x003E) { # >
3614 $self->{state} = DATA_STATE;
3615
3616 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3617 $self->{line_prev} = $self->{line};
3618 $self->{column_prev} = $self->{column};
3619 $self->{column}++;
3620 $self->{nc}
3621 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3622 } else {
3623 $self->{set_nc}->($self);
3624 }
3625
3626 if (length $self->{ct}->{data}) { # character
3627
3628 return ($self->{ct}); # character
3629 } else {
3630
3631 ## No token to emit. $self->{ct} is discarded.
3632 }
3633 redo A;
3634 } elsif ($self->{nc} == 0x005D) { # ]
3635 # character
3636 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3637 ## Stay in the state.
3638
3639 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3640 $self->{line_prev} = $self->{line};
3641 $self->{column_prev} = $self->{column};
3642 $self->{column}++;
3643 $self->{nc}
3644 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3645 } else {
3646 $self->{set_nc}->($self);
3647 }
3648
3649 redo A;
3650 } else {
3651
3652 $self->{ct}->{data} .= ']]'; # character
3653 $self->{state} = CDATA_SECTION_STATE;
3654 ## Reconsume.
3655 redo A;
3656 }
3657 } elsif ($self->{state} == ENTITY_STATE) {
3658 if ($is_space->{$self->{nc}} or
3659 {
3660 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3661 $self->{entity_add} => 1,
3662 }->{$self->{nc}}) {
3663
3664 ## Don't consume
3665 ## No error
3666 ## Return nothing.
3667 #
3668 } elsif ($self->{nc} == 0x0023) { # #
3669
3670 $self->{state} = ENTITY_HASH_STATE;
3671 $self->{s_kwd} = '#';
3672
3673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3674 $self->{line_prev} = $self->{line};
3675 $self->{column_prev} = $self->{column};
3676 $self->{column}++;
3677 $self->{nc}
3678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3679 } else {
3680 $self->{set_nc}->($self);
3681 }
3682
3683 redo A;
3684 } elsif ((0x0041 <= $self->{nc} and
3685 $self->{nc} <= 0x005A) or # A..Z
3686 (0x0061 <= $self->{nc} and
3687 $self->{nc} <= 0x007A)) { # a..z
3688
3689 require Whatpm::_NamedEntityList;
3690 $self->{state} = ENTITY_NAME_STATE;
3691 $self->{s_kwd} = chr $self->{nc};
3692 $self->{entity__value} = $self->{s_kwd};
3693 $self->{entity__match} = 0;
3694
3695 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3696 $self->{line_prev} = $self->{line};
3697 $self->{column_prev} = $self->{column};
3698 $self->{column}++;
3699 $self->{nc}
3700 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3701 } else {
3702 $self->{set_nc}->($self);
3703 }
3704
3705 redo A;
3706 } else {
3707
3708 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3709 ## Return nothing.
3710 #
3711 }
3712
3713 ## NOTE: No character is consumed by the "consume a character
3714 ## reference" algorithm. In other word, there is an "&" character
3715 ## that does not introduce a character reference, which would be
3716 ## appended to the parent element or the attribute value in later
3717 ## process of the tokenizer.
3718
3719 if ($self->{prev_state} == DATA_STATE) {
3720
3721 $self->{state} = $self->{prev_state};
3722 ## Reconsume.
3723 return ({type => CHARACTER_TOKEN, data => '&',
3724 line => $self->{line_prev},
3725 column => $self->{column_prev},
3726 });
3727 redo A;
3728 } else {
3729
3730 $self->{ca}->{value} .= '&';
3731 $self->{state} = $self->{prev_state};
3732 ## Reconsume.
3733 redo A;
3734 }
3735 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3736 if ($self->{nc} == 0x0078 or # x
3737 $self->{nc} == 0x0058) { # X
3738
3739 $self->{state} = HEXREF_X_STATE;
3740 $self->{s_kwd} .= chr $self->{nc};
3741
3742 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3743 $self->{line_prev} = $self->{line};
3744 $self->{column_prev} = $self->{column};
3745 $self->{column}++;
3746 $self->{nc}
3747 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3748 } else {
3749 $self->{set_nc}->($self);
3750 }
3751
3752 redo A;
3753 } elsif (0x0030 <= $self->{nc} and
3754 $self->{nc} <= 0x0039) { # 0..9
3755
3756 $self->{state} = NCR_NUM_STATE;
3757 $self->{s_kwd} = $self->{nc} - 0x0030;
3758
3759 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3760 $self->{line_prev} = $self->{line};
3761 $self->{column_prev} = $self->{column};
3762 $self->{column}++;
3763 $self->{nc}
3764 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3765 } else {
3766 $self->{set_nc}->($self);
3767 }
3768
3769 redo A;
3770 } else {
3771 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3772 line => $self->{line_prev},
3773 column => $self->{column_prev} - 1);
3774
3775 ## NOTE: According to the spec algorithm, nothing is returned,
3776 ## and then "&#" is appended to the parent element or the attribute
3777 ## value in the later processing.
3778
3779 if ($self->{prev_state} == DATA_STATE) {
3780
3781 $self->{state} = $self->{prev_state};
3782 ## Reconsume.
3783 return ({type => CHARACTER_TOKEN,
3784 data => '&#',
3785 line => $self->{line_prev},
3786 column => $self->{column_prev} - 1,
3787 });
3788 redo A;
3789 } else {
3790
3791 $self->{ca}->{value} .= '&#';
3792 $self->{state} = $self->{prev_state};
3793 ## Reconsume.
3794 redo A;
3795 }
3796 }
3797 } elsif ($self->{state} == NCR_NUM_STATE) {
3798 if (0x0030 <= $self->{nc} and
3799 $self->{nc} <= 0x0039) { # 0..9
3800
3801 $self->{s_kwd} *= 10;
3802 $self->{s_kwd} += $self->{nc} - 0x0030;
3803
3804 ## Stay in the state.
3805
3806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3807 $self->{line_prev} = $self->{line};
3808 $self->{column_prev} = $self->{column};
3809 $self->{column}++;
3810 $self->{nc}
3811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3812 } else {
3813 $self->{set_nc}->($self);
3814 }
3815
3816 redo A;
3817 } elsif ($self->{nc} == 0x003B) { # ;
3818
3819
3820 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3821 $self->{line_prev} = $self->{line};
3822 $self->{column_prev} = $self->{column};
3823 $self->{column}++;
3824 $self->{nc}
3825 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3826 } else {
3827 $self->{set_nc}->($self);
3828 }
3829
3830 #
3831 } else {
3832
3833 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3834 ## Reconsume.
3835 #
3836 }
3837
3838 my $code = $self->{s_kwd};
3839 my $l = $self->{line_prev};
3840 my $c = $self->{column_prev};
3841 if ($charref_map->{$code}) {
3842
3843 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3844 text => (sprintf 'U+%04X', $code),
3845 line => $l, column => $c);
3846 $code = $charref_map->{$code};
3847 } elsif ($code > 0x10FFFF) {
3848
3849 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3850 text => (sprintf 'U-%08X', $code),
3851 line => $l, column => $c);
3852 $code = 0xFFFD;
3853 }
3854
3855 if ($self->{prev_state} == DATA_STATE) {
3856
3857 $self->{state} = $self->{prev_state};
3858 ## Reconsume.
3859 return ({type => CHARACTER_TOKEN, data => chr $code,
3860 line => $l, column => $c,
3861 });
3862 redo A;
3863 } else {
3864
3865 $self->{ca}->{value} .= chr $code;
3866 $self->{ca}->{has_reference} = 1;
3867 $self->{state} = $self->{prev_state};
3868 ## Reconsume.
3869 redo A;
3870 }
3871 } elsif ($self->{state} == HEXREF_X_STATE) {
3872 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3873 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3874 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3875 # 0..9, A..F, a..f
3876
3877 $self->{state} = HEXREF_HEX_STATE;
3878 $self->{s_kwd} = 0;
3879 ## Reconsume.
3880 redo A;
3881 } else {
3882 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3883 line => $self->{line_prev},
3884 column => $self->{column_prev} - 2);
3885
3886 ## NOTE: According to the spec algorithm, nothing is returned,
3887 ## and then "&#" followed by "X" or "x" is appended to the parent
3888 ## element or the attribute value in the later processing.
3889
3890 if ($self->{prev_state} == DATA_STATE) {
3891
3892 $self->{state} = $self->{prev_state};
3893 ## Reconsume.
3894 return ({type => CHARACTER_TOKEN,
3895 data => '&' . $self->{s_kwd},
3896 line => $self->{line_prev},
3897 column => $self->{column_prev} - length $self->{s_kwd},
3898 });
3899 redo A;
3900 } else {
3901
3902 $self->{ca}->{value} .= '&' . $self->{s_kwd};
3903 $self->{state} = $self->{prev_state};
3904 ## Reconsume.
3905 redo A;
3906 }
3907 }
3908 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3909 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3910 # 0..9
3911
3912 $self->{s_kwd} *= 0x10;
3913 $self->{s_kwd} += $self->{nc} - 0x0030;
3914 ## Stay in the state.
3915
3916 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3917 $self->{line_prev} = $self->{line};
3918 $self->{column_prev} = $self->{column};
3919 $self->{column}++;
3920 $self->{nc}
3921 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3922 } else {
3923 $self->{set_nc}->($self);
3924 }
3925
3926 redo A;
3927 } elsif (0x0061 <= $self->{nc} and
3928 $self->{nc} <= 0x0066) { # a..f
3929
3930 $self->{s_kwd} *= 0x10;
3931 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3932 ## Stay in the state.
3933
3934 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3935 $self->{line_prev} = $self->{line};
3936 $self->{column_prev} = $self->{column};
3937 $self->{column}++;
3938 $self->{nc}
3939 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3940 } else {
3941 $self->{set_nc}->($self);
3942 }
3943
3944 redo A;
3945 } elsif (0x0041 <= $self->{nc} and
3946 $self->{nc} <= 0x0046) { # A..F
3947
3948 $self->{s_kwd} *= 0x10;
3949 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3950 ## Stay in the state.
3951
3952 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3953 $self->{line_prev} = $self->{line};
3954 $self->{column_prev} = $self->{column};
3955 $self->{column}++;
3956 $self->{nc}
3957 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3958 } else {
3959 $self->{set_nc}->($self);
3960 }
3961
3962 redo A;
3963 } elsif ($self->{nc} == 0x003B) { # ;
3964
3965
3966 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3967 $self->{line_prev} = $self->{line};
3968 $self->{column_prev} = $self->{column};
3969 $self->{column}++;
3970 $self->{nc}
3971 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3972 } else {
3973 $self->{set_nc}->($self);
3974 }
3975
3976 #
3977 } else {
3978
3979 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
3980 line => $self->{line},
3981 column => $self->{column});
3982 ## Reconsume.
3983 #
3984 }
3985
3986 my $code = $self->{s_kwd};
3987 my $l = $self->{line_prev};
3988 my $c = $self->{column_prev};
3989 if ($charref_map->{$code}) {
3990
3991 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3992 text => (sprintf 'U+%04X', $code),
3993 line => $l, column => $c);
3994 $code = $charref_map->{$code};
3995 } elsif ($code > 0x10FFFF) {
3996
3997 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3998 text => (sprintf 'U-%08X', $code),
3999 line => $l, column => $c);
4000 $code = 0xFFFD;
4001 }
4002
4003 if ($self->{prev_state} == DATA_STATE) {
4004
4005 $self->{state} = $self->{prev_state};
4006 ## Reconsume.
4007 return ({type => CHARACTER_TOKEN, data => chr $code,
4008 line => $l, column => $c,
4009 });
4010 redo A;
4011 } else {
4012
4013 $self->{ca}->{value} .= chr $code;
4014 $self->{ca}->{has_reference} = 1;
4015 $self->{state} = $self->{prev_state};
4016 ## Reconsume.
4017 redo A;
4018 }
4019 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4020 if (length $self->{s_kwd} < 30 and
4021 ## NOTE: Some number greater than the maximum length of entity name
4022 ((0x0041 <= $self->{nc} and # a
4023 $self->{nc} <= 0x005A) or # x
4024 (0x0061 <= $self->{nc} and # a
4025 $self->{nc} <= 0x007A) or # z
4026 (0x0030 <= $self->{nc} and # 0
4027 $self->{nc} <= 0x0039) or # 9
4028 $self->{nc} == 0x003B)) { # ;
4029 our $EntityChar;
4030 $self->{s_kwd} .= chr $self->{nc};
4031 if (defined $EntityChar->{$self->{s_kwd}}) {
4032 if ($self->{nc} == 0x003B) { # ;
4033
4034 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4035 $self->{entity__match} = 1;
4036
4037 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4038 $self->{line_prev} = $self->{line};
4039 $self->{column_prev} = $self->{column};
4040 $self->{column}++;
4041 $self->{nc}
4042 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4043 } else {
4044 $self->{set_nc}->($self);
4045 }
4046
4047 #
4048 } else {
4049
4050 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4051 $self->{entity__match} = -1;
4052 ## Stay in the state.
4053
4054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4055 $self->{line_prev} = $self->{line};
4056 $self->{column_prev} = $self->{column};
4057 $self->{column}++;
4058 $self->{nc}
4059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4060 } else {
4061 $self->{set_nc}->($self);
4062 }
4063
4064 redo A;
4065 }
4066 } else {
4067
4068 $self->{entity__value} .= chr $self->{nc};
4069 $self->{entity__match} *= 2;
4070 ## Stay in the state.
4071
4072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4073 $self->{line_prev} = $self->{line};
4074 $self->{column_prev} = $self->{column};
4075 $self->{column}++;
4076 $self->{nc}
4077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4078 } else {
4079 $self->{set_nc}->($self);
4080 }
4081
4082 redo A;
4083 }
4084 }
4085
4086 my $data;
4087 my $has_ref;
4088 if ($self->{entity__match} > 0) {
4089
4090 $data = $self->{entity__value};
4091 $has_ref = 1;
4092 #
4093 } elsif ($self->{entity__match} < 0) {
4094 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4095 if ($self->{prev_state} != DATA_STATE and # in attribute
4096 $self->{entity__match} < -1) {
4097
4098 $data = '&' . $self->{s_kwd};
4099 #
4100 } else {
4101
4102 $data = $self->{entity__value};
4103 $has_ref = 1;
4104 #
4105 }
4106 } else {
4107
4108 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4109 line => $self->{line_prev},
4110 column => $self->{column_prev} - length $self->{s_kwd});
4111 $data = '&' . $self->{s_kwd};
4112 #
4113 }
4114
4115 ## NOTE: In these cases, when a character reference is found,
4116 ## it is consumed and a character token is returned, or, otherwise,
4117 ## nothing is consumed and returned, according to the spec algorithm.
4118 ## In this implementation, anything that has been examined by the
4119 ## tokenizer is appended to the parent element or the attribute value
4120 ## as string, either literal string when no character reference or
4121 ## entity-replaced string otherwise, in this stage, since any characters
4122 ## that would not be consumed are appended in the data state or in an
4123 ## appropriate attribute value state anyway.
4124
4125 if ($self->{prev_state} == DATA_STATE) {
4126
4127 $self->{state} = $self->{prev_state};
4128 ## Reconsume.
4129 return ({type => CHARACTER_TOKEN,
4130 data => $data,
4131 line => $self->{line_prev},
4132 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4133 });
4134 redo A;
4135 } else {
4136
4137 $self->{ca}->{value} .= $data;
4138 $self->{ca}->{has_reference} = 1 if $has_ref;
4139 $self->{state} = $self->{prev_state};
4140 ## Reconsume.
4141 redo A;
4142 }
4143 } else {
4144 die "$0: $self->{state}: Unknown state";
4145 }
4146 } # A
4147
4148 die "$0: _get_next_token: unexpected case";
4149 } # _get_next_token
4150
4151 1;
4152 ## $Date: 2008/10/14 04:32:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24