/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations) (download)
Tue Oct 14 04:32:49 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +44 -11 lines
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 04:28:43 -0000
	* Tokenizer.pm.src: Make *_TOKEN (token type constants)
	exportable.  New token types, PI_TOKEN for XML and ABORT_TOKEN for
	document.write() or incremental parsing, are added for future
	extensions.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 04:27:29 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile, Parser.pm.src: New files.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178
179 $self->{state} = DATA_STATE; # MUST
180 #$self->{s_kwd}; # state keyword - initialized when used
181 #$self->{entity__value}; # initialized when used
182 #$self->{entity__match}; # initialized when used
183 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
184 undef $self->{ct}; # current token
185 undef $self->{ca}; # current attribute
186 undef $self->{last_stag_name}; # last emitted start tag name
187 #$self->{prev_state}; # initialized when used
188 delete $self->{self_closing};
189 $self->{char_buffer} = '';
190 $self->{char_buffer_pos} = 0;
191 $self->{nc} = -1; # next input character
192 #$self->{next_nc}
193
194 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
195 $self->{line_prev} = $self->{line};
196 $self->{column_prev} = $self->{column};
197 $self->{column}++;
198 $self->{nc}
199 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
200 } else {
201 $self->{set_nc}->($self);
202 }
203
204 $self->{token} = [];
205 # $self->{escape}
206 } # _initialize_tokenizer
207
208 ## A token has:
209 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
210 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
211 ## ->{name} (DOCTYPE_TOKEN)
212 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
213 ## ->{pubid} (DOCTYPE_TOKEN)
214 ## ->{sysid} (DOCTYPE_TOKEN)
215 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
216 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
217 ## ->{name}
218 ## ->{value}
219 ## ->{has_reference} == 1 or 0
220 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
221 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
223 ## while the token is pushed back to the stack.
224
225 ## Emitted token MUST immediately be handled by the tree construction state.
226
227 ## Before each step, UA MAY check to see if either one of the scripts in
228 ## "list of scripts that will execute as soon as possible" or the first
229 ## script in the "list of scripts that will execute asynchronously",
230 ## has completed loading. If one has, then it MUST be executed
231 ## and removed from the list.
232
233 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
234 ## (This requirement was dropped from HTML5 spec, unfortunately.)
235
236 my $is_space = {
237 0x0009 => 1, # CHARACTER TABULATION (HT)
238 0x000A => 1, # LINE FEED (LF)
239 #0x000B => 0, # LINE TABULATION (VT)
240 0x000C => 1, # FORM FEED (FF)
241 #0x000D => 1, # CARRIAGE RETURN (CR)
242 0x0020 => 1, # SPACE (SP)
243 };
244
245 sub _get_next_token ($) {
246 my $self = shift;
247
248 if ($self->{self_closing}) {
249 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
250 ## NOTE: The |self_closing| flag is only set by start tag token.
251 ## In addition, when a start tag token is emitted, it is always set to
252 ## |ct|.
253 delete $self->{self_closing};
254 }
255
256 if (@{$self->{token}}) {
257 $self->{self_closing} = $self->{token}->[0]->{self_closing};
258 return shift @{$self->{token}};
259 }
260
261 A: {
262 if ($self->{state} == PCDATA_STATE) {
263 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
264
265 if ($self->{nc} == 0x0026) { # &
266
267 ## NOTE: In the spec, the tokenizer is switched to the
268 ## "entity data state". In this implementation, the tokenizer
269 ## is switched to the |ENTITY_STATE|, which is an implementation
270 ## of the "consume a character reference" algorithm.
271 $self->{entity_add} = -1;
272 $self->{prev_state} = DATA_STATE;
273 $self->{state} = ENTITY_STATE;
274
275 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
276 $self->{line_prev} = $self->{line};
277 $self->{column_prev} = $self->{column};
278 $self->{column}++;
279 $self->{nc}
280 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
281 } else {
282 $self->{set_nc}->($self);
283 }
284
285 redo A;
286 } elsif ($self->{nc} == 0x003C) { # <
287
288 $self->{state} = TAG_OPEN_STATE;
289
290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
291 $self->{line_prev} = $self->{line};
292 $self->{column_prev} = $self->{column};
293 $self->{column}++;
294 $self->{nc}
295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
296 } else {
297 $self->{set_nc}->($self);
298 }
299
300 redo A;
301 } elsif ($self->{nc} == -1) {
302
303 return ({type => END_OF_FILE_TOKEN,
304 line => $self->{line}, column => $self->{column}});
305 last A; ## TODO: ok?
306 } else {
307
308 #
309 }
310
311 # Anything else
312 my $token = {type => CHARACTER_TOKEN,
313 data => chr $self->{nc},
314 line => $self->{line}, column => $self->{column},
315 };
316 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
317
318 ## Stay in the state.
319
320 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
321 $self->{line_prev} = $self->{line};
322 $self->{column_prev} = $self->{column};
323 $self->{column}++;
324 $self->{nc}
325 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
326 } else {
327 $self->{set_nc}->($self);
328 }
329
330 return ($token);
331 redo A;
332 } elsif ($self->{state} == DATA_STATE) {
333 $self->{s_kwd} = '' unless defined $self->{s_kwd};
334 if ($self->{nc} == 0x0026) { # &
335 $self->{s_kwd} = '';
336 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
337 not $self->{escape}) {
338
339 ## NOTE: In the spec, the tokenizer is switched to the
340 ## "entity data state". In this implementation, the tokenizer
341 ## is switched to the |ENTITY_STATE|, which is an implementation
342 ## of the "consume a character reference" algorithm.
343 $self->{entity_add} = -1;
344 $self->{prev_state} = DATA_STATE;
345 $self->{state} = ENTITY_STATE;
346
347 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
348 $self->{line_prev} = $self->{line};
349 $self->{column_prev} = $self->{column};
350 $self->{column}++;
351 $self->{nc}
352 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
353 } else {
354 $self->{set_nc}->($self);
355 }
356
357 redo A;
358 } else {
359
360 #
361 }
362 } elsif ($self->{nc} == 0x002D) { # -
363 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
364 $self->{s_kwd} .= '-';
365
366 if ($self->{s_kwd} eq '<!--') {
367
368 $self->{escape} = 1; # unless $self->{escape};
369 $self->{s_kwd} = '--';
370 #
371 } elsif ($self->{s_kwd} eq '---') {
372
373 $self->{s_kwd} = '--';
374 #
375 } else {
376
377 #
378 }
379 }
380
381 #
382 } elsif ($self->{nc} == 0x0021) { # !
383 if (length $self->{s_kwd}) {
384
385 $self->{s_kwd} .= '!';
386 #
387 } else {
388
389 #$self->{s_kwd} = '';
390 #
391 }
392 #
393 } elsif ($self->{nc} == 0x003C) { # <
394 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
395 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
396 not $self->{escape})) {
397
398 $self->{state} = TAG_OPEN_STATE;
399
400 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
401 $self->{line_prev} = $self->{line};
402 $self->{column_prev} = $self->{column};
403 $self->{column}++;
404 $self->{nc}
405 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
406 } else {
407 $self->{set_nc}->($self);
408 }
409
410 redo A;
411 } else {
412
413 $self->{s_kwd} = '';
414 #
415 }
416 } elsif ($self->{nc} == 0x003E) { # >
417 if ($self->{escape} and
418 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
419 if ($self->{s_kwd} eq '--') {
420
421 delete $self->{escape};
422 } else {
423
424 }
425 } else {
426
427 }
428
429 $self->{s_kwd} = '';
430 #
431 } elsif ($self->{nc} == -1) {
432
433 $self->{s_kwd} = '';
434 return ({type => END_OF_FILE_TOKEN,
435 line => $self->{line}, column => $self->{column}});
436 last A; ## TODO: ok?
437 } else {
438
439 $self->{s_kwd} = '';
440 #
441 }
442
443 # Anything else
444 my $token = {type => CHARACTER_TOKEN,
445 data => chr $self->{nc},
446 line => $self->{line}, column => $self->{column},
447 };
448 if ($self->{read_until}->($token->{data}, q[-!<>&],
449 length $token->{data})) {
450 $self->{s_kwd} = '';
451 }
452
453 ## Stay in the data state.
454 if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
455
456 $self->{state} = PCDATA_STATE;
457 } else {
458
459 ## Stay in the state.
460 }
461
462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
463 $self->{line_prev} = $self->{line};
464 $self->{column_prev} = $self->{column};
465 $self->{column}++;
466 $self->{nc}
467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
468 } else {
469 $self->{set_nc}->($self);
470 }
471
472 return ($token);
473 redo A;
474 } elsif ($self->{state} == TAG_OPEN_STATE) {
475 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
476 if ($self->{nc} == 0x002F) { # /
477
478
479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
480 $self->{line_prev} = $self->{line};
481 $self->{column_prev} = $self->{column};
482 $self->{column}++;
483 $self->{nc}
484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
485 } else {
486 $self->{set_nc}->($self);
487 }
488
489 $self->{state} = CLOSE_TAG_OPEN_STATE;
490 redo A;
491 } elsif ($self->{nc} == 0x0021) { # !
492
493 $self->{s_kwd} = '<' unless $self->{escape};
494 #
495 } else {
496
497 #
498 }
499
500 ## reconsume
501 $self->{state} = DATA_STATE;
502 return ({type => CHARACTER_TOKEN, data => '<',
503 line => $self->{line_prev},
504 column => $self->{column_prev},
505 });
506 redo A;
507 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
508 if ($self->{nc} == 0x0021) { # !
509
510 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
511
512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
513 $self->{line_prev} = $self->{line};
514 $self->{column_prev} = $self->{column};
515 $self->{column}++;
516 $self->{nc}
517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
518 } else {
519 $self->{set_nc}->($self);
520 }
521
522 redo A;
523 } elsif ($self->{nc} == 0x002F) { # /
524
525 $self->{state} = CLOSE_TAG_OPEN_STATE;
526
527 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
528 $self->{line_prev} = $self->{line};
529 $self->{column_prev} = $self->{column};
530 $self->{column}++;
531 $self->{nc}
532 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
533 } else {
534 $self->{set_nc}->($self);
535 }
536
537 redo A;
538 } elsif (0x0041 <= $self->{nc} and
539 $self->{nc} <= 0x005A) { # A..Z
540
541 $self->{ct}
542 = {type => START_TAG_TOKEN,
543 tag_name => chr ($self->{nc} + 0x0020),
544 line => $self->{line_prev},
545 column => $self->{column_prev}};
546 $self->{state} = TAG_NAME_STATE;
547
548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
549 $self->{line_prev} = $self->{line};
550 $self->{column_prev} = $self->{column};
551 $self->{column}++;
552 $self->{nc}
553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
554 } else {
555 $self->{set_nc}->($self);
556 }
557
558 redo A;
559 } elsif (0x0061 <= $self->{nc} and
560 $self->{nc} <= 0x007A) { # a..z
561
562 $self->{ct} = {type => START_TAG_TOKEN,
563 tag_name => chr ($self->{nc}),
564 line => $self->{line_prev},
565 column => $self->{column_prev}};
566 $self->{state} = TAG_NAME_STATE;
567
568 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
569 $self->{line_prev} = $self->{line};
570 $self->{column_prev} = $self->{column};
571 $self->{column}++;
572 $self->{nc}
573 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
574 } else {
575 $self->{set_nc}->($self);
576 }
577
578 redo A;
579 } elsif ($self->{nc} == 0x003E) { # >
580
581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
582 line => $self->{line_prev},
583 column => $self->{column_prev});
584 $self->{state} = DATA_STATE;
585
586 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
587 $self->{line_prev} = $self->{line};
588 $self->{column_prev} = $self->{column};
589 $self->{column}++;
590 $self->{nc}
591 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
592 } else {
593 $self->{set_nc}->($self);
594 }
595
596
597 return ({type => CHARACTER_TOKEN, data => '<>',
598 line => $self->{line_prev},
599 column => $self->{column_prev},
600 });
601
602 redo A;
603 } elsif ($self->{nc} == 0x003F) { # ?
604
605 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
606 line => $self->{line_prev},
607 column => $self->{column_prev});
608 $self->{state} = BOGUS_COMMENT_STATE;
609 $self->{ct} = {type => COMMENT_TOKEN, data => '',
610 line => $self->{line_prev},
611 column => $self->{column_prev},
612 };
613 ## $self->{nc} is intentionally left as is
614 redo A;
615 } else {
616
617 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
618 line => $self->{line_prev},
619 column => $self->{column_prev});
620 $self->{state} = DATA_STATE;
621 ## reconsume
622
623 return ({type => CHARACTER_TOKEN, data => '<',
624 line => $self->{line_prev},
625 column => $self->{column_prev},
626 });
627
628 redo A;
629 }
630 } else {
631 die "$0: $self->{content_model} in tag open";
632 }
633 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
634 ## NOTE: The "close tag open state" in the spec is implemented as
635 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
636
637 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
638 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
639 if (defined $self->{last_stag_name}) {
640 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
641 $self->{s_kwd} = '';
642 ## Reconsume.
643 redo A;
644 } else {
645 ## No start tag token has ever been emitted
646 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
647
648 $self->{state} = DATA_STATE;
649 ## Reconsume.
650 return ({type => CHARACTER_TOKEN, data => '</',
651 line => $l, column => $c,
652 });
653 redo A;
654 }
655 }
656
657 if (0x0041 <= $self->{nc} and
658 $self->{nc} <= 0x005A) { # A..Z
659
660 $self->{ct}
661 = {type => END_TAG_TOKEN,
662 tag_name => chr ($self->{nc} + 0x0020),
663 line => $l, column => $c};
664 $self->{state} = TAG_NAME_STATE;
665
666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
667 $self->{line_prev} = $self->{line};
668 $self->{column_prev} = $self->{column};
669 $self->{column}++;
670 $self->{nc}
671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
672 } else {
673 $self->{set_nc}->($self);
674 }
675
676 redo A;
677 } elsif (0x0061 <= $self->{nc} and
678 $self->{nc} <= 0x007A) { # a..z
679
680 $self->{ct} = {type => END_TAG_TOKEN,
681 tag_name => chr ($self->{nc}),
682 line => $l, column => $c};
683 $self->{state} = TAG_NAME_STATE;
684
685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686 $self->{line_prev} = $self->{line};
687 $self->{column_prev} = $self->{column};
688 $self->{column}++;
689 $self->{nc}
690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691 } else {
692 $self->{set_nc}->($self);
693 }
694
695 redo A;
696 } elsif ($self->{nc} == 0x003E) { # >
697
698 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
699 line => $self->{line_prev}, ## "<" in "</>"
700 column => $self->{column_prev} - 1);
701 $self->{state} = DATA_STATE;
702
703 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
704 $self->{line_prev} = $self->{line};
705 $self->{column_prev} = $self->{column};
706 $self->{column}++;
707 $self->{nc}
708 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
709 } else {
710 $self->{set_nc}->($self);
711 }
712
713 redo A;
714 } elsif ($self->{nc} == -1) {
715
716 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
717 $self->{state} = DATA_STATE;
718 # reconsume
719
720 return ({type => CHARACTER_TOKEN, data => '</',
721 line => $l, column => $c,
722 });
723
724 redo A;
725 } else {
726
727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
728 $self->{state} = BOGUS_COMMENT_STATE;
729 $self->{ct} = {type => COMMENT_TOKEN, data => '',
730 line => $self->{line_prev}, # "<" of "</"
731 column => $self->{column_prev} - 1,
732 };
733 ## NOTE: $self->{nc} is intentionally left as is.
734 ## Although the "anything else" case of the spec not explicitly
735 ## states that the next input character is to be reconsumed,
736 ## it will be included to the |data| of the comment token
737 ## generated from the bogus end tag, as defined in the
738 ## "bogus comment state" entry.
739 redo A;
740 }
741 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
742 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
743 if (length $ch) {
744 my $CH = $ch;
745 $ch =~ tr/a-z/A-Z/;
746 my $nch = chr $self->{nc};
747 if ($nch eq $ch or $nch eq $CH) {
748
749 ## Stay in the state.
750 $self->{s_kwd} .= $nch;
751
752 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
753 $self->{line_prev} = $self->{line};
754 $self->{column_prev} = $self->{column};
755 $self->{column}++;
756 $self->{nc}
757 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
758 } else {
759 $self->{set_nc}->($self);
760 }
761
762 redo A;
763 } else {
764
765 $self->{state} = DATA_STATE;
766 ## Reconsume.
767 return ({type => CHARACTER_TOKEN,
768 data => '</' . $self->{s_kwd},
769 line => $self->{line_prev},
770 column => $self->{column_prev} - 1 - length $self->{s_kwd},
771 });
772 redo A;
773 }
774 } else { # after "<{tag-name}"
775 unless ($is_space->{$self->{nc}} or
776 {
777 0x003E => 1, # >
778 0x002F => 1, # /
779 -1 => 1, # EOF
780 }->{$self->{nc}}) {
781
782 ## Reconsume.
783 $self->{state} = DATA_STATE;
784 return ({type => CHARACTER_TOKEN,
785 data => '</' . $self->{s_kwd},
786 line => $self->{line_prev},
787 column => $self->{column_prev} - 1 - length $self->{s_kwd},
788 });
789 redo A;
790 } else {
791
792 $self->{ct}
793 = {type => END_TAG_TOKEN,
794 tag_name => $self->{last_stag_name},
795 line => $self->{line_prev},
796 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
797 $self->{state} = TAG_NAME_STATE;
798 ## Reconsume.
799 redo A;
800 }
801 }
802 } elsif ($self->{state} == TAG_NAME_STATE) {
803 if ($is_space->{$self->{nc}}) {
804
805 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
806
807 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
808 $self->{line_prev} = $self->{line};
809 $self->{column_prev} = $self->{column};
810 $self->{column}++;
811 $self->{nc}
812 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
813 } else {
814 $self->{set_nc}->($self);
815 }
816
817 redo A;
818 } elsif ($self->{nc} == 0x003E) { # >
819 if ($self->{ct}->{type} == START_TAG_TOKEN) {
820
821 $self->{last_stag_name} = $self->{ct}->{tag_name};
822 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
823 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
824 #if ($self->{ct}->{attributes}) {
825 # ## NOTE: This should never be reached.
826 # !!! cp (36);
827 # !!! parse-error (type => 'end tag attribute');
828 #} else {
829
830 #}
831 } else {
832 die "$0: $self->{ct}->{type}: Unknown token type";
833 }
834 $self->{state} = DATA_STATE;
835
836 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
837 $self->{line_prev} = $self->{line};
838 $self->{column_prev} = $self->{column};
839 $self->{column}++;
840 $self->{nc}
841 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
842 } else {
843 $self->{set_nc}->($self);
844 }
845
846
847 return ($self->{ct}); # start tag or end tag
848
849 redo A;
850 } elsif (0x0041 <= $self->{nc} and
851 $self->{nc} <= 0x005A) { # A..Z
852
853 $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
854 # start tag or end tag
855 ## Stay in this state
856
857 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
858 $self->{line_prev} = $self->{line};
859 $self->{column_prev} = $self->{column};
860 $self->{column}++;
861 $self->{nc}
862 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
863 } else {
864 $self->{set_nc}->($self);
865 }
866
867 redo A;
868 } elsif ($self->{nc} == -1) {
869 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
870 if ($self->{ct}->{type} == START_TAG_TOKEN) {
871
872 $self->{last_stag_name} = $self->{ct}->{tag_name};
873 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
874 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
875 #if ($self->{ct}->{attributes}) {
876 # ## NOTE: This state should never be reached.
877 # !!! cp (40);
878 # !!! parse-error (type => 'end tag attribute');
879 #} else {
880
881 #}
882 } else {
883 die "$0: $self->{ct}->{type}: Unknown token type";
884 }
885 $self->{state} = DATA_STATE;
886 # reconsume
887
888 return ($self->{ct}); # start tag or end tag
889
890 redo A;
891 } elsif ($self->{nc} == 0x002F) { # /
892
893 $self->{state} = SELF_CLOSING_START_TAG_STATE;
894
895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
896 $self->{line_prev} = $self->{line};
897 $self->{column_prev} = $self->{column};
898 $self->{column}++;
899 $self->{nc}
900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
901 } else {
902 $self->{set_nc}->($self);
903 }
904
905 redo A;
906 } else {
907
908 $self->{ct}->{tag_name} .= chr $self->{nc};
909 # start tag or end tag
910 ## Stay in the state
911
912 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
913 $self->{line_prev} = $self->{line};
914 $self->{column_prev} = $self->{column};
915 $self->{column}++;
916 $self->{nc}
917 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
918 } else {
919 $self->{set_nc}->($self);
920 }
921
922 redo A;
923 }
924 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
925 if ($is_space->{$self->{nc}}) {
926
927 ## Stay in the state
928
929 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
930 $self->{line_prev} = $self->{line};
931 $self->{column_prev} = $self->{column};
932 $self->{column}++;
933 $self->{nc}
934 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
935 } else {
936 $self->{set_nc}->($self);
937 }
938
939 redo A;
940 } elsif ($self->{nc} == 0x003E) { # >
941 if ($self->{ct}->{type} == START_TAG_TOKEN) {
942
943 $self->{last_stag_name} = $self->{ct}->{tag_name};
944 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
945 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
946 if ($self->{ct}->{attributes}) {
947
948 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
949 } else {
950
951 }
952 } else {
953 die "$0: $self->{ct}->{type}: Unknown token type";
954 }
955 $self->{state} = DATA_STATE;
956
957 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
958 $self->{line_prev} = $self->{line};
959 $self->{column_prev} = $self->{column};
960 $self->{column}++;
961 $self->{nc}
962 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
963 } else {
964 $self->{set_nc}->($self);
965 }
966
967
968 return ($self->{ct}); # start tag or end tag
969
970 redo A;
971 } elsif (0x0041 <= $self->{nc} and
972 $self->{nc} <= 0x005A) { # A..Z
973
974 $self->{ca}
975 = {name => chr ($self->{nc} + 0x0020),
976 value => '',
977 line => $self->{line}, column => $self->{column}};
978 $self->{state} = ATTRIBUTE_NAME_STATE;
979
980 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
981 $self->{line_prev} = $self->{line};
982 $self->{column_prev} = $self->{column};
983 $self->{column}++;
984 $self->{nc}
985 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
986 } else {
987 $self->{set_nc}->($self);
988 }
989
990 redo A;
991 } elsif ($self->{nc} == 0x002F) { # /
992
993 $self->{state} = SELF_CLOSING_START_TAG_STATE;
994
995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996 $self->{line_prev} = $self->{line};
997 $self->{column_prev} = $self->{column};
998 $self->{column}++;
999 $self->{nc}
1000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001 } else {
1002 $self->{set_nc}->($self);
1003 }
1004
1005 redo A;
1006 } elsif ($self->{nc} == -1) {
1007 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1008 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1009
1010 $self->{last_stag_name} = $self->{ct}->{tag_name};
1011 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1012 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013 if ($self->{ct}->{attributes}) {
1014
1015 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1016 } else {
1017
1018 }
1019 } else {
1020 die "$0: $self->{ct}->{type}: Unknown token type";
1021 }
1022 $self->{state} = DATA_STATE;
1023 # reconsume
1024
1025 return ($self->{ct}); # start tag or end tag
1026
1027 redo A;
1028 } else {
1029 if ({
1030 0x0022 => 1, # "
1031 0x0027 => 1, # '
1032 0x003D => 1, # =
1033 }->{$self->{nc}}) {
1034
1035 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1036 } else {
1037
1038 }
1039 $self->{ca}
1040 = {name => chr ($self->{nc}),
1041 value => '',
1042 line => $self->{line}, column => $self->{column}};
1043 $self->{state} = ATTRIBUTE_NAME_STATE;
1044
1045 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1046 $self->{line_prev} = $self->{line};
1047 $self->{column_prev} = $self->{column};
1048 $self->{column}++;
1049 $self->{nc}
1050 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1051 } else {
1052 $self->{set_nc}->($self);
1053 }
1054
1055 redo A;
1056 }
1057 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1058 my $before_leave = sub {
1059 if (exists $self->{ct}->{attributes} # start tag or end tag
1060 ->{$self->{ca}->{name}}) { # MUST
1061
1062 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1063 ## Discard $self->{ca} # MUST
1064 } else {
1065
1066 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1067 = $self->{ca};
1068 }
1069 }; # $before_leave
1070
1071 if ($is_space->{$self->{nc}}) {
1072
1073 $before_leave->();
1074 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1075
1076 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1077 $self->{line_prev} = $self->{line};
1078 $self->{column_prev} = $self->{column};
1079 $self->{column}++;
1080 $self->{nc}
1081 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1082 } else {
1083 $self->{set_nc}->($self);
1084 }
1085
1086 redo A;
1087 } elsif ($self->{nc} == 0x003D) { # =
1088
1089 $before_leave->();
1090 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1091
1092 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1093 $self->{line_prev} = $self->{line};
1094 $self->{column_prev} = $self->{column};
1095 $self->{column}++;
1096 $self->{nc}
1097 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1098 } else {
1099 $self->{set_nc}->($self);
1100 }
1101
1102 redo A;
1103 } elsif ($self->{nc} == 0x003E) { # >
1104 $before_leave->();
1105 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1106
1107 $self->{last_stag_name} = $self->{ct}->{tag_name};
1108 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1109
1110 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1111 if ($self->{ct}->{attributes}) {
1112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1113 }
1114 } else {
1115 die "$0: $self->{ct}->{type}: Unknown token type";
1116 }
1117 $self->{state} = DATA_STATE;
1118
1119 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1120 $self->{line_prev} = $self->{line};
1121 $self->{column_prev} = $self->{column};
1122 $self->{column}++;
1123 $self->{nc}
1124 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1125 } else {
1126 $self->{set_nc}->($self);
1127 }
1128
1129
1130 return ($self->{ct}); # start tag or end tag
1131
1132 redo A;
1133 } elsif (0x0041 <= $self->{nc} and
1134 $self->{nc} <= 0x005A) { # A..Z
1135
1136 $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1137 ## Stay in the state
1138
1139 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1140 $self->{line_prev} = $self->{line};
1141 $self->{column_prev} = $self->{column};
1142 $self->{column}++;
1143 $self->{nc}
1144 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1145 } else {
1146 $self->{set_nc}->($self);
1147 }
1148
1149 redo A;
1150 } elsif ($self->{nc} == 0x002F) { # /
1151
1152 $before_leave->();
1153 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1154
1155 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1156 $self->{line_prev} = $self->{line};
1157 $self->{column_prev} = $self->{column};
1158 $self->{column}++;
1159 $self->{nc}
1160 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1161 } else {
1162 $self->{set_nc}->($self);
1163 }
1164
1165 redo A;
1166 } elsif ($self->{nc} == -1) {
1167 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1168 $before_leave->();
1169 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1170
1171 $self->{last_stag_name} = $self->{ct}->{tag_name};
1172 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1173 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1174 if ($self->{ct}->{attributes}) {
1175
1176 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1177 } else {
1178 ## NOTE: This state should never be reached.
1179
1180 }
1181 } else {
1182 die "$0: $self->{ct}->{type}: Unknown token type";
1183 }
1184 $self->{state} = DATA_STATE;
1185 # reconsume
1186
1187 return ($self->{ct}); # start tag or end tag
1188
1189 redo A;
1190 } else {
1191 if ($self->{nc} == 0x0022 or # "
1192 $self->{nc} == 0x0027) { # '
1193
1194 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1195 } else {
1196
1197 }
1198 $self->{ca}->{name} .= chr ($self->{nc});
1199 ## Stay in the state
1200
1201 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1202 $self->{line_prev} = $self->{line};
1203 $self->{column_prev} = $self->{column};
1204 $self->{column}++;
1205 $self->{nc}
1206 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1207 } else {
1208 $self->{set_nc}->($self);
1209 }
1210
1211 redo A;
1212 }
1213 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1214 if ($is_space->{$self->{nc}}) {
1215
1216 ## Stay in the state
1217
1218 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1219 $self->{line_prev} = $self->{line};
1220 $self->{column_prev} = $self->{column};
1221 $self->{column}++;
1222 $self->{nc}
1223 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1224 } else {
1225 $self->{set_nc}->($self);
1226 }
1227
1228 redo A;
1229 } elsif ($self->{nc} == 0x003D) { # =
1230
1231 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1232
1233 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1234 $self->{line_prev} = $self->{line};
1235 $self->{column_prev} = $self->{column};
1236 $self->{column}++;
1237 $self->{nc}
1238 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1239 } else {
1240 $self->{set_nc}->($self);
1241 }
1242
1243 redo A;
1244 } elsif ($self->{nc} == 0x003E) { # >
1245 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1246
1247 $self->{last_stag_name} = $self->{ct}->{tag_name};
1248 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1249 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1250 if ($self->{ct}->{attributes}) {
1251
1252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1253 } else {
1254 ## NOTE: This state should never be reached.
1255
1256 }
1257 } else {
1258 die "$0: $self->{ct}->{type}: Unknown token type";
1259 }
1260 $self->{state} = DATA_STATE;
1261
1262 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1263 $self->{line_prev} = $self->{line};
1264 $self->{column_prev} = $self->{column};
1265 $self->{column}++;
1266 $self->{nc}
1267 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1268 } else {
1269 $self->{set_nc}->($self);
1270 }
1271
1272
1273 return ($self->{ct}); # start tag or end tag
1274
1275 redo A;
1276 } elsif (0x0041 <= $self->{nc} and
1277 $self->{nc} <= 0x005A) { # A..Z
1278
1279 $self->{ca}
1280 = {name => chr ($self->{nc} + 0x0020),
1281 value => '',
1282 line => $self->{line}, column => $self->{column}};
1283 $self->{state} = ATTRIBUTE_NAME_STATE;
1284
1285 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1286 $self->{line_prev} = $self->{line};
1287 $self->{column_prev} = $self->{column};
1288 $self->{column}++;
1289 $self->{nc}
1290 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1291 } else {
1292 $self->{set_nc}->($self);
1293 }
1294
1295 redo A;
1296 } elsif ($self->{nc} == 0x002F) { # /
1297
1298 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1299
1300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301 $self->{line_prev} = $self->{line};
1302 $self->{column_prev} = $self->{column};
1303 $self->{column}++;
1304 $self->{nc}
1305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306 } else {
1307 $self->{set_nc}->($self);
1308 }
1309
1310 redo A;
1311 } elsif ($self->{nc} == -1) {
1312 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1313 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1314
1315 $self->{last_stag_name} = $self->{ct}->{tag_name};
1316 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1317 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1318 if ($self->{ct}->{attributes}) {
1319
1320 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1321 } else {
1322 ## NOTE: This state should never be reached.
1323
1324 }
1325 } else {
1326 die "$0: $self->{ct}->{type}: Unknown token type";
1327 }
1328 $self->{state} = DATA_STATE;
1329 # reconsume
1330
1331 return ($self->{ct}); # start tag or end tag
1332
1333 redo A;
1334 } else {
1335 if ($self->{nc} == 0x0022 or # "
1336 $self->{nc} == 0x0027) { # '
1337
1338 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1339 } else {
1340
1341 }
1342 $self->{ca}
1343 = {name => chr ($self->{nc}),
1344 value => '',
1345 line => $self->{line}, column => $self->{column}};
1346 $self->{state} = ATTRIBUTE_NAME_STATE;
1347
1348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1349 $self->{line_prev} = $self->{line};
1350 $self->{column_prev} = $self->{column};
1351 $self->{column}++;
1352 $self->{nc}
1353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1354 } else {
1355 $self->{set_nc}->($self);
1356 }
1357
1358 redo A;
1359 }
1360 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1361 if ($is_space->{$self->{nc}}) {
1362
1363 ## Stay in the state
1364
1365 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1366 $self->{line_prev} = $self->{line};
1367 $self->{column_prev} = $self->{column};
1368 $self->{column}++;
1369 $self->{nc}
1370 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1371 } else {
1372 $self->{set_nc}->($self);
1373 }
1374
1375 redo A;
1376 } elsif ($self->{nc} == 0x0022) { # "
1377
1378 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1379
1380 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1381 $self->{line_prev} = $self->{line};
1382 $self->{column_prev} = $self->{column};
1383 $self->{column}++;
1384 $self->{nc}
1385 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1386 } else {
1387 $self->{set_nc}->($self);
1388 }
1389
1390 redo A;
1391 } elsif ($self->{nc} == 0x0026) { # &
1392
1393 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1394 ## reconsume
1395 redo A;
1396 } elsif ($self->{nc} == 0x0027) { # '
1397
1398 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1399
1400 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1401 $self->{line_prev} = $self->{line};
1402 $self->{column_prev} = $self->{column};
1403 $self->{column}++;
1404 $self->{nc}
1405 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1406 } else {
1407 $self->{set_nc}->($self);
1408 }
1409
1410 redo A;
1411 } elsif ($self->{nc} == 0x003E) { # >
1412 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1413 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1414
1415 $self->{last_stag_name} = $self->{ct}->{tag_name};
1416 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1417 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1418 if ($self->{ct}->{attributes}) {
1419
1420 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1421 } else {
1422 ## NOTE: This state should never be reached.
1423
1424 }
1425 } else {
1426 die "$0: $self->{ct}->{type}: Unknown token type";
1427 }
1428 $self->{state} = DATA_STATE;
1429
1430 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1431 $self->{line_prev} = $self->{line};
1432 $self->{column_prev} = $self->{column};
1433 $self->{column}++;
1434 $self->{nc}
1435 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1436 } else {
1437 $self->{set_nc}->($self);
1438 }
1439
1440
1441 return ($self->{ct}); # start tag or end tag
1442
1443 redo A;
1444 } elsif ($self->{nc} == -1) {
1445 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1446 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1447
1448 $self->{last_stag_name} = $self->{ct}->{tag_name};
1449 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1450 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1451 if ($self->{ct}->{attributes}) {
1452
1453 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1454 } else {
1455 ## NOTE: This state should never be reached.
1456
1457 }
1458 } else {
1459 die "$0: $self->{ct}->{type}: Unknown token type";
1460 }
1461 $self->{state} = DATA_STATE;
1462 ## reconsume
1463
1464 return ($self->{ct}); # start tag or end tag
1465
1466 redo A;
1467 } else {
1468 if ($self->{nc} == 0x003D) { # =
1469
1470 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1471 } else {
1472
1473 }
1474 $self->{ca}->{value} .= chr ($self->{nc});
1475 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1476
1477 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1478 $self->{line_prev} = $self->{line};
1479 $self->{column_prev} = $self->{column};
1480 $self->{column}++;
1481 $self->{nc}
1482 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1483 } else {
1484 $self->{set_nc}->($self);
1485 }
1486
1487 redo A;
1488 }
1489 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1490 if ($self->{nc} == 0x0022) { # "
1491
1492 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1493
1494 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1495 $self->{line_prev} = $self->{line};
1496 $self->{column_prev} = $self->{column};
1497 $self->{column}++;
1498 $self->{nc}
1499 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1500 } else {
1501 $self->{set_nc}->($self);
1502 }
1503
1504 redo A;
1505 } elsif ($self->{nc} == 0x0026) { # &
1506
1507 ## NOTE: In the spec, the tokenizer is switched to the
1508 ## "entity in attribute value state". In this implementation, the
1509 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1510 ## implementation of the "consume a character reference" algorithm.
1511 $self->{prev_state} = $self->{state};
1512 $self->{entity_add} = 0x0022; # "
1513 $self->{state} = ENTITY_STATE;
1514
1515 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1516 $self->{line_prev} = $self->{line};
1517 $self->{column_prev} = $self->{column};
1518 $self->{column}++;
1519 $self->{nc}
1520 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1521 } else {
1522 $self->{set_nc}->($self);
1523 }
1524
1525 redo A;
1526 } elsif ($self->{nc} == -1) {
1527 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1528 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1529
1530 $self->{last_stag_name} = $self->{ct}->{tag_name};
1531 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1532 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1533 if ($self->{ct}->{attributes}) {
1534
1535 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1536 } else {
1537 ## NOTE: This state should never be reached.
1538
1539 }
1540 } else {
1541 die "$0: $self->{ct}->{type}: Unknown token type";
1542 }
1543 $self->{state} = DATA_STATE;
1544 ## reconsume
1545
1546 return ($self->{ct}); # start tag or end tag
1547
1548 redo A;
1549 } else {
1550
1551 $self->{ca}->{value} .= chr ($self->{nc});
1552 $self->{read_until}->($self->{ca}->{value},
1553 q["&],
1554 length $self->{ca}->{value});
1555
1556 ## Stay in the state
1557
1558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1559 $self->{line_prev} = $self->{line};
1560 $self->{column_prev} = $self->{column};
1561 $self->{column}++;
1562 $self->{nc}
1563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1564 } else {
1565 $self->{set_nc}->($self);
1566 }
1567
1568 redo A;
1569 }
1570 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1571 if ($self->{nc} == 0x0027) { # '
1572
1573 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1574
1575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1576 $self->{line_prev} = $self->{line};
1577 $self->{column_prev} = $self->{column};
1578 $self->{column}++;
1579 $self->{nc}
1580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1581 } else {
1582 $self->{set_nc}->($self);
1583 }
1584
1585 redo A;
1586 } elsif ($self->{nc} == 0x0026) { # &
1587
1588 ## NOTE: In the spec, the tokenizer is switched to the
1589 ## "entity in attribute value state". In this implementation, the
1590 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1591 ## implementation of the "consume a character reference" algorithm.
1592 $self->{entity_add} = 0x0027; # '
1593 $self->{prev_state} = $self->{state};
1594 $self->{state} = ENTITY_STATE;
1595
1596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1597 $self->{line_prev} = $self->{line};
1598 $self->{column_prev} = $self->{column};
1599 $self->{column}++;
1600 $self->{nc}
1601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1602 } else {
1603 $self->{set_nc}->($self);
1604 }
1605
1606 redo A;
1607 } elsif ($self->{nc} == -1) {
1608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1609 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1610
1611 $self->{last_stag_name} = $self->{ct}->{tag_name};
1612 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1613 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1614 if ($self->{ct}->{attributes}) {
1615
1616 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1617 } else {
1618 ## NOTE: This state should never be reached.
1619
1620 }
1621 } else {
1622 die "$0: $self->{ct}->{type}: Unknown token type";
1623 }
1624 $self->{state} = DATA_STATE;
1625 ## reconsume
1626
1627 return ($self->{ct}); # start tag or end tag
1628
1629 redo A;
1630 } else {
1631
1632 $self->{ca}->{value} .= chr ($self->{nc});
1633 $self->{read_until}->($self->{ca}->{value},
1634 q['&],
1635 length $self->{ca}->{value});
1636
1637 ## Stay in the state
1638
1639 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1640 $self->{line_prev} = $self->{line};
1641 $self->{column_prev} = $self->{column};
1642 $self->{column}++;
1643 $self->{nc}
1644 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1645 } else {
1646 $self->{set_nc}->($self);
1647 }
1648
1649 redo A;
1650 }
1651 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1652 if ($is_space->{$self->{nc}}) {
1653
1654 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1655
1656 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1657 $self->{line_prev} = $self->{line};
1658 $self->{column_prev} = $self->{column};
1659 $self->{column}++;
1660 $self->{nc}
1661 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1662 } else {
1663 $self->{set_nc}->($self);
1664 }
1665
1666 redo A;
1667 } elsif ($self->{nc} == 0x0026) { # &
1668
1669 ## NOTE: In the spec, the tokenizer is switched to the
1670 ## "entity in attribute value state". In this implementation, the
1671 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1672 ## implementation of the "consume a character reference" algorithm.
1673 $self->{entity_add} = -1;
1674 $self->{prev_state} = $self->{state};
1675 $self->{state} = ENTITY_STATE;
1676
1677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1678 $self->{line_prev} = $self->{line};
1679 $self->{column_prev} = $self->{column};
1680 $self->{column}++;
1681 $self->{nc}
1682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1683 } else {
1684 $self->{set_nc}->($self);
1685 }
1686
1687 redo A;
1688 } elsif ($self->{nc} == 0x003E) { # >
1689 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1690
1691 $self->{last_stag_name} = $self->{ct}->{tag_name};
1692 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1693 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1694 if ($self->{ct}->{attributes}) {
1695
1696 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1697 } else {
1698 ## NOTE: This state should never be reached.
1699
1700 }
1701 } else {
1702 die "$0: $self->{ct}->{type}: Unknown token type";
1703 }
1704 $self->{state} = DATA_STATE;
1705
1706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1707 $self->{line_prev} = $self->{line};
1708 $self->{column_prev} = $self->{column};
1709 $self->{column}++;
1710 $self->{nc}
1711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1712 } else {
1713 $self->{set_nc}->($self);
1714 }
1715
1716
1717 return ($self->{ct}); # start tag or end tag
1718
1719 redo A;
1720 } elsif ($self->{nc} == -1) {
1721 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1722 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1723
1724 $self->{last_stag_name} = $self->{ct}->{tag_name};
1725 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1726 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1727 if ($self->{ct}->{attributes}) {
1728
1729 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1730 } else {
1731 ## NOTE: This state should never be reached.
1732
1733 }
1734 } else {
1735 die "$0: $self->{ct}->{type}: Unknown token type";
1736 }
1737 $self->{state} = DATA_STATE;
1738 ## reconsume
1739
1740 return ($self->{ct}); # start tag or end tag
1741
1742 redo A;
1743 } else {
1744 if ({
1745 0x0022 => 1, # "
1746 0x0027 => 1, # '
1747 0x003D => 1, # =
1748 }->{$self->{nc}}) {
1749
1750 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1751 } else {
1752
1753 }
1754 $self->{ca}->{value} .= chr ($self->{nc});
1755 $self->{read_until}->($self->{ca}->{value},
1756 q["'=& >],
1757 length $self->{ca}->{value});
1758
1759 ## Stay in the state
1760
1761 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1762 $self->{line_prev} = $self->{line};
1763 $self->{column_prev} = $self->{column};
1764 $self->{column}++;
1765 $self->{nc}
1766 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1767 } else {
1768 $self->{set_nc}->($self);
1769 }
1770
1771 redo A;
1772 }
1773 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1774 if ($is_space->{$self->{nc}}) {
1775
1776 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1777
1778 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1779 $self->{line_prev} = $self->{line};
1780 $self->{column_prev} = $self->{column};
1781 $self->{column}++;
1782 $self->{nc}
1783 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1784 } else {
1785 $self->{set_nc}->($self);
1786 }
1787
1788 redo A;
1789 } elsif ($self->{nc} == 0x003E) { # >
1790 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1791
1792 $self->{last_stag_name} = $self->{ct}->{tag_name};
1793 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1794 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1795 if ($self->{ct}->{attributes}) {
1796
1797 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1798 } else {
1799 ## NOTE: This state should never be reached.
1800
1801 }
1802 } else {
1803 die "$0: $self->{ct}->{type}: Unknown token type";
1804 }
1805 $self->{state} = DATA_STATE;
1806
1807 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1808 $self->{line_prev} = $self->{line};
1809 $self->{column_prev} = $self->{column};
1810 $self->{column}++;
1811 $self->{nc}
1812 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1813 } else {
1814 $self->{set_nc}->($self);
1815 }
1816
1817
1818 return ($self->{ct}); # start tag or end tag
1819
1820 redo A;
1821 } elsif ($self->{nc} == 0x002F) { # /
1822
1823 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1824
1825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826 $self->{line_prev} = $self->{line};
1827 $self->{column_prev} = $self->{column};
1828 $self->{column}++;
1829 $self->{nc}
1830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831 } else {
1832 $self->{set_nc}->($self);
1833 }
1834
1835 redo A;
1836 } elsif ($self->{nc} == -1) {
1837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1838 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839
1840 $self->{last_stag_name} = $self->{ct}->{tag_name};
1841 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1842 if ($self->{ct}->{attributes}) {
1843
1844 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1845 } else {
1846 ## NOTE: This state should never be reached.
1847
1848 }
1849 } else {
1850 die "$0: $self->{ct}->{type}: Unknown token type";
1851 }
1852 $self->{state} = DATA_STATE;
1853 ## Reconsume.
1854 return ($self->{ct}); # start tag or end tag
1855 redo A;
1856 } else {
1857
1858 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1859 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1860 ## reconsume
1861 redo A;
1862 }
1863 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1864 if ($self->{nc} == 0x003E) { # >
1865 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1866
1867 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1868 ## TODO: Different type than slash in start tag
1869 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1870 if ($self->{ct}->{attributes}) {
1871
1872 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1873 } else {
1874
1875 }
1876 ## TODO: Test |<title></title/>|
1877 } else {
1878
1879 $self->{self_closing} = 1;
1880 }
1881
1882 $self->{state} = DATA_STATE;
1883
1884 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1885 $self->{line_prev} = $self->{line};
1886 $self->{column_prev} = $self->{column};
1887 $self->{column}++;
1888 $self->{nc}
1889 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1890 } else {
1891 $self->{set_nc}->($self);
1892 }
1893
1894
1895 return ($self->{ct}); # start tag or end tag
1896
1897 redo A;
1898 } elsif ($self->{nc} == -1) {
1899 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1900 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1901
1902 $self->{last_stag_name} = $self->{ct}->{tag_name};
1903 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1904 if ($self->{ct}->{attributes}) {
1905
1906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1907 } else {
1908 ## NOTE: This state should never be reached.
1909
1910 }
1911 } else {
1912 die "$0: $self->{ct}->{type}: Unknown token type";
1913 }
1914 $self->{state} = DATA_STATE;
1915 ## Reconsume.
1916 return ($self->{ct}); # start tag or end tag
1917 redo A;
1918 } else {
1919
1920 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1921 ## TODO: This error type is wrong.
1922 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1923 ## Reconsume.
1924 redo A;
1925 }
1926 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1927 ## (only happen if PCDATA state)
1928
1929 ## NOTE: Unlike spec's "bogus comment state", this implementation
1930 ## consumes characters one-by-one basis.
1931
1932 if ($self->{nc} == 0x003E) { # >
1933
1934 $self->{state} = DATA_STATE;
1935
1936 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1937 $self->{line_prev} = $self->{line};
1938 $self->{column_prev} = $self->{column};
1939 $self->{column}++;
1940 $self->{nc}
1941 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1942 } else {
1943 $self->{set_nc}->($self);
1944 }
1945
1946
1947 return ($self->{ct}); # comment
1948 redo A;
1949 } elsif ($self->{nc} == -1) {
1950
1951 $self->{state} = DATA_STATE;
1952 ## reconsume
1953
1954 return ($self->{ct}); # comment
1955 redo A;
1956 } else {
1957
1958 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1959 $self->{read_until}->($self->{ct}->{data},
1960 q[>],
1961 length $self->{ct}->{data});
1962
1963 ## Stay in the state.
1964
1965 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1966 $self->{line_prev} = $self->{line};
1967 $self->{column_prev} = $self->{column};
1968 $self->{column}++;
1969 $self->{nc}
1970 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1971 } else {
1972 $self->{set_nc}->($self);
1973 }
1974
1975 redo A;
1976 }
1977 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1978 ## (only happen if PCDATA state)
1979
1980 if ($self->{nc} == 0x002D) { # -
1981
1982 $self->{state} = MD_HYPHEN_STATE;
1983
1984 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1985 $self->{line_prev} = $self->{line};
1986 $self->{column_prev} = $self->{column};
1987 $self->{column}++;
1988 $self->{nc}
1989 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1990 } else {
1991 $self->{set_nc}->($self);
1992 }
1993
1994 redo A;
1995 } elsif ($self->{nc} == 0x0044 or # D
1996 $self->{nc} == 0x0064) { # d
1997 ## ASCII case-insensitive.
1998
1999 $self->{state} = MD_DOCTYPE_STATE;
2000 $self->{s_kwd} = chr $self->{nc};
2001
2002 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2003 $self->{line_prev} = $self->{line};
2004 $self->{column_prev} = $self->{column};
2005 $self->{column}++;
2006 $self->{nc}
2007 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2008 } else {
2009 $self->{set_nc}->($self);
2010 }
2011
2012 redo A;
2013 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2014 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2015 $self->{nc} == 0x005B) { # [
2016
2017 $self->{state} = MD_CDATA_STATE;
2018 $self->{s_kwd} = '[';
2019
2020 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2021 $self->{line_prev} = $self->{line};
2022 $self->{column_prev} = $self->{column};
2023 $self->{column}++;
2024 $self->{nc}
2025 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2026 } else {
2027 $self->{set_nc}->($self);
2028 }
2029
2030 redo A;
2031 } else {
2032
2033 }
2034
2035 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2036 line => $self->{line_prev},
2037 column => $self->{column_prev} - 1);
2038 ## Reconsume.
2039 $self->{state} = BOGUS_COMMENT_STATE;
2040 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2041 line => $self->{line_prev},
2042 column => $self->{column_prev} - 1,
2043 };
2044 redo A;
2045 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2046 if ($self->{nc} == 0x002D) { # -
2047
2048 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2049 line => $self->{line_prev},
2050 column => $self->{column_prev} - 2,
2051 };
2052 $self->{state} = COMMENT_START_STATE;
2053
2054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055 $self->{line_prev} = $self->{line};
2056 $self->{column_prev} = $self->{column};
2057 $self->{column}++;
2058 $self->{nc}
2059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060 } else {
2061 $self->{set_nc}->($self);
2062 }
2063
2064 redo A;
2065 } else {
2066
2067 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2068 line => $self->{line_prev},
2069 column => $self->{column_prev} - 2);
2070 $self->{state} = BOGUS_COMMENT_STATE;
2071 ## Reconsume.
2072 $self->{ct} = {type => COMMENT_TOKEN,
2073 data => '-',
2074 line => $self->{line_prev},
2075 column => $self->{column_prev} - 2,
2076 };
2077 redo A;
2078 }
2079 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2080 ## ASCII case-insensitive.
2081 if ($self->{nc} == [
2082 undef,
2083 0x004F, # O
2084 0x0043, # C
2085 0x0054, # T
2086 0x0059, # Y
2087 0x0050, # P
2088 ]->[length $self->{s_kwd}] or
2089 $self->{nc} == [
2090 undef,
2091 0x006F, # o
2092 0x0063, # c
2093 0x0074, # t
2094 0x0079, # y
2095 0x0070, # p
2096 ]->[length $self->{s_kwd}]) {
2097
2098 ## Stay in the state.
2099 $self->{s_kwd} .= chr $self->{nc};
2100
2101 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2102 $self->{line_prev} = $self->{line};
2103 $self->{column_prev} = $self->{column};
2104 $self->{column}++;
2105 $self->{nc}
2106 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2107 } else {
2108 $self->{set_nc}->($self);
2109 }
2110
2111 redo A;
2112 } elsif ((length $self->{s_kwd}) == 6 and
2113 ($self->{nc} == 0x0045 or # E
2114 $self->{nc} == 0x0065)) { # e
2115
2116 $self->{state} = DOCTYPE_STATE;
2117 $self->{ct} = {type => DOCTYPE_TOKEN,
2118 quirks => 1,
2119 line => $self->{line_prev},
2120 column => $self->{column_prev} - 7,
2121 };
2122
2123 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2124 $self->{line_prev} = $self->{line};
2125 $self->{column_prev} = $self->{column};
2126 $self->{column}++;
2127 $self->{nc}
2128 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2129 } else {
2130 $self->{set_nc}->($self);
2131 }
2132
2133 redo A;
2134 } else {
2135
2136 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2137 line => $self->{line_prev},
2138 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2139 $self->{state} = BOGUS_COMMENT_STATE;
2140 ## Reconsume.
2141 $self->{ct} = {type => COMMENT_TOKEN,
2142 data => $self->{s_kwd},
2143 line => $self->{line_prev},
2144 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2145 };
2146 redo A;
2147 }
2148 } elsif ($self->{state} == MD_CDATA_STATE) {
2149 if ($self->{nc} == {
2150 '[' => 0x0043, # C
2151 '[C' => 0x0044, # D
2152 '[CD' => 0x0041, # A
2153 '[CDA' => 0x0054, # T
2154 '[CDAT' => 0x0041, # A
2155 }->{$self->{s_kwd}}) {
2156
2157 ## Stay in the state.
2158 $self->{s_kwd} .= chr $self->{nc};
2159
2160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2161 $self->{line_prev} = $self->{line};
2162 $self->{column_prev} = $self->{column};
2163 $self->{column}++;
2164 $self->{nc}
2165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2166 } else {
2167 $self->{set_nc}->($self);
2168 }
2169
2170 redo A;
2171 } elsif ($self->{s_kwd} eq '[CDATA' and
2172 $self->{nc} == 0x005B) { # [
2173
2174 $self->{ct} = {type => CHARACTER_TOKEN,
2175 data => '',
2176 line => $self->{line_prev},
2177 column => $self->{column_prev} - 7};
2178 $self->{state} = CDATA_SECTION_STATE;
2179
2180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2181 $self->{line_prev} = $self->{line};
2182 $self->{column_prev} = $self->{column};
2183 $self->{column}++;
2184 $self->{nc}
2185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2186 } else {
2187 $self->{set_nc}->($self);
2188 }
2189
2190 redo A;
2191 } else {
2192
2193 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2194 line => $self->{line_prev},
2195 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2196 $self->{state} = BOGUS_COMMENT_STATE;
2197 ## Reconsume.
2198 $self->{ct} = {type => COMMENT_TOKEN,
2199 data => $self->{s_kwd},
2200 line => $self->{line_prev},
2201 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2202 };
2203 redo A;
2204 }
2205 } elsif ($self->{state} == COMMENT_START_STATE) {
2206 if ($self->{nc} == 0x002D) { # -
2207
2208 $self->{state} = COMMENT_START_DASH_STATE;
2209
2210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2211 $self->{line_prev} = $self->{line};
2212 $self->{column_prev} = $self->{column};
2213 $self->{column}++;
2214 $self->{nc}
2215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2216 } else {
2217 $self->{set_nc}->($self);
2218 }
2219
2220 redo A;
2221 } elsif ($self->{nc} == 0x003E) { # >
2222
2223 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2224 $self->{state} = DATA_STATE;
2225
2226 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2227 $self->{line_prev} = $self->{line};
2228 $self->{column_prev} = $self->{column};
2229 $self->{column}++;
2230 $self->{nc}
2231 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2232 } else {
2233 $self->{set_nc}->($self);
2234 }
2235
2236
2237 return ($self->{ct}); # comment
2238
2239 redo A;
2240 } elsif ($self->{nc} == -1) {
2241
2242 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2243 $self->{state} = DATA_STATE;
2244 ## reconsume
2245
2246 return ($self->{ct}); # comment
2247
2248 redo A;
2249 } else {
2250
2251 $self->{ct}->{data} # comment
2252 .= chr ($self->{nc});
2253 $self->{state} = COMMENT_STATE;
2254
2255 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2256 $self->{line_prev} = $self->{line};
2257 $self->{column_prev} = $self->{column};
2258 $self->{column}++;
2259 $self->{nc}
2260 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2261 } else {
2262 $self->{set_nc}->($self);
2263 }
2264
2265 redo A;
2266 }
2267 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2268 if ($self->{nc} == 0x002D) { # -
2269
2270 $self->{state} = COMMENT_END_STATE;
2271
2272 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2273 $self->{line_prev} = $self->{line};
2274 $self->{column_prev} = $self->{column};
2275 $self->{column}++;
2276 $self->{nc}
2277 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2278 } else {
2279 $self->{set_nc}->($self);
2280 }
2281
2282 redo A;
2283 } elsif ($self->{nc} == 0x003E) { # >
2284
2285 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2286 $self->{state} = DATA_STATE;
2287
2288 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2289 $self->{line_prev} = $self->{line};
2290 $self->{column_prev} = $self->{column};
2291 $self->{column}++;
2292 $self->{nc}
2293 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2294 } else {
2295 $self->{set_nc}->($self);
2296 }
2297
2298
2299 return ($self->{ct}); # comment
2300
2301 redo A;
2302 } elsif ($self->{nc} == -1) {
2303
2304 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2305 $self->{state} = DATA_STATE;
2306 ## reconsume
2307
2308 return ($self->{ct}); # comment
2309
2310 redo A;
2311 } else {
2312
2313 $self->{ct}->{data} # comment
2314 .= '-' . chr ($self->{nc});
2315 $self->{state} = COMMENT_STATE;
2316
2317 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2318 $self->{line_prev} = $self->{line};
2319 $self->{column_prev} = $self->{column};
2320 $self->{column}++;
2321 $self->{nc}
2322 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2323 } else {
2324 $self->{set_nc}->($self);
2325 }
2326
2327 redo A;
2328 }
2329 } elsif ($self->{state} == COMMENT_STATE) {
2330 if ($self->{nc} == 0x002D) { # -
2331
2332 $self->{state} = COMMENT_END_DASH_STATE;
2333
2334 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2335 $self->{line_prev} = $self->{line};
2336 $self->{column_prev} = $self->{column};
2337 $self->{column}++;
2338 $self->{nc}
2339 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2340 } else {
2341 $self->{set_nc}->($self);
2342 }
2343
2344 redo A;
2345 } elsif ($self->{nc} == -1) {
2346
2347 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2348 $self->{state} = DATA_STATE;
2349 ## reconsume
2350
2351 return ($self->{ct}); # comment
2352
2353 redo A;
2354 } else {
2355
2356 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2357 $self->{read_until}->($self->{ct}->{data},
2358 q[-],
2359 length $self->{ct}->{data});
2360
2361 ## Stay in the state
2362
2363 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2364 $self->{line_prev} = $self->{line};
2365 $self->{column_prev} = $self->{column};
2366 $self->{column}++;
2367 $self->{nc}
2368 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2369 } else {
2370 $self->{set_nc}->($self);
2371 }
2372
2373 redo A;
2374 }
2375 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2376 if ($self->{nc} == 0x002D) { # -
2377
2378 $self->{state} = COMMENT_END_STATE;
2379
2380 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2381 $self->{line_prev} = $self->{line};
2382 $self->{column_prev} = $self->{column};
2383 $self->{column}++;
2384 $self->{nc}
2385 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2386 } else {
2387 $self->{set_nc}->($self);
2388 }
2389
2390 redo A;
2391 } elsif ($self->{nc} == -1) {
2392
2393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2394 $self->{state} = DATA_STATE;
2395 ## reconsume
2396
2397 return ($self->{ct}); # comment
2398
2399 redo A;
2400 } else {
2401
2402 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2403 $self->{state} = COMMENT_STATE;
2404
2405 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2406 $self->{line_prev} = $self->{line};
2407 $self->{column_prev} = $self->{column};
2408 $self->{column}++;
2409 $self->{nc}
2410 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2411 } else {
2412 $self->{set_nc}->($self);
2413 }
2414
2415 redo A;
2416 }
2417 } elsif ($self->{state} == COMMENT_END_STATE) {
2418 if ($self->{nc} == 0x003E) { # >
2419
2420 $self->{state} = DATA_STATE;
2421
2422 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2423 $self->{line_prev} = $self->{line};
2424 $self->{column_prev} = $self->{column};
2425 $self->{column}++;
2426 $self->{nc}
2427 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2428 } else {
2429 $self->{set_nc}->($self);
2430 }
2431
2432
2433 return ($self->{ct}); # comment
2434
2435 redo A;
2436 } elsif ($self->{nc} == 0x002D) { # -
2437
2438 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2439 line => $self->{line_prev},
2440 column => $self->{column_prev});
2441 $self->{ct}->{data} .= '-'; # comment
2442 ## Stay in the state
2443
2444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2445 $self->{line_prev} = $self->{line};
2446 $self->{column_prev} = $self->{column};
2447 $self->{column}++;
2448 $self->{nc}
2449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2450 } else {
2451 $self->{set_nc}->($self);
2452 }
2453
2454 redo A;
2455 } elsif ($self->{nc} == -1) {
2456
2457 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2458 $self->{state} = DATA_STATE;
2459 ## reconsume
2460
2461 return ($self->{ct}); # comment
2462
2463 redo A;
2464 } else {
2465
2466 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2467 line => $self->{line_prev},
2468 column => $self->{column_prev});
2469 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2470 $self->{state} = COMMENT_STATE;
2471
2472 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2473 $self->{line_prev} = $self->{line};
2474 $self->{column_prev} = $self->{column};
2475 $self->{column}++;
2476 $self->{nc}
2477 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2478 } else {
2479 $self->{set_nc}->($self);
2480 }
2481
2482 redo A;
2483 }
2484 } elsif ($self->{state} == DOCTYPE_STATE) {
2485 if ($is_space->{$self->{nc}}) {
2486
2487 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2488
2489 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2490 $self->{line_prev} = $self->{line};
2491 $self->{column_prev} = $self->{column};
2492 $self->{column}++;
2493 $self->{nc}
2494 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2495 } else {
2496 $self->{set_nc}->($self);
2497 }
2498
2499 redo A;
2500 } else {
2501
2502 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2503 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2504 ## reconsume
2505 redo A;
2506 }
2507 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2508 if ($is_space->{$self->{nc}}) {
2509
2510 ## Stay in the state
2511
2512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2513 $self->{line_prev} = $self->{line};
2514 $self->{column_prev} = $self->{column};
2515 $self->{column}++;
2516 $self->{nc}
2517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2518 } else {
2519 $self->{set_nc}->($self);
2520 }
2521
2522 redo A;
2523 } elsif ($self->{nc} == 0x003E) { # >
2524
2525 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2526 $self->{state} = DATA_STATE;
2527
2528 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2529 $self->{line_prev} = $self->{line};
2530 $self->{column_prev} = $self->{column};
2531 $self->{column}++;
2532 $self->{nc}
2533 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2534 } else {
2535 $self->{set_nc}->($self);
2536 }
2537
2538
2539 return ($self->{ct}); # DOCTYPE (quirks)
2540
2541 redo A;
2542 } elsif ($self->{nc} == -1) {
2543
2544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2545 $self->{state} = DATA_STATE;
2546 ## reconsume
2547
2548 return ($self->{ct}); # DOCTYPE (quirks)
2549
2550 redo A;
2551 } else {
2552
2553 $self->{ct}->{name} = chr $self->{nc};
2554 delete $self->{ct}->{quirks};
2555 $self->{state} = DOCTYPE_NAME_STATE;
2556
2557 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2558 $self->{line_prev} = $self->{line};
2559 $self->{column_prev} = $self->{column};
2560 $self->{column}++;
2561 $self->{nc}
2562 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2563 } else {
2564 $self->{set_nc}->($self);
2565 }
2566
2567 redo A;
2568 }
2569 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2570 ## ISSUE: Redundant "First," in the spec.
2571 if ($is_space->{$self->{nc}}) {
2572
2573 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2574
2575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2576 $self->{line_prev} = $self->{line};
2577 $self->{column_prev} = $self->{column};
2578 $self->{column}++;
2579 $self->{nc}
2580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2581 } else {
2582 $self->{set_nc}->($self);
2583 }
2584
2585 redo A;
2586 } elsif ($self->{nc} == 0x003E) { # >
2587
2588 $self->{state} = DATA_STATE;
2589
2590 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2591 $self->{line_prev} = $self->{line};
2592 $self->{column_prev} = $self->{column};
2593 $self->{column}++;
2594 $self->{nc}
2595 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2596 } else {
2597 $self->{set_nc}->($self);
2598 }
2599
2600
2601 return ($self->{ct}); # DOCTYPE
2602
2603 redo A;
2604 } elsif ($self->{nc} == -1) {
2605
2606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2607 $self->{state} = DATA_STATE;
2608 ## reconsume
2609
2610 $self->{ct}->{quirks} = 1;
2611 return ($self->{ct}); # DOCTYPE
2612
2613 redo A;
2614 } else {
2615
2616 $self->{ct}->{name}
2617 .= chr ($self->{nc}); # DOCTYPE
2618 ## Stay in the state
2619
2620 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2621 $self->{line_prev} = $self->{line};
2622 $self->{column_prev} = $self->{column};
2623 $self->{column}++;
2624 $self->{nc}
2625 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2626 } else {
2627 $self->{set_nc}->($self);
2628 }
2629
2630 redo A;
2631 }
2632 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2633 if ($is_space->{$self->{nc}}) {
2634
2635 ## Stay in the state
2636
2637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2638 $self->{line_prev} = $self->{line};
2639 $self->{column_prev} = $self->{column};
2640 $self->{column}++;
2641 $self->{nc}
2642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2643 } else {
2644 $self->{set_nc}->($self);
2645 }
2646
2647 redo A;
2648 } elsif ($self->{nc} == 0x003E) { # >
2649
2650 $self->{state} = DATA_STATE;
2651
2652 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2653 $self->{line_prev} = $self->{line};
2654 $self->{column_prev} = $self->{column};
2655 $self->{column}++;
2656 $self->{nc}
2657 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2658 } else {
2659 $self->{set_nc}->($self);
2660 }
2661
2662
2663 return ($self->{ct}); # DOCTYPE
2664
2665 redo A;
2666 } elsif ($self->{nc} == -1) {
2667
2668 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2669 $self->{state} = DATA_STATE;
2670 ## reconsume
2671
2672 $self->{ct}->{quirks} = 1;
2673 return ($self->{ct}); # DOCTYPE
2674
2675 redo A;
2676 } elsif ($self->{nc} == 0x0050 or # P
2677 $self->{nc} == 0x0070) { # p
2678 $self->{state} = PUBLIC_STATE;
2679 $self->{s_kwd} = chr $self->{nc};
2680
2681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2682 $self->{line_prev} = $self->{line};
2683 $self->{column_prev} = $self->{column};
2684 $self->{column}++;
2685 $self->{nc}
2686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2687 } else {
2688 $self->{set_nc}->($self);
2689 }
2690
2691 redo A;
2692 } elsif ($self->{nc} == 0x0053 or # S
2693 $self->{nc} == 0x0073) { # s
2694 $self->{state} = SYSTEM_STATE;
2695 $self->{s_kwd} = chr $self->{nc};
2696
2697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2698 $self->{line_prev} = $self->{line};
2699 $self->{column_prev} = $self->{column};
2700 $self->{column}++;
2701 $self->{nc}
2702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2703 } else {
2704 $self->{set_nc}->($self);
2705 }
2706
2707 redo A;
2708 } else {
2709
2710 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2711 $self->{ct}->{quirks} = 1;
2712
2713 $self->{state} = BOGUS_DOCTYPE_STATE;
2714
2715 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2716 $self->{line_prev} = $self->{line};
2717 $self->{column_prev} = $self->{column};
2718 $self->{column}++;
2719 $self->{nc}
2720 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2721 } else {
2722 $self->{set_nc}->($self);
2723 }
2724
2725 redo A;
2726 }
2727 } elsif ($self->{state} == PUBLIC_STATE) {
2728 ## ASCII case-insensitive
2729 if ($self->{nc} == [
2730 undef,
2731 0x0055, # U
2732 0x0042, # B
2733 0x004C, # L
2734 0x0049, # I
2735 ]->[length $self->{s_kwd}] or
2736 $self->{nc} == [
2737 undef,
2738 0x0075, # u
2739 0x0062, # b
2740 0x006C, # l
2741 0x0069, # i
2742 ]->[length $self->{s_kwd}]) {
2743
2744 ## Stay in the state.
2745 $self->{s_kwd} .= chr $self->{nc};
2746
2747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2748 $self->{line_prev} = $self->{line};
2749 $self->{column_prev} = $self->{column};
2750 $self->{column}++;
2751 $self->{nc}
2752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2753 } else {
2754 $self->{set_nc}->($self);
2755 }
2756
2757 redo A;
2758 } elsif ((length $self->{s_kwd}) == 5 and
2759 ($self->{nc} == 0x0043 or # C
2760 $self->{nc} == 0x0063)) { # c
2761
2762 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2763
2764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2765 $self->{line_prev} = $self->{line};
2766 $self->{column_prev} = $self->{column};
2767 $self->{column}++;
2768 $self->{nc}
2769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2770 } else {
2771 $self->{set_nc}->($self);
2772 }
2773
2774 redo A;
2775 } else {
2776
2777 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2778 line => $self->{line_prev},
2779 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2780 $self->{ct}->{quirks} = 1;
2781
2782 $self->{state} = BOGUS_DOCTYPE_STATE;
2783 ## Reconsume.
2784 redo A;
2785 }
2786 } elsif ($self->{state} == SYSTEM_STATE) {
2787 ## ASCII case-insensitive
2788 if ($self->{nc} == [
2789 undef,
2790 0x0059, # Y
2791 0x0053, # S
2792 0x0054, # T
2793 0x0045, # E
2794 ]->[length $self->{s_kwd}] or
2795 $self->{nc} == [
2796 undef,
2797 0x0079, # y
2798 0x0073, # s
2799 0x0074, # t
2800 0x0065, # e
2801 ]->[length $self->{s_kwd}]) {
2802
2803 ## Stay in the state.
2804 $self->{s_kwd} .= chr $self->{nc};
2805
2806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807 $self->{line_prev} = $self->{line};
2808 $self->{column_prev} = $self->{column};
2809 $self->{column}++;
2810 $self->{nc}
2811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812 } else {
2813 $self->{set_nc}->($self);
2814 }
2815
2816 redo A;
2817 } elsif ((length $self->{s_kwd}) == 5 and
2818 ($self->{nc} == 0x004D or # M
2819 $self->{nc} == 0x006D)) { # m
2820
2821 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2822
2823 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2824 $self->{line_prev} = $self->{line};
2825 $self->{column_prev} = $self->{column};
2826 $self->{column}++;
2827 $self->{nc}
2828 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2829 } else {
2830 $self->{set_nc}->($self);
2831 }
2832
2833 redo A;
2834 } else {
2835
2836 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2837 line => $self->{line_prev},
2838 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2839 $self->{ct}->{quirks} = 1;
2840
2841 $self->{state} = BOGUS_DOCTYPE_STATE;
2842 ## Reconsume.
2843 redo A;
2844 }
2845 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2846 if ($is_space->{$self->{nc}}) {
2847
2848 ## Stay in the state
2849
2850 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2851 $self->{line_prev} = $self->{line};
2852 $self->{column_prev} = $self->{column};
2853 $self->{column}++;
2854 $self->{nc}
2855 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2856 } else {
2857 $self->{set_nc}->($self);
2858 }
2859
2860 redo A;
2861 } elsif ($self->{nc} eq 0x0022) { # "
2862
2863 $self->{ct}->{pubid} = ''; # DOCTYPE
2864 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2865
2866 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2867 $self->{line_prev} = $self->{line};
2868 $self->{column_prev} = $self->{column};
2869 $self->{column}++;
2870 $self->{nc}
2871 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2872 } else {
2873 $self->{set_nc}->($self);
2874 }
2875
2876 redo A;
2877 } elsif ($self->{nc} eq 0x0027) { # '
2878
2879 $self->{ct}->{pubid} = ''; # DOCTYPE
2880 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2881
2882 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2883 $self->{line_prev} = $self->{line};
2884 $self->{column_prev} = $self->{column};
2885 $self->{column}++;
2886 $self->{nc}
2887 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2888 } else {
2889 $self->{set_nc}->($self);
2890 }
2891
2892 redo A;
2893 } elsif ($self->{nc} eq 0x003E) { # >
2894
2895 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2896
2897 $self->{state} = DATA_STATE;
2898
2899 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2900 $self->{line_prev} = $self->{line};
2901 $self->{column_prev} = $self->{column};
2902 $self->{column}++;
2903 $self->{nc}
2904 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2905 } else {
2906 $self->{set_nc}->($self);
2907 }
2908
2909
2910 $self->{ct}->{quirks} = 1;
2911 return ($self->{ct}); # DOCTYPE
2912
2913 redo A;
2914 } elsif ($self->{nc} == -1) {
2915
2916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2917
2918 $self->{state} = DATA_STATE;
2919 ## reconsume
2920
2921 $self->{ct}->{quirks} = 1;
2922 return ($self->{ct}); # DOCTYPE
2923
2924 redo A;
2925 } else {
2926
2927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
2928 $self->{ct}->{quirks} = 1;
2929
2930 $self->{state} = BOGUS_DOCTYPE_STATE;
2931
2932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2933 $self->{line_prev} = $self->{line};
2934 $self->{column_prev} = $self->{column};
2935 $self->{column}++;
2936 $self->{nc}
2937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2938 } else {
2939 $self->{set_nc}->($self);
2940 }
2941
2942 redo A;
2943 }
2944 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2945 if ($self->{nc} == 0x0022) { # "
2946
2947 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2948
2949 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2950 $self->{line_prev} = $self->{line};
2951 $self->{column_prev} = $self->{column};
2952 $self->{column}++;
2953 $self->{nc}
2954 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2955 } else {
2956 $self->{set_nc}->($self);
2957 }
2958
2959 redo A;
2960 } elsif ($self->{nc} == 0x003E) { # >
2961
2962 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2963
2964 $self->{state} = DATA_STATE;
2965
2966 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2967 $self->{line_prev} = $self->{line};
2968 $self->{column_prev} = $self->{column};
2969 $self->{column}++;
2970 $self->{nc}
2971 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2972 } else {
2973 $self->{set_nc}->($self);
2974 }
2975
2976
2977 $self->{ct}->{quirks} = 1;
2978 return ($self->{ct}); # DOCTYPE
2979
2980 redo A;
2981 } elsif ($self->{nc} == -1) {
2982
2983 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2984
2985 $self->{state} = DATA_STATE;
2986 ## reconsume
2987
2988 $self->{ct}->{quirks} = 1;
2989 return ($self->{ct}); # DOCTYPE
2990
2991 redo A;
2992 } else {
2993
2994 $self->{ct}->{pubid} # DOCTYPE
2995 .= chr $self->{nc};
2996 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2997 length $self->{ct}->{pubid});
2998
2999 ## Stay in the state
3000
3001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002 $self->{line_prev} = $self->{line};
3003 $self->{column_prev} = $self->{column};
3004 $self->{column}++;
3005 $self->{nc}
3006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007 } else {
3008 $self->{set_nc}->($self);
3009 }
3010
3011 redo A;
3012 }
3013 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3014 if ($self->{nc} == 0x0027) { # '
3015
3016 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3017
3018 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3019 $self->{line_prev} = $self->{line};
3020 $self->{column_prev} = $self->{column};
3021 $self->{column}++;
3022 $self->{nc}
3023 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3024 } else {
3025 $self->{set_nc}->($self);
3026 }
3027
3028 redo A;
3029 } elsif ($self->{nc} == 0x003E) { # >
3030
3031 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3032
3033 $self->{state} = DATA_STATE;
3034
3035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036 $self->{line_prev} = $self->{line};
3037 $self->{column_prev} = $self->{column};
3038 $self->{column}++;
3039 $self->{nc}
3040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041 } else {
3042 $self->{set_nc}->($self);
3043 }
3044
3045
3046 $self->{ct}->{quirks} = 1;
3047 return ($self->{ct}); # DOCTYPE
3048
3049 redo A;
3050 } elsif ($self->{nc} == -1) {
3051
3052 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3053
3054 $self->{state} = DATA_STATE;
3055 ## reconsume
3056
3057 $self->{ct}->{quirks} = 1;
3058 return ($self->{ct}); # DOCTYPE
3059
3060 redo A;
3061 } else {
3062
3063 $self->{ct}->{pubid} # DOCTYPE
3064 .= chr $self->{nc};
3065 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3066 length $self->{ct}->{pubid});
3067
3068 ## Stay in the state
3069
3070 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3071 $self->{line_prev} = $self->{line};
3072 $self->{column_prev} = $self->{column};
3073 $self->{column}++;
3074 $self->{nc}
3075 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3076 } else {
3077 $self->{set_nc}->($self);
3078 }
3079
3080 redo A;
3081 }
3082 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3083 if ($is_space->{$self->{nc}}) {
3084
3085 ## Stay in the state
3086
3087 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3088 $self->{line_prev} = $self->{line};
3089 $self->{column_prev} = $self->{column};
3090 $self->{column}++;
3091 $self->{nc}
3092 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3093 } else {
3094 $self->{set_nc}->($self);
3095 }
3096
3097 redo A;
3098 } elsif ($self->{nc} == 0x0022) { # "
3099
3100 $self->{ct}->{sysid} = ''; # DOCTYPE
3101 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3102
3103 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3104 $self->{line_prev} = $self->{line};
3105 $self->{column_prev} = $self->{column};
3106 $self->{column}++;
3107 $self->{nc}
3108 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3109 } else {
3110 $self->{set_nc}->($self);
3111 }
3112
3113 redo A;
3114 } elsif ($self->{nc} == 0x0027) { # '
3115
3116 $self->{ct}->{sysid} = ''; # DOCTYPE
3117 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3118
3119 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3120 $self->{line_prev} = $self->{line};
3121 $self->{column_prev} = $self->{column};
3122 $self->{column}++;
3123 $self->{nc}
3124 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3125 } else {
3126 $self->{set_nc}->($self);
3127 }
3128
3129 redo A;
3130 } elsif ($self->{nc} == 0x003E) { # >
3131
3132 $self->{state} = DATA_STATE;
3133
3134 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3135 $self->{line_prev} = $self->{line};
3136 $self->{column_prev} = $self->{column};
3137 $self->{column}++;
3138 $self->{nc}
3139 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3140 } else {
3141 $self->{set_nc}->($self);
3142 }
3143
3144
3145 return ($self->{ct}); # DOCTYPE
3146
3147 redo A;
3148 } elsif ($self->{nc} == -1) {
3149
3150 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3151
3152 $self->{state} = DATA_STATE;
3153 ## reconsume
3154
3155 $self->{ct}->{quirks} = 1;
3156 return ($self->{ct}); # DOCTYPE
3157
3158 redo A;
3159 } else {
3160
3161 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3162 $self->{ct}->{quirks} = 1;
3163
3164 $self->{state} = BOGUS_DOCTYPE_STATE;
3165
3166 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3167 $self->{line_prev} = $self->{line};
3168 $self->{column_prev} = $self->{column};
3169 $self->{column}++;
3170 $self->{nc}
3171 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3172 } else {
3173 $self->{set_nc}->($self);
3174 }
3175
3176 redo A;
3177 }
3178 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3179 if ($is_space->{$self->{nc}}) {
3180
3181 ## Stay in the state
3182
3183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3184 $self->{line_prev} = $self->{line};
3185 $self->{column_prev} = $self->{column};
3186 $self->{column}++;
3187 $self->{nc}
3188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3189 } else {
3190 $self->{set_nc}->($self);
3191 }
3192
3193 redo A;
3194 } elsif ($self->{nc} == 0x0022) { # "
3195
3196 $self->{ct}->{sysid} = ''; # DOCTYPE
3197 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3198
3199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3200 $self->{line_prev} = $self->{line};
3201 $self->{column_prev} = $self->{column};
3202 $self->{column}++;
3203 $self->{nc}
3204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3205 } else {
3206 $self->{set_nc}->($self);
3207 }
3208
3209 redo A;
3210 } elsif ($self->{nc} == 0x0027) { # '
3211
3212 $self->{ct}->{sysid} = ''; # DOCTYPE
3213 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3214
3215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3216 $self->{line_prev} = $self->{line};
3217 $self->{column_prev} = $self->{column};
3218 $self->{column}++;
3219 $self->{nc}
3220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3221 } else {
3222 $self->{set_nc}->($self);
3223 }
3224
3225 redo A;
3226 } elsif ($self->{nc} == 0x003E) { # >
3227
3228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3229 $self->{state} = DATA_STATE;
3230
3231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3232 $self->{line_prev} = $self->{line};
3233 $self->{column_prev} = $self->{column};
3234 $self->{column}++;
3235 $self->{nc}
3236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3237 } else {
3238 $self->{set_nc}->($self);
3239 }
3240
3241
3242 $self->{ct}->{quirks} = 1;
3243 return ($self->{ct}); # DOCTYPE
3244
3245 redo A;
3246 } elsif ($self->{nc} == -1) {
3247
3248 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3249
3250 $self->{state} = DATA_STATE;
3251 ## reconsume
3252
3253 $self->{ct}->{quirks} = 1;
3254 return ($self->{ct}); # DOCTYPE
3255
3256 redo A;
3257 } else {
3258
3259 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3260 $self->{ct}->{quirks} = 1;
3261
3262 $self->{state} = BOGUS_DOCTYPE_STATE;
3263
3264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265 $self->{line_prev} = $self->{line};
3266 $self->{column_prev} = $self->{column};
3267 $self->{column}++;
3268 $self->{nc}
3269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270 } else {
3271 $self->{set_nc}->($self);
3272 }
3273
3274 redo A;
3275 }
3276 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3277 if ($self->{nc} == 0x0022) { # "
3278
3279 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3280
3281 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3282 $self->{line_prev} = $self->{line};
3283 $self->{column_prev} = $self->{column};
3284 $self->{column}++;
3285 $self->{nc}
3286 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3287 } else {
3288 $self->{set_nc}->($self);
3289 }
3290
3291 redo A;
3292 } elsif ($self->{nc} == 0x003E) { # >
3293
3294 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3295
3296 $self->{state} = DATA_STATE;
3297
3298 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3299 $self->{line_prev} = $self->{line};
3300 $self->{column_prev} = $self->{column};
3301 $self->{column}++;
3302 $self->{nc}
3303 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3304 } else {
3305 $self->{set_nc}->($self);
3306 }
3307
3308
3309 $self->{ct}->{quirks} = 1;
3310 return ($self->{ct}); # DOCTYPE
3311
3312 redo A;
3313 } elsif ($self->{nc} == -1) {
3314
3315 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3316
3317 $self->{state} = DATA_STATE;
3318 ## reconsume
3319
3320 $self->{ct}->{quirks} = 1;
3321 return ($self->{ct}); # DOCTYPE
3322
3323 redo A;
3324 } else {
3325
3326 $self->{ct}->{sysid} # DOCTYPE
3327 .= chr $self->{nc};
3328 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3329 length $self->{ct}->{sysid});
3330
3331 ## Stay in the state
3332
3333 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3334 $self->{line_prev} = $self->{line};
3335 $self->{column_prev} = $self->{column};
3336 $self->{column}++;
3337 $self->{nc}
3338 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3339 } else {
3340 $self->{set_nc}->($self);
3341 }
3342
3343 redo A;
3344 }
3345 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3346 if ($self->{nc} == 0x0027) { # '
3347
3348 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3349
3350 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3351 $self->{line_prev} = $self->{line};
3352 $self->{column_prev} = $self->{column};
3353 $self->{column}++;
3354 $self->{nc}
3355 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3356 } else {
3357 $self->{set_nc}->($self);
3358 }
3359
3360 redo A;
3361 } elsif ($self->{nc} == 0x003E) { # >
3362
3363 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3364
3365 $self->{state} = DATA_STATE;
3366
3367 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3368 $self->{line_prev} = $self->{line};
3369 $self->{column_prev} = $self->{column};
3370 $self->{column}++;
3371 $self->{nc}
3372 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3373 } else {
3374 $self->{set_nc}->($self);
3375 }
3376
3377
3378 $self->{ct}->{quirks} = 1;
3379 return ($self->{ct}); # DOCTYPE
3380
3381 redo A;
3382 } elsif ($self->{nc} == -1) {
3383
3384 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3385
3386 $self->{state} = DATA_STATE;
3387 ## reconsume
3388
3389 $self->{ct}->{quirks} = 1;
3390 return ($self->{ct}); # DOCTYPE
3391
3392 redo A;
3393 } else {
3394
3395 $self->{ct}->{sysid} # DOCTYPE
3396 .= chr $self->{nc};
3397 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3398 length $self->{ct}->{sysid});
3399
3400 ## Stay in the state
3401
3402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403 $self->{line_prev} = $self->{line};
3404 $self->{column_prev} = $self->{column};
3405 $self->{column}++;
3406 $self->{nc}
3407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408 } else {
3409 $self->{set_nc}->($self);
3410 }
3411
3412 redo A;
3413 }
3414 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3415 if ($is_space->{$self->{nc}}) {
3416
3417 ## Stay in the state
3418
3419 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3420 $self->{line_prev} = $self->{line};
3421 $self->{column_prev} = $self->{column};
3422 $self->{column}++;
3423 $self->{nc}
3424 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3425 } else {
3426 $self->{set_nc}->($self);
3427 }
3428
3429 redo A;
3430 } elsif ($self->{nc} == 0x003E) { # >
3431
3432 $self->{state} = DATA_STATE;
3433
3434 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3435 $self->{line_prev} = $self->{line};
3436 $self->{column_prev} = $self->{column};
3437 $self->{column}++;
3438 $self->{nc}
3439 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3440 } else {
3441 $self->{set_nc}->($self);
3442 }
3443
3444
3445 return ($self->{ct}); # DOCTYPE
3446
3447 redo A;
3448 } elsif ($self->{nc} == -1) {
3449
3450 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3451 $self->{state} = DATA_STATE;
3452 ## reconsume
3453
3454 $self->{ct}->{quirks} = 1;
3455 return ($self->{ct}); # DOCTYPE
3456
3457 redo A;
3458 } else {
3459
3460 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3461 #$self->{ct}->{quirks} = 1;
3462
3463 $self->{state} = BOGUS_DOCTYPE_STATE;
3464
3465 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3466 $self->{line_prev} = $self->{line};
3467 $self->{column_prev} = $self->{column};
3468 $self->{column}++;
3469 $self->{nc}
3470 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3471 } else {
3472 $self->{set_nc}->($self);
3473 }
3474
3475 redo A;
3476 }
3477 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3478 if ($self->{nc} == 0x003E) { # >
3479
3480 $self->{state} = DATA_STATE;
3481
3482 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3483 $self->{line_prev} = $self->{line};
3484 $self->{column_prev} = $self->{column};
3485 $self->{column}++;
3486 $self->{nc}
3487 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3488 } else {
3489 $self->{set_nc}->($self);
3490 }
3491
3492
3493 return ($self->{ct}); # DOCTYPE
3494
3495 redo A;
3496 } elsif ($self->{nc} == -1) {
3497
3498 $self->{state} = DATA_STATE;
3499 ## reconsume
3500
3501 return ($self->{ct}); # DOCTYPE
3502
3503 redo A;
3504 } else {
3505
3506 my $s = '';
3507 $self->{read_until}->($s, q[>], 0);
3508
3509 ## Stay in the state
3510
3511 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3512 $self->{line_prev} = $self->{line};
3513 $self->{column_prev} = $self->{column};
3514 $self->{column}++;
3515 $self->{nc}
3516 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3517 } else {
3518 $self->{set_nc}->($self);
3519 }
3520
3521 redo A;
3522 }
3523 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3524 ## NOTE: "CDATA section state" in the state is jointly implemented
3525 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3526 ## and |CDATA_SECTION_MSE2_STATE|.
3527
3528 if ($self->{nc} == 0x005D) { # ]
3529
3530 $self->{state} = CDATA_SECTION_MSE1_STATE;
3531
3532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3533 $self->{line_prev} = $self->{line};
3534 $self->{column_prev} = $self->{column};
3535 $self->{column}++;
3536 $self->{nc}
3537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3538 } else {
3539 $self->{set_nc}->($self);
3540 }
3541
3542 redo A;
3543 } elsif ($self->{nc} == -1) {
3544 $self->{state} = DATA_STATE;
3545
3546 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3547 $self->{line_prev} = $self->{line};
3548 $self->{column_prev} = $self->{column};
3549 $self->{column}++;
3550 $self->{nc}
3551 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3552 } else {
3553 $self->{set_nc}->($self);
3554 }
3555
3556 if (length $self->{ct}->{data}) { # character
3557
3558 return ($self->{ct}); # character
3559 } else {
3560
3561 ## No token to emit. $self->{ct} is discarded.
3562 }
3563 redo A;
3564 } else {
3565
3566 $self->{ct}->{data} .= chr $self->{nc};
3567 $self->{read_until}->($self->{ct}->{data},
3568 q<]>,
3569 length $self->{ct}->{data});
3570
3571 ## Stay in the state.
3572
3573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574 $self->{line_prev} = $self->{line};
3575 $self->{column_prev} = $self->{column};
3576 $self->{column}++;
3577 $self->{nc}
3578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3579 } else {
3580 $self->{set_nc}->($self);
3581 }
3582
3583 redo A;
3584 }
3585
3586 ## ISSUE: "text tokens" in spec.
3587 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3588 if ($self->{nc} == 0x005D) { # ]
3589
3590 $self->{state} = CDATA_SECTION_MSE2_STATE;
3591
3592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3593 $self->{line_prev} = $self->{line};
3594 $self->{column_prev} = $self->{column};
3595 $self->{column}++;
3596 $self->{nc}
3597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3598 } else {
3599 $self->{set_nc}->($self);
3600 }
3601
3602 redo A;
3603 } else {
3604
3605 $self->{ct}->{data} .= ']';
3606 $self->{state} = CDATA_SECTION_STATE;
3607 ## Reconsume.
3608 redo A;
3609 }
3610 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3611 if ($self->{nc} == 0x003E) { # >
3612 $self->{state} = DATA_STATE;
3613
3614 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3615 $self->{line_prev} = $self->{line};
3616 $self->{column_prev} = $self->{column};
3617 $self->{column}++;
3618 $self->{nc}
3619 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3620 } else {
3621 $self->{set_nc}->($self);
3622 }
3623
3624 if (length $self->{ct}->{data}) { # character
3625
3626 return ($self->{ct}); # character
3627 } else {
3628
3629 ## No token to emit. $self->{ct} is discarded.
3630 }
3631 redo A;
3632 } elsif ($self->{nc} == 0x005D) { # ]
3633 # character
3634 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3635 ## Stay in the state.
3636
3637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3638 $self->{line_prev} = $self->{line};
3639 $self->{column_prev} = $self->{column};
3640 $self->{column}++;
3641 $self->{nc}
3642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3643 } else {
3644 $self->{set_nc}->($self);
3645 }
3646
3647 redo A;
3648 } else {
3649
3650 $self->{ct}->{data} .= ']]'; # character
3651 $self->{state} = CDATA_SECTION_STATE;
3652 ## Reconsume.
3653 redo A;
3654 }
3655 } elsif ($self->{state} == ENTITY_STATE) {
3656 if ($is_space->{$self->{nc}} or
3657 {
3658 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3659 $self->{entity_add} => 1,
3660 }->{$self->{nc}}) {
3661
3662 ## Don't consume
3663 ## No error
3664 ## Return nothing.
3665 #
3666 } elsif ($self->{nc} == 0x0023) { # #
3667
3668 $self->{state} = ENTITY_HASH_STATE;
3669 $self->{s_kwd} = '#';
3670
3671 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3672 $self->{line_prev} = $self->{line};
3673 $self->{column_prev} = $self->{column};
3674 $self->{column}++;
3675 $self->{nc}
3676 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3677 } else {
3678 $self->{set_nc}->($self);
3679 }
3680
3681 redo A;
3682 } elsif ((0x0041 <= $self->{nc} and
3683 $self->{nc} <= 0x005A) or # A..Z
3684 (0x0061 <= $self->{nc} and
3685 $self->{nc} <= 0x007A)) { # a..z
3686
3687 require Whatpm::_NamedEntityList;
3688 $self->{state} = ENTITY_NAME_STATE;
3689 $self->{s_kwd} = chr $self->{nc};
3690 $self->{entity__value} = $self->{s_kwd};
3691 $self->{entity__match} = 0;
3692
3693 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3694 $self->{line_prev} = $self->{line};
3695 $self->{column_prev} = $self->{column};
3696 $self->{column}++;
3697 $self->{nc}
3698 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3699 } else {
3700 $self->{set_nc}->($self);
3701 }
3702
3703 redo A;
3704 } else {
3705
3706 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3707 ## Return nothing.
3708 #
3709 }
3710
3711 ## NOTE: No character is consumed by the "consume a character
3712 ## reference" algorithm. In other word, there is an "&" character
3713 ## that does not introduce a character reference, which would be
3714 ## appended to the parent element or the attribute value in later
3715 ## process of the tokenizer.
3716
3717 if ($self->{prev_state} == DATA_STATE) {
3718
3719 $self->{state} = $self->{prev_state};
3720 ## Reconsume.
3721 return ({type => CHARACTER_TOKEN, data => '&',
3722 line => $self->{line_prev},
3723 column => $self->{column_prev},
3724 });
3725 redo A;
3726 } else {
3727
3728 $self->{ca}->{value} .= '&';
3729 $self->{state} = $self->{prev_state};
3730 ## Reconsume.
3731 redo A;
3732 }
3733 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3734 if ($self->{nc} == 0x0078 or # x
3735 $self->{nc} == 0x0058) { # X
3736
3737 $self->{state} = HEXREF_X_STATE;
3738 $self->{s_kwd} .= chr $self->{nc};
3739
3740 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3741 $self->{line_prev} = $self->{line};
3742 $self->{column_prev} = $self->{column};
3743 $self->{column}++;
3744 $self->{nc}
3745 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3746 } else {
3747 $self->{set_nc}->($self);
3748 }
3749
3750 redo A;
3751 } elsif (0x0030 <= $self->{nc} and
3752 $self->{nc} <= 0x0039) { # 0..9
3753
3754 $self->{state} = NCR_NUM_STATE;
3755 $self->{s_kwd} = $self->{nc} - 0x0030;
3756
3757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3758 $self->{line_prev} = $self->{line};
3759 $self->{column_prev} = $self->{column};
3760 $self->{column}++;
3761 $self->{nc}
3762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3763 } else {
3764 $self->{set_nc}->($self);
3765 }
3766
3767 redo A;
3768 } else {
3769 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3770 line => $self->{line_prev},
3771 column => $self->{column_prev} - 1);
3772
3773 ## NOTE: According to the spec algorithm, nothing is returned,
3774 ## and then "&#" is appended to the parent element or the attribute
3775 ## value in the later processing.
3776
3777 if ($self->{prev_state} == DATA_STATE) {
3778
3779 $self->{state} = $self->{prev_state};
3780 ## Reconsume.
3781 return ({type => CHARACTER_TOKEN,
3782 data => '&#',
3783 line => $self->{line_prev},
3784 column => $self->{column_prev} - 1,
3785 });
3786 redo A;
3787 } else {
3788
3789 $self->{ca}->{value} .= '&#';
3790 $self->{state} = $self->{prev_state};
3791 ## Reconsume.
3792 redo A;
3793 }
3794 }
3795 } elsif ($self->{state} == NCR_NUM_STATE) {
3796 if (0x0030 <= $self->{nc} and
3797 $self->{nc} <= 0x0039) { # 0..9
3798
3799 $self->{s_kwd} *= 10;
3800 $self->{s_kwd} += $self->{nc} - 0x0030;
3801
3802 ## Stay in the state.
3803
3804 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3805 $self->{line_prev} = $self->{line};
3806 $self->{column_prev} = $self->{column};
3807 $self->{column}++;
3808 $self->{nc}
3809 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3810 } else {
3811 $self->{set_nc}->($self);
3812 }
3813
3814 redo A;
3815 } elsif ($self->{nc} == 0x003B) { # ;
3816
3817
3818 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3819 $self->{line_prev} = $self->{line};
3820 $self->{column_prev} = $self->{column};
3821 $self->{column}++;
3822 $self->{nc}
3823 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3824 } else {
3825 $self->{set_nc}->($self);
3826 }
3827
3828 #
3829 } else {
3830
3831 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3832 ## Reconsume.
3833 #
3834 }
3835
3836 my $code = $self->{s_kwd};
3837 my $l = $self->{line_prev};
3838 my $c = $self->{column_prev};
3839 if ($charref_map->{$code}) {
3840
3841 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3842 text => (sprintf 'U+%04X', $code),
3843 line => $l, column => $c);
3844 $code = $charref_map->{$code};
3845 } elsif ($code > 0x10FFFF) {
3846
3847 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3848 text => (sprintf 'U-%08X', $code),
3849 line => $l, column => $c);
3850 $code = 0xFFFD;
3851 }
3852
3853 if ($self->{prev_state} == DATA_STATE) {
3854
3855 $self->{state} = $self->{prev_state};
3856 ## Reconsume.
3857 return ({type => CHARACTER_TOKEN, data => chr $code,
3858 line => $l, column => $c,
3859 });
3860 redo A;
3861 } else {
3862
3863 $self->{ca}->{value} .= chr $code;
3864 $self->{ca}->{has_reference} = 1;
3865 $self->{state} = $self->{prev_state};
3866 ## Reconsume.
3867 redo A;
3868 }
3869 } elsif ($self->{state} == HEXREF_X_STATE) {
3870 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3871 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3872 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3873 # 0..9, A..F, a..f
3874
3875 $self->{state} = HEXREF_HEX_STATE;
3876 $self->{s_kwd} = 0;
3877 ## Reconsume.
3878 redo A;
3879 } else {
3880 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3881 line => $self->{line_prev},
3882 column => $self->{column_prev} - 2);
3883
3884 ## NOTE: According to the spec algorithm, nothing is returned,
3885 ## and then "&#" followed by "X" or "x" is appended to the parent
3886 ## element or the attribute value in the later processing.
3887
3888 if ($self->{prev_state} == DATA_STATE) {
3889
3890 $self->{state} = $self->{prev_state};
3891 ## Reconsume.
3892 return ({type => CHARACTER_TOKEN,
3893 data => '&' . $self->{s_kwd},
3894 line => $self->{line_prev},
3895 column => $self->{column_prev} - length $self->{s_kwd},
3896 });
3897 redo A;
3898 } else {
3899
3900 $self->{ca}->{value} .= '&' . $self->{s_kwd};
3901 $self->{state} = $self->{prev_state};
3902 ## Reconsume.
3903 redo A;
3904 }
3905 }
3906 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3907 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3908 # 0..9
3909
3910 $self->{s_kwd} *= 0x10;
3911 $self->{s_kwd} += $self->{nc} - 0x0030;
3912 ## Stay in the state.
3913
3914 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915 $self->{line_prev} = $self->{line};
3916 $self->{column_prev} = $self->{column};
3917 $self->{column}++;
3918 $self->{nc}
3919 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920 } else {
3921 $self->{set_nc}->($self);
3922 }
3923
3924 redo A;
3925 } elsif (0x0061 <= $self->{nc} and
3926 $self->{nc} <= 0x0066) { # a..f
3927
3928 $self->{s_kwd} *= 0x10;
3929 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3930 ## Stay in the state.
3931
3932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3933 $self->{line_prev} = $self->{line};
3934 $self->{column_prev} = $self->{column};
3935 $self->{column}++;
3936 $self->{nc}
3937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3938 } else {
3939 $self->{set_nc}->($self);
3940 }
3941
3942 redo A;
3943 } elsif (0x0041 <= $self->{nc} and
3944 $self->{nc} <= 0x0046) { # A..F
3945
3946 $self->{s_kwd} *= 0x10;
3947 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3948 ## Stay in the state.
3949
3950 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3951 $self->{line_prev} = $self->{line};
3952 $self->{column_prev} = $self->{column};
3953 $self->{column}++;
3954 $self->{nc}
3955 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3956 } else {
3957 $self->{set_nc}->($self);
3958 }
3959
3960 redo A;
3961 } elsif ($self->{nc} == 0x003B) { # ;
3962
3963
3964 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3965 $self->{line_prev} = $self->{line};
3966 $self->{column_prev} = $self->{column};
3967 $self->{column}++;
3968 $self->{nc}
3969 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3970 } else {
3971 $self->{set_nc}->($self);
3972 }
3973
3974 #
3975 } else {
3976
3977 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
3978 line => $self->{line},
3979 column => $self->{column});
3980 ## Reconsume.
3981 #
3982 }
3983
3984 my $code = $self->{s_kwd};
3985 my $l = $self->{line_prev};
3986 my $c = $self->{column_prev};
3987 if ($charref_map->{$code}) {
3988
3989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3990 text => (sprintf 'U+%04X', $code),
3991 line => $l, column => $c);
3992 $code = $charref_map->{$code};
3993 } elsif ($code > 0x10FFFF) {
3994
3995 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3996 text => (sprintf 'U-%08X', $code),
3997 line => $l, column => $c);
3998 $code = 0xFFFD;
3999 }
4000
4001 if ($self->{prev_state} == DATA_STATE) {
4002
4003 $self->{state} = $self->{prev_state};
4004 ## Reconsume.
4005 return ({type => CHARACTER_TOKEN, data => chr $code,
4006 line => $l, column => $c,
4007 });
4008 redo A;
4009 } else {
4010
4011 $self->{ca}->{value} .= chr $code;
4012 $self->{ca}->{has_reference} = 1;
4013 $self->{state} = $self->{prev_state};
4014 ## Reconsume.
4015 redo A;
4016 }
4017 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4018 if (length $self->{s_kwd} < 30 and
4019 ## NOTE: Some number greater than the maximum length of entity name
4020 ((0x0041 <= $self->{nc} and # a
4021 $self->{nc} <= 0x005A) or # x
4022 (0x0061 <= $self->{nc} and # a
4023 $self->{nc} <= 0x007A) or # z
4024 (0x0030 <= $self->{nc} and # 0
4025 $self->{nc} <= 0x0039) or # 9
4026 $self->{nc} == 0x003B)) { # ;
4027 our $EntityChar;
4028 $self->{s_kwd} .= chr $self->{nc};
4029 if (defined $EntityChar->{$self->{s_kwd}}) {
4030 if ($self->{nc} == 0x003B) { # ;
4031
4032 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4033 $self->{entity__match} = 1;
4034
4035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036 $self->{line_prev} = $self->{line};
4037 $self->{column_prev} = $self->{column};
4038 $self->{column}++;
4039 $self->{nc}
4040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041 } else {
4042 $self->{set_nc}->($self);
4043 }
4044
4045 #
4046 } else {
4047
4048 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4049 $self->{entity__match} = -1;
4050 ## Stay in the state.
4051
4052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4053 $self->{line_prev} = $self->{line};
4054 $self->{column_prev} = $self->{column};
4055 $self->{column}++;
4056 $self->{nc}
4057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4058 } else {
4059 $self->{set_nc}->($self);
4060 }
4061
4062 redo A;
4063 }
4064 } else {
4065
4066 $self->{entity__value} .= chr $self->{nc};
4067 $self->{entity__match} *= 2;
4068 ## Stay in the state.
4069
4070 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4071 $self->{line_prev} = $self->{line};
4072 $self->{column_prev} = $self->{column};
4073 $self->{column}++;
4074 $self->{nc}
4075 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4076 } else {
4077 $self->{set_nc}->($self);
4078 }
4079
4080 redo A;
4081 }
4082 }
4083
4084 my $data;
4085 my $has_ref;
4086 if ($self->{entity__match} > 0) {
4087
4088 $data = $self->{entity__value};
4089 $has_ref = 1;
4090 #
4091 } elsif ($self->{entity__match} < 0) {
4092 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4093 if ($self->{prev_state} != DATA_STATE and # in attribute
4094 $self->{entity__match} < -1) {
4095
4096 $data = '&' . $self->{s_kwd};
4097 #
4098 } else {
4099
4100 $data = $self->{entity__value};
4101 $has_ref = 1;
4102 #
4103 }
4104 } else {
4105
4106 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4107 line => $self->{line_prev},
4108 column => $self->{column_prev} - length $self->{s_kwd});
4109 $data = '&' . $self->{s_kwd};
4110 #
4111 }
4112
4113 ## NOTE: In these cases, when a character reference is found,
4114 ## it is consumed and a character token is returned, or, otherwise,
4115 ## nothing is consumed and returned, according to the spec algorithm.
4116 ## In this implementation, anything that has been examined by the
4117 ## tokenizer is appended to the parent element or the attribute value
4118 ## as string, either literal string when no character reference or
4119 ## entity-replaced string otherwise, in this stage, since any characters
4120 ## that would not be consumed are appended in the data state or in an
4121 ## appropriate attribute value state anyway.
4122
4123 if ($self->{prev_state} == DATA_STATE) {
4124
4125 $self->{state} = $self->{prev_state};
4126 ## Reconsume.
4127 return ({type => CHARACTER_TOKEN,
4128 data => $data,
4129 line => $self->{line_prev},
4130 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4131 });
4132 redo A;
4133 } else {
4134
4135 $self->{ca}->{value} .= $data;
4136 $self->{ca}->{has_reference} = 1 if $has_ref;
4137 $self->{state} = $self->{prev_state};
4138 ## Reconsume.
4139 redo A;
4140 }
4141 } else {
4142 die "$0: $self->{state}: Unknown state";
4143 }
4144 } # A
4145
4146 die "$0: _get_next_token: unexpected case";
4147 } # _get_next_token
4148
4149 1;
4150 ## $Date: 2008/10/14 02:27:58 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24