/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (show annotations) (download)
Tue Oct 14 14:57:52 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +16 -2 lines
++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:56:52 -0000
	* cdata-1.dat: Tests on CDATA section outside of the root element
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:57:28 -0000
	* Tokenizer.pm.src: Parse error if CDATA section is not closed or
	is placed outside of the root element.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 $self->{s_kwd} = ''; # state keyword
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194
195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196 $self->{line_prev} = $self->{line};
197 $self->{column_prev} = $self->{column};
198 $self->{column}++;
199 $self->{nc}
200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201 } else {
202 $self->{set_nc}->($self);
203 }
204
205 $self->{token} = [];
206 # $self->{escape}
207 } # _initialize_tokenizer
208
209 ## A token has:
210 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212 ## ->{name} (DOCTYPE_TOKEN)
213 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214 ## ->{pubid} (DOCTYPE_TOKEN)
215 ## ->{sysid} (DOCTYPE_TOKEN)
216 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218 ## ->{name}
219 ## ->{value}
220 ## ->{has_reference} == 1 or 0
221 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224 ## while the token is pushed back to the stack.
225
226 ## Emitted token MUST immediately be handled by the tree construction state.
227
228 ## Before each step, UA MAY check to see if either one of the scripts in
229 ## "list of scripts that will execute as soon as possible" or the first
230 ## script in the "list of scripts that will execute asynchronously",
231 ## has completed loading. If one has, then it MUST be executed
232 ## and removed from the list.
233
234 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235 ## (This requirement was dropped from HTML5 spec, unfortunately.)
236
237 my $is_space = {
238 0x0009 => 1, # CHARACTER TABULATION (HT)
239 0x000A => 1, # LINE FEED (LF)
240 #0x000B => 0, # LINE TABULATION (VT)
241 0x000C => 1, # FORM FEED (FF)
242 #0x000D => 1, # CARRIAGE RETURN (CR)
243 0x0020 => 1, # SPACE (SP)
244 };
245
246 sub _get_next_token ($) {
247 my $self = shift;
248
249 if ($self->{self_closing}) {
250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251 ## NOTE: The |self_closing| flag is only set by start tag token.
252 ## In addition, when a start tag token is emitted, it is always set to
253 ## |ct|.
254 delete $self->{self_closing};
255 }
256
257 if (@{$self->{token}}) {
258 $self->{self_closing} = $self->{token}->[0]->{self_closing};
259 return shift @{$self->{token}};
260 }
261
262 A: {
263 if ($self->{state} == PCDATA_STATE) {
264 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265
266 if ($self->{nc} == 0x0026) { # &
267
268 ## NOTE: In the spec, the tokenizer is switched to the
269 ## "entity data state". In this implementation, the tokenizer
270 ## is switched to the |ENTITY_STATE|, which is an implementation
271 ## of the "consume a character reference" algorithm.
272 $self->{entity_add} = -1;
273 $self->{prev_state} = DATA_STATE;
274 $self->{state} = ENTITY_STATE;
275
276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277 $self->{line_prev} = $self->{line};
278 $self->{column_prev} = $self->{column};
279 $self->{column}++;
280 $self->{nc}
281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282 } else {
283 $self->{set_nc}->($self);
284 }
285
286 redo A;
287 } elsif ($self->{nc} == 0x003C) { # <
288
289 $self->{state} = TAG_OPEN_STATE;
290
291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292 $self->{line_prev} = $self->{line};
293 $self->{column_prev} = $self->{column};
294 $self->{column}++;
295 $self->{nc}
296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297 } else {
298 $self->{set_nc}->($self);
299 }
300
301 redo A;
302 } elsif ($self->{nc} == -1) {
303
304 return ({type => END_OF_FILE_TOKEN,
305 line => $self->{line}, column => $self->{column}});
306 last A; ## TODO: ok?
307 } else {
308
309 #
310 }
311
312 # Anything else
313 my $token = {type => CHARACTER_TOKEN,
314 data => chr $self->{nc},
315 line => $self->{line}, column => $self->{column},
316 };
317 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318
319 ## Stay in the state.
320
321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322 $self->{line_prev} = $self->{line};
323 $self->{column_prev} = $self->{column};
324 $self->{column}++;
325 $self->{nc}
326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327 } else {
328 $self->{set_nc}->($self);
329 }
330
331 return ($token);
332 redo A;
333 } elsif ($self->{state} == DATA_STATE) {
334 $self->{s_kwd} = '' unless defined $self->{s_kwd};
335 if ($self->{nc} == 0x0026) { # &
336 $self->{s_kwd} = '';
337 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338 not $self->{escape}) {
339
340 ## NOTE: In the spec, the tokenizer is switched to the
341 ## "entity data state". In this implementation, the tokenizer
342 ## is switched to the |ENTITY_STATE|, which is an implementation
343 ## of the "consume a character reference" algorithm.
344 $self->{entity_add} = -1;
345 $self->{prev_state} = DATA_STATE;
346 $self->{state} = ENTITY_STATE;
347
348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349 $self->{line_prev} = $self->{line};
350 $self->{column_prev} = $self->{column};
351 $self->{column}++;
352 $self->{nc}
353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354 } else {
355 $self->{set_nc}->($self);
356 }
357
358 redo A;
359 } else {
360
361 #
362 }
363 } elsif ($self->{nc} == 0x002D) { # -
364 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365 if ($self->{s_kwd} eq '<!-') {
366
367 $self->{escape} = 1; # unless $self->{escape};
368 $self->{s_kwd} = '--';
369 #
370 } elsif ($self->{s_kwd} eq '-') {
371
372 $self->{s_kwd} = '--';
373 #
374 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
375
376 $self->{s_kwd} .= '-';
377 #
378 } else {
379
380 $self->{s_kwd} = '-';
381 #
382 }
383 }
384
385 #
386 } elsif ($self->{nc} == 0x0021) { # !
387 if (length $self->{s_kwd}) {
388
389 $self->{s_kwd} .= '!';
390 #
391 } else {
392
393 #$self->{s_kwd} = '';
394 #
395 }
396 #
397 } elsif ($self->{nc} == 0x003C) { # <
398 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
399 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
400 not $self->{escape})) {
401
402 $self->{state} = TAG_OPEN_STATE;
403
404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
405 $self->{line_prev} = $self->{line};
406 $self->{column_prev} = $self->{column};
407 $self->{column}++;
408 $self->{nc}
409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
410 } else {
411 $self->{set_nc}->($self);
412 }
413
414 redo A;
415 } else {
416
417 $self->{s_kwd} = '';
418 #
419 }
420 } elsif ($self->{nc} == 0x003E) { # >
421 if ($self->{escape} and
422 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
423 if ($self->{s_kwd} eq '--') {
424
425 delete $self->{escape};
426 #
427 } else {
428
429 #
430 }
431 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
432
433 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
434 line => $self->{line_prev},
435 column => $self->{column_prev} - 1);
436 #
437 } else {
438
439 #
440 }
441
442 $self->{s_kwd} = '';
443 #
444 } elsif ($self->{nc} == 0x005D) { # ]
445 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
446
447 $self->{s_kwd} .= ']';
448 } elsif ($self->{s_kwd} eq ']]') {
449
450 #
451 } else {
452
453 $self->{s_kwd} = '';
454 }
455 #
456 } elsif ($self->{nc} == -1) {
457
458 $self->{s_kwd} = '';
459 return ({type => END_OF_FILE_TOKEN,
460 line => $self->{line}, column => $self->{column}});
461 last A; ## TODO: ok?
462 } else {
463
464 $self->{s_kwd} = '';
465 #
466 }
467
468 # Anything else
469 my $token = {type => CHARACTER_TOKEN,
470 data => chr $self->{nc},
471 line => $self->{line}, column => $self->{column},
472 };
473 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
474 length $token->{data})) {
475 $self->{s_kwd} = '';
476 }
477
478 ## Stay in the data state.
479 if (not $self->{is_xml} and
480 $self->{content_model} == PCDATA_CONTENT_MODEL) {
481
482 $self->{state} = PCDATA_STATE;
483 } else {
484
485 ## Stay in the state.
486 }
487
488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
489 $self->{line_prev} = $self->{line};
490 $self->{column_prev} = $self->{column};
491 $self->{column}++;
492 $self->{nc}
493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
494 } else {
495 $self->{set_nc}->($self);
496 }
497
498 return ($token);
499 redo A;
500 } elsif ($self->{state} == TAG_OPEN_STATE) {
501 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
502 if ($self->{nc} == 0x002F) { # /
503
504
505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
506 $self->{line_prev} = $self->{line};
507 $self->{column_prev} = $self->{column};
508 $self->{column}++;
509 $self->{nc}
510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
511 } else {
512 $self->{set_nc}->($self);
513 }
514
515 $self->{state} = CLOSE_TAG_OPEN_STATE;
516 redo A;
517 } elsif ($self->{nc} == 0x0021) { # !
518
519 $self->{s_kwd} = '<' unless $self->{escape};
520 #
521 } else {
522
523 #
524 }
525
526 ## reconsume
527 $self->{state} = DATA_STATE;
528 $self->{s_kwd} = '';
529 return ({type => CHARACTER_TOKEN, data => '<',
530 line => $self->{line_prev},
531 column => $self->{column_prev},
532 });
533 redo A;
534 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
535 if ($self->{nc} == 0x0021) { # !
536
537 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
538
539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
540 $self->{line_prev} = $self->{line};
541 $self->{column_prev} = $self->{column};
542 $self->{column}++;
543 $self->{nc}
544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
545 } else {
546 $self->{set_nc}->($self);
547 }
548
549 redo A;
550 } elsif ($self->{nc} == 0x002F) { # /
551
552 $self->{state} = CLOSE_TAG_OPEN_STATE;
553
554 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
555 $self->{line_prev} = $self->{line};
556 $self->{column_prev} = $self->{column};
557 $self->{column}++;
558 $self->{nc}
559 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
560 } else {
561 $self->{set_nc}->($self);
562 }
563
564 redo A;
565 } elsif (0x0041 <= $self->{nc} and
566 $self->{nc} <= 0x005A) { # A..Z
567
568 $self->{ct}
569 = {type => START_TAG_TOKEN,
570 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
571 line => $self->{line_prev},
572 column => $self->{column_prev}};
573 $self->{state} = TAG_NAME_STATE;
574
575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
576 $self->{line_prev} = $self->{line};
577 $self->{column_prev} = $self->{column};
578 $self->{column}++;
579 $self->{nc}
580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
581 } else {
582 $self->{set_nc}->($self);
583 }
584
585 redo A;
586 } elsif (0x0061 <= $self->{nc} and
587 $self->{nc} <= 0x007A) { # a..z
588
589 $self->{ct} = {type => START_TAG_TOKEN,
590 tag_name => chr ($self->{nc}),
591 line => $self->{line_prev},
592 column => $self->{column_prev}};
593 $self->{state} = TAG_NAME_STATE;
594
595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596 $self->{line_prev} = $self->{line};
597 $self->{column_prev} = $self->{column};
598 $self->{column}++;
599 $self->{nc}
600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601 } else {
602 $self->{set_nc}->($self);
603 }
604
605 redo A;
606 } elsif ($self->{nc} == 0x003E) { # >
607
608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
609 line => $self->{line_prev},
610 column => $self->{column_prev});
611 $self->{state} = DATA_STATE;
612 $self->{s_kwd} = '';
613
614 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
615 $self->{line_prev} = $self->{line};
616 $self->{column_prev} = $self->{column};
617 $self->{column}++;
618 $self->{nc}
619 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
620 } else {
621 $self->{set_nc}->($self);
622 }
623
624
625 return ({type => CHARACTER_TOKEN, data => '<>',
626 line => $self->{line_prev},
627 column => $self->{column_prev},
628 });
629
630 redo A;
631 } elsif ($self->{nc} == 0x003F) { # ?
632
633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
634 line => $self->{line_prev},
635 column => $self->{column_prev});
636 $self->{state} = BOGUS_COMMENT_STATE;
637 $self->{ct} = {type => COMMENT_TOKEN, data => '',
638 line => $self->{line_prev},
639 column => $self->{column_prev},
640 };
641 ## $self->{nc} is intentionally left as is
642 redo A;
643 } else {
644
645 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
646 line => $self->{line_prev},
647 column => $self->{column_prev});
648 $self->{state} = DATA_STATE;
649 $self->{s_kwd} = '';
650 ## reconsume
651
652 return ({type => CHARACTER_TOKEN, data => '<',
653 line => $self->{line_prev},
654 column => $self->{column_prev},
655 });
656
657 redo A;
658 }
659 } else {
660 die "$0: $self->{content_model} in tag open";
661 }
662 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
663 ## NOTE: The "close tag open state" in the spec is implemented as
664 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
665
666 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
667 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
668 if (defined $self->{last_stag_name}) {
669 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
670 $self->{s_kwd} = '';
671 ## Reconsume.
672 redo A;
673 } else {
674 ## No start tag token has ever been emitted
675 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
676
677 $self->{state} = DATA_STATE;
678 $self->{s_kwd} = '';
679 ## Reconsume.
680 return ({type => CHARACTER_TOKEN, data => '</',
681 line => $l, column => $c,
682 });
683 redo A;
684 }
685 }
686
687 if (0x0041 <= $self->{nc} and
688 $self->{nc} <= 0x005A) { # A..Z
689
690 $self->{ct}
691 = {type => END_TAG_TOKEN,
692 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
693 line => $l, column => $c};
694 $self->{state} = TAG_NAME_STATE;
695
696 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
697 $self->{line_prev} = $self->{line};
698 $self->{column_prev} = $self->{column};
699 $self->{column}++;
700 $self->{nc}
701 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
702 } else {
703 $self->{set_nc}->($self);
704 }
705
706 redo A;
707 } elsif (0x0061 <= $self->{nc} and
708 $self->{nc} <= 0x007A) { # a..z
709
710 $self->{ct} = {type => END_TAG_TOKEN,
711 tag_name => chr ($self->{nc}),
712 line => $l, column => $c};
713 $self->{state} = TAG_NAME_STATE;
714
715 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
716 $self->{line_prev} = $self->{line};
717 $self->{column_prev} = $self->{column};
718 $self->{column}++;
719 $self->{nc}
720 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
721 } else {
722 $self->{set_nc}->($self);
723 }
724
725 redo A;
726 } elsif ($self->{nc} == 0x003E) { # >
727
728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
729 line => $self->{line_prev}, ## "<" in "</>"
730 column => $self->{column_prev} - 1);
731 $self->{state} = DATA_STATE;
732 $self->{s_kwd} = '';
733
734 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
735 $self->{line_prev} = $self->{line};
736 $self->{column_prev} = $self->{column};
737 $self->{column}++;
738 $self->{nc}
739 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
740 } else {
741 $self->{set_nc}->($self);
742 }
743
744 redo A;
745 } elsif ($self->{nc} == -1) {
746
747 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
748 $self->{s_kwd} = '';
749 $self->{state} = DATA_STATE;
750 # reconsume
751
752 return ({type => CHARACTER_TOKEN, data => '</',
753 line => $l, column => $c,
754 });
755
756 redo A;
757 } else {
758
759 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
760 $self->{state} = BOGUS_COMMENT_STATE;
761 $self->{ct} = {type => COMMENT_TOKEN, data => '',
762 line => $self->{line_prev}, # "<" of "</"
763 column => $self->{column_prev} - 1,
764 };
765 ## NOTE: $self->{nc} is intentionally left as is.
766 ## Although the "anything else" case of the spec not explicitly
767 ## states that the next input character is to be reconsumed,
768 ## it will be included to the |data| of the comment token
769 ## generated from the bogus end tag, as defined in the
770 ## "bogus comment state" entry.
771 redo A;
772 }
773 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
774 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
775 if (length $ch) {
776 my $CH = $ch;
777 $ch =~ tr/a-z/A-Z/;
778 my $nch = chr $self->{nc};
779 if ($nch eq $ch or $nch eq $CH) {
780
781 ## Stay in the state.
782 $self->{s_kwd} .= $nch;
783
784 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
785 $self->{line_prev} = $self->{line};
786 $self->{column_prev} = $self->{column};
787 $self->{column}++;
788 $self->{nc}
789 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
790 } else {
791 $self->{set_nc}->($self);
792 }
793
794 redo A;
795 } else {
796
797 $self->{state} = DATA_STATE;
798 $self->{s_kwd} = '';
799 ## Reconsume.
800 return ({type => CHARACTER_TOKEN,
801 data => '</' . $self->{s_kwd},
802 line => $self->{line_prev},
803 column => $self->{column_prev} - 1 - length $self->{s_kwd},
804 });
805 redo A;
806 }
807 } else { # after "<{tag-name}"
808 unless ($is_space->{$self->{nc}} or
809 {
810 0x003E => 1, # >
811 0x002F => 1, # /
812 -1 => 1, # EOF
813 }->{$self->{nc}}) {
814
815 ## Reconsume.
816 $self->{state} = DATA_STATE;
817 $self->{s_kwd} = '';
818 return ({type => CHARACTER_TOKEN,
819 data => '</' . $self->{s_kwd},
820 line => $self->{line_prev},
821 column => $self->{column_prev} - 1 - length $self->{s_kwd},
822 });
823 redo A;
824 } else {
825
826 $self->{ct}
827 = {type => END_TAG_TOKEN,
828 tag_name => $self->{last_stag_name},
829 line => $self->{line_prev},
830 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
831 $self->{state} = TAG_NAME_STATE;
832 ## Reconsume.
833 redo A;
834 }
835 }
836 } elsif ($self->{state} == TAG_NAME_STATE) {
837 if ($is_space->{$self->{nc}}) {
838
839 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
840
841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
842 $self->{line_prev} = $self->{line};
843 $self->{column_prev} = $self->{column};
844 $self->{column}++;
845 $self->{nc}
846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
847 } else {
848 $self->{set_nc}->($self);
849 }
850
851 redo A;
852 } elsif ($self->{nc} == 0x003E) { # >
853 if ($self->{ct}->{type} == START_TAG_TOKEN) {
854
855 $self->{last_stag_name} = $self->{ct}->{tag_name};
856 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
857 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
858 #if ($self->{ct}->{attributes}) {
859 # ## NOTE: This should never be reached.
860 # !!! cp (36);
861 # !!! parse-error (type => 'end tag attribute');
862 #} else {
863
864 #}
865 } else {
866 die "$0: $self->{ct}->{type}: Unknown token type";
867 }
868 $self->{state} = DATA_STATE;
869 $self->{s_kwd} = '';
870
871 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
872 $self->{line_prev} = $self->{line};
873 $self->{column_prev} = $self->{column};
874 $self->{column}++;
875 $self->{nc}
876 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
877 } else {
878 $self->{set_nc}->($self);
879 }
880
881
882 return ($self->{ct}); # start tag or end tag
883
884 redo A;
885 } elsif (0x0041 <= $self->{nc} and
886 $self->{nc} <= 0x005A) { # A..Z
887
888 $self->{ct}->{tag_name}
889 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
890 # start tag or end tag
891 ## Stay in this state
892
893 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
894 $self->{line_prev} = $self->{line};
895 $self->{column_prev} = $self->{column};
896 $self->{column}++;
897 $self->{nc}
898 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
899 } else {
900 $self->{set_nc}->($self);
901 }
902
903 redo A;
904 } elsif ($self->{nc} == -1) {
905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
906 if ($self->{ct}->{type} == START_TAG_TOKEN) {
907
908 $self->{last_stag_name} = $self->{ct}->{tag_name};
909 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
910 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
911 #if ($self->{ct}->{attributes}) {
912 # ## NOTE: This state should never be reached.
913 # !!! cp (40);
914 # !!! parse-error (type => 'end tag attribute');
915 #} else {
916
917 #}
918 } else {
919 die "$0: $self->{ct}->{type}: Unknown token type";
920 }
921 $self->{state} = DATA_STATE;
922 $self->{s_kwd} = '';
923 # reconsume
924
925 return ($self->{ct}); # start tag or end tag
926
927 redo A;
928 } elsif ($self->{nc} == 0x002F) { # /
929
930 $self->{state} = SELF_CLOSING_START_TAG_STATE;
931
932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
933 $self->{line_prev} = $self->{line};
934 $self->{column_prev} = $self->{column};
935 $self->{column}++;
936 $self->{nc}
937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
938 } else {
939 $self->{set_nc}->($self);
940 }
941
942 redo A;
943 } else {
944
945 $self->{ct}->{tag_name} .= chr $self->{nc};
946 # start tag or end tag
947 ## Stay in the state
948
949 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
950 $self->{line_prev} = $self->{line};
951 $self->{column_prev} = $self->{column};
952 $self->{column}++;
953 $self->{nc}
954 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
955 } else {
956 $self->{set_nc}->($self);
957 }
958
959 redo A;
960 }
961 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
962 if ($is_space->{$self->{nc}}) {
963
964 ## Stay in the state
965
966 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
967 $self->{line_prev} = $self->{line};
968 $self->{column_prev} = $self->{column};
969 $self->{column}++;
970 $self->{nc}
971 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
972 } else {
973 $self->{set_nc}->($self);
974 }
975
976 redo A;
977 } elsif ($self->{nc} == 0x003E) { # >
978 if ($self->{ct}->{type} == START_TAG_TOKEN) {
979
980 $self->{last_stag_name} = $self->{ct}->{tag_name};
981 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
982 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
983 if ($self->{ct}->{attributes}) {
984
985 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
986 } else {
987
988 }
989 } else {
990 die "$0: $self->{ct}->{type}: Unknown token type";
991 }
992 $self->{state} = DATA_STATE;
993 $self->{s_kwd} = '';
994
995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996 $self->{line_prev} = $self->{line};
997 $self->{column_prev} = $self->{column};
998 $self->{column}++;
999 $self->{nc}
1000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001 } else {
1002 $self->{set_nc}->($self);
1003 }
1004
1005
1006 return ($self->{ct}); # start tag or end tag
1007
1008 redo A;
1009 } elsif (0x0041 <= $self->{nc} and
1010 $self->{nc} <= 0x005A) { # A..Z
1011
1012 $self->{ca}
1013 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1014 value => '',
1015 line => $self->{line}, column => $self->{column}};
1016 $self->{state} = ATTRIBUTE_NAME_STATE;
1017
1018 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1019 $self->{line_prev} = $self->{line};
1020 $self->{column_prev} = $self->{column};
1021 $self->{column}++;
1022 $self->{nc}
1023 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1024 } else {
1025 $self->{set_nc}->($self);
1026 }
1027
1028 redo A;
1029 } elsif ($self->{nc} == 0x002F) { # /
1030
1031 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1032
1033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1034 $self->{line_prev} = $self->{line};
1035 $self->{column_prev} = $self->{column};
1036 $self->{column}++;
1037 $self->{nc}
1038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1039 } else {
1040 $self->{set_nc}->($self);
1041 }
1042
1043 redo A;
1044 } elsif ($self->{nc} == -1) {
1045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1046 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1047
1048 $self->{last_stag_name} = $self->{ct}->{tag_name};
1049 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1050 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1051 if ($self->{ct}->{attributes}) {
1052
1053 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1054 } else {
1055
1056 }
1057 } else {
1058 die "$0: $self->{ct}->{type}: Unknown token type";
1059 }
1060 $self->{state} = DATA_STATE;
1061 $self->{s_kwd} = '';
1062 # reconsume
1063
1064 return ($self->{ct}); # start tag or end tag
1065
1066 redo A;
1067 } else {
1068 if ({
1069 0x0022 => 1, # "
1070 0x0027 => 1, # '
1071 0x003D => 1, # =
1072 }->{$self->{nc}}) {
1073
1074 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1075 } else {
1076
1077 }
1078 $self->{ca}
1079 = {name => chr ($self->{nc}),
1080 value => '',
1081 line => $self->{line}, column => $self->{column}};
1082 $self->{state} = ATTRIBUTE_NAME_STATE;
1083
1084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1085 $self->{line_prev} = $self->{line};
1086 $self->{column_prev} = $self->{column};
1087 $self->{column}++;
1088 $self->{nc}
1089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1090 } else {
1091 $self->{set_nc}->($self);
1092 }
1093
1094 redo A;
1095 }
1096 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1097 my $before_leave = sub {
1098 if (exists $self->{ct}->{attributes} # start tag or end tag
1099 ->{$self->{ca}->{name}}) { # MUST
1100
1101 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1102 ## Discard $self->{ca} # MUST
1103 } else {
1104
1105 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1106 = $self->{ca};
1107 }
1108 }; # $before_leave
1109
1110 if ($is_space->{$self->{nc}}) {
1111
1112 $before_leave->();
1113 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1114
1115 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1116 $self->{line_prev} = $self->{line};
1117 $self->{column_prev} = $self->{column};
1118 $self->{column}++;
1119 $self->{nc}
1120 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1121 } else {
1122 $self->{set_nc}->($self);
1123 }
1124
1125 redo A;
1126 } elsif ($self->{nc} == 0x003D) { # =
1127
1128 $before_leave->();
1129 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1130
1131 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1132 $self->{line_prev} = $self->{line};
1133 $self->{column_prev} = $self->{column};
1134 $self->{column}++;
1135 $self->{nc}
1136 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1137 } else {
1138 $self->{set_nc}->($self);
1139 }
1140
1141 redo A;
1142 } elsif ($self->{nc} == 0x003E) { # >
1143 $before_leave->();
1144 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1145
1146 $self->{last_stag_name} = $self->{ct}->{tag_name};
1147 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1148
1149 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1150 if ($self->{ct}->{attributes}) {
1151 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1152 }
1153 } else {
1154 die "$0: $self->{ct}->{type}: Unknown token type";
1155 }
1156 $self->{state} = DATA_STATE;
1157 $self->{s_kwd} = '';
1158
1159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1160 $self->{line_prev} = $self->{line};
1161 $self->{column_prev} = $self->{column};
1162 $self->{column}++;
1163 $self->{nc}
1164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1165 } else {
1166 $self->{set_nc}->($self);
1167 }
1168
1169
1170 return ($self->{ct}); # start tag or end tag
1171
1172 redo A;
1173 } elsif (0x0041 <= $self->{nc} and
1174 $self->{nc} <= 0x005A) { # A..Z
1175
1176 $self->{ca}->{name}
1177 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1178 ## Stay in the state
1179
1180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1181 $self->{line_prev} = $self->{line};
1182 $self->{column_prev} = $self->{column};
1183 $self->{column}++;
1184 $self->{nc}
1185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1186 } else {
1187 $self->{set_nc}->($self);
1188 }
1189
1190 redo A;
1191 } elsif ($self->{nc} == 0x002F) { # /
1192
1193 $before_leave->();
1194 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1195
1196 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1197 $self->{line_prev} = $self->{line};
1198 $self->{column_prev} = $self->{column};
1199 $self->{column}++;
1200 $self->{nc}
1201 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1202 } else {
1203 $self->{set_nc}->($self);
1204 }
1205
1206 redo A;
1207 } elsif ($self->{nc} == -1) {
1208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1209 $before_leave->();
1210 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211
1212 $self->{last_stag_name} = $self->{ct}->{tag_name};
1213 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215 if ($self->{ct}->{attributes}) {
1216
1217 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1218 } else {
1219 ## NOTE: This state should never be reached.
1220
1221 }
1222 } else {
1223 die "$0: $self->{ct}->{type}: Unknown token type";
1224 }
1225 $self->{state} = DATA_STATE;
1226 $self->{s_kwd} = '';
1227 # reconsume
1228
1229 return ($self->{ct}); # start tag or end tag
1230
1231 redo A;
1232 } else {
1233 if ($self->{nc} == 0x0022 or # "
1234 $self->{nc} == 0x0027) { # '
1235
1236 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1237 } else {
1238
1239 }
1240 $self->{ca}->{name} .= chr ($self->{nc});
1241 ## Stay in the state
1242
1243 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1244 $self->{line_prev} = $self->{line};
1245 $self->{column_prev} = $self->{column};
1246 $self->{column}++;
1247 $self->{nc}
1248 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1249 } else {
1250 $self->{set_nc}->($self);
1251 }
1252
1253 redo A;
1254 }
1255 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1256 if ($is_space->{$self->{nc}}) {
1257
1258 ## Stay in the state
1259
1260 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1261 $self->{line_prev} = $self->{line};
1262 $self->{column_prev} = $self->{column};
1263 $self->{column}++;
1264 $self->{nc}
1265 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1266 } else {
1267 $self->{set_nc}->($self);
1268 }
1269
1270 redo A;
1271 } elsif ($self->{nc} == 0x003D) { # =
1272
1273 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1274
1275 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1276 $self->{line_prev} = $self->{line};
1277 $self->{column_prev} = $self->{column};
1278 $self->{column}++;
1279 $self->{nc}
1280 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1281 } else {
1282 $self->{set_nc}->($self);
1283 }
1284
1285 redo A;
1286 } elsif ($self->{nc} == 0x003E) { # >
1287 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1288
1289 $self->{last_stag_name} = $self->{ct}->{tag_name};
1290 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1291 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1292 if ($self->{ct}->{attributes}) {
1293
1294 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1295 } else {
1296 ## NOTE: This state should never be reached.
1297
1298 }
1299 } else {
1300 die "$0: $self->{ct}->{type}: Unknown token type";
1301 }
1302 $self->{state} = DATA_STATE;
1303 $self->{s_kwd} = '';
1304
1305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1306 $self->{line_prev} = $self->{line};
1307 $self->{column_prev} = $self->{column};
1308 $self->{column}++;
1309 $self->{nc}
1310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1311 } else {
1312 $self->{set_nc}->($self);
1313 }
1314
1315
1316 return ($self->{ct}); # start tag or end tag
1317
1318 redo A;
1319 } elsif (0x0041 <= $self->{nc} and
1320 $self->{nc} <= 0x005A) { # A..Z
1321
1322 $self->{ca}
1323 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1324 value => '',
1325 line => $self->{line}, column => $self->{column}};
1326 $self->{state} = ATTRIBUTE_NAME_STATE;
1327
1328 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1329 $self->{line_prev} = $self->{line};
1330 $self->{column_prev} = $self->{column};
1331 $self->{column}++;
1332 $self->{nc}
1333 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1334 } else {
1335 $self->{set_nc}->($self);
1336 }
1337
1338 redo A;
1339 } elsif ($self->{nc} == 0x002F) { # /
1340
1341 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1342
1343 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1344 $self->{line_prev} = $self->{line};
1345 $self->{column_prev} = $self->{column};
1346 $self->{column}++;
1347 $self->{nc}
1348 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1349 } else {
1350 $self->{set_nc}->($self);
1351 }
1352
1353 redo A;
1354 } elsif ($self->{nc} == -1) {
1355 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1356 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1357
1358 $self->{last_stag_name} = $self->{ct}->{tag_name};
1359 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1360 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1361 if ($self->{ct}->{attributes}) {
1362
1363 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1364 } else {
1365 ## NOTE: This state should never be reached.
1366
1367 }
1368 } else {
1369 die "$0: $self->{ct}->{type}: Unknown token type";
1370 }
1371 $self->{s_kwd} = '';
1372 $self->{state} = DATA_STATE;
1373 # reconsume
1374
1375 return ($self->{ct}); # start tag or end tag
1376
1377 redo A;
1378 } else {
1379 if ($self->{nc} == 0x0022 or # "
1380 $self->{nc} == 0x0027) { # '
1381
1382 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1383 } else {
1384
1385 }
1386 $self->{ca}
1387 = {name => chr ($self->{nc}),
1388 value => '',
1389 line => $self->{line}, column => $self->{column}};
1390 $self->{state} = ATTRIBUTE_NAME_STATE;
1391
1392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1393 $self->{line_prev} = $self->{line};
1394 $self->{column_prev} = $self->{column};
1395 $self->{column}++;
1396 $self->{nc}
1397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1398 } else {
1399 $self->{set_nc}->($self);
1400 }
1401
1402 redo A;
1403 }
1404 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1405 if ($is_space->{$self->{nc}}) {
1406
1407 ## Stay in the state
1408
1409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1410 $self->{line_prev} = $self->{line};
1411 $self->{column_prev} = $self->{column};
1412 $self->{column}++;
1413 $self->{nc}
1414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1415 } else {
1416 $self->{set_nc}->($self);
1417 }
1418
1419 redo A;
1420 } elsif ($self->{nc} == 0x0022) { # "
1421
1422 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1423
1424 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1425 $self->{line_prev} = $self->{line};
1426 $self->{column_prev} = $self->{column};
1427 $self->{column}++;
1428 $self->{nc}
1429 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1430 } else {
1431 $self->{set_nc}->($self);
1432 }
1433
1434 redo A;
1435 } elsif ($self->{nc} == 0x0026) { # &
1436
1437 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1438 ## reconsume
1439 redo A;
1440 } elsif ($self->{nc} == 0x0027) { # '
1441
1442 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1443
1444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445 $self->{line_prev} = $self->{line};
1446 $self->{column_prev} = $self->{column};
1447 $self->{column}++;
1448 $self->{nc}
1449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450 } else {
1451 $self->{set_nc}->($self);
1452 }
1453
1454 redo A;
1455 } elsif ($self->{nc} == 0x003E) { # >
1456 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1457 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1458
1459 $self->{last_stag_name} = $self->{ct}->{tag_name};
1460 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1461 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1462 if ($self->{ct}->{attributes}) {
1463
1464 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1465 } else {
1466 ## NOTE: This state should never be reached.
1467
1468 }
1469 } else {
1470 die "$0: $self->{ct}->{type}: Unknown token type";
1471 }
1472 $self->{state} = DATA_STATE;
1473 $self->{s_kwd} = '';
1474
1475 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1476 $self->{line_prev} = $self->{line};
1477 $self->{column_prev} = $self->{column};
1478 $self->{column}++;
1479 $self->{nc}
1480 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1481 } else {
1482 $self->{set_nc}->($self);
1483 }
1484
1485
1486 return ($self->{ct}); # start tag or end tag
1487
1488 redo A;
1489 } elsif ($self->{nc} == -1) {
1490 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1491 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1492
1493 $self->{last_stag_name} = $self->{ct}->{tag_name};
1494 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1495 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1496 if ($self->{ct}->{attributes}) {
1497
1498 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1499 } else {
1500 ## NOTE: This state should never be reached.
1501
1502 }
1503 } else {
1504 die "$0: $self->{ct}->{type}: Unknown token type";
1505 }
1506 $self->{state} = DATA_STATE;
1507 $self->{s_kwd} = '';
1508 ## reconsume
1509
1510 return ($self->{ct}); # start tag or end tag
1511
1512 redo A;
1513 } else {
1514 if ($self->{nc} == 0x003D) { # =
1515
1516 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1517 } else {
1518
1519 }
1520 $self->{ca}->{value} .= chr ($self->{nc});
1521 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1522
1523 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524 $self->{line_prev} = $self->{line};
1525 $self->{column_prev} = $self->{column};
1526 $self->{column}++;
1527 $self->{nc}
1528 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529 } else {
1530 $self->{set_nc}->($self);
1531 }
1532
1533 redo A;
1534 }
1535 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1536 if ($self->{nc} == 0x0022) { # "
1537
1538 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1539
1540 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1541 $self->{line_prev} = $self->{line};
1542 $self->{column_prev} = $self->{column};
1543 $self->{column}++;
1544 $self->{nc}
1545 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1546 } else {
1547 $self->{set_nc}->($self);
1548 }
1549
1550 redo A;
1551 } elsif ($self->{nc} == 0x0026) { # &
1552
1553 ## NOTE: In the spec, the tokenizer is switched to the
1554 ## "entity in attribute value state". In this implementation, the
1555 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1556 ## implementation of the "consume a character reference" algorithm.
1557 $self->{prev_state} = $self->{state};
1558 $self->{entity_add} = 0x0022; # "
1559 $self->{state} = ENTITY_STATE;
1560
1561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562 $self->{line_prev} = $self->{line};
1563 $self->{column_prev} = $self->{column};
1564 $self->{column}++;
1565 $self->{nc}
1566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567 } else {
1568 $self->{set_nc}->($self);
1569 }
1570
1571 redo A;
1572 } elsif ($self->{nc} == -1) {
1573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1574 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575
1576 $self->{last_stag_name} = $self->{ct}->{tag_name};
1577 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579 if ($self->{ct}->{attributes}) {
1580
1581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582 } else {
1583 ## NOTE: This state should never be reached.
1584
1585 }
1586 } else {
1587 die "$0: $self->{ct}->{type}: Unknown token type";
1588 }
1589 $self->{state} = DATA_STATE;
1590 $self->{s_kwd} = '';
1591 ## reconsume
1592
1593 return ($self->{ct}); # start tag or end tag
1594
1595 redo A;
1596 } else {
1597
1598 $self->{ca}->{value} .= chr ($self->{nc});
1599 $self->{read_until}->($self->{ca}->{value},
1600 q["&],
1601 length $self->{ca}->{value});
1602
1603 ## Stay in the state
1604
1605 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1606 $self->{line_prev} = $self->{line};
1607 $self->{column_prev} = $self->{column};
1608 $self->{column}++;
1609 $self->{nc}
1610 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1611 } else {
1612 $self->{set_nc}->($self);
1613 }
1614
1615 redo A;
1616 }
1617 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1618 if ($self->{nc} == 0x0027) { # '
1619
1620 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1621
1622 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623 $self->{line_prev} = $self->{line};
1624 $self->{column_prev} = $self->{column};
1625 $self->{column}++;
1626 $self->{nc}
1627 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628 } else {
1629 $self->{set_nc}->($self);
1630 }
1631
1632 redo A;
1633 } elsif ($self->{nc} == 0x0026) { # &
1634
1635 ## NOTE: In the spec, the tokenizer is switched to the
1636 ## "entity in attribute value state". In this implementation, the
1637 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1638 ## implementation of the "consume a character reference" algorithm.
1639 $self->{entity_add} = 0x0027; # '
1640 $self->{prev_state} = $self->{state};
1641 $self->{state} = ENTITY_STATE;
1642
1643 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644 $self->{line_prev} = $self->{line};
1645 $self->{column_prev} = $self->{column};
1646 $self->{column}++;
1647 $self->{nc}
1648 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649 } else {
1650 $self->{set_nc}->($self);
1651 }
1652
1653 redo A;
1654 } elsif ($self->{nc} == -1) {
1655 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1656 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1657
1658 $self->{last_stag_name} = $self->{ct}->{tag_name};
1659 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1660 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1661 if ($self->{ct}->{attributes}) {
1662
1663 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1664 } else {
1665 ## NOTE: This state should never be reached.
1666
1667 }
1668 } else {
1669 die "$0: $self->{ct}->{type}: Unknown token type";
1670 }
1671 $self->{state} = DATA_STATE;
1672 $self->{s_kwd} = '';
1673 ## reconsume
1674
1675 return ($self->{ct}); # start tag or end tag
1676
1677 redo A;
1678 } else {
1679
1680 $self->{ca}->{value} .= chr ($self->{nc});
1681 $self->{read_until}->($self->{ca}->{value},
1682 q['&],
1683 length $self->{ca}->{value});
1684
1685 ## Stay in the state
1686
1687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1688 $self->{line_prev} = $self->{line};
1689 $self->{column_prev} = $self->{column};
1690 $self->{column}++;
1691 $self->{nc}
1692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1693 } else {
1694 $self->{set_nc}->($self);
1695 }
1696
1697 redo A;
1698 }
1699 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1700 if ($is_space->{$self->{nc}}) {
1701
1702 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1703
1704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705 $self->{line_prev} = $self->{line};
1706 $self->{column_prev} = $self->{column};
1707 $self->{column}++;
1708 $self->{nc}
1709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710 } else {
1711 $self->{set_nc}->($self);
1712 }
1713
1714 redo A;
1715 } elsif ($self->{nc} == 0x0026) { # &
1716
1717 ## NOTE: In the spec, the tokenizer is switched to the
1718 ## "entity in attribute value state". In this implementation, the
1719 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1720 ## implementation of the "consume a character reference" algorithm.
1721 $self->{entity_add} = -1;
1722 $self->{prev_state} = $self->{state};
1723 $self->{state} = ENTITY_STATE;
1724
1725 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1726 $self->{line_prev} = $self->{line};
1727 $self->{column_prev} = $self->{column};
1728 $self->{column}++;
1729 $self->{nc}
1730 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1731 } else {
1732 $self->{set_nc}->($self);
1733 }
1734
1735 redo A;
1736 } elsif ($self->{nc} == 0x003E) { # >
1737 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1738
1739 $self->{last_stag_name} = $self->{ct}->{tag_name};
1740 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1741 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1742 if ($self->{ct}->{attributes}) {
1743
1744 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1745 } else {
1746 ## NOTE: This state should never be reached.
1747
1748 }
1749 } else {
1750 die "$0: $self->{ct}->{type}: Unknown token type";
1751 }
1752 $self->{state} = DATA_STATE;
1753 $self->{s_kwd} = '';
1754
1755 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1756 $self->{line_prev} = $self->{line};
1757 $self->{column_prev} = $self->{column};
1758 $self->{column}++;
1759 $self->{nc}
1760 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1761 } else {
1762 $self->{set_nc}->($self);
1763 }
1764
1765
1766 return ($self->{ct}); # start tag or end tag
1767
1768 redo A;
1769 } elsif ($self->{nc} == -1) {
1770 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1771 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1772
1773 $self->{last_stag_name} = $self->{ct}->{tag_name};
1774 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1775 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1776 if ($self->{ct}->{attributes}) {
1777
1778 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1779 } else {
1780 ## NOTE: This state should never be reached.
1781
1782 }
1783 } else {
1784 die "$0: $self->{ct}->{type}: Unknown token type";
1785 }
1786 $self->{state} = DATA_STATE;
1787 $self->{s_kwd} = '';
1788 ## reconsume
1789
1790 return ($self->{ct}); # start tag or end tag
1791
1792 redo A;
1793 } else {
1794 if ({
1795 0x0022 => 1, # "
1796 0x0027 => 1, # '
1797 0x003D => 1, # =
1798 }->{$self->{nc}}) {
1799
1800 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1801 } else {
1802
1803 }
1804 $self->{ca}->{value} .= chr ($self->{nc});
1805 $self->{read_until}->($self->{ca}->{value},
1806 q["'=& >],
1807 length $self->{ca}->{value});
1808
1809 ## Stay in the state
1810
1811 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1812 $self->{line_prev} = $self->{line};
1813 $self->{column_prev} = $self->{column};
1814 $self->{column}++;
1815 $self->{nc}
1816 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1817 } else {
1818 $self->{set_nc}->($self);
1819 }
1820
1821 redo A;
1822 }
1823 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1824 if ($is_space->{$self->{nc}}) {
1825
1826 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1827
1828 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1829 $self->{line_prev} = $self->{line};
1830 $self->{column_prev} = $self->{column};
1831 $self->{column}++;
1832 $self->{nc}
1833 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1834 } else {
1835 $self->{set_nc}->($self);
1836 }
1837
1838 redo A;
1839 } elsif ($self->{nc} == 0x003E) { # >
1840 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1841
1842 $self->{last_stag_name} = $self->{ct}->{tag_name};
1843 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1844 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1845 if ($self->{ct}->{attributes}) {
1846
1847 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1848 } else {
1849 ## NOTE: This state should never be reached.
1850
1851 }
1852 } else {
1853 die "$0: $self->{ct}->{type}: Unknown token type";
1854 }
1855 $self->{state} = DATA_STATE;
1856 $self->{s_kwd} = '';
1857
1858 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1859 $self->{line_prev} = $self->{line};
1860 $self->{column_prev} = $self->{column};
1861 $self->{column}++;
1862 $self->{nc}
1863 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1864 } else {
1865 $self->{set_nc}->($self);
1866 }
1867
1868
1869 return ($self->{ct}); # start tag or end tag
1870
1871 redo A;
1872 } elsif ($self->{nc} == 0x002F) { # /
1873
1874 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1875
1876 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1877 $self->{line_prev} = $self->{line};
1878 $self->{column_prev} = $self->{column};
1879 $self->{column}++;
1880 $self->{nc}
1881 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1882 } else {
1883 $self->{set_nc}->($self);
1884 }
1885
1886 redo A;
1887 } elsif ($self->{nc} == -1) {
1888 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1889 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1890
1891 $self->{last_stag_name} = $self->{ct}->{tag_name};
1892 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1893 if ($self->{ct}->{attributes}) {
1894
1895 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1896 } else {
1897 ## NOTE: This state should never be reached.
1898
1899 }
1900 } else {
1901 die "$0: $self->{ct}->{type}: Unknown token type";
1902 }
1903 $self->{state} = DATA_STATE;
1904 $self->{s_kwd} = '';
1905 ## Reconsume.
1906 return ($self->{ct}); # start tag or end tag
1907 redo A;
1908 } else {
1909
1910 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1911 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1912 ## reconsume
1913 redo A;
1914 }
1915 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1916 if ($self->{nc} == 0x003E) { # >
1917 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1918
1919 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1920 ## TODO: Different type than slash in start tag
1921 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1922 if ($self->{ct}->{attributes}) {
1923
1924 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1925 } else {
1926
1927 }
1928 ## TODO: Test |<title></title/>|
1929 } else {
1930
1931 $self->{self_closing} = 1;
1932 }
1933
1934 $self->{state} = DATA_STATE;
1935 $self->{s_kwd} = '';
1936
1937 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938 $self->{line_prev} = $self->{line};
1939 $self->{column_prev} = $self->{column};
1940 $self->{column}++;
1941 $self->{nc}
1942 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943 } else {
1944 $self->{set_nc}->($self);
1945 }
1946
1947
1948 return ($self->{ct}); # start tag or end tag
1949
1950 redo A;
1951 } elsif ($self->{nc} == -1) {
1952 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1953 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1954
1955 $self->{last_stag_name} = $self->{ct}->{tag_name};
1956 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1957 if ($self->{ct}->{attributes}) {
1958
1959 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1960 } else {
1961 ## NOTE: This state should never be reached.
1962
1963 }
1964 } else {
1965 die "$0: $self->{ct}->{type}: Unknown token type";
1966 }
1967 $self->{state} = DATA_STATE;
1968 $self->{s_kwd} = '';
1969 ## Reconsume.
1970 return ($self->{ct}); # start tag or end tag
1971 redo A;
1972 } else {
1973
1974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1975 ## TODO: This error type is wrong.
1976 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1977 ## Reconsume.
1978 redo A;
1979 }
1980 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1981 ## (only happen if PCDATA state)
1982
1983 ## NOTE: Unlike spec's "bogus comment state", this implementation
1984 ## consumes characters one-by-one basis.
1985
1986 if ($self->{nc} == 0x003E) { # >
1987
1988 $self->{state} = DATA_STATE;
1989 $self->{s_kwd} = '';
1990
1991 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1992 $self->{line_prev} = $self->{line};
1993 $self->{column_prev} = $self->{column};
1994 $self->{column}++;
1995 $self->{nc}
1996 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1997 } else {
1998 $self->{set_nc}->($self);
1999 }
2000
2001
2002 return ($self->{ct}); # comment
2003 redo A;
2004 } elsif ($self->{nc} == -1) {
2005
2006 $self->{state} = DATA_STATE;
2007 $self->{s_kwd} = '';
2008 ## reconsume
2009
2010 return ($self->{ct}); # comment
2011 redo A;
2012 } else {
2013
2014 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2015 $self->{read_until}->($self->{ct}->{data},
2016 q[>],
2017 length $self->{ct}->{data});
2018
2019 ## Stay in the state.
2020
2021 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2022 $self->{line_prev} = $self->{line};
2023 $self->{column_prev} = $self->{column};
2024 $self->{column}++;
2025 $self->{nc}
2026 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2027 } else {
2028 $self->{set_nc}->($self);
2029 }
2030
2031 redo A;
2032 }
2033 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2034 ## (only happen if PCDATA state)
2035
2036 if ($self->{nc} == 0x002D) { # -
2037
2038 $self->{state} = MD_HYPHEN_STATE;
2039
2040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2041 $self->{line_prev} = $self->{line};
2042 $self->{column_prev} = $self->{column};
2043 $self->{column}++;
2044 $self->{nc}
2045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2046 } else {
2047 $self->{set_nc}->($self);
2048 }
2049
2050 redo A;
2051 } elsif ($self->{nc} == 0x0044 or # D
2052 $self->{nc} == 0x0064) { # d
2053 ## ASCII case-insensitive.
2054
2055 $self->{state} = MD_DOCTYPE_STATE;
2056 $self->{s_kwd} = chr $self->{nc};
2057
2058 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2059 $self->{line_prev} = $self->{line};
2060 $self->{column_prev} = $self->{column};
2061 $self->{column}++;
2062 $self->{nc}
2063 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2064 } else {
2065 $self->{set_nc}->($self);
2066 }
2067
2068 redo A;
2069 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2070 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2071 $self->{is_xml}) and
2072 $self->{nc} == 0x005B) { # [
2073
2074 $self->{state} = MD_CDATA_STATE;
2075 $self->{s_kwd} = '[';
2076
2077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2078 $self->{line_prev} = $self->{line};
2079 $self->{column_prev} = $self->{column};
2080 $self->{column}++;
2081 $self->{nc}
2082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2083 } else {
2084 $self->{set_nc}->($self);
2085 }
2086
2087 redo A;
2088 } else {
2089
2090 }
2091
2092 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2093 line => $self->{line_prev},
2094 column => $self->{column_prev} - 1);
2095 ## Reconsume.
2096 $self->{state} = BOGUS_COMMENT_STATE;
2097 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2098 line => $self->{line_prev},
2099 column => $self->{column_prev} - 1,
2100 };
2101 redo A;
2102 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2103 if ($self->{nc} == 0x002D) { # -
2104
2105 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2106 line => $self->{line_prev},
2107 column => $self->{column_prev} - 2,
2108 };
2109 $self->{state} = COMMENT_START_STATE;
2110
2111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2112 $self->{line_prev} = $self->{line};
2113 $self->{column_prev} = $self->{column};
2114 $self->{column}++;
2115 $self->{nc}
2116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2117 } else {
2118 $self->{set_nc}->($self);
2119 }
2120
2121 redo A;
2122 } else {
2123
2124 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2125 line => $self->{line_prev},
2126 column => $self->{column_prev} - 2);
2127 $self->{state} = BOGUS_COMMENT_STATE;
2128 ## Reconsume.
2129 $self->{ct} = {type => COMMENT_TOKEN,
2130 data => '-',
2131 line => $self->{line_prev},
2132 column => $self->{column_prev} - 2,
2133 };
2134 redo A;
2135 }
2136 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2137 ## ASCII case-insensitive.
2138 if ($self->{nc} == [
2139 undef,
2140 0x004F, # O
2141 0x0043, # C
2142 0x0054, # T
2143 0x0059, # Y
2144 0x0050, # P
2145 ]->[length $self->{s_kwd}] or
2146 $self->{nc} == [
2147 undef,
2148 0x006F, # o
2149 0x0063, # c
2150 0x0074, # t
2151 0x0079, # y
2152 0x0070, # p
2153 ]->[length $self->{s_kwd}]) {
2154
2155 ## Stay in the state.
2156 $self->{s_kwd} .= chr $self->{nc};
2157
2158 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2159 $self->{line_prev} = $self->{line};
2160 $self->{column_prev} = $self->{column};
2161 $self->{column}++;
2162 $self->{nc}
2163 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2164 } else {
2165 $self->{set_nc}->($self);
2166 }
2167
2168 redo A;
2169 } elsif ((length $self->{s_kwd}) == 6 and
2170 ($self->{nc} == 0x0045 or # E
2171 $self->{nc} == 0x0065)) { # e
2172
2173 $self->{state} = DOCTYPE_STATE;
2174 $self->{ct} = {type => DOCTYPE_TOKEN,
2175 quirks => 1,
2176 line => $self->{line_prev},
2177 column => $self->{column_prev} - 7,
2178 };
2179
2180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2181 $self->{line_prev} = $self->{line};
2182 $self->{column_prev} = $self->{column};
2183 $self->{column}++;
2184 $self->{nc}
2185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2186 } else {
2187 $self->{set_nc}->($self);
2188 }
2189
2190 redo A;
2191 } else {
2192
2193 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2194 line => $self->{line_prev},
2195 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2196 $self->{state} = BOGUS_COMMENT_STATE;
2197 ## Reconsume.
2198 $self->{ct} = {type => COMMENT_TOKEN,
2199 data => $self->{s_kwd},
2200 line => $self->{line_prev},
2201 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2202 };
2203 redo A;
2204 }
2205 } elsif ($self->{state} == MD_CDATA_STATE) {
2206 if ($self->{nc} == {
2207 '[' => 0x0043, # C
2208 '[C' => 0x0044, # D
2209 '[CD' => 0x0041, # A
2210 '[CDA' => 0x0054, # T
2211 '[CDAT' => 0x0041, # A
2212 }->{$self->{s_kwd}}) {
2213
2214 ## Stay in the state.
2215 $self->{s_kwd} .= chr $self->{nc};
2216
2217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218 $self->{line_prev} = $self->{line};
2219 $self->{column_prev} = $self->{column};
2220 $self->{column}++;
2221 $self->{nc}
2222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223 } else {
2224 $self->{set_nc}->($self);
2225 }
2226
2227 redo A;
2228 } elsif ($self->{s_kwd} eq '[CDATA' and
2229 $self->{nc} == 0x005B) { # [
2230
2231
2232 if ($self->{is_xml} and
2233 not $self->{tainted} and
2234 @{$self->{open_elements} or []} == 0) {
2235 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2236 line => $self->{line_prev},
2237 column => $self->{column_prev} - 7);
2238 $self->{tainted} = 1;
2239 }
2240
2241 $self->{ct} = {type => CHARACTER_TOKEN,
2242 data => '',
2243 line => $self->{line_prev},
2244 column => $self->{column_prev} - 7};
2245 $self->{state} = CDATA_SECTION_STATE;
2246
2247 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2248 $self->{line_prev} = $self->{line};
2249 $self->{column_prev} = $self->{column};
2250 $self->{column}++;
2251 $self->{nc}
2252 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2253 } else {
2254 $self->{set_nc}->($self);
2255 }
2256
2257 redo A;
2258 } else {
2259
2260 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2261 line => $self->{line_prev},
2262 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2263 $self->{state} = BOGUS_COMMENT_STATE;
2264 ## Reconsume.
2265 $self->{ct} = {type => COMMENT_TOKEN,
2266 data => $self->{s_kwd},
2267 line => $self->{line_prev},
2268 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2269 };
2270 redo A;
2271 }
2272 } elsif ($self->{state} == COMMENT_START_STATE) {
2273 if ($self->{nc} == 0x002D) { # -
2274
2275 $self->{state} = COMMENT_START_DASH_STATE;
2276
2277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2278 $self->{line_prev} = $self->{line};
2279 $self->{column_prev} = $self->{column};
2280 $self->{column}++;
2281 $self->{nc}
2282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2283 } else {
2284 $self->{set_nc}->($self);
2285 }
2286
2287 redo A;
2288 } elsif ($self->{nc} == 0x003E) { # >
2289
2290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2291 $self->{state} = DATA_STATE;
2292 $self->{s_kwd} = '';
2293
2294 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295 $self->{line_prev} = $self->{line};
2296 $self->{column_prev} = $self->{column};
2297 $self->{column}++;
2298 $self->{nc}
2299 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300 } else {
2301 $self->{set_nc}->($self);
2302 }
2303
2304
2305 return ($self->{ct}); # comment
2306
2307 redo A;
2308 } elsif ($self->{nc} == -1) {
2309
2310 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2311 $self->{state} = DATA_STATE;
2312 $self->{s_kwd} = '';
2313 ## reconsume
2314
2315 return ($self->{ct}); # comment
2316
2317 redo A;
2318 } else {
2319
2320 $self->{ct}->{data} # comment
2321 .= chr ($self->{nc});
2322 $self->{state} = COMMENT_STATE;
2323
2324 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2325 $self->{line_prev} = $self->{line};
2326 $self->{column_prev} = $self->{column};
2327 $self->{column}++;
2328 $self->{nc}
2329 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2330 } else {
2331 $self->{set_nc}->($self);
2332 }
2333
2334 redo A;
2335 }
2336 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2337 if ($self->{nc} == 0x002D) { # -
2338
2339 $self->{state} = COMMENT_END_STATE;
2340
2341 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2342 $self->{line_prev} = $self->{line};
2343 $self->{column_prev} = $self->{column};
2344 $self->{column}++;
2345 $self->{nc}
2346 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2347 } else {
2348 $self->{set_nc}->($self);
2349 }
2350
2351 redo A;
2352 } elsif ($self->{nc} == 0x003E) { # >
2353
2354 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2355 $self->{state} = DATA_STATE;
2356 $self->{s_kwd} = '';
2357
2358 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2359 $self->{line_prev} = $self->{line};
2360 $self->{column_prev} = $self->{column};
2361 $self->{column}++;
2362 $self->{nc}
2363 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2364 } else {
2365 $self->{set_nc}->($self);
2366 }
2367
2368
2369 return ($self->{ct}); # comment
2370
2371 redo A;
2372 } elsif ($self->{nc} == -1) {
2373
2374 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2375 $self->{state} = DATA_STATE;
2376 $self->{s_kwd} = '';
2377 ## reconsume
2378
2379 return ($self->{ct}); # comment
2380
2381 redo A;
2382 } else {
2383
2384 $self->{ct}->{data} # comment
2385 .= '-' . chr ($self->{nc});
2386 $self->{state} = COMMENT_STATE;
2387
2388 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2389 $self->{line_prev} = $self->{line};
2390 $self->{column_prev} = $self->{column};
2391 $self->{column}++;
2392 $self->{nc}
2393 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2394 } else {
2395 $self->{set_nc}->($self);
2396 }
2397
2398 redo A;
2399 }
2400 } elsif ($self->{state} == COMMENT_STATE) {
2401 if ($self->{nc} == 0x002D) { # -
2402
2403 $self->{state} = COMMENT_END_DASH_STATE;
2404
2405 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2406 $self->{line_prev} = $self->{line};
2407 $self->{column_prev} = $self->{column};
2408 $self->{column}++;
2409 $self->{nc}
2410 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2411 } else {
2412 $self->{set_nc}->($self);
2413 }
2414
2415 redo A;
2416 } elsif ($self->{nc} == -1) {
2417
2418 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2419 $self->{state} = DATA_STATE;
2420 $self->{s_kwd} = '';
2421 ## reconsume
2422
2423 return ($self->{ct}); # comment
2424
2425 redo A;
2426 } else {
2427
2428 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2429 $self->{read_until}->($self->{ct}->{data},
2430 q[-],
2431 length $self->{ct}->{data});
2432
2433 ## Stay in the state
2434
2435 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2436 $self->{line_prev} = $self->{line};
2437 $self->{column_prev} = $self->{column};
2438 $self->{column}++;
2439 $self->{nc}
2440 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2441 } else {
2442 $self->{set_nc}->($self);
2443 }
2444
2445 redo A;
2446 }
2447 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2448 if ($self->{nc} == 0x002D) { # -
2449
2450 $self->{state} = COMMENT_END_STATE;
2451
2452 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2453 $self->{line_prev} = $self->{line};
2454 $self->{column_prev} = $self->{column};
2455 $self->{column}++;
2456 $self->{nc}
2457 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2458 } else {
2459 $self->{set_nc}->($self);
2460 }
2461
2462 redo A;
2463 } elsif ($self->{nc} == -1) {
2464
2465 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2466 $self->{s_kwd} = '';
2467 $self->{state} = DATA_STATE;
2468 $self->{s_kwd} = '';
2469 ## reconsume
2470
2471 return ($self->{ct}); # comment
2472
2473 redo A;
2474 } else {
2475
2476 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2477 $self->{state} = COMMENT_STATE;
2478
2479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480 $self->{line_prev} = $self->{line};
2481 $self->{column_prev} = $self->{column};
2482 $self->{column}++;
2483 $self->{nc}
2484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485 } else {
2486 $self->{set_nc}->($self);
2487 }
2488
2489 redo A;
2490 }
2491 } elsif ($self->{state} == COMMENT_END_STATE) {
2492 if ($self->{nc} == 0x003E) { # >
2493
2494 $self->{state} = DATA_STATE;
2495 $self->{s_kwd} = '';
2496
2497 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2498 $self->{line_prev} = $self->{line};
2499 $self->{column_prev} = $self->{column};
2500 $self->{column}++;
2501 $self->{nc}
2502 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2503 } else {
2504 $self->{set_nc}->($self);
2505 }
2506
2507
2508 return ($self->{ct}); # comment
2509
2510 redo A;
2511 } elsif ($self->{nc} == 0x002D) { # -
2512
2513 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2514 line => $self->{line_prev},
2515 column => $self->{column_prev});
2516 $self->{ct}->{data} .= '-'; # comment
2517 ## Stay in the state
2518
2519 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2520 $self->{line_prev} = $self->{line};
2521 $self->{column_prev} = $self->{column};
2522 $self->{column}++;
2523 $self->{nc}
2524 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2525 } else {
2526 $self->{set_nc}->($self);
2527 }
2528
2529 redo A;
2530 } elsif ($self->{nc} == -1) {
2531
2532 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2533 $self->{state} = DATA_STATE;
2534 $self->{s_kwd} = '';
2535 ## reconsume
2536
2537 return ($self->{ct}); # comment
2538
2539 redo A;
2540 } else {
2541
2542 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2543 line => $self->{line_prev},
2544 column => $self->{column_prev});
2545 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2546 $self->{state} = COMMENT_STATE;
2547
2548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2549 $self->{line_prev} = $self->{line};
2550 $self->{column_prev} = $self->{column};
2551 $self->{column}++;
2552 $self->{nc}
2553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2554 } else {
2555 $self->{set_nc}->($self);
2556 }
2557
2558 redo A;
2559 }
2560 } elsif ($self->{state} == DOCTYPE_STATE) {
2561 if ($is_space->{$self->{nc}}) {
2562
2563 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2564
2565 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2566 $self->{line_prev} = $self->{line};
2567 $self->{column_prev} = $self->{column};
2568 $self->{column}++;
2569 $self->{nc}
2570 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2571 } else {
2572 $self->{set_nc}->($self);
2573 }
2574
2575 redo A;
2576 } else {
2577
2578 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2579 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2580 ## reconsume
2581 redo A;
2582 }
2583 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2584 if ($is_space->{$self->{nc}}) {
2585
2586 ## Stay in the state
2587
2588 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2589 $self->{line_prev} = $self->{line};
2590 $self->{column_prev} = $self->{column};
2591 $self->{column}++;
2592 $self->{nc}
2593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2594 } else {
2595 $self->{set_nc}->($self);
2596 }
2597
2598 redo A;
2599 } elsif ($self->{nc} == 0x003E) { # >
2600
2601 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2602 $self->{state} = DATA_STATE;
2603 $self->{s_kwd} = '';
2604
2605 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2606 $self->{line_prev} = $self->{line};
2607 $self->{column_prev} = $self->{column};
2608 $self->{column}++;
2609 $self->{nc}
2610 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2611 } else {
2612 $self->{set_nc}->($self);
2613 }
2614
2615
2616 return ($self->{ct}); # DOCTYPE (quirks)
2617
2618 redo A;
2619 } elsif ($self->{nc} == -1) {
2620
2621 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2622 $self->{state} = DATA_STATE;
2623 $self->{s_kwd} = '';
2624 ## reconsume
2625
2626 return ($self->{ct}); # DOCTYPE (quirks)
2627
2628 redo A;
2629 } else {
2630
2631 $self->{ct}->{name} = chr $self->{nc};
2632 delete $self->{ct}->{quirks};
2633 $self->{state} = DOCTYPE_NAME_STATE;
2634
2635 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2636 $self->{line_prev} = $self->{line};
2637 $self->{column_prev} = $self->{column};
2638 $self->{column}++;
2639 $self->{nc}
2640 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2641 } else {
2642 $self->{set_nc}->($self);
2643 }
2644
2645 redo A;
2646 }
2647 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2648 ## ISSUE: Redundant "First," in the spec.
2649 if ($is_space->{$self->{nc}}) {
2650
2651 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2652
2653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2654 $self->{line_prev} = $self->{line};
2655 $self->{column_prev} = $self->{column};
2656 $self->{column}++;
2657 $self->{nc}
2658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2659 } else {
2660 $self->{set_nc}->($self);
2661 }
2662
2663 redo A;
2664 } elsif ($self->{nc} == 0x003E) { # >
2665
2666 $self->{state} = DATA_STATE;
2667 $self->{s_kwd} = '';
2668
2669 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2670 $self->{line_prev} = $self->{line};
2671 $self->{column_prev} = $self->{column};
2672 $self->{column}++;
2673 $self->{nc}
2674 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2675 } else {
2676 $self->{set_nc}->($self);
2677 }
2678
2679
2680 return ($self->{ct}); # DOCTYPE
2681
2682 redo A;
2683 } elsif ($self->{nc} == -1) {
2684
2685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2686 $self->{state} = DATA_STATE;
2687 $self->{s_kwd} = '';
2688 ## reconsume
2689
2690 $self->{ct}->{quirks} = 1;
2691 return ($self->{ct}); # DOCTYPE
2692
2693 redo A;
2694 } else {
2695
2696 $self->{ct}->{name}
2697 .= chr ($self->{nc}); # DOCTYPE
2698 ## Stay in the state
2699
2700 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2701 $self->{line_prev} = $self->{line};
2702 $self->{column_prev} = $self->{column};
2703 $self->{column}++;
2704 $self->{nc}
2705 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2706 } else {
2707 $self->{set_nc}->($self);
2708 }
2709
2710 redo A;
2711 }
2712 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2713 if ($is_space->{$self->{nc}}) {
2714
2715 ## Stay in the state
2716
2717 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2718 $self->{line_prev} = $self->{line};
2719 $self->{column_prev} = $self->{column};
2720 $self->{column}++;
2721 $self->{nc}
2722 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2723 } else {
2724 $self->{set_nc}->($self);
2725 }
2726
2727 redo A;
2728 } elsif ($self->{nc} == 0x003E) { # >
2729
2730 $self->{state} = DATA_STATE;
2731 $self->{s_kwd} = '';
2732
2733 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2734 $self->{line_prev} = $self->{line};
2735 $self->{column_prev} = $self->{column};
2736 $self->{column}++;
2737 $self->{nc}
2738 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2739 } else {
2740 $self->{set_nc}->($self);
2741 }
2742
2743
2744 return ($self->{ct}); # DOCTYPE
2745
2746 redo A;
2747 } elsif ($self->{nc} == -1) {
2748
2749 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2750 $self->{state} = DATA_STATE;
2751 $self->{s_kwd} = '';
2752 ## reconsume
2753
2754 $self->{ct}->{quirks} = 1;
2755 return ($self->{ct}); # DOCTYPE
2756
2757 redo A;
2758 } elsif ($self->{nc} == 0x0050 or # P
2759 $self->{nc} == 0x0070) { # p
2760 $self->{state} = PUBLIC_STATE;
2761 $self->{s_kwd} = chr $self->{nc};
2762
2763 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2764 $self->{line_prev} = $self->{line};
2765 $self->{column_prev} = $self->{column};
2766 $self->{column}++;
2767 $self->{nc}
2768 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2769 } else {
2770 $self->{set_nc}->($self);
2771 }
2772
2773 redo A;
2774 } elsif ($self->{nc} == 0x0053 or # S
2775 $self->{nc} == 0x0073) { # s
2776 $self->{state} = SYSTEM_STATE;
2777 $self->{s_kwd} = chr $self->{nc};
2778
2779 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2780 $self->{line_prev} = $self->{line};
2781 $self->{column_prev} = $self->{column};
2782 $self->{column}++;
2783 $self->{nc}
2784 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2785 } else {
2786 $self->{set_nc}->($self);
2787 }
2788
2789 redo A;
2790 } else {
2791
2792 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2793 $self->{ct}->{quirks} = 1;
2794
2795 $self->{state} = BOGUS_DOCTYPE_STATE;
2796
2797 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2798 $self->{line_prev} = $self->{line};
2799 $self->{column_prev} = $self->{column};
2800 $self->{column}++;
2801 $self->{nc}
2802 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2803 } else {
2804 $self->{set_nc}->($self);
2805 }
2806
2807 redo A;
2808 }
2809 } elsif ($self->{state} == PUBLIC_STATE) {
2810 ## ASCII case-insensitive
2811 if ($self->{nc} == [
2812 undef,
2813 0x0055, # U
2814 0x0042, # B
2815 0x004C, # L
2816 0x0049, # I
2817 ]->[length $self->{s_kwd}] or
2818 $self->{nc} == [
2819 undef,
2820 0x0075, # u
2821 0x0062, # b
2822 0x006C, # l
2823 0x0069, # i
2824 ]->[length $self->{s_kwd}]) {
2825
2826 ## Stay in the state.
2827 $self->{s_kwd} .= chr $self->{nc};
2828
2829 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2830 $self->{line_prev} = $self->{line};
2831 $self->{column_prev} = $self->{column};
2832 $self->{column}++;
2833 $self->{nc}
2834 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2835 } else {
2836 $self->{set_nc}->($self);
2837 }
2838
2839 redo A;
2840 } elsif ((length $self->{s_kwd}) == 5 and
2841 ($self->{nc} == 0x0043 or # C
2842 $self->{nc} == 0x0063)) { # c
2843
2844 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2845
2846 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2847 $self->{line_prev} = $self->{line};
2848 $self->{column_prev} = $self->{column};
2849 $self->{column}++;
2850 $self->{nc}
2851 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2852 } else {
2853 $self->{set_nc}->($self);
2854 }
2855
2856 redo A;
2857 } else {
2858
2859 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2860 line => $self->{line_prev},
2861 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2862 $self->{ct}->{quirks} = 1;
2863
2864 $self->{state} = BOGUS_DOCTYPE_STATE;
2865 ## Reconsume.
2866 redo A;
2867 }
2868 } elsif ($self->{state} == SYSTEM_STATE) {
2869 ## ASCII case-insensitive
2870 if ($self->{nc} == [
2871 undef,
2872 0x0059, # Y
2873 0x0053, # S
2874 0x0054, # T
2875 0x0045, # E
2876 ]->[length $self->{s_kwd}] or
2877 $self->{nc} == [
2878 undef,
2879 0x0079, # y
2880 0x0073, # s
2881 0x0074, # t
2882 0x0065, # e
2883 ]->[length $self->{s_kwd}]) {
2884
2885 ## Stay in the state.
2886 $self->{s_kwd} .= chr $self->{nc};
2887
2888 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2889 $self->{line_prev} = $self->{line};
2890 $self->{column_prev} = $self->{column};
2891 $self->{column}++;
2892 $self->{nc}
2893 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2894 } else {
2895 $self->{set_nc}->($self);
2896 }
2897
2898 redo A;
2899 } elsif ((length $self->{s_kwd}) == 5 and
2900 ($self->{nc} == 0x004D or # M
2901 $self->{nc} == 0x006D)) { # m
2902
2903 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2904
2905 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2906 $self->{line_prev} = $self->{line};
2907 $self->{column_prev} = $self->{column};
2908 $self->{column}++;
2909 $self->{nc}
2910 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2911 } else {
2912 $self->{set_nc}->($self);
2913 }
2914
2915 redo A;
2916 } else {
2917
2918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2919 line => $self->{line_prev},
2920 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2921 $self->{ct}->{quirks} = 1;
2922
2923 $self->{state} = BOGUS_DOCTYPE_STATE;
2924 ## Reconsume.
2925 redo A;
2926 }
2927 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2928 if ($is_space->{$self->{nc}}) {
2929
2930 ## Stay in the state
2931
2932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2933 $self->{line_prev} = $self->{line};
2934 $self->{column_prev} = $self->{column};
2935 $self->{column}++;
2936 $self->{nc}
2937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2938 } else {
2939 $self->{set_nc}->($self);
2940 }
2941
2942 redo A;
2943 } elsif ($self->{nc} eq 0x0022) { # "
2944
2945 $self->{ct}->{pubid} = ''; # DOCTYPE
2946 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2947
2948 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2949 $self->{line_prev} = $self->{line};
2950 $self->{column_prev} = $self->{column};
2951 $self->{column}++;
2952 $self->{nc}
2953 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2954 } else {
2955 $self->{set_nc}->($self);
2956 }
2957
2958 redo A;
2959 } elsif ($self->{nc} eq 0x0027) { # '
2960
2961 $self->{ct}->{pubid} = ''; # DOCTYPE
2962 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2963
2964 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2965 $self->{line_prev} = $self->{line};
2966 $self->{column_prev} = $self->{column};
2967 $self->{column}++;
2968 $self->{nc}
2969 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2970 } else {
2971 $self->{set_nc}->($self);
2972 }
2973
2974 redo A;
2975 } elsif ($self->{nc} eq 0x003E) { # >
2976
2977 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2978
2979 $self->{state} = DATA_STATE;
2980 $self->{s_kwd} = '';
2981
2982 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2983 $self->{line_prev} = $self->{line};
2984 $self->{column_prev} = $self->{column};
2985 $self->{column}++;
2986 $self->{nc}
2987 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2988 } else {
2989 $self->{set_nc}->($self);
2990 }
2991
2992
2993 $self->{ct}->{quirks} = 1;
2994 return ($self->{ct}); # DOCTYPE
2995
2996 redo A;
2997 } elsif ($self->{nc} == -1) {
2998
2999 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3000
3001 $self->{state} = DATA_STATE;
3002 $self->{s_kwd} = '';
3003 ## reconsume
3004
3005 $self->{ct}->{quirks} = 1;
3006 return ($self->{ct}); # DOCTYPE
3007
3008 redo A;
3009 } else {
3010
3011 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3012 $self->{ct}->{quirks} = 1;
3013
3014 $self->{state} = BOGUS_DOCTYPE_STATE;
3015
3016 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3017 $self->{line_prev} = $self->{line};
3018 $self->{column_prev} = $self->{column};
3019 $self->{column}++;
3020 $self->{nc}
3021 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3022 } else {
3023 $self->{set_nc}->($self);
3024 }
3025
3026 redo A;
3027 }
3028 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3029 if ($self->{nc} == 0x0022) { # "
3030
3031 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3032
3033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3034 $self->{line_prev} = $self->{line};
3035 $self->{column_prev} = $self->{column};
3036 $self->{column}++;
3037 $self->{nc}
3038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3039 } else {
3040 $self->{set_nc}->($self);
3041 }
3042
3043 redo A;
3044 } elsif ($self->{nc} == 0x003E) { # >
3045
3046 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3047
3048 $self->{state} = DATA_STATE;
3049 $self->{s_kwd} = '';
3050
3051 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3052 $self->{line_prev} = $self->{line};
3053 $self->{column_prev} = $self->{column};
3054 $self->{column}++;
3055 $self->{nc}
3056 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3057 } else {
3058 $self->{set_nc}->($self);
3059 }
3060
3061
3062 $self->{ct}->{quirks} = 1;
3063 return ($self->{ct}); # DOCTYPE
3064
3065 redo A;
3066 } elsif ($self->{nc} == -1) {
3067
3068 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3069
3070 $self->{state} = DATA_STATE;
3071 $self->{s_kwd} = '';
3072 ## reconsume
3073
3074 $self->{ct}->{quirks} = 1;
3075 return ($self->{ct}); # DOCTYPE
3076
3077 redo A;
3078 } else {
3079
3080 $self->{ct}->{pubid} # DOCTYPE
3081 .= chr $self->{nc};
3082 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3083 length $self->{ct}->{pubid});
3084
3085 ## Stay in the state
3086
3087 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3088 $self->{line_prev} = $self->{line};
3089 $self->{column_prev} = $self->{column};
3090 $self->{column}++;
3091 $self->{nc}
3092 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3093 } else {
3094 $self->{set_nc}->($self);
3095 }
3096
3097 redo A;
3098 }
3099 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3100 if ($self->{nc} == 0x0027) { # '
3101
3102 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3103
3104 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105 $self->{line_prev} = $self->{line};
3106 $self->{column_prev} = $self->{column};
3107 $self->{column}++;
3108 $self->{nc}
3109 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110 } else {
3111 $self->{set_nc}->($self);
3112 }
3113
3114 redo A;
3115 } elsif ($self->{nc} == 0x003E) { # >
3116
3117 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3118
3119 $self->{state} = DATA_STATE;
3120 $self->{s_kwd} = '';
3121
3122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123 $self->{line_prev} = $self->{line};
3124 $self->{column_prev} = $self->{column};
3125 $self->{column}++;
3126 $self->{nc}
3127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128 } else {
3129 $self->{set_nc}->($self);
3130 }
3131
3132
3133 $self->{ct}->{quirks} = 1;
3134 return ($self->{ct}); # DOCTYPE
3135
3136 redo A;
3137 } elsif ($self->{nc} == -1) {
3138
3139 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3140
3141 $self->{state} = DATA_STATE;
3142 $self->{s_kwd} = '';
3143 ## reconsume
3144
3145 $self->{ct}->{quirks} = 1;
3146 return ($self->{ct}); # DOCTYPE
3147
3148 redo A;
3149 } else {
3150
3151 $self->{ct}->{pubid} # DOCTYPE
3152 .= chr $self->{nc};
3153 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3154 length $self->{ct}->{pubid});
3155
3156 ## Stay in the state
3157
3158 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3159 $self->{line_prev} = $self->{line};
3160 $self->{column_prev} = $self->{column};
3161 $self->{column}++;
3162 $self->{nc}
3163 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3164 } else {
3165 $self->{set_nc}->($self);
3166 }
3167
3168 redo A;
3169 }
3170 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3171 if ($is_space->{$self->{nc}}) {
3172
3173 ## Stay in the state
3174
3175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3176 $self->{line_prev} = $self->{line};
3177 $self->{column_prev} = $self->{column};
3178 $self->{column}++;
3179 $self->{nc}
3180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3181 } else {
3182 $self->{set_nc}->($self);
3183 }
3184
3185 redo A;
3186 } elsif ($self->{nc} == 0x0022) { # "
3187
3188 $self->{ct}->{sysid} = ''; # DOCTYPE
3189 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3190
3191 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3192 $self->{line_prev} = $self->{line};
3193 $self->{column_prev} = $self->{column};
3194 $self->{column}++;
3195 $self->{nc}
3196 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3197 } else {
3198 $self->{set_nc}->($self);
3199 }
3200
3201 redo A;
3202 } elsif ($self->{nc} == 0x0027) { # '
3203
3204 $self->{ct}->{sysid} = ''; # DOCTYPE
3205 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3206
3207 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3208 $self->{line_prev} = $self->{line};
3209 $self->{column_prev} = $self->{column};
3210 $self->{column}++;
3211 $self->{nc}
3212 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3213 } else {
3214 $self->{set_nc}->($self);
3215 }
3216
3217 redo A;
3218 } elsif ($self->{nc} == 0x003E) { # >
3219
3220 $self->{state} = DATA_STATE;
3221 $self->{s_kwd} = '';
3222
3223 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3224 $self->{line_prev} = $self->{line};
3225 $self->{column_prev} = $self->{column};
3226 $self->{column}++;
3227 $self->{nc}
3228 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3229 } else {
3230 $self->{set_nc}->($self);
3231 }
3232
3233
3234 return ($self->{ct}); # DOCTYPE
3235
3236 redo A;
3237 } elsif ($self->{nc} == -1) {
3238
3239 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3240
3241 $self->{state} = DATA_STATE;
3242 $self->{s_kwd} = '';
3243 ## reconsume
3244
3245 $self->{ct}->{quirks} = 1;
3246 return ($self->{ct}); # DOCTYPE
3247
3248 redo A;
3249 } else {
3250
3251 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3252 $self->{ct}->{quirks} = 1;
3253
3254 $self->{state} = BOGUS_DOCTYPE_STATE;
3255
3256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3257 $self->{line_prev} = $self->{line};
3258 $self->{column_prev} = $self->{column};
3259 $self->{column}++;
3260 $self->{nc}
3261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3262 } else {
3263 $self->{set_nc}->($self);
3264 }
3265
3266 redo A;
3267 }
3268 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3269 if ($is_space->{$self->{nc}}) {
3270
3271 ## Stay in the state
3272
3273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3274 $self->{line_prev} = $self->{line};
3275 $self->{column_prev} = $self->{column};
3276 $self->{column}++;
3277 $self->{nc}
3278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3279 } else {
3280 $self->{set_nc}->($self);
3281 }
3282
3283 redo A;
3284 } elsif ($self->{nc} == 0x0022) { # "
3285
3286 $self->{ct}->{sysid} = ''; # DOCTYPE
3287 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3288
3289 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3290 $self->{line_prev} = $self->{line};
3291 $self->{column_prev} = $self->{column};
3292 $self->{column}++;
3293 $self->{nc}
3294 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3295 } else {
3296 $self->{set_nc}->($self);
3297 }
3298
3299 redo A;
3300 } elsif ($self->{nc} == 0x0027) { # '
3301
3302 $self->{ct}->{sysid} = ''; # DOCTYPE
3303 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3304
3305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3306 $self->{line_prev} = $self->{line};
3307 $self->{column_prev} = $self->{column};
3308 $self->{column}++;
3309 $self->{nc}
3310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3311 } else {
3312 $self->{set_nc}->($self);
3313 }
3314
3315 redo A;
3316 } elsif ($self->{nc} == 0x003E) { # >
3317
3318 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3319 $self->{state} = DATA_STATE;
3320 $self->{s_kwd} = '';
3321
3322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3323 $self->{line_prev} = $self->{line};
3324 $self->{column_prev} = $self->{column};
3325 $self->{column}++;
3326 $self->{nc}
3327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3328 } else {
3329 $self->{set_nc}->($self);
3330 }
3331
3332
3333 $self->{ct}->{quirks} = 1;
3334 return ($self->{ct}); # DOCTYPE
3335
3336 redo A;
3337 } elsif ($self->{nc} == -1) {
3338
3339 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3340
3341 $self->{state} = DATA_STATE;
3342 $self->{s_kwd} = '';
3343 ## reconsume
3344
3345 $self->{ct}->{quirks} = 1;
3346 return ($self->{ct}); # DOCTYPE
3347
3348 redo A;
3349 } else {
3350
3351 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3352 $self->{ct}->{quirks} = 1;
3353
3354 $self->{state} = BOGUS_DOCTYPE_STATE;
3355
3356 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3357 $self->{line_prev} = $self->{line};
3358 $self->{column_prev} = $self->{column};
3359 $self->{column}++;
3360 $self->{nc}
3361 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3362 } else {
3363 $self->{set_nc}->($self);
3364 }
3365
3366 redo A;
3367 }
3368 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3369 if ($self->{nc} == 0x0022) { # "
3370
3371 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3372
3373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3374 $self->{line_prev} = $self->{line};
3375 $self->{column_prev} = $self->{column};
3376 $self->{column}++;
3377 $self->{nc}
3378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3379 } else {
3380 $self->{set_nc}->($self);
3381 }
3382
3383 redo A;
3384 } elsif ($self->{nc} == 0x003E) { # >
3385
3386 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3387
3388 $self->{state} = DATA_STATE;
3389 $self->{s_kwd} = '';
3390
3391 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3392 $self->{line_prev} = $self->{line};
3393 $self->{column_prev} = $self->{column};
3394 $self->{column}++;
3395 $self->{nc}
3396 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3397 } else {
3398 $self->{set_nc}->($self);
3399 }
3400
3401
3402 $self->{ct}->{quirks} = 1;
3403 return ($self->{ct}); # DOCTYPE
3404
3405 redo A;
3406 } elsif ($self->{nc} == -1) {
3407
3408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3409
3410 $self->{state} = DATA_STATE;
3411 $self->{s_kwd} = '';
3412 ## reconsume
3413
3414 $self->{ct}->{quirks} = 1;
3415 return ($self->{ct}); # DOCTYPE
3416
3417 redo A;
3418 } else {
3419
3420 $self->{ct}->{sysid} # DOCTYPE
3421 .= chr $self->{nc};
3422 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3423 length $self->{ct}->{sysid});
3424
3425 ## Stay in the state
3426
3427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428 $self->{line_prev} = $self->{line};
3429 $self->{column_prev} = $self->{column};
3430 $self->{column}++;
3431 $self->{nc}
3432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3433 } else {
3434 $self->{set_nc}->($self);
3435 }
3436
3437 redo A;
3438 }
3439 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3440 if ($self->{nc} == 0x0027) { # '
3441
3442 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3443
3444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3445 $self->{line_prev} = $self->{line};
3446 $self->{column_prev} = $self->{column};
3447 $self->{column}++;
3448 $self->{nc}
3449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3450 } else {
3451 $self->{set_nc}->($self);
3452 }
3453
3454 redo A;
3455 } elsif ($self->{nc} == 0x003E) { # >
3456
3457 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3458
3459 $self->{state} = DATA_STATE;
3460 $self->{s_kwd} = '';
3461
3462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3463 $self->{line_prev} = $self->{line};
3464 $self->{column_prev} = $self->{column};
3465 $self->{column}++;
3466 $self->{nc}
3467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3468 } else {
3469 $self->{set_nc}->($self);
3470 }
3471
3472
3473 $self->{ct}->{quirks} = 1;
3474 return ($self->{ct}); # DOCTYPE
3475
3476 redo A;
3477 } elsif ($self->{nc} == -1) {
3478
3479 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3480
3481 $self->{state} = DATA_STATE;
3482 $self->{s_kwd} = '';
3483 ## reconsume
3484
3485 $self->{ct}->{quirks} = 1;
3486 return ($self->{ct}); # DOCTYPE
3487
3488 redo A;
3489 } else {
3490
3491 $self->{ct}->{sysid} # DOCTYPE
3492 .= chr $self->{nc};
3493 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3494 length $self->{ct}->{sysid});
3495
3496 ## Stay in the state
3497
3498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3499 $self->{line_prev} = $self->{line};
3500 $self->{column_prev} = $self->{column};
3501 $self->{column}++;
3502 $self->{nc}
3503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3504 } else {
3505 $self->{set_nc}->($self);
3506 }
3507
3508 redo A;
3509 }
3510 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3511 if ($is_space->{$self->{nc}}) {
3512
3513 ## Stay in the state
3514
3515 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3516 $self->{line_prev} = $self->{line};
3517 $self->{column_prev} = $self->{column};
3518 $self->{column}++;
3519 $self->{nc}
3520 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3521 } else {
3522 $self->{set_nc}->($self);
3523 }
3524
3525 redo A;
3526 } elsif ($self->{nc} == 0x003E) { # >
3527
3528 $self->{state} = DATA_STATE;
3529 $self->{s_kwd} = '';
3530
3531 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3532 $self->{line_prev} = $self->{line};
3533 $self->{column_prev} = $self->{column};
3534 $self->{column}++;
3535 $self->{nc}
3536 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3537 } else {
3538 $self->{set_nc}->($self);
3539 }
3540
3541
3542 return ($self->{ct}); # DOCTYPE
3543
3544 redo A;
3545 } elsif ($self->{nc} == -1) {
3546
3547 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3548 $self->{state} = DATA_STATE;
3549 $self->{s_kwd} = '';
3550 ## reconsume
3551
3552 $self->{ct}->{quirks} = 1;
3553 return ($self->{ct}); # DOCTYPE
3554
3555 redo A;
3556 } else {
3557
3558 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3559 #$self->{ct}->{quirks} = 1;
3560
3561 $self->{state} = BOGUS_DOCTYPE_STATE;
3562
3563 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3564 $self->{line_prev} = $self->{line};
3565 $self->{column_prev} = $self->{column};
3566 $self->{column}++;
3567 $self->{nc}
3568 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3569 } else {
3570 $self->{set_nc}->($self);
3571 }
3572
3573 redo A;
3574 }
3575 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3576 if ($self->{nc} == 0x003E) { # >
3577
3578 $self->{state} = DATA_STATE;
3579 $self->{s_kwd} = '';
3580
3581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3582 $self->{line_prev} = $self->{line};
3583 $self->{column_prev} = $self->{column};
3584 $self->{column}++;
3585 $self->{nc}
3586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3587 } else {
3588 $self->{set_nc}->($self);
3589 }
3590
3591
3592 return ($self->{ct}); # DOCTYPE
3593
3594 redo A;
3595 } elsif ($self->{nc} == -1) {
3596
3597 $self->{state} = DATA_STATE;
3598 $self->{s_kwd} = '';
3599 ## reconsume
3600
3601 return ($self->{ct}); # DOCTYPE
3602
3603 redo A;
3604 } else {
3605
3606 my $s = '';
3607 $self->{read_until}->($s, q[>], 0);
3608
3609 ## Stay in the state
3610
3611 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3612 $self->{line_prev} = $self->{line};
3613 $self->{column_prev} = $self->{column};
3614 $self->{column}++;
3615 $self->{nc}
3616 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3617 } else {
3618 $self->{set_nc}->($self);
3619 }
3620
3621 redo A;
3622 }
3623 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3624 ## NOTE: "CDATA section state" in the state is jointly implemented
3625 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3626 ## and |CDATA_SECTION_MSE2_STATE|.
3627
3628 if ($self->{nc} == 0x005D) { # ]
3629
3630 $self->{state} = CDATA_SECTION_MSE1_STATE;
3631
3632 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3633 $self->{line_prev} = $self->{line};
3634 $self->{column_prev} = $self->{column};
3635 $self->{column}++;
3636 $self->{nc}
3637 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3638 } else {
3639 $self->{set_nc}->($self);
3640 }
3641
3642 redo A;
3643 } elsif ($self->{nc} == -1) {
3644 if ($self->{is_xml}) {
3645 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3646 }
3647
3648 $self->{state} = DATA_STATE;
3649 $self->{s_kwd} = '';
3650
3651 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3652 $self->{line_prev} = $self->{line};
3653 $self->{column_prev} = $self->{column};
3654 $self->{column}++;
3655 $self->{nc}
3656 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3657 } else {
3658 $self->{set_nc}->($self);
3659 }
3660
3661 if (length $self->{ct}->{data}) { # character
3662
3663 return ($self->{ct}); # character
3664 } else {
3665
3666 ## No token to emit. $self->{ct} is discarded.
3667 }
3668 redo A;
3669 } else {
3670
3671 $self->{ct}->{data} .= chr $self->{nc};
3672 $self->{read_until}->($self->{ct}->{data},
3673 q<]>,
3674 length $self->{ct}->{data});
3675
3676 ## Stay in the state.
3677
3678 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3679 $self->{line_prev} = $self->{line};
3680 $self->{column_prev} = $self->{column};
3681 $self->{column}++;
3682 $self->{nc}
3683 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3684 } else {
3685 $self->{set_nc}->($self);
3686 }
3687
3688 redo A;
3689 }
3690
3691 ## ISSUE: "text tokens" in spec.
3692 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3693 if ($self->{nc} == 0x005D) { # ]
3694
3695 $self->{state} = CDATA_SECTION_MSE2_STATE;
3696
3697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3698 $self->{line_prev} = $self->{line};
3699 $self->{column_prev} = $self->{column};
3700 $self->{column}++;
3701 $self->{nc}
3702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3703 } else {
3704 $self->{set_nc}->($self);
3705 }
3706
3707 redo A;
3708 } else {
3709
3710 $self->{ct}->{data} .= ']';
3711 $self->{state} = CDATA_SECTION_STATE;
3712 ## Reconsume.
3713 redo A;
3714 }
3715 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3716 if ($self->{nc} == 0x003E) { # >
3717 $self->{state} = DATA_STATE;
3718 $self->{s_kwd} = '';
3719
3720 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3721 $self->{line_prev} = $self->{line};
3722 $self->{column_prev} = $self->{column};
3723 $self->{column}++;
3724 $self->{nc}
3725 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3726 } else {
3727 $self->{set_nc}->($self);
3728 }
3729
3730 if (length $self->{ct}->{data}) { # character
3731
3732 return ($self->{ct}); # character
3733 } else {
3734
3735 ## No token to emit. $self->{ct} is discarded.
3736 }
3737 redo A;
3738 } elsif ($self->{nc} == 0x005D) { # ]
3739 # character
3740 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3741 ## Stay in the state.
3742
3743 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3744 $self->{line_prev} = $self->{line};
3745 $self->{column_prev} = $self->{column};
3746 $self->{column}++;
3747 $self->{nc}
3748 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3749 } else {
3750 $self->{set_nc}->($self);
3751 }
3752
3753 redo A;
3754 } else {
3755
3756 $self->{ct}->{data} .= ']]'; # character
3757 $self->{state} = CDATA_SECTION_STATE;
3758 ## Reconsume.
3759 redo A;
3760 }
3761 } elsif ($self->{state} == ENTITY_STATE) {
3762 if ($is_space->{$self->{nc}} or
3763 {
3764 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3765 $self->{entity_add} => 1,
3766 }->{$self->{nc}}) {
3767
3768 ## Don't consume
3769 ## No error
3770 ## Return nothing.
3771 #
3772 } elsif ($self->{nc} == 0x0023) { # #
3773
3774 $self->{state} = ENTITY_HASH_STATE;
3775 $self->{s_kwd} = '#';
3776
3777 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3778 $self->{line_prev} = $self->{line};
3779 $self->{column_prev} = $self->{column};
3780 $self->{column}++;
3781 $self->{nc}
3782 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3783 } else {
3784 $self->{set_nc}->($self);
3785 }
3786
3787 redo A;
3788 } elsif ((0x0041 <= $self->{nc} and
3789 $self->{nc} <= 0x005A) or # A..Z
3790 (0x0061 <= $self->{nc} and
3791 $self->{nc} <= 0x007A)) { # a..z
3792
3793 require Whatpm::_NamedEntityList;
3794 $self->{state} = ENTITY_NAME_STATE;
3795 $self->{s_kwd} = chr $self->{nc};
3796 $self->{entity__value} = $self->{s_kwd};
3797 $self->{entity__match} = 0;
3798
3799 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3800 $self->{line_prev} = $self->{line};
3801 $self->{column_prev} = $self->{column};
3802 $self->{column}++;
3803 $self->{nc}
3804 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3805 } else {
3806 $self->{set_nc}->($self);
3807 }
3808
3809 redo A;
3810 } else {
3811
3812 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3813 ## Return nothing.
3814 #
3815 }
3816
3817 ## NOTE: No character is consumed by the "consume a character
3818 ## reference" algorithm. In other word, there is an "&" character
3819 ## that does not introduce a character reference, which would be
3820 ## appended to the parent element or the attribute value in later
3821 ## process of the tokenizer.
3822
3823 if ($self->{prev_state} == DATA_STATE) {
3824
3825 $self->{state} = $self->{prev_state};
3826 $self->{s_kwd} = '';
3827 ## Reconsume.
3828 return ({type => CHARACTER_TOKEN, data => '&',
3829 line => $self->{line_prev},
3830 column => $self->{column_prev},
3831 });
3832 redo A;
3833 } else {
3834
3835 $self->{ca}->{value} .= '&';
3836 $self->{state} = $self->{prev_state};
3837 $self->{s_kwd} = '';
3838 ## Reconsume.
3839 redo A;
3840 }
3841 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3842 if ($self->{nc} == 0x0078 or # x
3843 $self->{nc} == 0x0058) { # X
3844
3845 $self->{state} = HEXREF_X_STATE;
3846 $self->{s_kwd} .= chr $self->{nc};
3847
3848 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3849 $self->{line_prev} = $self->{line};
3850 $self->{column_prev} = $self->{column};
3851 $self->{column}++;
3852 $self->{nc}
3853 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3854 } else {
3855 $self->{set_nc}->($self);
3856 }
3857
3858 redo A;
3859 } elsif (0x0030 <= $self->{nc} and
3860 $self->{nc} <= 0x0039) { # 0..9
3861
3862 $self->{state} = NCR_NUM_STATE;
3863 $self->{s_kwd} = $self->{nc} - 0x0030;
3864
3865 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3866 $self->{line_prev} = $self->{line};
3867 $self->{column_prev} = $self->{column};
3868 $self->{column}++;
3869 $self->{nc}
3870 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3871 } else {
3872 $self->{set_nc}->($self);
3873 }
3874
3875 redo A;
3876 } else {
3877 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3878 line => $self->{line_prev},
3879 column => $self->{column_prev} - 1);
3880
3881 ## NOTE: According to the spec algorithm, nothing is returned,
3882 ## and then "&#" is appended to the parent element or the attribute
3883 ## value in the later processing.
3884
3885 if ($self->{prev_state} == DATA_STATE) {
3886
3887 $self->{state} = $self->{prev_state};
3888 $self->{s_kwd} = '';
3889 ## Reconsume.
3890 return ({type => CHARACTER_TOKEN,
3891 data => '&#',
3892 line => $self->{line_prev},
3893 column => $self->{column_prev} - 1,
3894 });
3895 redo A;
3896 } else {
3897
3898 $self->{ca}->{value} .= '&#';
3899 $self->{state} = $self->{prev_state};
3900 $self->{s_kwd} = '';
3901 ## Reconsume.
3902 redo A;
3903 }
3904 }
3905 } elsif ($self->{state} == NCR_NUM_STATE) {
3906 if (0x0030 <= $self->{nc} and
3907 $self->{nc} <= 0x0039) { # 0..9
3908
3909 $self->{s_kwd} *= 10;
3910 $self->{s_kwd} += $self->{nc} - 0x0030;
3911
3912 ## Stay in the state.
3913
3914 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915 $self->{line_prev} = $self->{line};
3916 $self->{column_prev} = $self->{column};
3917 $self->{column}++;
3918 $self->{nc}
3919 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920 } else {
3921 $self->{set_nc}->($self);
3922 }
3923
3924 redo A;
3925 } elsif ($self->{nc} == 0x003B) { # ;
3926
3927
3928 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3929 $self->{line_prev} = $self->{line};
3930 $self->{column_prev} = $self->{column};
3931 $self->{column}++;
3932 $self->{nc}
3933 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3934 } else {
3935 $self->{set_nc}->($self);
3936 }
3937
3938 #
3939 } else {
3940
3941 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3942 ## Reconsume.
3943 #
3944 }
3945
3946 my $code = $self->{s_kwd};
3947 my $l = $self->{line_prev};
3948 my $c = $self->{column_prev};
3949 if ($charref_map->{$code}) {
3950
3951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3952 text => (sprintf 'U+%04X', $code),
3953 line => $l, column => $c);
3954 $code = $charref_map->{$code};
3955 } elsif ($code > 0x10FFFF) {
3956
3957 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3958 text => (sprintf 'U-%08X', $code),
3959 line => $l, column => $c);
3960 $code = 0xFFFD;
3961 }
3962
3963 if ($self->{prev_state} == DATA_STATE) {
3964
3965 $self->{state} = $self->{prev_state};
3966 $self->{s_kwd} = '';
3967 ## Reconsume.
3968 return ({type => CHARACTER_TOKEN, data => chr $code,
3969 line => $l, column => $c,
3970 });
3971 redo A;
3972 } else {
3973
3974 $self->{ca}->{value} .= chr $code;
3975 $self->{ca}->{has_reference} = 1;
3976 $self->{state} = $self->{prev_state};
3977 $self->{s_kwd} = '';
3978 ## Reconsume.
3979 redo A;
3980 }
3981 } elsif ($self->{state} == HEXREF_X_STATE) {
3982 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3983 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3984 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3985 # 0..9, A..F, a..f
3986
3987 $self->{state} = HEXREF_HEX_STATE;
3988 $self->{s_kwd} = 0;
3989 ## Reconsume.
3990 redo A;
3991 } else {
3992 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3993 line => $self->{line_prev},
3994 column => $self->{column_prev} - 2);
3995
3996 ## NOTE: According to the spec algorithm, nothing is returned,
3997 ## and then "&#" followed by "X" or "x" is appended to the parent
3998 ## element or the attribute value in the later processing.
3999
4000 if ($self->{prev_state} == DATA_STATE) {
4001
4002 $self->{state} = $self->{prev_state};
4003 $self->{s_kwd} = '';
4004 ## Reconsume.
4005 return ({type => CHARACTER_TOKEN,
4006 data => '&' . $self->{s_kwd},
4007 line => $self->{line_prev},
4008 column => $self->{column_prev} - length $self->{s_kwd},
4009 });
4010 redo A;
4011 } else {
4012
4013 $self->{ca}->{value} .= '&' . $self->{s_kwd};
4014 $self->{state} = $self->{prev_state};
4015 $self->{s_kwd} = '';
4016 ## Reconsume.
4017 redo A;
4018 }
4019 }
4020 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4021 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4022 # 0..9
4023
4024 $self->{s_kwd} *= 0x10;
4025 $self->{s_kwd} += $self->{nc} - 0x0030;
4026 ## Stay in the state.
4027
4028 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4029 $self->{line_prev} = $self->{line};
4030 $self->{column_prev} = $self->{column};
4031 $self->{column}++;
4032 $self->{nc}
4033 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4034 } else {
4035 $self->{set_nc}->($self);
4036 }
4037
4038 redo A;
4039 } elsif (0x0061 <= $self->{nc} and
4040 $self->{nc} <= 0x0066) { # a..f
4041
4042 $self->{s_kwd} *= 0x10;
4043 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4044 ## Stay in the state.
4045
4046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4047 $self->{line_prev} = $self->{line};
4048 $self->{column_prev} = $self->{column};
4049 $self->{column}++;
4050 $self->{nc}
4051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4052 } else {
4053 $self->{set_nc}->($self);
4054 }
4055
4056 redo A;
4057 } elsif (0x0041 <= $self->{nc} and
4058 $self->{nc} <= 0x0046) { # A..F
4059
4060 $self->{s_kwd} *= 0x10;
4061 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4062 ## Stay in the state.
4063
4064 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4065 $self->{line_prev} = $self->{line};
4066 $self->{column_prev} = $self->{column};
4067 $self->{column}++;
4068 $self->{nc}
4069 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4070 } else {
4071 $self->{set_nc}->($self);
4072 }
4073
4074 redo A;
4075 } elsif ($self->{nc} == 0x003B) { # ;
4076
4077
4078 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4079 $self->{line_prev} = $self->{line};
4080 $self->{column_prev} = $self->{column};
4081 $self->{column}++;
4082 $self->{nc}
4083 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4084 } else {
4085 $self->{set_nc}->($self);
4086 }
4087
4088 #
4089 } else {
4090
4091 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4092 line => $self->{line},
4093 column => $self->{column});
4094 ## Reconsume.
4095 #
4096 }
4097
4098 my $code = $self->{s_kwd};
4099 my $l = $self->{line_prev};
4100 my $c = $self->{column_prev};
4101 if ($charref_map->{$code}) {
4102
4103 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4104 text => (sprintf 'U+%04X', $code),
4105 line => $l, column => $c);
4106 $code = $charref_map->{$code};
4107 } elsif ($code > 0x10FFFF) {
4108
4109 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4110 text => (sprintf 'U-%08X', $code),
4111 line => $l, column => $c);
4112 $code = 0xFFFD;
4113 }
4114
4115 if ($self->{prev_state} == DATA_STATE) {
4116
4117 $self->{state} = $self->{prev_state};
4118 $self->{s_kwd} = '';
4119 ## Reconsume.
4120 return ({type => CHARACTER_TOKEN, data => chr $code,
4121 line => $l, column => $c,
4122 });
4123 redo A;
4124 } else {
4125
4126 $self->{ca}->{value} .= chr $code;
4127 $self->{ca}->{has_reference} = 1;
4128 $self->{state} = $self->{prev_state};
4129 $self->{s_kwd} = '';
4130 ## Reconsume.
4131 redo A;
4132 }
4133 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4134 if (length $self->{s_kwd} < 30 and
4135 ## NOTE: Some number greater than the maximum length of entity name
4136 ((0x0041 <= $self->{nc} and # a
4137 $self->{nc} <= 0x005A) or # x
4138 (0x0061 <= $self->{nc} and # a
4139 $self->{nc} <= 0x007A) or # z
4140 (0x0030 <= $self->{nc} and # 0
4141 $self->{nc} <= 0x0039) or # 9
4142 $self->{nc} == 0x003B)) { # ;
4143 our $EntityChar;
4144 $self->{s_kwd} .= chr $self->{nc};
4145 if (defined $EntityChar->{$self->{s_kwd}}) {
4146 if ($self->{nc} == 0x003B) { # ;
4147
4148 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4149 $self->{entity__match} = 1;
4150
4151 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4152 $self->{line_prev} = $self->{line};
4153 $self->{column_prev} = $self->{column};
4154 $self->{column}++;
4155 $self->{nc}
4156 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4157 } else {
4158 $self->{set_nc}->($self);
4159 }
4160
4161 #
4162 } else {
4163
4164 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4165 $self->{entity__match} = -1;
4166 ## Stay in the state.
4167
4168 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4169 $self->{line_prev} = $self->{line};
4170 $self->{column_prev} = $self->{column};
4171 $self->{column}++;
4172 $self->{nc}
4173 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4174 } else {
4175 $self->{set_nc}->($self);
4176 }
4177
4178 redo A;
4179 }
4180 } else {
4181
4182 $self->{entity__value} .= chr $self->{nc};
4183 $self->{entity__match} *= 2;
4184 ## Stay in the state.
4185
4186 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4187 $self->{line_prev} = $self->{line};
4188 $self->{column_prev} = $self->{column};
4189 $self->{column}++;
4190 $self->{nc}
4191 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4192 } else {
4193 $self->{set_nc}->($self);
4194 }
4195
4196 redo A;
4197 }
4198 }
4199
4200 my $data;
4201 my $has_ref;
4202 if ($self->{entity__match} > 0) {
4203
4204 $data = $self->{entity__value};
4205 $has_ref = 1;
4206 #
4207 } elsif ($self->{entity__match} < 0) {
4208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4209 if ($self->{prev_state} != DATA_STATE and # in attribute
4210 $self->{entity__match} < -1) {
4211
4212 $data = '&' . $self->{s_kwd};
4213 #
4214 } else {
4215
4216 $data = $self->{entity__value};
4217 $has_ref = 1;
4218 #
4219 }
4220 } else {
4221
4222 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4223 line => $self->{line_prev},
4224 column => $self->{column_prev} - length $self->{s_kwd});
4225 $data = '&' . $self->{s_kwd};
4226 #
4227 }
4228
4229 ## NOTE: In these cases, when a character reference is found,
4230 ## it is consumed and a character token is returned, or, otherwise,
4231 ## nothing is consumed and returned, according to the spec algorithm.
4232 ## In this implementation, anything that has been examined by the
4233 ## tokenizer is appended to the parent element or the attribute value
4234 ## as string, either literal string when no character reference or
4235 ## entity-replaced string otherwise, in this stage, since any characters
4236 ## that would not be consumed are appended in the data state or in an
4237 ## appropriate attribute value state anyway.
4238
4239 if ($self->{prev_state} == DATA_STATE) {
4240
4241 $self->{state} = $self->{prev_state};
4242 $self->{s_kwd} = '';
4243 ## Reconsume.
4244 return ({type => CHARACTER_TOKEN,
4245 data => $data,
4246 line => $self->{line_prev},
4247 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4248 });
4249 redo A;
4250 } else {
4251
4252 $self->{ca}->{value} .= $data;
4253 $self->{ca}->{has_reference} = 1 if $has_ref;
4254 $self->{state} = $self->{prev_state};
4255 $self->{s_kwd} = '';
4256 ## Reconsume.
4257 redo A;
4258 }
4259 } else {
4260 die "$0: $self->{state}: Unknown state";
4261 }
4262 } # A
4263
4264 die "$0: _get_next_token: unexpected case";
4265 } # _get_next_token
4266
4267 1;
4268 ## $Date: 2008/10/14 14:38:59 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24