/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (show annotations) (download)
Tue Oct 14 14:38:59 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.4: +109 -9 lines
++ whatpm/t/ChangeLog	14 Oct 2008 14:21:51 -0000
	* XML-Parser.t: "xml/texts-1.dat" added.

	* tokenizer-test-2.dat: Test for ]]> are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:38:34 -0000
	* doctypes-1.dat: Wrong results fixed.

	* texts-1.dat: New test data file.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:22:43 -0000
	* Tokenizer.pm.src: Raise a parse error for XML "]]>" other than
	CDATA section end.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 $self->{s_kwd} = ''; # state keyword
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194
195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196 $self->{line_prev} = $self->{line};
197 $self->{column_prev} = $self->{column};
198 $self->{column}++;
199 $self->{nc}
200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201 } else {
202 $self->{set_nc}->($self);
203 }
204
205 $self->{token} = [];
206 # $self->{escape}
207 } # _initialize_tokenizer
208
209 ## A token has:
210 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212 ## ->{name} (DOCTYPE_TOKEN)
213 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214 ## ->{pubid} (DOCTYPE_TOKEN)
215 ## ->{sysid} (DOCTYPE_TOKEN)
216 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218 ## ->{name}
219 ## ->{value}
220 ## ->{has_reference} == 1 or 0
221 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224 ## while the token is pushed back to the stack.
225
226 ## Emitted token MUST immediately be handled by the tree construction state.
227
228 ## Before each step, UA MAY check to see if either one of the scripts in
229 ## "list of scripts that will execute as soon as possible" or the first
230 ## script in the "list of scripts that will execute asynchronously",
231 ## has completed loading. If one has, then it MUST be executed
232 ## and removed from the list.
233
234 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235 ## (This requirement was dropped from HTML5 spec, unfortunately.)
236
237 my $is_space = {
238 0x0009 => 1, # CHARACTER TABULATION (HT)
239 0x000A => 1, # LINE FEED (LF)
240 #0x000B => 0, # LINE TABULATION (VT)
241 0x000C => 1, # FORM FEED (FF)
242 #0x000D => 1, # CARRIAGE RETURN (CR)
243 0x0020 => 1, # SPACE (SP)
244 };
245
246 sub _get_next_token ($) {
247 my $self = shift;
248
249 if ($self->{self_closing}) {
250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251 ## NOTE: The |self_closing| flag is only set by start tag token.
252 ## In addition, when a start tag token is emitted, it is always set to
253 ## |ct|.
254 delete $self->{self_closing};
255 }
256
257 if (@{$self->{token}}) {
258 $self->{self_closing} = $self->{token}->[0]->{self_closing};
259 return shift @{$self->{token}};
260 }
261
262 A: {
263 if ($self->{state} == PCDATA_STATE) {
264 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265
266 if ($self->{nc} == 0x0026) { # &
267
268 ## NOTE: In the spec, the tokenizer is switched to the
269 ## "entity data state". In this implementation, the tokenizer
270 ## is switched to the |ENTITY_STATE|, which is an implementation
271 ## of the "consume a character reference" algorithm.
272 $self->{entity_add} = -1;
273 $self->{prev_state} = DATA_STATE;
274 $self->{state} = ENTITY_STATE;
275
276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277 $self->{line_prev} = $self->{line};
278 $self->{column_prev} = $self->{column};
279 $self->{column}++;
280 $self->{nc}
281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282 } else {
283 $self->{set_nc}->($self);
284 }
285
286 redo A;
287 } elsif ($self->{nc} == 0x003C) { # <
288
289 $self->{state} = TAG_OPEN_STATE;
290
291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292 $self->{line_prev} = $self->{line};
293 $self->{column_prev} = $self->{column};
294 $self->{column}++;
295 $self->{nc}
296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297 } else {
298 $self->{set_nc}->($self);
299 }
300
301 redo A;
302 } elsif ($self->{nc} == -1) {
303
304 return ({type => END_OF_FILE_TOKEN,
305 line => $self->{line}, column => $self->{column}});
306 last A; ## TODO: ok?
307 } else {
308
309 #
310 }
311
312 # Anything else
313 my $token = {type => CHARACTER_TOKEN,
314 data => chr $self->{nc},
315 line => $self->{line}, column => $self->{column},
316 };
317 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318
319 ## Stay in the state.
320
321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322 $self->{line_prev} = $self->{line};
323 $self->{column_prev} = $self->{column};
324 $self->{column}++;
325 $self->{nc}
326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327 } else {
328 $self->{set_nc}->($self);
329 }
330
331 return ($token);
332 redo A;
333 } elsif ($self->{state} == DATA_STATE) {
334 $self->{s_kwd} = '' unless defined $self->{s_kwd};
335 if ($self->{nc} == 0x0026) { # &
336 $self->{s_kwd} = '';
337 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338 not $self->{escape}) {
339
340 ## NOTE: In the spec, the tokenizer is switched to the
341 ## "entity data state". In this implementation, the tokenizer
342 ## is switched to the |ENTITY_STATE|, which is an implementation
343 ## of the "consume a character reference" algorithm.
344 $self->{entity_add} = -1;
345 $self->{prev_state} = DATA_STATE;
346 $self->{state} = ENTITY_STATE;
347
348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349 $self->{line_prev} = $self->{line};
350 $self->{column_prev} = $self->{column};
351 $self->{column}++;
352 $self->{nc}
353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354 } else {
355 $self->{set_nc}->($self);
356 }
357
358 redo A;
359 } else {
360
361 #
362 }
363 } elsif ($self->{nc} == 0x002D) { # -
364 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365 if ($self->{s_kwd} eq '<!-') {
366
367 $self->{escape} = 1; # unless $self->{escape};
368 $self->{s_kwd} = '--';
369 #
370 } elsif ($self->{s_kwd} eq '-') {
371
372 $self->{s_kwd} = '--';
373 #
374 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
375
376 $self->{s_kwd} .= '-';
377 #
378 } else {
379
380 $self->{s_kwd} = '-';
381 #
382 }
383 }
384
385 #
386 } elsif ($self->{nc} == 0x0021) { # !
387 if (length $self->{s_kwd}) {
388
389 $self->{s_kwd} .= '!';
390 #
391 } else {
392
393 #$self->{s_kwd} = '';
394 #
395 }
396 #
397 } elsif ($self->{nc} == 0x003C) { # <
398 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
399 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
400 not $self->{escape})) {
401
402 $self->{state} = TAG_OPEN_STATE;
403
404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
405 $self->{line_prev} = $self->{line};
406 $self->{column_prev} = $self->{column};
407 $self->{column}++;
408 $self->{nc}
409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
410 } else {
411 $self->{set_nc}->($self);
412 }
413
414 redo A;
415 } else {
416
417 $self->{s_kwd} = '';
418 #
419 }
420 } elsif ($self->{nc} == 0x003E) { # >
421 if ($self->{escape} and
422 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
423 if ($self->{s_kwd} eq '--') {
424
425 delete $self->{escape};
426 #
427 } else {
428
429 #
430 }
431 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
432
433 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
434 line => $self->{line_prev},
435 column => $self->{column_prev} - 1);
436 #
437 } else {
438
439 #
440 }
441
442 $self->{s_kwd} = '';
443 #
444 } elsif ($self->{nc} == 0x005D) { # ]
445 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
446
447 $self->{s_kwd} .= ']';
448 } elsif ($self->{s_kwd} eq ']]') {
449
450 #
451 } else {
452
453 $self->{s_kwd} = '';
454 }
455 #
456 } elsif ($self->{nc} == -1) {
457
458 $self->{s_kwd} = '';
459 return ({type => END_OF_FILE_TOKEN,
460 line => $self->{line}, column => $self->{column}});
461 last A; ## TODO: ok?
462 } else {
463
464 $self->{s_kwd} = '';
465 #
466 }
467
468 # Anything else
469 my $token = {type => CHARACTER_TOKEN,
470 data => chr $self->{nc},
471 line => $self->{line}, column => $self->{column},
472 };
473 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
474 length $token->{data})) {
475 $self->{s_kwd} = '';
476 }
477
478 ## Stay in the data state.
479 if (not $self->{is_xml} and
480 $self->{content_model} == PCDATA_CONTENT_MODEL) {
481
482 $self->{state} = PCDATA_STATE;
483 } else {
484
485 ## Stay in the state.
486 }
487
488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
489 $self->{line_prev} = $self->{line};
490 $self->{column_prev} = $self->{column};
491 $self->{column}++;
492 $self->{nc}
493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
494 } else {
495 $self->{set_nc}->($self);
496 }
497
498 return ($token);
499 redo A;
500 } elsif ($self->{state} == TAG_OPEN_STATE) {
501 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
502 if ($self->{nc} == 0x002F) { # /
503
504
505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
506 $self->{line_prev} = $self->{line};
507 $self->{column_prev} = $self->{column};
508 $self->{column}++;
509 $self->{nc}
510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
511 } else {
512 $self->{set_nc}->($self);
513 }
514
515 $self->{state} = CLOSE_TAG_OPEN_STATE;
516 redo A;
517 } elsif ($self->{nc} == 0x0021) { # !
518
519 $self->{s_kwd} = '<' unless $self->{escape};
520 #
521 } else {
522
523 #
524 }
525
526 ## reconsume
527 $self->{state} = DATA_STATE;
528 $self->{s_kwd} = '';
529 return ({type => CHARACTER_TOKEN, data => '<',
530 line => $self->{line_prev},
531 column => $self->{column_prev},
532 });
533 redo A;
534 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
535 if ($self->{nc} == 0x0021) { # !
536
537 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
538
539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
540 $self->{line_prev} = $self->{line};
541 $self->{column_prev} = $self->{column};
542 $self->{column}++;
543 $self->{nc}
544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
545 } else {
546 $self->{set_nc}->($self);
547 }
548
549 redo A;
550 } elsif ($self->{nc} == 0x002F) { # /
551
552 $self->{state} = CLOSE_TAG_OPEN_STATE;
553
554 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
555 $self->{line_prev} = $self->{line};
556 $self->{column_prev} = $self->{column};
557 $self->{column}++;
558 $self->{nc}
559 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
560 } else {
561 $self->{set_nc}->($self);
562 }
563
564 redo A;
565 } elsif (0x0041 <= $self->{nc} and
566 $self->{nc} <= 0x005A) { # A..Z
567
568 $self->{ct}
569 = {type => START_TAG_TOKEN,
570 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
571 line => $self->{line_prev},
572 column => $self->{column_prev}};
573 $self->{state} = TAG_NAME_STATE;
574
575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
576 $self->{line_prev} = $self->{line};
577 $self->{column_prev} = $self->{column};
578 $self->{column}++;
579 $self->{nc}
580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
581 } else {
582 $self->{set_nc}->($self);
583 }
584
585 redo A;
586 } elsif (0x0061 <= $self->{nc} and
587 $self->{nc} <= 0x007A) { # a..z
588
589 $self->{ct} = {type => START_TAG_TOKEN,
590 tag_name => chr ($self->{nc}),
591 line => $self->{line_prev},
592 column => $self->{column_prev}};
593 $self->{state} = TAG_NAME_STATE;
594
595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596 $self->{line_prev} = $self->{line};
597 $self->{column_prev} = $self->{column};
598 $self->{column}++;
599 $self->{nc}
600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601 } else {
602 $self->{set_nc}->($self);
603 }
604
605 redo A;
606 } elsif ($self->{nc} == 0x003E) { # >
607
608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
609 line => $self->{line_prev},
610 column => $self->{column_prev});
611 $self->{state} = DATA_STATE;
612 $self->{s_kwd} = '';
613
614 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
615 $self->{line_prev} = $self->{line};
616 $self->{column_prev} = $self->{column};
617 $self->{column}++;
618 $self->{nc}
619 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
620 } else {
621 $self->{set_nc}->($self);
622 }
623
624
625 return ({type => CHARACTER_TOKEN, data => '<>',
626 line => $self->{line_prev},
627 column => $self->{column_prev},
628 });
629
630 redo A;
631 } elsif ($self->{nc} == 0x003F) { # ?
632
633 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
634 line => $self->{line_prev},
635 column => $self->{column_prev});
636 $self->{state} = BOGUS_COMMENT_STATE;
637 $self->{ct} = {type => COMMENT_TOKEN, data => '',
638 line => $self->{line_prev},
639 column => $self->{column_prev},
640 };
641 ## $self->{nc} is intentionally left as is
642 redo A;
643 } else {
644
645 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
646 line => $self->{line_prev},
647 column => $self->{column_prev});
648 $self->{state} = DATA_STATE;
649 $self->{s_kwd} = '';
650 ## reconsume
651
652 return ({type => CHARACTER_TOKEN, data => '<',
653 line => $self->{line_prev},
654 column => $self->{column_prev},
655 });
656
657 redo A;
658 }
659 } else {
660 die "$0: $self->{content_model} in tag open";
661 }
662 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
663 ## NOTE: The "close tag open state" in the spec is implemented as
664 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
665
666 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
667 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
668 if (defined $self->{last_stag_name}) {
669 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
670 $self->{s_kwd} = '';
671 ## Reconsume.
672 redo A;
673 } else {
674 ## No start tag token has ever been emitted
675 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
676
677 $self->{state} = DATA_STATE;
678 $self->{s_kwd} = '';
679 ## Reconsume.
680 return ({type => CHARACTER_TOKEN, data => '</',
681 line => $l, column => $c,
682 });
683 redo A;
684 }
685 }
686
687 if (0x0041 <= $self->{nc} and
688 $self->{nc} <= 0x005A) { # A..Z
689
690 $self->{ct}
691 = {type => END_TAG_TOKEN,
692 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
693 line => $l, column => $c};
694 $self->{state} = TAG_NAME_STATE;
695
696 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
697 $self->{line_prev} = $self->{line};
698 $self->{column_prev} = $self->{column};
699 $self->{column}++;
700 $self->{nc}
701 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
702 } else {
703 $self->{set_nc}->($self);
704 }
705
706 redo A;
707 } elsif (0x0061 <= $self->{nc} and
708 $self->{nc} <= 0x007A) { # a..z
709
710 $self->{ct} = {type => END_TAG_TOKEN,
711 tag_name => chr ($self->{nc}),
712 line => $l, column => $c};
713 $self->{state} = TAG_NAME_STATE;
714
715 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
716 $self->{line_prev} = $self->{line};
717 $self->{column_prev} = $self->{column};
718 $self->{column}++;
719 $self->{nc}
720 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
721 } else {
722 $self->{set_nc}->($self);
723 }
724
725 redo A;
726 } elsif ($self->{nc} == 0x003E) { # >
727
728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
729 line => $self->{line_prev}, ## "<" in "</>"
730 column => $self->{column_prev} - 1);
731 $self->{state} = DATA_STATE;
732 $self->{s_kwd} = '';
733
734 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
735 $self->{line_prev} = $self->{line};
736 $self->{column_prev} = $self->{column};
737 $self->{column}++;
738 $self->{nc}
739 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
740 } else {
741 $self->{set_nc}->($self);
742 }
743
744 redo A;
745 } elsif ($self->{nc} == -1) {
746
747 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
748 $self->{s_kwd} = '';
749 $self->{state} = DATA_STATE;
750 # reconsume
751
752 return ({type => CHARACTER_TOKEN, data => '</',
753 line => $l, column => $c,
754 });
755
756 redo A;
757 } else {
758
759 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
760 $self->{state} = BOGUS_COMMENT_STATE;
761 $self->{ct} = {type => COMMENT_TOKEN, data => '',
762 line => $self->{line_prev}, # "<" of "</"
763 column => $self->{column_prev} - 1,
764 };
765 ## NOTE: $self->{nc} is intentionally left as is.
766 ## Although the "anything else" case of the spec not explicitly
767 ## states that the next input character is to be reconsumed,
768 ## it will be included to the |data| of the comment token
769 ## generated from the bogus end tag, as defined in the
770 ## "bogus comment state" entry.
771 redo A;
772 }
773 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
774 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
775 if (length $ch) {
776 my $CH = $ch;
777 $ch =~ tr/a-z/A-Z/;
778 my $nch = chr $self->{nc};
779 if ($nch eq $ch or $nch eq $CH) {
780
781 ## Stay in the state.
782 $self->{s_kwd} .= $nch;
783
784 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
785 $self->{line_prev} = $self->{line};
786 $self->{column_prev} = $self->{column};
787 $self->{column}++;
788 $self->{nc}
789 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
790 } else {
791 $self->{set_nc}->($self);
792 }
793
794 redo A;
795 } else {
796
797 $self->{state} = DATA_STATE;
798 $self->{s_kwd} = '';
799 ## Reconsume.
800 return ({type => CHARACTER_TOKEN,
801 data => '</' . $self->{s_kwd},
802 line => $self->{line_prev},
803 column => $self->{column_prev} - 1 - length $self->{s_kwd},
804 });
805 redo A;
806 }
807 } else { # after "<{tag-name}"
808 unless ($is_space->{$self->{nc}} or
809 {
810 0x003E => 1, # >
811 0x002F => 1, # /
812 -1 => 1, # EOF
813 }->{$self->{nc}}) {
814
815 ## Reconsume.
816 $self->{state} = DATA_STATE;
817 $self->{s_kwd} = '';
818 return ({type => CHARACTER_TOKEN,
819 data => '</' . $self->{s_kwd},
820 line => $self->{line_prev},
821 column => $self->{column_prev} - 1 - length $self->{s_kwd},
822 });
823 redo A;
824 } else {
825
826 $self->{ct}
827 = {type => END_TAG_TOKEN,
828 tag_name => $self->{last_stag_name},
829 line => $self->{line_prev},
830 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
831 $self->{state} = TAG_NAME_STATE;
832 ## Reconsume.
833 redo A;
834 }
835 }
836 } elsif ($self->{state} == TAG_NAME_STATE) {
837 if ($is_space->{$self->{nc}}) {
838
839 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
840
841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
842 $self->{line_prev} = $self->{line};
843 $self->{column_prev} = $self->{column};
844 $self->{column}++;
845 $self->{nc}
846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
847 } else {
848 $self->{set_nc}->($self);
849 }
850
851 redo A;
852 } elsif ($self->{nc} == 0x003E) { # >
853 if ($self->{ct}->{type} == START_TAG_TOKEN) {
854
855 $self->{last_stag_name} = $self->{ct}->{tag_name};
856 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
857 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
858 #if ($self->{ct}->{attributes}) {
859 # ## NOTE: This should never be reached.
860 # !!! cp (36);
861 # !!! parse-error (type => 'end tag attribute');
862 #} else {
863
864 #}
865 } else {
866 die "$0: $self->{ct}->{type}: Unknown token type";
867 }
868 $self->{state} = DATA_STATE;
869 $self->{s_kwd} = '';
870
871 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
872 $self->{line_prev} = $self->{line};
873 $self->{column_prev} = $self->{column};
874 $self->{column}++;
875 $self->{nc}
876 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
877 } else {
878 $self->{set_nc}->($self);
879 }
880
881
882 return ($self->{ct}); # start tag or end tag
883
884 redo A;
885 } elsif (0x0041 <= $self->{nc} and
886 $self->{nc} <= 0x005A) { # A..Z
887
888 $self->{ct}->{tag_name}
889 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
890 # start tag or end tag
891 ## Stay in this state
892
893 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
894 $self->{line_prev} = $self->{line};
895 $self->{column_prev} = $self->{column};
896 $self->{column}++;
897 $self->{nc}
898 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
899 } else {
900 $self->{set_nc}->($self);
901 }
902
903 redo A;
904 } elsif ($self->{nc} == -1) {
905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
906 if ($self->{ct}->{type} == START_TAG_TOKEN) {
907
908 $self->{last_stag_name} = $self->{ct}->{tag_name};
909 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
910 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
911 #if ($self->{ct}->{attributes}) {
912 # ## NOTE: This state should never be reached.
913 # !!! cp (40);
914 # !!! parse-error (type => 'end tag attribute');
915 #} else {
916
917 #}
918 } else {
919 die "$0: $self->{ct}->{type}: Unknown token type";
920 }
921 $self->{state} = DATA_STATE;
922 $self->{s_kwd} = '';
923 # reconsume
924
925 return ($self->{ct}); # start tag or end tag
926
927 redo A;
928 } elsif ($self->{nc} == 0x002F) { # /
929
930 $self->{state} = SELF_CLOSING_START_TAG_STATE;
931
932 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
933 $self->{line_prev} = $self->{line};
934 $self->{column_prev} = $self->{column};
935 $self->{column}++;
936 $self->{nc}
937 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
938 } else {
939 $self->{set_nc}->($self);
940 }
941
942 redo A;
943 } else {
944
945 $self->{ct}->{tag_name} .= chr $self->{nc};
946 # start tag or end tag
947 ## Stay in the state
948
949 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
950 $self->{line_prev} = $self->{line};
951 $self->{column_prev} = $self->{column};
952 $self->{column}++;
953 $self->{nc}
954 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
955 } else {
956 $self->{set_nc}->($self);
957 }
958
959 redo A;
960 }
961 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
962 if ($is_space->{$self->{nc}}) {
963
964 ## Stay in the state
965
966 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
967 $self->{line_prev} = $self->{line};
968 $self->{column_prev} = $self->{column};
969 $self->{column}++;
970 $self->{nc}
971 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
972 } else {
973 $self->{set_nc}->($self);
974 }
975
976 redo A;
977 } elsif ($self->{nc} == 0x003E) { # >
978 if ($self->{ct}->{type} == START_TAG_TOKEN) {
979
980 $self->{last_stag_name} = $self->{ct}->{tag_name};
981 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
982 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
983 if ($self->{ct}->{attributes}) {
984
985 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
986 } else {
987
988 }
989 } else {
990 die "$0: $self->{ct}->{type}: Unknown token type";
991 }
992 $self->{state} = DATA_STATE;
993 $self->{s_kwd} = '';
994
995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996 $self->{line_prev} = $self->{line};
997 $self->{column_prev} = $self->{column};
998 $self->{column}++;
999 $self->{nc}
1000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001 } else {
1002 $self->{set_nc}->($self);
1003 }
1004
1005
1006 return ($self->{ct}); # start tag or end tag
1007
1008 redo A;
1009 } elsif (0x0041 <= $self->{nc} and
1010 $self->{nc} <= 0x005A) { # A..Z
1011
1012 $self->{ca}
1013 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1014 value => '',
1015 line => $self->{line}, column => $self->{column}};
1016 $self->{state} = ATTRIBUTE_NAME_STATE;
1017
1018 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1019 $self->{line_prev} = $self->{line};
1020 $self->{column_prev} = $self->{column};
1021 $self->{column}++;
1022 $self->{nc}
1023 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1024 } else {
1025 $self->{set_nc}->($self);
1026 }
1027
1028 redo A;
1029 } elsif ($self->{nc} == 0x002F) { # /
1030
1031 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1032
1033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1034 $self->{line_prev} = $self->{line};
1035 $self->{column_prev} = $self->{column};
1036 $self->{column}++;
1037 $self->{nc}
1038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1039 } else {
1040 $self->{set_nc}->($self);
1041 }
1042
1043 redo A;
1044 } elsif ($self->{nc} == -1) {
1045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1046 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1047
1048 $self->{last_stag_name} = $self->{ct}->{tag_name};
1049 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1050 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1051 if ($self->{ct}->{attributes}) {
1052
1053 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1054 } else {
1055
1056 }
1057 } else {
1058 die "$0: $self->{ct}->{type}: Unknown token type";
1059 }
1060 $self->{state} = DATA_STATE;
1061 $self->{s_kwd} = '';
1062 # reconsume
1063
1064 return ($self->{ct}); # start tag or end tag
1065
1066 redo A;
1067 } else {
1068 if ({
1069 0x0022 => 1, # "
1070 0x0027 => 1, # '
1071 0x003D => 1, # =
1072 }->{$self->{nc}}) {
1073
1074 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1075 } else {
1076
1077 }
1078 $self->{ca}
1079 = {name => chr ($self->{nc}),
1080 value => '',
1081 line => $self->{line}, column => $self->{column}};
1082 $self->{state} = ATTRIBUTE_NAME_STATE;
1083
1084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1085 $self->{line_prev} = $self->{line};
1086 $self->{column_prev} = $self->{column};
1087 $self->{column}++;
1088 $self->{nc}
1089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1090 } else {
1091 $self->{set_nc}->($self);
1092 }
1093
1094 redo A;
1095 }
1096 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1097 my $before_leave = sub {
1098 if (exists $self->{ct}->{attributes} # start tag or end tag
1099 ->{$self->{ca}->{name}}) { # MUST
1100
1101 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1102 ## Discard $self->{ca} # MUST
1103 } else {
1104
1105 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1106 = $self->{ca};
1107 }
1108 }; # $before_leave
1109
1110 if ($is_space->{$self->{nc}}) {
1111
1112 $before_leave->();
1113 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1114
1115 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1116 $self->{line_prev} = $self->{line};
1117 $self->{column_prev} = $self->{column};
1118 $self->{column}++;
1119 $self->{nc}
1120 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1121 } else {
1122 $self->{set_nc}->($self);
1123 }
1124
1125 redo A;
1126 } elsif ($self->{nc} == 0x003D) { # =
1127
1128 $before_leave->();
1129 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1130
1131 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1132 $self->{line_prev} = $self->{line};
1133 $self->{column_prev} = $self->{column};
1134 $self->{column}++;
1135 $self->{nc}
1136 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1137 } else {
1138 $self->{set_nc}->($self);
1139 }
1140
1141 redo A;
1142 } elsif ($self->{nc} == 0x003E) { # >
1143 $before_leave->();
1144 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1145
1146 $self->{last_stag_name} = $self->{ct}->{tag_name};
1147 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1148
1149 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1150 if ($self->{ct}->{attributes}) {
1151 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1152 }
1153 } else {
1154 die "$0: $self->{ct}->{type}: Unknown token type";
1155 }
1156 $self->{state} = DATA_STATE;
1157 $self->{s_kwd} = '';
1158
1159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1160 $self->{line_prev} = $self->{line};
1161 $self->{column_prev} = $self->{column};
1162 $self->{column}++;
1163 $self->{nc}
1164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1165 } else {
1166 $self->{set_nc}->($self);
1167 }
1168
1169
1170 return ($self->{ct}); # start tag or end tag
1171
1172 redo A;
1173 } elsif (0x0041 <= $self->{nc} and
1174 $self->{nc} <= 0x005A) { # A..Z
1175
1176 $self->{ca}->{name}
1177 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1178 ## Stay in the state
1179
1180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1181 $self->{line_prev} = $self->{line};
1182 $self->{column_prev} = $self->{column};
1183 $self->{column}++;
1184 $self->{nc}
1185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1186 } else {
1187 $self->{set_nc}->($self);
1188 }
1189
1190 redo A;
1191 } elsif ($self->{nc} == 0x002F) { # /
1192
1193 $before_leave->();
1194 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1195
1196 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1197 $self->{line_prev} = $self->{line};
1198 $self->{column_prev} = $self->{column};
1199 $self->{column}++;
1200 $self->{nc}
1201 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1202 } else {
1203 $self->{set_nc}->($self);
1204 }
1205
1206 redo A;
1207 } elsif ($self->{nc} == -1) {
1208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1209 $before_leave->();
1210 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211
1212 $self->{last_stag_name} = $self->{ct}->{tag_name};
1213 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215 if ($self->{ct}->{attributes}) {
1216
1217 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1218 } else {
1219 ## NOTE: This state should never be reached.
1220
1221 }
1222 } else {
1223 die "$0: $self->{ct}->{type}: Unknown token type";
1224 }
1225 $self->{state} = DATA_STATE;
1226 $self->{s_kwd} = '';
1227 # reconsume
1228
1229 return ($self->{ct}); # start tag or end tag
1230
1231 redo A;
1232 } else {
1233 if ($self->{nc} == 0x0022 or # "
1234 $self->{nc} == 0x0027) { # '
1235
1236 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1237 } else {
1238
1239 }
1240 $self->{ca}->{name} .= chr ($self->{nc});
1241 ## Stay in the state
1242
1243 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1244 $self->{line_prev} = $self->{line};
1245 $self->{column_prev} = $self->{column};
1246 $self->{column}++;
1247 $self->{nc}
1248 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1249 } else {
1250 $self->{set_nc}->($self);
1251 }
1252
1253 redo A;
1254 }
1255 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1256 if ($is_space->{$self->{nc}}) {
1257
1258 ## Stay in the state
1259
1260 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1261 $self->{line_prev} = $self->{line};
1262 $self->{column_prev} = $self->{column};
1263 $self->{column}++;
1264 $self->{nc}
1265 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1266 } else {
1267 $self->{set_nc}->($self);
1268 }
1269
1270 redo A;
1271 } elsif ($self->{nc} == 0x003D) { # =
1272
1273 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1274
1275 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1276 $self->{line_prev} = $self->{line};
1277 $self->{column_prev} = $self->{column};
1278 $self->{column}++;
1279 $self->{nc}
1280 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1281 } else {
1282 $self->{set_nc}->($self);
1283 }
1284
1285 redo A;
1286 } elsif ($self->{nc} == 0x003E) { # >
1287 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1288
1289 $self->{last_stag_name} = $self->{ct}->{tag_name};
1290 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1291 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1292 if ($self->{ct}->{attributes}) {
1293
1294 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1295 } else {
1296 ## NOTE: This state should never be reached.
1297
1298 }
1299 } else {
1300 die "$0: $self->{ct}->{type}: Unknown token type";
1301 }
1302 $self->{state} = DATA_STATE;
1303 $self->{s_kwd} = '';
1304
1305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1306 $self->{line_prev} = $self->{line};
1307 $self->{column_prev} = $self->{column};
1308 $self->{column}++;
1309 $self->{nc}
1310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1311 } else {
1312 $self->{set_nc}->($self);
1313 }
1314
1315
1316 return ($self->{ct}); # start tag or end tag
1317
1318 redo A;
1319 } elsif (0x0041 <= $self->{nc} and
1320 $self->{nc} <= 0x005A) { # A..Z
1321
1322 $self->{ca}
1323 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1324 value => '',
1325 line => $self->{line}, column => $self->{column}};
1326 $self->{state} = ATTRIBUTE_NAME_STATE;
1327
1328 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1329 $self->{line_prev} = $self->{line};
1330 $self->{column_prev} = $self->{column};
1331 $self->{column}++;
1332 $self->{nc}
1333 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1334 } else {
1335 $self->{set_nc}->($self);
1336 }
1337
1338 redo A;
1339 } elsif ($self->{nc} == 0x002F) { # /
1340
1341 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1342
1343 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1344 $self->{line_prev} = $self->{line};
1345 $self->{column_prev} = $self->{column};
1346 $self->{column}++;
1347 $self->{nc}
1348 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1349 } else {
1350 $self->{set_nc}->($self);
1351 }
1352
1353 redo A;
1354 } elsif ($self->{nc} == -1) {
1355 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1356 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1357
1358 $self->{last_stag_name} = $self->{ct}->{tag_name};
1359 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1360 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1361 if ($self->{ct}->{attributes}) {
1362
1363 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1364 } else {
1365 ## NOTE: This state should never be reached.
1366
1367 }
1368 } else {
1369 die "$0: $self->{ct}->{type}: Unknown token type";
1370 }
1371 $self->{s_kwd} = '';
1372 $self->{state} = DATA_STATE;
1373 # reconsume
1374
1375 return ($self->{ct}); # start tag or end tag
1376
1377 redo A;
1378 } else {
1379 if ($self->{nc} == 0x0022 or # "
1380 $self->{nc} == 0x0027) { # '
1381
1382 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1383 } else {
1384
1385 }
1386 $self->{ca}
1387 = {name => chr ($self->{nc}),
1388 value => '',
1389 line => $self->{line}, column => $self->{column}};
1390 $self->{state} = ATTRIBUTE_NAME_STATE;
1391
1392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1393 $self->{line_prev} = $self->{line};
1394 $self->{column_prev} = $self->{column};
1395 $self->{column}++;
1396 $self->{nc}
1397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1398 } else {
1399 $self->{set_nc}->($self);
1400 }
1401
1402 redo A;
1403 }
1404 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1405 if ($is_space->{$self->{nc}}) {
1406
1407 ## Stay in the state
1408
1409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1410 $self->{line_prev} = $self->{line};
1411 $self->{column_prev} = $self->{column};
1412 $self->{column}++;
1413 $self->{nc}
1414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1415 } else {
1416 $self->{set_nc}->($self);
1417 }
1418
1419 redo A;
1420 } elsif ($self->{nc} == 0x0022) { # "
1421
1422 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1423
1424 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1425 $self->{line_prev} = $self->{line};
1426 $self->{column_prev} = $self->{column};
1427 $self->{column}++;
1428 $self->{nc}
1429 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1430 } else {
1431 $self->{set_nc}->($self);
1432 }
1433
1434 redo A;
1435 } elsif ($self->{nc} == 0x0026) { # &
1436
1437 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1438 ## reconsume
1439 redo A;
1440 } elsif ($self->{nc} == 0x0027) { # '
1441
1442 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1443
1444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445 $self->{line_prev} = $self->{line};
1446 $self->{column_prev} = $self->{column};
1447 $self->{column}++;
1448 $self->{nc}
1449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450 } else {
1451 $self->{set_nc}->($self);
1452 }
1453
1454 redo A;
1455 } elsif ($self->{nc} == 0x003E) { # >
1456 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1457 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1458
1459 $self->{last_stag_name} = $self->{ct}->{tag_name};
1460 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1461 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1462 if ($self->{ct}->{attributes}) {
1463
1464 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1465 } else {
1466 ## NOTE: This state should never be reached.
1467
1468 }
1469 } else {
1470 die "$0: $self->{ct}->{type}: Unknown token type";
1471 }
1472 $self->{state} = DATA_STATE;
1473 $self->{s_kwd} = '';
1474
1475 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1476 $self->{line_prev} = $self->{line};
1477 $self->{column_prev} = $self->{column};
1478 $self->{column}++;
1479 $self->{nc}
1480 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1481 } else {
1482 $self->{set_nc}->($self);
1483 }
1484
1485
1486 return ($self->{ct}); # start tag or end tag
1487
1488 redo A;
1489 } elsif ($self->{nc} == -1) {
1490 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1491 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1492
1493 $self->{last_stag_name} = $self->{ct}->{tag_name};
1494 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1495 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1496 if ($self->{ct}->{attributes}) {
1497
1498 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1499 } else {
1500 ## NOTE: This state should never be reached.
1501
1502 }
1503 } else {
1504 die "$0: $self->{ct}->{type}: Unknown token type";
1505 }
1506 $self->{state} = DATA_STATE;
1507 $self->{s_kwd} = '';
1508 ## reconsume
1509
1510 return ($self->{ct}); # start tag or end tag
1511
1512 redo A;
1513 } else {
1514 if ($self->{nc} == 0x003D) { # =
1515
1516 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1517 } else {
1518
1519 }
1520 $self->{ca}->{value} .= chr ($self->{nc});
1521 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1522
1523 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524 $self->{line_prev} = $self->{line};
1525 $self->{column_prev} = $self->{column};
1526 $self->{column}++;
1527 $self->{nc}
1528 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529 } else {
1530 $self->{set_nc}->($self);
1531 }
1532
1533 redo A;
1534 }
1535 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1536 if ($self->{nc} == 0x0022) { # "
1537
1538 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1539
1540 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1541 $self->{line_prev} = $self->{line};
1542 $self->{column_prev} = $self->{column};
1543 $self->{column}++;
1544 $self->{nc}
1545 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1546 } else {
1547 $self->{set_nc}->($self);
1548 }
1549
1550 redo A;
1551 } elsif ($self->{nc} == 0x0026) { # &
1552
1553 ## NOTE: In the spec, the tokenizer is switched to the
1554 ## "entity in attribute value state". In this implementation, the
1555 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1556 ## implementation of the "consume a character reference" algorithm.
1557 $self->{prev_state} = $self->{state};
1558 $self->{entity_add} = 0x0022; # "
1559 $self->{state} = ENTITY_STATE;
1560
1561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562 $self->{line_prev} = $self->{line};
1563 $self->{column_prev} = $self->{column};
1564 $self->{column}++;
1565 $self->{nc}
1566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567 } else {
1568 $self->{set_nc}->($self);
1569 }
1570
1571 redo A;
1572 } elsif ($self->{nc} == -1) {
1573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1574 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575
1576 $self->{last_stag_name} = $self->{ct}->{tag_name};
1577 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579 if ($self->{ct}->{attributes}) {
1580
1581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582 } else {
1583 ## NOTE: This state should never be reached.
1584
1585 }
1586 } else {
1587 die "$0: $self->{ct}->{type}: Unknown token type";
1588 }
1589 $self->{state} = DATA_STATE;
1590 $self->{s_kwd} = '';
1591 ## reconsume
1592
1593 return ($self->{ct}); # start tag or end tag
1594
1595 redo A;
1596 } else {
1597
1598 $self->{ca}->{value} .= chr ($self->{nc});
1599 $self->{read_until}->($self->{ca}->{value},
1600 q["&],
1601 length $self->{ca}->{value});
1602
1603 ## Stay in the state
1604
1605 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1606 $self->{line_prev} = $self->{line};
1607 $self->{column_prev} = $self->{column};
1608 $self->{column}++;
1609 $self->{nc}
1610 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1611 } else {
1612 $self->{set_nc}->($self);
1613 }
1614
1615 redo A;
1616 }
1617 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1618 if ($self->{nc} == 0x0027) { # '
1619
1620 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1621
1622 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623 $self->{line_prev} = $self->{line};
1624 $self->{column_prev} = $self->{column};
1625 $self->{column}++;
1626 $self->{nc}
1627 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628 } else {
1629 $self->{set_nc}->($self);
1630 }
1631
1632 redo A;
1633 } elsif ($self->{nc} == 0x0026) { # &
1634
1635 ## NOTE: In the spec, the tokenizer is switched to the
1636 ## "entity in attribute value state". In this implementation, the
1637 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1638 ## implementation of the "consume a character reference" algorithm.
1639 $self->{entity_add} = 0x0027; # '
1640 $self->{prev_state} = $self->{state};
1641 $self->{state} = ENTITY_STATE;
1642
1643 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644 $self->{line_prev} = $self->{line};
1645 $self->{column_prev} = $self->{column};
1646 $self->{column}++;
1647 $self->{nc}
1648 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649 } else {
1650 $self->{set_nc}->($self);
1651 }
1652
1653 redo A;
1654 } elsif ($self->{nc} == -1) {
1655 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1656 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1657
1658 $self->{last_stag_name} = $self->{ct}->{tag_name};
1659 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1660 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1661 if ($self->{ct}->{attributes}) {
1662
1663 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1664 } else {
1665 ## NOTE: This state should never be reached.
1666
1667 }
1668 } else {
1669 die "$0: $self->{ct}->{type}: Unknown token type";
1670 }
1671 $self->{state} = DATA_STATE;
1672 $self->{s_kwd} = '';
1673 ## reconsume
1674
1675 return ($self->{ct}); # start tag or end tag
1676
1677 redo A;
1678 } else {
1679
1680 $self->{ca}->{value} .= chr ($self->{nc});
1681 $self->{read_until}->($self->{ca}->{value},
1682 q['&],
1683 length $self->{ca}->{value});
1684
1685 ## Stay in the state
1686
1687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1688 $self->{line_prev} = $self->{line};
1689 $self->{column_prev} = $self->{column};
1690 $self->{column}++;
1691 $self->{nc}
1692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1693 } else {
1694 $self->{set_nc}->($self);
1695 }
1696
1697 redo A;
1698 }
1699 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1700 if ($is_space->{$self->{nc}}) {
1701
1702 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1703
1704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705 $self->{line_prev} = $self->{line};
1706 $self->{column_prev} = $self->{column};
1707 $self->{column}++;
1708 $self->{nc}
1709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710 } else {
1711 $self->{set_nc}->($self);
1712 }
1713
1714 redo A;
1715 } elsif ($self->{nc} == 0x0026) { # &
1716
1717 ## NOTE: In the spec, the tokenizer is switched to the
1718 ## "entity in attribute value state". In this implementation, the
1719 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1720 ## implementation of the "consume a character reference" algorithm.
1721 $self->{entity_add} = -1;
1722 $self->{prev_state} = $self->{state};
1723 $self->{state} = ENTITY_STATE;
1724
1725 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1726 $self->{line_prev} = $self->{line};
1727 $self->{column_prev} = $self->{column};
1728 $self->{column}++;
1729 $self->{nc}
1730 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1731 } else {
1732 $self->{set_nc}->($self);
1733 }
1734
1735 redo A;
1736 } elsif ($self->{nc} == 0x003E) { # >
1737 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1738
1739 $self->{last_stag_name} = $self->{ct}->{tag_name};
1740 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1741 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1742 if ($self->{ct}->{attributes}) {
1743
1744 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1745 } else {
1746 ## NOTE: This state should never be reached.
1747
1748 }
1749 } else {
1750 die "$0: $self->{ct}->{type}: Unknown token type";
1751 }
1752 $self->{state} = DATA_STATE;
1753 $self->{s_kwd} = '';
1754
1755 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1756 $self->{line_prev} = $self->{line};
1757 $self->{column_prev} = $self->{column};
1758 $self->{column}++;
1759 $self->{nc}
1760 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1761 } else {
1762 $self->{set_nc}->($self);
1763 }
1764
1765
1766 return ($self->{ct}); # start tag or end tag
1767
1768 redo A;
1769 } elsif ($self->{nc} == -1) {
1770 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1771 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1772
1773 $self->{last_stag_name} = $self->{ct}->{tag_name};
1774 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1775 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1776 if ($self->{ct}->{attributes}) {
1777
1778 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1779 } else {
1780 ## NOTE: This state should never be reached.
1781
1782 }
1783 } else {
1784 die "$0: $self->{ct}->{type}: Unknown token type";
1785 }
1786 $self->{state} = DATA_STATE;
1787 $self->{s_kwd} = '';
1788 ## reconsume
1789
1790 return ($self->{ct}); # start tag or end tag
1791
1792 redo A;
1793 } else {
1794 if ({
1795 0x0022 => 1, # "
1796 0x0027 => 1, # '
1797 0x003D => 1, # =
1798 }->{$self->{nc}}) {
1799
1800 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1801 } else {
1802
1803 }
1804 $self->{ca}->{value} .= chr ($self->{nc});
1805 $self->{read_until}->($self->{ca}->{value},
1806 q["'=& >],
1807 length $self->{ca}->{value});
1808
1809 ## Stay in the state
1810
1811 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1812 $self->{line_prev} = $self->{line};
1813 $self->{column_prev} = $self->{column};
1814 $self->{column}++;
1815 $self->{nc}
1816 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1817 } else {
1818 $self->{set_nc}->($self);
1819 }
1820
1821 redo A;
1822 }
1823 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1824 if ($is_space->{$self->{nc}}) {
1825
1826 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1827
1828 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1829 $self->{line_prev} = $self->{line};
1830 $self->{column_prev} = $self->{column};
1831 $self->{column}++;
1832 $self->{nc}
1833 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1834 } else {
1835 $self->{set_nc}->($self);
1836 }
1837
1838 redo A;
1839 } elsif ($self->{nc} == 0x003E) { # >
1840 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1841
1842 $self->{last_stag_name} = $self->{ct}->{tag_name};
1843 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1844 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1845 if ($self->{ct}->{attributes}) {
1846
1847 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1848 } else {
1849 ## NOTE: This state should never be reached.
1850
1851 }
1852 } else {
1853 die "$0: $self->{ct}->{type}: Unknown token type";
1854 }
1855 $self->{state} = DATA_STATE;
1856 $self->{s_kwd} = '';
1857
1858 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1859 $self->{line_prev} = $self->{line};
1860 $self->{column_prev} = $self->{column};
1861 $self->{column}++;
1862 $self->{nc}
1863 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1864 } else {
1865 $self->{set_nc}->($self);
1866 }
1867
1868
1869 return ($self->{ct}); # start tag or end tag
1870
1871 redo A;
1872 } elsif ($self->{nc} == 0x002F) { # /
1873
1874 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1875
1876 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1877 $self->{line_prev} = $self->{line};
1878 $self->{column_prev} = $self->{column};
1879 $self->{column}++;
1880 $self->{nc}
1881 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1882 } else {
1883 $self->{set_nc}->($self);
1884 }
1885
1886 redo A;
1887 } elsif ($self->{nc} == -1) {
1888 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1889 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1890
1891 $self->{last_stag_name} = $self->{ct}->{tag_name};
1892 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1893 if ($self->{ct}->{attributes}) {
1894
1895 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1896 } else {
1897 ## NOTE: This state should never be reached.
1898
1899 }
1900 } else {
1901 die "$0: $self->{ct}->{type}: Unknown token type";
1902 }
1903 $self->{state} = DATA_STATE;
1904 $self->{s_kwd} = '';
1905 ## Reconsume.
1906 return ($self->{ct}); # start tag or end tag
1907 redo A;
1908 } else {
1909
1910 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1911 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1912 ## reconsume
1913 redo A;
1914 }
1915 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1916 if ($self->{nc} == 0x003E) { # >
1917 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1918
1919 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1920 ## TODO: Different type than slash in start tag
1921 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1922 if ($self->{ct}->{attributes}) {
1923
1924 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1925 } else {
1926
1927 }
1928 ## TODO: Test |<title></title/>|
1929 } else {
1930
1931 $self->{self_closing} = 1;
1932 }
1933
1934 $self->{state} = DATA_STATE;
1935 $self->{s_kwd} = '';
1936
1937 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938 $self->{line_prev} = $self->{line};
1939 $self->{column_prev} = $self->{column};
1940 $self->{column}++;
1941 $self->{nc}
1942 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943 } else {
1944 $self->{set_nc}->($self);
1945 }
1946
1947
1948 return ($self->{ct}); # start tag or end tag
1949
1950 redo A;
1951 } elsif ($self->{nc} == -1) {
1952 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1953 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1954
1955 $self->{last_stag_name} = $self->{ct}->{tag_name};
1956 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1957 if ($self->{ct}->{attributes}) {
1958
1959 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1960 } else {
1961 ## NOTE: This state should never be reached.
1962
1963 }
1964 } else {
1965 die "$0: $self->{ct}->{type}: Unknown token type";
1966 }
1967 $self->{state} = DATA_STATE;
1968 $self->{s_kwd} = '';
1969 ## Reconsume.
1970 return ($self->{ct}); # start tag or end tag
1971 redo A;
1972 } else {
1973
1974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1975 ## TODO: This error type is wrong.
1976 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1977 ## Reconsume.
1978 redo A;
1979 }
1980 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1981 ## (only happen if PCDATA state)
1982
1983 ## NOTE: Unlike spec's "bogus comment state", this implementation
1984 ## consumes characters one-by-one basis.
1985
1986 if ($self->{nc} == 0x003E) { # >
1987
1988 $self->{state} = DATA_STATE;
1989 $self->{s_kwd} = '';
1990
1991 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1992 $self->{line_prev} = $self->{line};
1993 $self->{column_prev} = $self->{column};
1994 $self->{column}++;
1995 $self->{nc}
1996 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1997 } else {
1998 $self->{set_nc}->($self);
1999 }
2000
2001
2002 return ($self->{ct}); # comment
2003 redo A;
2004 } elsif ($self->{nc} == -1) {
2005
2006 $self->{state} = DATA_STATE;
2007 $self->{s_kwd} = '';
2008 ## reconsume
2009
2010 return ($self->{ct}); # comment
2011 redo A;
2012 } else {
2013
2014 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2015 $self->{read_until}->($self->{ct}->{data},
2016 q[>],
2017 length $self->{ct}->{data});
2018
2019 ## Stay in the state.
2020
2021 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2022 $self->{line_prev} = $self->{line};
2023 $self->{column_prev} = $self->{column};
2024 $self->{column}++;
2025 $self->{nc}
2026 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2027 } else {
2028 $self->{set_nc}->($self);
2029 }
2030
2031 redo A;
2032 }
2033 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2034 ## (only happen if PCDATA state)
2035
2036 if ($self->{nc} == 0x002D) { # -
2037
2038 $self->{state} = MD_HYPHEN_STATE;
2039
2040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2041 $self->{line_prev} = $self->{line};
2042 $self->{column_prev} = $self->{column};
2043 $self->{column}++;
2044 $self->{nc}
2045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2046 } else {
2047 $self->{set_nc}->($self);
2048 }
2049
2050 redo A;
2051 } elsif ($self->{nc} == 0x0044 or # D
2052 $self->{nc} == 0x0064) { # d
2053 ## ASCII case-insensitive.
2054
2055 $self->{state} = MD_DOCTYPE_STATE;
2056 $self->{s_kwd} = chr $self->{nc};
2057
2058 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2059 $self->{line_prev} = $self->{line};
2060 $self->{column_prev} = $self->{column};
2061 $self->{column}++;
2062 $self->{nc}
2063 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2064 } else {
2065 $self->{set_nc}->($self);
2066 }
2067
2068 redo A;
2069 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2070 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2071 $self->{is_xml}) and
2072 $self->{nc} == 0x005B) { # [
2073
2074 $self->{state} = MD_CDATA_STATE;
2075 $self->{s_kwd} = '[';
2076
2077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2078 $self->{line_prev} = $self->{line};
2079 $self->{column_prev} = $self->{column};
2080 $self->{column}++;
2081 $self->{nc}
2082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2083 } else {
2084 $self->{set_nc}->($self);
2085 }
2086
2087 redo A;
2088 } else {
2089
2090 }
2091
2092 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2093 line => $self->{line_prev},
2094 column => $self->{column_prev} - 1);
2095 ## Reconsume.
2096 $self->{state} = BOGUS_COMMENT_STATE;
2097 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2098 line => $self->{line_prev},
2099 column => $self->{column_prev} - 1,
2100 };
2101 redo A;
2102 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2103 if ($self->{nc} == 0x002D) { # -
2104
2105 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2106 line => $self->{line_prev},
2107 column => $self->{column_prev} - 2,
2108 };
2109 $self->{state} = COMMENT_START_STATE;
2110
2111 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2112 $self->{line_prev} = $self->{line};
2113 $self->{column_prev} = $self->{column};
2114 $self->{column}++;
2115 $self->{nc}
2116 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2117 } else {
2118 $self->{set_nc}->($self);
2119 }
2120
2121 redo A;
2122 } else {
2123
2124 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2125 line => $self->{line_prev},
2126 column => $self->{column_prev} - 2);
2127 $self->{state} = BOGUS_COMMENT_STATE;
2128 ## Reconsume.
2129 $self->{ct} = {type => COMMENT_TOKEN,
2130 data => '-',
2131 line => $self->{line_prev},
2132 column => $self->{column_prev} - 2,
2133 };
2134 redo A;
2135 }
2136 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2137 ## ASCII case-insensitive.
2138 if ($self->{nc} == [
2139 undef,
2140 0x004F, # O
2141 0x0043, # C
2142 0x0054, # T
2143 0x0059, # Y
2144 0x0050, # P
2145 ]->[length $self->{s_kwd}] or
2146 $self->{nc} == [
2147 undef,
2148 0x006F, # o
2149 0x0063, # c
2150 0x0074, # t
2151 0x0079, # y
2152 0x0070, # p
2153 ]->[length $self->{s_kwd}]) {
2154
2155 ## Stay in the state.
2156 $self->{s_kwd} .= chr $self->{nc};
2157
2158 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2159 $self->{line_prev} = $self->{line};
2160 $self->{column_prev} = $self->{column};
2161 $self->{column}++;
2162 $self->{nc}
2163 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2164 } else {
2165 $self->{set_nc}->($self);
2166 }
2167
2168 redo A;
2169 } elsif ((length $self->{s_kwd}) == 6 and
2170 ($self->{nc} == 0x0045 or # E
2171 $self->{nc} == 0x0065)) { # e
2172
2173 $self->{state} = DOCTYPE_STATE;
2174 $self->{ct} = {type => DOCTYPE_TOKEN,
2175 quirks => 1,
2176 line => $self->{line_prev},
2177 column => $self->{column_prev} - 7,
2178 };
2179
2180 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2181 $self->{line_prev} = $self->{line};
2182 $self->{column_prev} = $self->{column};
2183 $self->{column}++;
2184 $self->{nc}
2185 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2186 } else {
2187 $self->{set_nc}->($self);
2188 }
2189
2190 redo A;
2191 } else {
2192
2193 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2194 line => $self->{line_prev},
2195 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2196 $self->{state} = BOGUS_COMMENT_STATE;
2197 ## Reconsume.
2198 $self->{ct} = {type => COMMENT_TOKEN,
2199 data => $self->{s_kwd},
2200 line => $self->{line_prev},
2201 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2202 };
2203 redo A;
2204 }
2205 } elsif ($self->{state} == MD_CDATA_STATE) {
2206 if ($self->{nc} == {
2207 '[' => 0x0043, # C
2208 '[C' => 0x0044, # D
2209 '[CD' => 0x0041, # A
2210 '[CDA' => 0x0054, # T
2211 '[CDAT' => 0x0041, # A
2212 }->{$self->{s_kwd}}) {
2213
2214 ## Stay in the state.
2215 $self->{s_kwd} .= chr $self->{nc};
2216
2217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218 $self->{line_prev} = $self->{line};
2219 $self->{column_prev} = $self->{column};
2220 $self->{column}++;
2221 $self->{nc}
2222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223 } else {
2224 $self->{set_nc}->($self);
2225 }
2226
2227 redo A;
2228 } elsif ($self->{s_kwd} eq '[CDATA' and
2229 $self->{nc} == 0x005B) { # [
2230
2231 $self->{ct} = {type => CHARACTER_TOKEN,
2232 data => '',
2233 line => $self->{line_prev},
2234 column => $self->{column_prev} - 7};
2235 $self->{state} = CDATA_SECTION_STATE;
2236
2237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2238 $self->{line_prev} = $self->{line};
2239 $self->{column_prev} = $self->{column};
2240 $self->{column}++;
2241 $self->{nc}
2242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2243 } else {
2244 $self->{set_nc}->($self);
2245 }
2246
2247 redo A;
2248 } else {
2249
2250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2251 line => $self->{line_prev},
2252 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2253 $self->{state} = BOGUS_COMMENT_STATE;
2254 ## Reconsume.
2255 $self->{ct} = {type => COMMENT_TOKEN,
2256 data => $self->{s_kwd},
2257 line => $self->{line_prev},
2258 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2259 };
2260 redo A;
2261 }
2262 } elsif ($self->{state} == COMMENT_START_STATE) {
2263 if ($self->{nc} == 0x002D) { # -
2264
2265 $self->{state} = COMMENT_START_DASH_STATE;
2266
2267 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2268 $self->{line_prev} = $self->{line};
2269 $self->{column_prev} = $self->{column};
2270 $self->{column}++;
2271 $self->{nc}
2272 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2273 } else {
2274 $self->{set_nc}->($self);
2275 }
2276
2277 redo A;
2278 } elsif ($self->{nc} == 0x003E) { # >
2279
2280 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2281 $self->{state} = DATA_STATE;
2282 $self->{s_kwd} = '';
2283
2284 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2285 $self->{line_prev} = $self->{line};
2286 $self->{column_prev} = $self->{column};
2287 $self->{column}++;
2288 $self->{nc}
2289 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2290 } else {
2291 $self->{set_nc}->($self);
2292 }
2293
2294
2295 return ($self->{ct}); # comment
2296
2297 redo A;
2298 } elsif ($self->{nc} == -1) {
2299
2300 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2301 $self->{state} = DATA_STATE;
2302 $self->{s_kwd} = '';
2303 ## reconsume
2304
2305 return ($self->{ct}); # comment
2306
2307 redo A;
2308 } else {
2309
2310 $self->{ct}->{data} # comment
2311 .= chr ($self->{nc});
2312 $self->{state} = COMMENT_STATE;
2313
2314 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2315 $self->{line_prev} = $self->{line};
2316 $self->{column_prev} = $self->{column};
2317 $self->{column}++;
2318 $self->{nc}
2319 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2320 } else {
2321 $self->{set_nc}->($self);
2322 }
2323
2324 redo A;
2325 }
2326 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2327 if ($self->{nc} == 0x002D) { # -
2328
2329 $self->{state} = COMMENT_END_STATE;
2330
2331 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2332 $self->{line_prev} = $self->{line};
2333 $self->{column_prev} = $self->{column};
2334 $self->{column}++;
2335 $self->{nc}
2336 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2337 } else {
2338 $self->{set_nc}->($self);
2339 }
2340
2341 redo A;
2342 } elsif ($self->{nc} == 0x003E) { # >
2343
2344 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2345 $self->{state} = DATA_STATE;
2346 $self->{s_kwd} = '';
2347
2348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2349 $self->{line_prev} = $self->{line};
2350 $self->{column_prev} = $self->{column};
2351 $self->{column}++;
2352 $self->{nc}
2353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2354 } else {
2355 $self->{set_nc}->($self);
2356 }
2357
2358
2359 return ($self->{ct}); # comment
2360
2361 redo A;
2362 } elsif ($self->{nc} == -1) {
2363
2364 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2365 $self->{state} = DATA_STATE;
2366 $self->{s_kwd} = '';
2367 ## reconsume
2368
2369 return ($self->{ct}); # comment
2370
2371 redo A;
2372 } else {
2373
2374 $self->{ct}->{data} # comment
2375 .= '-' . chr ($self->{nc});
2376 $self->{state} = COMMENT_STATE;
2377
2378 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2379 $self->{line_prev} = $self->{line};
2380 $self->{column_prev} = $self->{column};
2381 $self->{column}++;
2382 $self->{nc}
2383 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2384 } else {
2385 $self->{set_nc}->($self);
2386 }
2387
2388 redo A;
2389 }
2390 } elsif ($self->{state} == COMMENT_STATE) {
2391 if ($self->{nc} == 0x002D) { # -
2392
2393 $self->{state} = COMMENT_END_DASH_STATE;
2394
2395 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2396 $self->{line_prev} = $self->{line};
2397 $self->{column_prev} = $self->{column};
2398 $self->{column}++;
2399 $self->{nc}
2400 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2401 } else {
2402 $self->{set_nc}->($self);
2403 }
2404
2405 redo A;
2406 } elsif ($self->{nc} == -1) {
2407
2408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2409 $self->{state} = DATA_STATE;
2410 $self->{s_kwd} = '';
2411 ## reconsume
2412
2413 return ($self->{ct}); # comment
2414
2415 redo A;
2416 } else {
2417
2418 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2419 $self->{read_until}->($self->{ct}->{data},
2420 q[-],
2421 length $self->{ct}->{data});
2422
2423 ## Stay in the state
2424
2425 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2426 $self->{line_prev} = $self->{line};
2427 $self->{column_prev} = $self->{column};
2428 $self->{column}++;
2429 $self->{nc}
2430 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2431 } else {
2432 $self->{set_nc}->($self);
2433 }
2434
2435 redo A;
2436 }
2437 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2438 if ($self->{nc} == 0x002D) { # -
2439
2440 $self->{state} = COMMENT_END_STATE;
2441
2442 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2443 $self->{line_prev} = $self->{line};
2444 $self->{column_prev} = $self->{column};
2445 $self->{column}++;
2446 $self->{nc}
2447 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2448 } else {
2449 $self->{set_nc}->($self);
2450 }
2451
2452 redo A;
2453 } elsif ($self->{nc} == -1) {
2454
2455 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2456 $self->{s_kwd} = '';
2457 $self->{state} = DATA_STATE;
2458 $self->{s_kwd} = '';
2459 ## reconsume
2460
2461 return ($self->{ct}); # comment
2462
2463 redo A;
2464 } else {
2465
2466 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2467 $self->{state} = COMMENT_STATE;
2468
2469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470 $self->{line_prev} = $self->{line};
2471 $self->{column_prev} = $self->{column};
2472 $self->{column}++;
2473 $self->{nc}
2474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475 } else {
2476 $self->{set_nc}->($self);
2477 }
2478
2479 redo A;
2480 }
2481 } elsif ($self->{state} == COMMENT_END_STATE) {
2482 if ($self->{nc} == 0x003E) { # >
2483
2484 $self->{state} = DATA_STATE;
2485 $self->{s_kwd} = '';
2486
2487 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2488 $self->{line_prev} = $self->{line};
2489 $self->{column_prev} = $self->{column};
2490 $self->{column}++;
2491 $self->{nc}
2492 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2493 } else {
2494 $self->{set_nc}->($self);
2495 }
2496
2497
2498 return ($self->{ct}); # comment
2499
2500 redo A;
2501 } elsif ($self->{nc} == 0x002D) { # -
2502
2503 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2504 line => $self->{line_prev},
2505 column => $self->{column_prev});
2506 $self->{ct}->{data} .= '-'; # comment
2507 ## Stay in the state
2508
2509 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2510 $self->{line_prev} = $self->{line};
2511 $self->{column_prev} = $self->{column};
2512 $self->{column}++;
2513 $self->{nc}
2514 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2515 } else {
2516 $self->{set_nc}->($self);
2517 }
2518
2519 redo A;
2520 } elsif ($self->{nc} == -1) {
2521
2522 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2523 $self->{state} = DATA_STATE;
2524 $self->{s_kwd} = '';
2525 ## reconsume
2526
2527 return ($self->{ct}); # comment
2528
2529 redo A;
2530 } else {
2531
2532 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2533 line => $self->{line_prev},
2534 column => $self->{column_prev});
2535 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2536 $self->{state} = COMMENT_STATE;
2537
2538 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2539 $self->{line_prev} = $self->{line};
2540 $self->{column_prev} = $self->{column};
2541 $self->{column}++;
2542 $self->{nc}
2543 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2544 } else {
2545 $self->{set_nc}->($self);
2546 }
2547
2548 redo A;
2549 }
2550 } elsif ($self->{state} == DOCTYPE_STATE) {
2551 if ($is_space->{$self->{nc}}) {
2552
2553 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2554
2555 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2556 $self->{line_prev} = $self->{line};
2557 $self->{column_prev} = $self->{column};
2558 $self->{column}++;
2559 $self->{nc}
2560 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2561 } else {
2562 $self->{set_nc}->($self);
2563 }
2564
2565 redo A;
2566 } else {
2567
2568 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2569 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2570 ## reconsume
2571 redo A;
2572 }
2573 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2574 if ($is_space->{$self->{nc}}) {
2575
2576 ## Stay in the state
2577
2578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2579 $self->{line_prev} = $self->{line};
2580 $self->{column_prev} = $self->{column};
2581 $self->{column}++;
2582 $self->{nc}
2583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2584 } else {
2585 $self->{set_nc}->($self);
2586 }
2587
2588 redo A;
2589 } elsif ($self->{nc} == 0x003E) { # >
2590
2591 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2592 $self->{state} = DATA_STATE;
2593 $self->{s_kwd} = '';
2594
2595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596 $self->{line_prev} = $self->{line};
2597 $self->{column_prev} = $self->{column};
2598 $self->{column}++;
2599 $self->{nc}
2600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601 } else {
2602 $self->{set_nc}->($self);
2603 }
2604
2605
2606 return ($self->{ct}); # DOCTYPE (quirks)
2607
2608 redo A;
2609 } elsif ($self->{nc} == -1) {
2610
2611 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2612 $self->{state} = DATA_STATE;
2613 $self->{s_kwd} = '';
2614 ## reconsume
2615
2616 return ($self->{ct}); # DOCTYPE (quirks)
2617
2618 redo A;
2619 } else {
2620
2621 $self->{ct}->{name} = chr $self->{nc};
2622 delete $self->{ct}->{quirks};
2623 $self->{state} = DOCTYPE_NAME_STATE;
2624
2625 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2626 $self->{line_prev} = $self->{line};
2627 $self->{column_prev} = $self->{column};
2628 $self->{column}++;
2629 $self->{nc}
2630 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2631 } else {
2632 $self->{set_nc}->($self);
2633 }
2634
2635 redo A;
2636 }
2637 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2638 ## ISSUE: Redundant "First," in the spec.
2639 if ($is_space->{$self->{nc}}) {
2640
2641 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2642
2643 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2644 $self->{line_prev} = $self->{line};
2645 $self->{column_prev} = $self->{column};
2646 $self->{column}++;
2647 $self->{nc}
2648 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2649 } else {
2650 $self->{set_nc}->($self);
2651 }
2652
2653 redo A;
2654 } elsif ($self->{nc} == 0x003E) { # >
2655
2656 $self->{state} = DATA_STATE;
2657 $self->{s_kwd} = '';
2658
2659 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2660 $self->{line_prev} = $self->{line};
2661 $self->{column_prev} = $self->{column};
2662 $self->{column}++;
2663 $self->{nc}
2664 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2665 } else {
2666 $self->{set_nc}->($self);
2667 }
2668
2669
2670 return ($self->{ct}); # DOCTYPE
2671
2672 redo A;
2673 } elsif ($self->{nc} == -1) {
2674
2675 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2676 $self->{state} = DATA_STATE;
2677 $self->{s_kwd} = '';
2678 ## reconsume
2679
2680 $self->{ct}->{quirks} = 1;
2681 return ($self->{ct}); # DOCTYPE
2682
2683 redo A;
2684 } else {
2685
2686 $self->{ct}->{name}
2687 .= chr ($self->{nc}); # DOCTYPE
2688 ## Stay in the state
2689
2690 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2691 $self->{line_prev} = $self->{line};
2692 $self->{column_prev} = $self->{column};
2693 $self->{column}++;
2694 $self->{nc}
2695 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2696 } else {
2697 $self->{set_nc}->($self);
2698 }
2699
2700 redo A;
2701 }
2702 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2703 if ($is_space->{$self->{nc}}) {
2704
2705 ## Stay in the state
2706
2707 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2708 $self->{line_prev} = $self->{line};
2709 $self->{column_prev} = $self->{column};
2710 $self->{column}++;
2711 $self->{nc}
2712 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2713 } else {
2714 $self->{set_nc}->($self);
2715 }
2716
2717 redo A;
2718 } elsif ($self->{nc} == 0x003E) { # >
2719
2720 $self->{state} = DATA_STATE;
2721 $self->{s_kwd} = '';
2722
2723 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2724 $self->{line_prev} = $self->{line};
2725 $self->{column_prev} = $self->{column};
2726 $self->{column}++;
2727 $self->{nc}
2728 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2729 } else {
2730 $self->{set_nc}->($self);
2731 }
2732
2733
2734 return ($self->{ct}); # DOCTYPE
2735
2736 redo A;
2737 } elsif ($self->{nc} == -1) {
2738
2739 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2740 $self->{state} = DATA_STATE;
2741 $self->{s_kwd} = '';
2742 ## reconsume
2743
2744 $self->{ct}->{quirks} = 1;
2745 return ($self->{ct}); # DOCTYPE
2746
2747 redo A;
2748 } elsif ($self->{nc} == 0x0050 or # P
2749 $self->{nc} == 0x0070) { # p
2750 $self->{state} = PUBLIC_STATE;
2751 $self->{s_kwd} = chr $self->{nc};
2752
2753 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2754 $self->{line_prev} = $self->{line};
2755 $self->{column_prev} = $self->{column};
2756 $self->{column}++;
2757 $self->{nc}
2758 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2759 } else {
2760 $self->{set_nc}->($self);
2761 }
2762
2763 redo A;
2764 } elsif ($self->{nc} == 0x0053 or # S
2765 $self->{nc} == 0x0073) { # s
2766 $self->{state} = SYSTEM_STATE;
2767 $self->{s_kwd} = chr $self->{nc};
2768
2769 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2770 $self->{line_prev} = $self->{line};
2771 $self->{column_prev} = $self->{column};
2772 $self->{column}++;
2773 $self->{nc}
2774 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2775 } else {
2776 $self->{set_nc}->($self);
2777 }
2778
2779 redo A;
2780 } else {
2781
2782 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2783 $self->{ct}->{quirks} = 1;
2784
2785 $self->{state} = BOGUS_DOCTYPE_STATE;
2786
2787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788 $self->{line_prev} = $self->{line};
2789 $self->{column_prev} = $self->{column};
2790 $self->{column}++;
2791 $self->{nc}
2792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793 } else {
2794 $self->{set_nc}->($self);
2795 }
2796
2797 redo A;
2798 }
2799 } elsif ($self->{state} == PUBLIC_STATE) {
2800 ## ASCII case-insensitive
2801 if ($self->{nc} == [
2802 undef,
2803 0x0055, # U
2804 0x0042, # B
2805 0x004C, # L
2806 0x0049, # I
2807 ]->[length $self->{s_kwd}] or
2808 $self->{nc} == [
2809 undef,
2810 0x0075, # u
2811 0x0062, # b
2812 0x006C, # l
2813 0x0069, # i
2814 ]->[length $self->{s_kwd}]) {
2815
2816 ## Stay in the state.
2817 $self->{s_kwd} .= chr $self->{nc};
2818
2819 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2820 $self->{line_prev} = $self->{line};
2821 $self->{column_prev} = $self->{column};
2822 $self->{column}++;
2823 $self->{nc}
2824 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2825 } else {
2826 $self->{set_nc}->($self);
2827 }
2828
2829 redo A;
2830 } elsif ((length $self->{s_kwd}) == 5 and
2831 ($self->{nc} == 0x0043 or # C
2832 $self->{nc} == 0x0063)) { # c
2833
2834 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2835
2836 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2837 $self->{line_prev} = $self->{line};
2838 $self->{column_prev} = $self->{column};
2839 $self->{column}++;
2840 $self->{nc}
2841 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2842 } else {
2843 $self->{set_nc}->($self);
2844 }
2845
2846 redo A;
2847 } else {
2848
2849 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2850 line => $self->{line_prev},
2851 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2852 $self->{ct}->{quirks} = 1;
2853
2854 $self->{state} = BOGUS_DOCTYPE_STATE;
2855 ## Reconsume.
2856 redo A;
2857 }
2858 } elsif ($self->{state} == SYSTEM_STATE) {
2859 ## ASCII case-insensitive
2860 if ($self->{nc} == [
2861 undef,
2862 0x0059, # Y
2863 0x0053, # S
2864 0x0054, # T
2865 0x0045, # E
2866 ]->[length $self->{s_kwd}] or
2867 $self->{nc} == [
2868 undef,
2869 0x0079, # y
2870 0x0073, # s
2871 0x0074, # t
2872 0x0065, # e
2873 ]->[length $self->{s_kwd}]) {
2874
2875 ## Stay in the state.
2876 $self->{s_kwd} .= chr $self->{nc};
2877
2878 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2879 $self->{line_prev} = $self->{line};
2880 $self->{column_prev} = $self->{column};
2881 $self->{column}++;
2882 $self->{nc}
2883 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2884 } else {
2885 $self->{set_nc}->($self);
2886 }
2887
2888 redo A;
2889 } elsif ((length $self->{s_kwd}) == 5 and
2890 ($self->{nc} == 0x004D or # M
2891 $self->{nc} == 0x006D)) { # m
2892
2893 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2894
2895 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896 $self->{line_prev} = $self->{line};
2897 $self->{column_prev} = $self->{column};
2898 $self->{column}++;
2899 $self->{nc}
2900 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901 } else {
2902 $self->{set_nc}->($self);
2903 }
2904
2905 redo A;
2906 } else {
2907
2908 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2909 line => $self->{line_prev},
2910 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2911 $self->{ct}->{quirks} = 1;
2912
2913 $self->{state} = BOGUS_DOCTYPE_STATE;
2914 ## Reconsume.
2915 redo A;
2916 }
2917 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2918 if ($is_space->{$self->{nc}}) {
2919
2920 ## Stay in the state
2921
2922 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2923 $self->{line_prev} = $self->{line};
2924 $self->{column_prev} = $self->{column};
2925 $self->{column}++;
2926 $self->{nc}
2927 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2928 } else {
2929 $self->{set_nc}->($self);
2930 }
2931
2932 redo A;
2933 } elsif ($self->{nc} eq 0x0022) { # "
2934
2935 $self->{ct}->{pubid} = ''; # DOCTYPE
2936 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2937
2938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2939 $self->{line_prev} = $self->{line};
2940 $self->{column_prev} = $self->{column};
2941 $self->{column}++;
2942 $self->{nc}
2943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2944 } else {
2945 $self->{set_nc}->($self);
2946 }
2947
2948 redo A;
2949 } elsif ($self->{nc} eq 0x0027) { # '
2950
2951 $self->{ct}->{pubid} = ''; # DOCTYPE
2952 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2953
2954 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2955 $self->{line_prev} = $self->{line};
2956 $self->{column_prev} = $self->{column};
2957 $self->{column}++;
2958 $self->{nc}
2959 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2960 } else {
2961 $self->{set_nc}->($self);
2962 }
2963
2964 redo A;
2965 } elsif ($self->{nc} eq 0x003E) { # >
2966
2967 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2968
2969 $self->{state} = DATA_STATE;
2970 $self->{s_kwd} = '';
2971
2972 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2973 $self->{line_prev} = $self->{line};
2974 $self->{column_prev} = $self->{column};
2975 $self->{column}++;
2976 $self->{nc}
2977 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2978 } else {
2979 $self->{set_nc}->($self);
2980 }
2981
2982
2983 $self->{ct}->{quirks} = 1;
2984 return ($self->{ct}); # DOCTYPE
2985
2986 redo A;
2987 } elsif ($self->{nc} == -1) {
2988
2989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2990
2991 $self->{state} = DATA_STATE;
2992 $self->{s_kwd} = '';
2993 ## reconsume
2994
2995 $self->{ct}->{quirks} = 1;
2996 return ($self->{ct}); # DOCTYPE
2997
2998 redo A;
2999 } else {
3000
3001 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3002 $self->{ct}->{quirks} = 1;
3003
3004 $self->{state} = BOGUS_DOCTYPE_STATE;
3005
3006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3007 $self->{line_prev} = $self->{line};
3008 $self->{column_prev} = $self->{column};
3009 $self->{column}++;
3010 $self->{nc}
3011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3012 } else {
3013 $self->{set_nc}->($self);
3014 }
3015
3016 redo A;
3017 }
3018 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3019 if ($self->{nc} == 0x0022) { # "
3020
3021 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3022
3023 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3024 $self->{line_prev} = $self->{line};
3025 $self->{column_prev} = $self->{column};
3026 $self->{column}++;
3027 $self->{nc}
3028 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3029 } else {
3030 $self->{set_nc}->($self);
3031 }
3032
3033 redo A;
3034 } elsif ($self->{nc} == 0x003E) { # >
3035
3036 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3037
3038 $self->{state} = DATA_STATE;
3039 $self->{s_kwd} = '';
3040
3041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3042 $self->{line_prev} = $self->{line};
3043 $self->{column_prev} = $self->{column};
3044 $self->{column}++;
3045 $self->{nc}
3046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3047 } else {
3048 $self->{set_nc}->($self);
3049 }
3050
3051
3052 $self->{ct}->{quirks} = 1;
3053 return ($self->{ct}); # DOCTYPE
3054
3055 redo A;
3056 } elsif ($self->{nc} == -1) {
3057
3058 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3059
3060 $self->{state} = DATA_STATE;
3061 $self->{s_kwd} = '';
3062 ## reconsume
3063
3064 $self->{ct}->{quirks} = 1;
3065 return ($self->{ct}); # DOCTYPE
3066
3067 redo A;
3068 } else {
3069
3070 $self->{ct}->{pubid} # DOCTYPE
3071 .= chr $self->{nc};
3072 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3073 length $self->{ct}->{pubid});
3074
3075 ## Stay in the state
3076
3077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3078 $self->{line_prev} = $self->{line};
3079 $self->{column_prev} = $self->{column};
3080 $self->{column}++;
3081 $self->{nc}
3082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3083 } else {
3084 $self->{set_nc}->($self);
3085 }
3086
3087 redo A;
3088 }
3089 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3090 if ($self->{nc} == 0x0027) { # '
3091
3092 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3093
3094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3095 $self->{line_prev} = $self->{line};
3096 $self->{column_prev} = $self->{column};
3097 $self->{column}++;
3098 $self->{nc}
3099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3100 } else {
3101 $self->{set_nc}->($self);
3102 }
3103
3104 redo A;
3105 } elsif ($self->{nc} == 0x003E) { # >
3106
3107 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3108
3109 $self->{state} = DATA_STATE;
3110 $self->{s_kwd} = '';
3111
3112 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3113 $self->{line_prev} = $self->{line};
3114 $self->{column_prev} = $self->{column};
3115 $self->{column}++;
3116 $self->{nc}
3117 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3118 } else {
3119 $self->{set_nc}->($self);
3120 }
3121
3122
3123 $self->{ct}->{quirks} = 1;
3124 return ($self->{ct}); # DOCTYPE
3125
3126 redo A;
3127 } elsif ($self->{nc} == -1) {
3128
3129 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3130
3131 $self->{state} = DATA_STATE;
3132 $self->{s_kwd} = '';
3133 ## reconsume
3134
3135 $self->{ct}->{quirks} = 1;
3136 return ($self->{ct}); # DOCTYPE
3137
3138 redo A;
3139 } else {
3140
3141 $self->{ct}->{pubid} # DOCTYPE
3142 .= chr $self->{nc};
3143 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3144 length $self->{ct}->{pubid});
3145
3146 ## Stay in the state
3147
3148 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3149 $self->{line_prev} = $self->{line};
3150 $self->{column_prev} = $self->{column};
3151 $self->{column}++;
3152 $self->{nc}
3153 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3154 } else {
3155 $self->{set_nc}->($self);
3156 }
3157
3158 redo A;
3159 }
3160 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3161 if ($is_space->{$self->{nc}}) {
3162
3163 ## Stay in the state
3164
3165 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3166 $self->{line_prev} = $self->{line};
3167 $self->{column_prev} = $self->{column};
3168 $self->{column}++;
3169 $self->{nc}
3170 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3171 } else {
3172 $self->{set_nc}->($self);
3173 }
3174
3175 redo A;
3176 } elsif ($self->{nc} == 0x0022) { # "
3177
3178 $self->{ct}->{sysid} = ''; # DOCTYPE
3179 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3180
3181 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3182 $self->{line_prev} = $self->{line};
3183 $self->{column_prev} = $self->{column};
3184 $self->{column}++;
3185 $self->{nc}
3186 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3187 } else {
3188 $self->{set_nc}->($self);
3189 }
3190
3191 redo A;
3192 } elsif ($self->{nc} == 0x0027) { # '
3193
3194 $self->{ct}->{sysid} = ''; # DOCTYPE
3195 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3196
3197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3198 $self->{line_prev} = $self->{line};
3199 $self->{column_prev} = $self->{column};
3200 $self->{column}++;
3201 $self->{nc}
3202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3203 } else {
3204 $self->{set_nc}->($self);
3205 }
3206
3207 redo A;
3208 } elsif ($self->{nc} == 0x003E) { # >
3209
3210 $self->{state} = DATA_STATE;
3211 $self->{s_kwd} = '';
3212
3213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214 $self->{line_prev} = $self->{line};
3215 $self->{column_prev} = $self->{column};
3216 $self->{column}++;
3217 $self->{nc}
3218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219 } else {
3220 $self->{set_nc}->($self);
3221 }
3222
3223
3224 return ($self->{ct}); # DOCTYPE
3225
3226 redo A;
3227 } elsif ($self->{nc} == -1) {
3228
3229 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3230
3231 $self->{state} = DATA_STATE;
3232 $self->{s_kwd} = '';
3233 ## reconsume
3234
3235 $self->{ct}->{quirks} = 1;
3236 return ($self->{ct}); # DOCTYPE
3237
3238 redo A;
3239 } else {
3240
3241 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3242 $self->{ct}->{quirks} = 1;
3243
3244 $self->{state} = BOGUS_DOCTYPE_STATE;
3245
3246 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3247 $self->{line_prev} = $self->{line};
3248 $self->{column_prev} = $self->{column};
3249 $self->{column}++;
3250 $self->{nc}
3251 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3252 } else {
3253 $self->{set_nc}->($self);
3254 }
3255
3256 redo A;
3257 }
3258 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3259 if ($is_space->{$self->{nc}}) {
3260
3261 ## Stay in the state
3262
3263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264 $self->{line_prev} = $self->{line};
3265 $self->{column_prev} = $self->{column};
3266 $self->{column}++;
3267 $self->{nc}
3268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269 } else {
3270 $self->{set_nc}->($self);
3271 }
3272
3273 redo A;
3274 } elsif ($self->{nc} == 0x0022) { # "
3275
3276 $self->{ct}->{sysid} = ''; # DOCTYPE
3277 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3278
3279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3280 $self->{line_prev} = $self->{line};
3281 $self->{column_prev} = $self->{column};
3282 $self->{column}++;
3283 $self->{nc}
3284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3285 } else {
3286 $self->{set_nc}->($self);
3287 }
3288
3289 redo A;
3290 } elsif ($self->{nc} == 0x0027) { # '
3291
3292 $self->{ct}->{sysid} = ''; # DOCTYPE
3293 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3294
3295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3296 $self->{line_prev} = $self->{line};
3297 $self->{column_prev} = $self->{column};
3298 $self->{column}++;
3299 $self->{nc}
3300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3301 } else {
3302 $self->{set_nc}->($self);
3303 }
3304
3305 redo A;
3306 } elsif ($self->{nc} == 0x003E) { # >
3307
3308 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3309 $self->{state} = DATA_STATE;
3310 $self->{s_kwd} = '';
3311
3312 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3313 $self->{line_prev} = $self->{line};
3314 $self->{column_prev} = $self->{column};
3315 $self->{column}++;
3316 $self->{nc}
3317 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3318 } else {
3319 $self->{set_nc}->($self);
3320 }
3321
3322
3323 $self->{ct}->{quirks} = 1;
3324 return ($self->{ct}); # DOCTYPE
3325
3326 redo A;
3327 } elsif ($self->{nc} == -1) {
3328
3329 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3330
3331 $self->{state} = DATA_STATE;
3332 $self->{s_kwd} = '';
3333 ## reconsume
3334
3335 $self->{ct}->{quirks} = 1;
3336 return ($self->{ct}); # DOCTYPE
3337
3338 redo A;
3339 } else {
3340
3341 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3342 $self->{ct}->{quirks} = 1;
3343
3344 $self->{state} = BOGUS_DOCTYPE_STATE;
3345
3346 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3347 $self->{line_prev} = $self->{line};
3348 $self->{column_prev} = $self->{column};
3349 $self->{column}++;
3350 $self->{nc}
3351 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3352 } else {
3353 $self->{set_nc}->($self);
3354 }
3355
3356 redo A;
3357 }
3358 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3359 if ($self->{nc} == 0x0022) { # "
3360
3361 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3362
3363 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3364 $self->{line_prev} = $self->{line};
3365 $self->{column_prev} = $self->{column};
3366 $self->{column}++;
3367 $self->{nc}
3368 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3369 } else {
3370 $self->{set_nc}->($self);
3371 }
3372
3373 redo A;
3374 } elsif ($self->{nc} == 0x003E) { # >
3375
3376 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3377
3378 $self->{state} = DATA_STATE;
3379 $self->{s_kwd} = '';
3380
3381 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3382 $self->{line_prev} = $self->{line};
3383 $self->{column_prev} = $self->{column};
3384 $self->{column}++;
3385 $self->{nc}
3386 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3387 } else {
3388 $self->{set_nc}->($self);
3389 }
3390
3391
3392 $self->{ct}->{quirks} = 1;
3393 return ($self->{ct}); # DOCTYPE
3394
3395 redo A;
3396 } elsif ($self->{nc} == -1) {
3397
3398 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3399
3400 $self->{state} = DATA_STATE;
3401 $self->{s_kwd} = '';
3402 ## reconsume
3403
3404 $self->{ct}->{quirks} = 1;
3405 return ($self->{ct}); # DOCTYPE
3406
3407 redo A;
3408 } else {
3409
3410 $self->{ct}->{sysid} # DOCTYPE
3411 .= chr $self->{nc};
3412 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3413 length $self->{ct}->{sysid});
3414
3415 ## Stay in the state
3416
3417 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3418 $self->{line_prev} = $self->{line};
3419 $self->{column_prev} = $self->{column};
3420 $self->{column}++;
3421 $self->{nc}
3422 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3423 } else {
3424 $self->{set_nc}->($self);
3425 }
3426
3427 redo A;
3428 }
3429 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3430 if ($self->{nc} == 0x0027) { # '
3431
3432 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3433
3434 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3435 $self->{line_prev} = $self->{line};
3436 $self->{column_prev} = $self->{column};
3437 $self->{column}++;
3438 $self->{nc}
3439 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3440 } else {
3441 $self->{set_nc}->($self);
3442 }
3443
3444 redo A;
3445 } elsif ($self->{nc} == 0x003E) { # >
3446
3447 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3448
3449 $self->{state} = DATA_STATE;
3450 $self->{s_kwd} = '';
3451
3452 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3453 $self->{line_prev} = $self->{line};
3454 $self->{column_prev} = $self->{column};
3455 $self->{column}++;
3456 $self->{nc}
3457 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3458 } else {
3459 $self->{set_nc}->($self);
3460 }
3461
3462
3463 $self->{ct}->{quirks} = 1;
3464 return ($self->{ct}); # DOCTYPE
3465
3466 redo A;
3467 } elsif ($self->{nc} == -1) {
3468
3469 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3470
3471 $self->{state} = DATA_STATE;
3472 $self->{s_kwd} = '';
3473 ## reconsume
3474
3475 $self->{ct}->{quirks} = 1;
3476 return ($self->{ct}); # DOCTYPE
3477
3478 redo A;
3479 } else {
3480
3481 $self->{ct}->{sysid} # DOCTYPE
3482 .= chr $self->{nc};
3483 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3484 length $self->{ct}->{sysid});
3485
3486 ## Stay in the state
3487
3488 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3489 $self->{line_prev} = $self->{line};
3490 $self->{column_prev} = $self->{column};
3491 $self->{column}++;
3492 $self->{nc}
3493 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3494 } else {
3495 $self->{set_nc}->($self);
3496 }
3497
3498 redo A;
3499 }
3500 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3501 if ($is_space->{$self->{nc}}) {
3502
3503 ## Stay in the state
3504
3505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506 $self->{line_prev} = $self->{line};
3507 $self->{column_prev} = $self->{column};
3508 $self->{column}++;
3509 $self->{nc}
3510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511 } else {
3512 $self->{set_nc}->($self);
3513 }
3514
3515 redo A;
3516 } elsif ($self->{nc} == 0x003E) { # >
3517
3518 $self->{state} = DATA_STATE;
3519 $self->{s_kwd} = '';
3520
3521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522 $self->{line_prev} = $self->{line};
3523 $self->{column_prev} = $self->{column};
3524 $self->{column}++;
3525 $self->{nc}
3526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527 } else {
3528 $self->{set_nc}->($self);
3529 }
3530
3531
3532 return ($self->{ct}); # DOCTYPE
3533
3534 redo A;
3535 } elsif ($self->{nc} == -1) {
3536
3537 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3538 $self->{state} = DATA_STATE;
3539 $self->{s_kwd} = '';
3540 ## reconsume
3541
3542 $self->{ct}->{quirks} = 1;
3543 return ($self->{ct}); # DOCTYPE
3544
3545 redo A;
3546 } else {
3547
3548 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3549 #$self->{ct}->{quirks} = 1;
3550
3551 $self->{state} = BOGUS_DOCTYPE_STATE;
3552
3553 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3554 $self->{line_prev} = $self->{line};
3555 $self->{column_prev} = $self->{column};
3556 $self->{column}++;
3557 $self->{nc}
3558 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3559 } else {
3560 $self->{set_nc}->($self);
3561 }
3562
3563 redo A;
3564 }
3565 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3566 if ($self->{nc} == 0x003E) { # >
3567
3568 $self->{state} = DATA_STATE;
3569 $self->{s_kwd} = '';
3570
3571 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3572 $self->{line_prev} = $self->{line};
3573 $self->{column_prev} = $self->{column};
3574 $self->{column}++;
3575 $self->{nc}
3576 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3577 } else {
3578 $self->{set_nc}->($self);
3579 }
3580
3581
3582 return ($self->{ct}); # DOCTYPE
3583
3584 redo A;
3585 } elsif ($self->{nc} == -1) {
3586
3587 $self->{state} = DATA_STATE;
3588 $self->{s_kwd} = '';
3589 ## reconsume
3590
3591 return ($self->{ct}); # DOCTYPE
3592
3593 redo A;
3594 } else {
3595
3596 my $s = '';
3597 $self->{read_until}->($s, q[>], 0);
3598
3599 ## Stay in the state
3600
3601 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3602 $self->{line_prev} = $self->{line};
3603 $self->{column_prev} = $self->{column};
3604 $self->{column}++;
3605 $self->{nc}
3606 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3607 } else {
3608 $self->{set_nc}->($self);
3609 }
3610
3611 redo A;
3612 }
3613 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3614 ## NOTE: "CDATA section state" in the state is jointly implemented
3615 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3616 ## and |CDATA_SECTION_MSE2_STATE|.
3617
3618 if ($self->{nc} == 0x005D) { # ]
3619
3620 $self->{state} = CDATA_SECTION_MSE1_STATE;
3621
3622 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3623 $self->{line_prev} = $self->{line};
3624 $self->{column_prev} = $self->{column};
3625 $self->{column}++;
3626 $self->{nc}
3627 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3628 } else {
3629 $self->{set_nc}->($self);
3630 }
3631
3632 redo A;
3633 } elsif ($self->{nc} == -1) {
3634 $self->{state} = DATA_STATE;
3635 $self->{s_kwd} = '';
3636
3637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3638 $self->{line_prev} = $self->{line};
3639 $self->{column_prev} = $self->{column};
3640 $self->{column}++;
3641 $self->{nc}
3642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3643 } else {
3644 $self->{set_nc}->($self);
3645 }
3646
3647 if (length $self->{ct}->{data}) { # character
3648
3649 return ($self->{ct}); # character
3650 } else {
3651
3652 ## No token to emit. $self->{ct} is discarded.
3653 }
3654 redo A;
3655 } else {
3656
3657 $self->{ct}->{data} .= chr $self->{nc};
3658 $self->{read_until}->($self->{ct}->{data},
3659 q<]>,
3660 length $self->{ct}->{data});
3661
3662 ## Stay in the state.
3663
3664 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3665 $self->{line_prev} = $self->{line};
3666 $self->{column_prev} = $self->{column};
3667 $self->{column}++;
3668 $self->{nc}
3669 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3670 } else {
3671 $self->{set_nc}->($self);
3672 }
3673
3674 redo A;
3675 }
3676
3677 ## ISSUE: "text tokens" in spec.
3678 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3679 if ($self->{nc} == 0x005D) { # ]
3680
3681 $self->{state} = CDATA_SECTION_MSE2_STATE;
3682
3683 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3684 $self->{line_prev} = $self->{line};
3685 $self->{column_prev} = $self->{column};
3686 $self->{column}++;
3687 $self->{nc}
3688 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3689 } else {
3690 $self->{set_nc}->($self);
3691 }
3692
3693 redo A;
3694 } else {
3695
3696 $self->{ct}->{data} .= ']';
3697 $self->{state} = CDATA_SECTION_STATE;
3698 ## Reconsume.
3699 redo A;
3700 }
3701 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3702 if ($self->{nc} == 0x003E) { # >
3703 $self->{state} = DATA_STATE;
3704 $self->{s_kwd} = '';
3705
3706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3707 $self->{line_prev} = $self->{line};
3708 $self->{column_prev} = $self->{column};
3709 $self->{column}++;
3710 $self->{nc}
3711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3712 } else {
3713 $self->{set_nc}->($self);
3714 }
3715
3716 if (length $self->{ct}->{data}) { # character
3717
3718 return ($self->{ct}); # character
3719 } else {
3720
3721 ## No token to emit. $self->{ct} is discarded.
3722 }
3723 redo A;
3724 } elsif ($self->{nc} == 0x005D) { # ]
3725 # character
3726 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3727 ## Stay in the state.
3728
3729 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3730 $self->{line_prev} = $self->{line};
3731 $self->{column_prev} = $self->{column};
3732 $self->{column}++;
3733 $self->{nc}
3734 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3735 } else {
3736 $self->{set_nc}->($self);
3737 }
3738
3739 redo A;
3740 } else {
3741
3742 $self->{ct}->{data} .= ']]'; # character
3743 $self->{state} = CDATA_SECTION_STATE;
3744 ## Reconsume.
3745 redo A;
3746 }
3747 } elsif ($self->{state} == ENTITY_STATE) {
3748 if ($is_space->{$self->{nc}} or
3749 {
3750 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3751 $self->{entity_add} => 1,
3752 }->{$self->{nc}}) {
3753
3754 ## Don't consume
3755 ## No error
3756 ## Return nothing.
3757 #
3758 } elsif ($self->{nc} == 0x0023) { # #
3759
3760 $self->{state} = ENTITY_HASH_STATE;
3761 $self->{s_kwd} = '#';
3762
3763 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3764 $self->{line_prev} = $self->{line};
3765 $self->{column_prev} = $self->{column};
3766 $self->{column}++;
3767 $self->{nc}
3768 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3769 } else {
3770 $self->{set_nc}->($self);
3771 }
3772
3773 redo A;
3774 } elsif ((0x0041 <= $self->{nc} and
3775 $self->{nc} <= 0x005A) or # A..Z
3776 (0x0061 <= $self->{nc} and
3777 $self->{nc} <= 0x007A)) { # a..z
3778
3779 require Whatpm::_NamedEntityList;
3780 $self->{state} = ENTITY_NAME_STATE;
3781 $self->{s_kwd} = chr $self->{nc};
3782 $self->{entity__value} = $self->{s_kwd};
3783 $self->{entity__match} = 0;
3784
3785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786 $self->{line_prev} = $self->{line};
3787 $self->{column_prev} = $self->{column};
3788 $self->{column}++;
3789 $self->{nc}
3790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791 } else {
3792 $self->{set_nc}->($self);
3793 }
3794
3795 redo A;
3796 } else {
3797
3798 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3799 ## Return nothing.
3800 #
3801 }
3802
3803 ## NOTE: No character is consumed by the "consume a character
3804 ## reference" algorithm. In other word, there is an "&" character
3805 ## that does not introduce a character reference, which would be
3806 ## appended to the parent element or the attribute value in later
3807 ## process of the tokenizer.
3808
3809 if ($self->{prev_state} == DATA_STATE) {
3810
3811 $self->{state} = $self->{prev_state};
3812 $self->{s_kwd} = '';
3813 ## Reconsume.
3814 return ({type => CHARACTER_TOKEN, data => '&',
3815 line => $self->{line_prev},
3816 column => $self->{column_prev},
3817 });
3818 redo A;
3819 } else {
3820
3821 $self->{ca}->{value} .= '&';
3822 $self->{state} = $self->{prev_state};
3823 $self->{s_kwd} = '';
3824 ## Reconsume.
3825 redo A;
3826 }
3827 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3828 if ($self->{nc} == 0x0078 or # x
3829 $self->{nc} == 0x0058) { # X
3830
3831 $self->{state} = HEXREF_X_STATE;
3832 $self->{s_kwd} .= chr $self->{nc};
3833
3834 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3835 $self->{line_prev} = $self->{line};
3836 $self->{column_prev} = $self->{column};
3837 $self->{column}++;
3838 $self->{nc}
3839 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3840 } else {
3841 $self->{set_nc}->($self);
3842 }
3843
3844 redo A;
3845 } elsif (0x0030 <= $self->{nc} and
3846 $self->{nc} <= 0x0039) { # 0..9
3847
3848 $self->{state} = NCR_NUM_STATE;
3849 $self->{s_kwd} = $self->{nc} - 0x0030;
3850
3851 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3852 $self->{line_prev} = $self->{line};
3853 $self->{column_prev} = $self->{column};
3854 $self->{column}++;
3855 $self->{nc}
3856 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3857 } else {
3858 $self->{set_nc}->($self);
3859 }
3860
3861 redo A;
3862 } else {
3863 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3864 line => $self->{line_prev},
3865 column => $self->{column_prev} - 1);
3866
3867 ## NOTE: According to the spec algorithm, nothing is returned,
3868 ## and then "&#" is appended to the parent element or the attribute
3869 ## value in the later processing.
3870
3871 if ($self->{prev_state} == DATA_STATE) {
3872
3873 $self->{state} = $self->{prev_state};
3874 $self->{s_kwd} = '';
3875 ## Reconsume.
3876 return ({type => CHARACTER_TOKEN,
3877 data => '&#',
3878 line => $self->{line_prev},
3879 column => $self->{column_prev} - 1,
3880 });
3881 redo A;
3882 } else {
3883
3884 $self->{ca}->{value} .= '&#';
3885 $self->{state} = $self->{prev_state};
3886 $self->{s_kwd} = '';
3887 ## Reconsume.
3888 redo A;
3889 }
3890 }
3891 } elsif ($self->{state} == NCR_NUM_STATE) {
3892 if (0x0030 <= $self->{nc} and
3893 $self->{nc} <= 0x0039) { # 0..9
3894
3895 $self->{s_kwd} *= 10;
3896 $self->{s_kwd} += $self->{nc} - 0x0030;
3897
3898 ## Stay in the state.
3899
3900 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3901 $self->{line_prev} = $self->{line};
3902 $self->{column_prev} = $self->{column};
3903 $self->{column}++;
3904 $self->{nc}
3905 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3906 } else {
3907 $self->{set_nc}->($self);
3908 }
3909
3910 redo A;
3911 } elsif ($self->{nc} == 0x003B) { # ;
3912
3913
3914 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915 $self->{line_prev} = $self->{line};
3916 $self->{column_prev} = $self->{column};
3917 $self->{column}++;
3918 $self->{nc}
3919 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920 } else {
3921 $self->{set_nc}->($self);
3922 }
3923
3924 #
3925 } else {
3926
3927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3928 ## Reconsume.
3929 #
3930 }
3931
3932 my $code = $self->{s_kwd};
3933 my $l = $self->{line_prev};
3934 my $c = $self->{column_prev};
3935 if ($charref_map->{$code}) {
3936
3937 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3938 text => (sprintf 'U+%04X', $code),
3939 line => $l, column => $c);
3940 $code = $charref_map->{$code};
3941 } elsif ($code > 0x10FFFF) {
3942
3943 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3944 text => (sprintf 'U-%08X', $code),
3945 line => $l, column => $c);
3946 $code = 0xFFFD;
3947 }
3948
3949 if ($self->{prev_state} == DATA_STATE) {
3950
3951 $self->{state} = $self->{prev_state};
3952 $self->{s_kwd} = '';
3953 ## Reconsume.
3954 return ({type => CHARACTER_TOKEN, data => chr $code,
3955 line => $l, column => $c,
3956 });
3957 redo A;
3958 } else {
3959
3960 $self->{ca}->{value} .= chr $code;
3961 $self->{ca}->{has_reference} = 1;
3962 $self->{state} = $self->{prev_state};
3963 $self->{s_kwd} = '';
3964 ## Reconsume.
3965 redo A;
3966 }
3967 } elsif ($self->{state} == HEXREF_X_STATE) {
3968 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3969 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3970 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3971 # 0..9, A..F, a..f
3972
3973 $self->{state} = HEXREF_HEX_STATE;
3974 $self->{s_kwd} = 0;
3975 ## Reconsume.
3976 redo A;
3977 } else {
3978 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3979 line => $self->{line_prev},
3980 column => $self->{column_prev} - 2);
3981
3982 ## NOTE: According to the spec algorithm, nothing is returned,
3983 ## and then "&#" followed by "X" or "x" is appended to the parent
3984 ## element or the attribute value in the later processing.
3985
3986 if ($self->{prev_state} == DATA_STATE) {
3987
3988 $self->{state} = $self->{prev_state};
3989 $self->{s_kwd} = '';
3990 ## Reconsume.
3991 return ({type => CHARACTER_TOKEN,
3992 data => '&' . $self->{s_kwd},
3993 line => $self->{line_prev},
3994 column => $self->{column_prev} - length $self->{s_kwd},
3995 });
3996 redo A;
3997 } else {
3998
3999 $self->{ca}->{value} .= '&' . $self->{s_kwd};
4000 $self->{state} = $self->{prev_state};
4001 $self->{s_kwd} = '';
4002 ## Reconsume.
4003 redo A;
4004 }
4005 }
4006 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4007 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4008 # 0..9
4009
4010 $self->{s_kwd} *= 0x10;
4011 $self->{s_kwd} += $self->{nc} - 0x0030;
4012 ## Stay in the state.
4013
4014 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4015 $self->{line_prev} = $self->{line};
4016 $self->{column_prev} = $self->{column};
4017 $self->{column}++;
4018 $self->{nc}
4019 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4020 } else {
4021 $self->{set_nc}->($self);
4022 }
4023
4024 redo A;
4025 } elsif (0x0061 <= $self->{nc} and
4026 $self->{nc} <= 0x0066) { # a..f
4027
4028 $self->{s_kwd} *= 0x10;
4029 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4030 ## Stay in the state.
4031
4032 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4033 $self->{line_prev} = $self->{line};
4034 $self->{column_prev} = $self->{column};
4035 $self->{column}++;
4036 $self->{nc}
4037 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4038 } else {
4039 $self->{set_nc}->($self);
4040 }
4041
4042 redo A;
4043 } elsif (0x0041 <= $self->{nc} and
4044 $self->{nc} <= 0x0046) { # A..F
4045
4046 $self->{s_kwd} *= 0x10;
4047 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4048 ## Stay in the state.
4049
4050 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4051 $self->{line_prev} = $self->{line};
4052 $self->{column_prev} = $self->{column};
4053 $self->{column}++;
4054 $self->{nc}
4055 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4056 } else {
4057 $self->{set_nc}->($self);
4058 }
4059
4060 redo A;
4061 } elsif ($self->{nc} == 0x003B) { # ;
4062
4063
4064 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4065 $self->{line_prev} = $self->{line};
4066 $self->{column_prev} = $self->{column};
4067 $self->{column}++;
4068 $self->{nc}
4069 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4070 } else {
4071 $self->{set_nc}->($self);
4072 }
4073
4074 #
4075 } else {
4076
4077 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4078 line => $self->{line},
4079 column => $self->{column});
4080 ## Reconsume.
4081 #
4082 }
4083
4084 my $code = $self->{s_kwd};
4085 my $l = $self->{line_prev};
4086 my $c = $self->{column_prev};
4087 if ($charref_map->{$code}) {
4088
4089 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4090 text => (sprintf 'U+%04X', $code),
4091 line => $l, column => $c);
4092 $code = $charref_map->{$code};
4093 } elsif ($code > 0x10FFFF) {
4094
4095 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4096 text => (sprintf 'U-%08X', $code),
4097 line => $l, column => $c);
4098 $code = 0xFFFD;
4099 }
4100
4101 if ($self->{prev_state} == DATA_STATE) {
4102
4103 $self->{state} = $self->{prev_state};
4104 $self->{s_kwd} = '';
4105 ## Reconsume.
4106 return ({type => CHARACTER_TOKEN, data => chr $code,
4107 line => $l, column => $c,
4108 });
4109 redo A;
4110 } else {
4111
4112 $self->{ca}->{value} .= chr $code;
4113 $self->{ca}->{has_reference} = 1;
4114 $self->{state} = $self->{prev_state};
4115 $self->{s_kwd} = '';
4116 ## Reconsume.
4117 redo A;
4118 }
4119 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4120 if (length $self->{s_kwd} < 30 and
4121 ## NOTE: Some number greater than the maximum length of entity name
4122 ((0x0041 <= $self->{nc} and # a
4123 $self->{nc} <= 0x005A) or # x
4124 (0x0061 <= $self->{nc} and # a
4125 $self->{nc} <= 0x007A) or # z
4126 (0x0030 <= $self->{nc} and # 0
4127 $self->{nc} <= 0x0039) or # 9
4128 $self->{nc} == 0x003B)) { # ;
4129 our $EntityChar;
4130 $self->{s_kwd} .= chr $self->{nc};
4131 if (defined $EntityChar->{$self->{s_kwd}}) {
4132 if ($self->{nc} == 0x003B) { # ;
4133
4134 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4135 $self->{entity__match} = 1;
4136
4137 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4138 $self->{line_prev} = $self->{line};
4139 $self->{column_prev} = $self->{column};
4140 $self->{column}++;
4141 $self->{nc}
4142 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4143 } else {
4144 $self->{set_nc}->($self);
4145 }
4146
4147 #
4148 } else {
4149
4150 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4151 $self->{entity__match} = -1;
4152 ## Stay in the state.
4153
4154 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4155 $self->{line_prev} = $self->{line};
4156 $self->{column_prev} = $self->{column};
4157 $self->{column}++;
4158 $self->{nc}
4159 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4160 } else {
4161 $self->{set_nc}->($self);
4162 }
4163
4164 redo A;
4165 }
4166 } else {
4167
4168 $self->{entity__value} .= chr $self->{nc};
4169 $self->{entity__match} *= 2;
4170 ## Stay in the state.
4171
4172 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4173 $self->{line_prev} = $self->{line};
4174 $self->{column_prev} = $self->{column};
4175 $self->{column}++;
4176 $self->{nc}
4177 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4178 } else {
4179 $self->{set_nc}->($self);
4180 }
4181
4182 redo A;
4183 }
4184 }
4185
4186 my $data;
4187 my $has_ref;
4188 if ($self->{entity__match} > 0) {
4189
4190 $data = $self->{entity__value};
4191 $has_ref = 1;
4192 #
4193 } elsif ($self->{entity__match} < 0) {
4194 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4195 if ($self->{prev_state} != DATA_STATE and # in attribute
4196 $self->{entity__match} < -1) {
4197
4198 $data = '&' . $self->{s_kwd};
4199 #
4200 } else {
4201
4202 $data = $self->{entity__value};
4203 $has_ref = 1;
4204 #
4205 }
4206 } else {
4207
4208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4209 line => $self->{line_prev},
4210 column => $self->{column_prev} - length $self->{s_kwd});
4211 $data = '&' . $self->{s_kwd};
4212 #
4213 }
4214
4215 ## NOTE: In these cases, when a character reference is found,
4216 ## it is consumed and a character token is returned, or, otherwise,
4217 ## nothing is consumed and returned, according to the spec algorithm.
4218 ## In this implementation, anything that has been examined by the
4219 ## tokenizer is appended to the parent element or the attribute value
4220 ## as string, either literal string when no character reference or
4221 ## entity-replaced string otherwise, in this stage, since any characters
4222 ## that would not be consumed are appended in the data state or in an
4223 ## appropriate attribute value state anyway.
4224
4225 if ($self->{prev_state} == DATA_STATE) {
4226
4227 $self->{state} = $self->{prev_state};
4228 $self->{s_kwd} = '';
4229 ## Reconsume.
4230 return ({type => CHARACTER_TOKEN,
4231 data => $data,
4232 line => $self->{line_prev},
4233 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4234 });
4235 redo A;
4236 } else {
4237
4238 $self->{ca}->{value} .= $data;
4239 $self->{ca}->{has_reference} = 1 if $has_ref;
4240 $self->{state} = $self->{prev_state};
4241 $self->{s_kwd} = '';
4242 ## Reconsume.
4243 redo A;
4244 }
4245 } else {
4246 die "$0: $self->{state}: Unknown state";
4247 }
4248 } # A
4249
4250 die "$0: _get_next_token: unexpected case";
4251 } # _get_next_token
4252
4253 1;
4254 ## $Date: 2008/10/14 11:46:57 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24