/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (show annotations) (download)
Tue Oct 14 15:25:50 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.6: +6 -2 lines
++ whatpm/t/ChangeLog	14 Oct 2008 15:23:30 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/charref-1.dat" added.

++ whatpm/t/xml/ChangeLog	14 Oct 2008 15:23:49 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* charref-1.dat: New test data file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 15:24:42 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Mark CHARACTER_TOKEN with character reference
	as such, for the support of XML parse error.

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 15:25:35 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Raise a parse error for white space character
	generated by a character reference outside of the root element.

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 $self->{s_kwd} = ''; # state keyword
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194
195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196 $self->{line_prev} = $self->{line};
197 $self->{column_prev} = $self->{column};
198 $self->{column}++;
199 $self->{nc}
200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201 } else {
202 $self->{set_nc}->($self);
203 }
204
205 $self->{token} = [];
206 # $self->{escape}
207 } # _initialize_tokenizer
208
209 ## A token has:
210 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212 ## ->{name} (DOCTYPE_TOKEN)
213 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214 ## ->{pubid} (DOCTYPE_TOKEN)
215 ## ->{sysid} (DOCTYPE_TOKEN)
216 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218 ## ->{name}
219 ## ->{value}
220 ## ->{has_reference} == 1 or 0
221 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
224 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
225 ## while the token is pushed back to the stack.
226
227 ## Emitted token MUST immediately be handled by the tree construction state.
228
229 ## Before each step, UA MAY check to see if either one of the scripts in
230 ## "list of scripts that will execute as soon as possible" or the first
231 ## script in the "list of scripts that will execute asynchronously",
232 ## has completed loading. If one has, then it MUST be executed
233 ## and removed from the list.
234
235 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
236 ## (This requirement was dropped from HTML5 spec, unfortunately.)
237
238 my $is_space = {
239 0x0009 => 1, # CHARACTER TABULATION (HT)
240 0x000A => 1, # LINE FEED (LF)
241 #0x000B => 0, # LINE TABULATION (VT)
242 0x000C => 1, # FORM FEED (FF)
243 #0x000D => 1, # CARRIAGE RETURN (CR)
244 0x0020 => 1, # SPACE (SP)
245 };
246
247 sub _get_next_token ($) {
248 my $self = shift;
249
250 if ($self->{self_closing}) {
251 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
252 ## NOTE: The |self_closing| flag is only set by start tag token.
253 ## In addition, when a start tag token is emitted, it is always set to
254 ## |ct|.
255 delete $self->{self_closing};
256 }
257
258 if (@{$self->{token}}) {
259 $self->{self_closing} = $self->{token}->[0]->{self_closing};
260 return shift @{$self->{token}};
261 }
262
263 A: {
264 if ($self->{state} == PCDATA_STATE) {
265 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
266
267 if ($self->{nc} == 0x0026) { # &
268
269 ## NOTE: In the spec, the tokenizer is switched to the
270 ## "entity data state". In this implementation, the tokenizer
271 ## is switched to the |ENTITY_STATE|, which is an implementation
272 ## of the "consume a character reference" algorithm.
273 $self->{entity_add} = -1;
274 $self->{prev_state} = DATA_STATE;
275 $self->{state} = ENTITY_STATE;
276
277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278 $self->{line_prev} = $self->{line};
279 $self->{column_prev} = $self->{column};
280 $self->{column}++;
281 $self->{nc}
282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283 } else {
284 $self->{set_nc}->($self);
285 }
286
287 redo A;
288 } elsif ($self->{nc} == 0x003C) { # <
289
290 $self->{state} = TAG_OPEN_STATE;
291
292 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
293 $self->{line_prev} = $self->{line};
294 $self->{column_prev} = $self->{column};
295 $self->{column}++;
296 $self->{nc}
297 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
298 } else {
299 $self->{set_nc}->($self);
300 }
301
302 redo A;
303 } elsif ($self->{nc} == -1) {
304
305 return ({type => END_OF_FILE_TOKEN,
306 line => $self->{line}, column => $self->{column}});
307 last A; ## TODO: ok?
308 } else {
309
310 #
311 }
312
313 # Anything else
314 my $token = {type => CHARACTER_TOKEN,
315 data => chr $self->{nc},
316 line => $self->{line}, column => $self->{column},
317 };
318 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
319
320 ## Stay in the state.
321
322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
323 $self->{line_prev} = $self->{line};
324 $self->{column_prev} = $self->{column};
325 $self->{column}++;
326 $self->{nc}
327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
328 } else {
329 $self->{set_nc}->($self);
330 }
331
332 return ($token);
333 redo A;
334 } elsif ($self->{state} == DATA_STATE) {
335 $self->{s_kwd} = '' unless defined $self->{s_kwd};
336 if ($self->{nc} == 0x0026) { # &
337 $self->{s_kwd} = '';
338 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
339 not $self->{escape}) {
340
341 ## NOTE: In the spec, the tokenizer is switched to the
342 ## "entity data state". In this implementation, the tokenizer
343 ## is switched to the |ENTITY_STATE|, which is an implementation
344 ## of the "consume a character reference" algorithm.
345 $self->{entity_add} = -1;
346 $self->{prev_state} = DATA_STATE;
347 $self->{state} = ENTITY_STATE;
348
349 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
350 $self->{line_prev} = $self->{line};
351 $self->{column_prev} = $self->{column};
352 $self->{column}++;
353 $self->{nc}
354 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
355 } else {
356 $self->{set_nc}->($self);
357 }
358
359 redo A;
360 } else {
361
362 #
363 }
364 } elsif ($self->{nc} == 0x002D) { # -
365 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
366 if ($self->{s_kwd} eq '<!-') {
367
368 $self->{escape} = 1; # unless $self->{escape};
369 $self->{s_kwd} = '--';
370 #
371 } elsif ($self->{s_kwd} eq '-') {
372
373 $self->{s_kwd} = '--';
374 #
375 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
376
377 $self->{s_kwd} .= '-';
378 #
379 } else {
380
381 $self->{s_kwd} = '-';
382 #
383 }
384 }
385
386 #
387 } elsif ($self->{nc} == 0x0021) { # !
388 if (length $self->{s_kwd}) {
389
390 $self->{s_kwd} .= '!';
391 #
392 } else {
393
394 #$self->{s_kwd} = '';
395 #
396 }
397 #
398 } elsif ($self->{nc} == 0x003C) { # <
399 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
400 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
401 not $self->{escape})) {
402
403 $self->{state} = TAG_OPEN_STATE;
404
405 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
406 $self->{line_prev} = $self->{line};
407 $self->{column_prev} = $self->{column};
408 $self->{column}++;
409 $self->{nc}
410 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
411 } else {
412 $self->{set_nc}->($self);
413 }
414
415 redo A;
416 } else {
417
418 $self->{s_kwd} = '';
419 #
420 }
421 } elsif ($self->{nc} == 0x003E) { # >
422 if ($self->{escape} and
423 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
424 if ($self->{s_kwd} eq '--') {
425
426 delete $self->{escape};
427 #
428 } else {
429
430 #
431 }
432 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
433
434 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
435 line => $self->{line_prev},
436 column => $self->{column_prev} - 1);
437 #
438 } else {
439
440 #
441 }
442
443 $self->{s_kwd} = '';
444 #
445 } elsif ($self->{nc} == 0x005D) { # ]
446 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
447
448 $self->{s_kwd} .= ']';
449 } elsif ($self->{s_kwd} eq ']]') {
450
451 #
452 } else {
453
454 $self->{s_kwd} = '';
455 }
456 #
457 } elsif ($self->{nc} == -1) {
458
459 $self->{s_kwd} = '';
460 return ({type => END_OF_FILE_TOKEN,
461 line => $self->{line}, column => $self->{column}});
462 last A; ## TODO: ok?
463 } else {
464
465 $self->{s_kwd} = '';
466 #
467 }
468
469 # Anything else
470 my $token = {type => CHARACTER_TOKEN,
471 data => chr $self->{nc},
472 line => $self->{line}, column => $self->{column},
473 };
474 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
475 length $token->{data})) {
476 $self->{s_kwd} = '';
477 }
478
479 ## Stay in the data state.
480 if (not $self->{is_xml} and
481 $self->{content_model} == PCDATA_CONTENT_MODEL) {
482
483 $self->{state} = PCDATA_STATE;
484 } else {
485
486 ## Stay in the state.
487 }
488
489 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
490 $self->{line_prev} = $self->{line};
491 $self->{column_prev} = $self->{column};
492 $self->{column}++;
493 $self->{nc}
494 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
495 } else {
496 $self->{set_nc}->($self);
497 }
498
499 return ($token);
500 redo A;
501 } elsif ($self->{state} == TAG_OPEN_STATE) {
502 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
503 if ($self->{nc} == 0x002F) { # /
504
505
506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
507 $self->{line_prev} = $self->{line};
508 $self->{column_prev} = $self->{column};
509 $self->{column}++;
510 $self->{nc}
511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
512 } else {
513 $self->{set_nc}->($self);
514 }
515
516 $self->{state} = CLOSE_TAG_OPEN_STATE;
517 redo A;
518 } elsif ($self->{nc} == 0x0021) { # !
519
520 $self->{s_kwd} = '<' unless $self->{escape};
521 #
522 } else {
523
524 #
525 }
526
527 ## reconsume
528 $self->{state} = DATA_STATE;
529 $self->{s_kwd} = '';
530 return ({type => CHARACTER_TOKEN, data => '<',
531 line => $self->{line_prev},
532 column => $self->{column_prev},
533 });
534 redo A;
535 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
536 if ($self->{nc} == 0x0021) { # !
537
538 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
539
540 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
541 $self->{line_prev} = $self->{line};
542 $self->{column_prev} = $self->{column};
543 $self->{column}++;
544 $self->{nc}
545 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
546 } else {
547 $self->{set_nc}->($self);
548 }
549
550 redo A;
551 } elsif ($self->{nc} == 0x002F) { # /
552
553 $self->{state} = CLOSE_TAG_OPEN_STATE;
554
555 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
556 $self->{line_prev} = $self->{line};
557 $self->{column_prev} = $self->{column};
558 $self->{column}++;
559 $self->{nc}
560 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
561 } else {
562 $self->{set_nc}->($self);
563 }
564
565 redo A;
566 } elsif (0x0041 <= $self->{nc} and
567 $self->{nc} <= 0x005A) { # A..Z
568
569 $self->{ct}
570 = {type => START_TAG_TOKEN,
571 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
572 line => $self->{line_prev},
573 column => $self->{column_prev}};
574 $self->{state} = TAG_NAME_STATE;
575
576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577 $self->{line_prev} = $self->{line};
578 $self->{column_prev} = $self->{column};
579 $self->{column}++;
580 $self->{nc}
581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582 } else {
583 $self->{set_nc}->($self);
584 }
585
586 redo A;
587 } elsif (0x0061 <= $self->{nc} and
588 $self->{nc} <= 0x007A) { # a..z
589
590 $self->{ct} = {type => START_TAG_TOKEN,
591 tag_name => chr ($self->{nc}),
592 line => $self->{line_prev},
593 column => $self->{column_prev}};
594 $self->{state} = TAG_NAME_STATE;
595
596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
597 $self->{line_prev} = $self->{line};
598 $self->{column_prev} = $self->{column};
599 $self->{column}++;
600 $self->{nc}
601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
602 } else {
603 $self->{set_nc}->($self);
604 }
605
606 redo A;
607 } elsif ($self->{nc} == 0x003E) { # >
608
609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
610 line => $self->{line_prev},
611 column => $self->{column_prev});
612 $self->{state} = DATA_STATE;
613 $self->{s_kwd} = '';
614
615 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
616 $self->{line_prev} = $self->{line};
617 $self->{column_prev} = $self->{column};
618 $self->{column}++;
619 $self->{nc}
620 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
621 } else {
622 $self->{set_nc}->($self);
623 }
624
625
626 return ({type => CHARACTER_TOKEN, data => '<>',
627 line => $self->{line_prev},
628 column => $self->{column_prev},
629 });
630
631 redo A;
632 } elsif ($self->{nc} == 0x003F) { # ?
633
634 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
635 line => $self->{line_prev},
636 column => $self->{column_prev});
637 $self->{state} = BOGUS_COMMENT_STATE;
638 $self->{ct} = {type => COMMENT_TOKEN, data => '',
639 line => $self->{line_prev},
640 column => $self->{column_prev},
641 };
642 ## $self->{nc} is intentionally left as is
643 redo A;
644 } else {
645
646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
647 line => $self->{line_prev},
648 column => $self->{column_prev});
649 $self->{state} = DATA_STATE;
650 $self->{s_kwd} = '';
651 ## reconsume
652
653 return ({type => CHARACTER_TOKEN, data => '<',
654 line => $self->{line_prev},
655 column => $self->{column_prev},
656 });
657
658 redo A;
659 }
660 } else {
661 die "$0: $self->{content_model} in tag open";
662 }
663 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
664 ## NOTE: The "close tag open state" in the spec is implemented as
665 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
666
667 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
668 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
669 if (defined $self->{last_stag_name}) {
670 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
671 $self->{s_kwd} = '';
672 ## Reconsume.
673 redo A;
674 } else {
675 ## No start tag token has ever been emitted
676 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
677
678 $self->{state} = DATA_STATE;
679 $self->{s_kwd} = '';
680 ## Reconsume.
681 return ({type => CHARACTER_TOKEN, data => '</',
682 line => $l, column => $c,
683 });
684 redo A;
685 }
686 }
687
688 if (0x0041 <= $self->{nc} and
689 $self->{nc} <= 0x005A) { # A..Z
690
691 $self->{ct}
692 = {type => END_TAG_TOKEN,
693 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
694 line => $l, column => $c};
695 $self->{state} = TAG_NAME_STATE;
696
697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
698 $self->{line_prev} = $self->{line};
699 $self->{column_prev} = $self->{column};
700 $self->{column}++;
701 $self->{nc}
702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
703 } else {
704 $self->{set_nc}->($self);
705 }
706
707 redo A;
708 } elsif (0x0061 <= $self->{nc} and
709 $self->{nc} <= 0x007A) { # a..z
710
711 $self->{ct} = {type => END_TAG_TOKEN,
712 tag_name => chr ($self->{nc}),
713 line => $l, column => $c};
714 $self->{state} = TAG_NAME_STATE;
715
716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
717 $self->{line_prev} = $self->{line};
718 $self->{column_prev} = $self->{column};
719 $self->{column}++;
720 $self->{nc}
721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
722 } else {
723 $self->{set_nc}->($self);
724 }
725
726 redo A;
727 } elsif ($self->{nc} == 0x003E) { # >
728
729 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
730 line => $self->{line_prev}, ## "<" in "</>"
731 column => $self->{column_prev} - 1);
732 $self->{state} = DATA_STATE;
733 $self->{s_kwd} = '';
734
735 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
736 $self->{line_prev} = $self->{line};
737 $self->{column_prev} = $self->{column};
738 $self->{column}++;
739 $self->{nc}
740 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
741 } else {
742 $self->{set_nc}->($self);
743 }
744
745 redo A;
746 } elsif ($self->{nc} == -1) {
747
748 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
749 $self->{s_kwd} = '';
750 $self->{state} = DATA_STATE;
751 # reconsume
752
753 return ({type => CHARACTER_TOKEN, data => '</',
754 line => $l, column => $c,
755 });
756
757 redo A;
758 } else {
759
760 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
761 $self->{state} = BOGUS_COMMENT_STATE;
762 $self->{ct} = {type => COMMENT_TOKEN, data => '',
763 line => $self->{line_prev}, # "<" of "</"
764 column => $self->{column_prev} - 1,
765 };
766 ## NOTE: $self->{nc} is intentionally left as is.
767 ## Although the "anything else" case of the spec not explicitly
768 ## states that the next input character is to be reconsumed,
769 ## it will be included to the |data| of the comment token
770 ## generated from the bogus end tag, as defined in the
771 ## "bogus comment state" entry.
772 redo A;
773 }
774 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
775 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
776 if (length $ch) {
777 my $CH = $ch;
778 $ch =~ tr/a-z/A-Z/;
779 my $nch = chr $self->{nc};
780 if ($nch eq $ch or $nch eq $CH) {
781
782 ## Stay in the state.
783 $self->{s_kwd} .= $nch;
784
785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
786 $self->{line_prev} = $self->{line};
787 $self->{column_prev} = $self->{column};
788 $self->{column}++;
789 $self->{nc}
790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
791 } else {
792 $self->{set_nc}->($self);
793 }
794
795 redo A;
796 } else {
797
798 $self->{state} = DATA_STATE;
799 $self->{s_kwd} = '';
800 ## Reconsume.
801 return ({type => CHARACTER_TOKEN,
802 data => '</' . $self->{s_kwd},
803 line => $self->{line_prev},
804 column => $self->{column_prev} - 1 - length $self->{s_kwd},
805 });
806 redo A;
807 }
808 } else { # after "<{tag-name}"
809 unless ($is_space->{$self->{nc}} or
810 {
811 0x003E => 1, # >
812 0x002F => 1, # /
813 -1 => 1, # EOF
814 }->{$self->{nc}}) {
815
816 ## Reconsume.
817 $self->{state} = DATA_STATE;
818 $self->{s_kwd} = '';
819 return ({type => CHARACTER_TOKEN,
820 data => '</' . $self->{s_kwd},
821 line => $self->{line_prev},
822 column => $self->{column_prev} - 1 - length $self->{s_kwd},
823 });
824 redo A;
825 } else {
826
827 $self->{ct}
828 = {type => END_TAG_TOKEN,
829 tag_name => $self->{last_stag_name},
830 line => $self->{line_prev},
831 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
832 $self->{state} = TAG_NAME_STATE;
833 ## Reconsume.
834 redo A;
835 }
836 }
837 } elsif ($self->{state} == TAG_NAME_STATE) {
838 if ($is_space->{$self->{nc}}) {
839
840 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
841
842 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
843 $self->{line_prev} = $self->{line};
844 $self->{column_prev} = $self->{column};
845 $self->{column}++;
846 $self->{nc}
847 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
848 } else {
849 $self->{set_nc}->($self);
850 }
851
852 redo A;
853 } elsif ($self->{nc} == 0x003E) { # >
854 if ($self->{ct}->{type} == START_TAG_TOKEN) {
855
856 $self->{last_stag_name} = $self->{ct}->{tag_name};
857 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
858 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
859 #if ($self->{ct}->{attributes}) {
860 # ## NOTE: This should never be reached.
861 # !!! cp (36);
862 # !!! parse-error (type => 'end tag attribute');
863 #} else {
864
865 #}
866 } else {
867 die "$0: $self->{ct}->{type}: Unknown token type";
868 }
869 $self->{state} = DATA_STATE;
870 $self->{s_kwd} = '';
871
872 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
873 $self->{line_prev} = $self->{line};
874 $self->{column_prev} = $self->{column};
875 $self->{column}++;
876 $self->{nc}
877 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
878 } else {
879 $self->{set_nc}->($self);
880 }
881
882
883 return ($self->{ct}); # start tag or end tag
884
885 redo A;
886 } elsif (0x0041 <= $self->{nc} and
887 $self->{nc} <= 0x005A) { # A..Z
888
889 $self->{ct}->{tag_name}
890 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
891 # start tag or end tag
892 ## Stay in this state
893
894 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
895 $self->{line_prev} = $self->{line};
896 $self->{column_prev} = $self->{column};
897 $self->{column}++;
898 $self->{nc}
899 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
900 } else {
901 $self->{set_nc}->($self);
902 }
903
904 redo A;
905 } elsif ($self->{nc} == -1) {
906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
907 if ($self->{ct}->{type} == START_TAG_TOKEN) {
908
909 $self->{last_stag_name} = $self->{ct}->{tag_name};
910 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
911 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
912 #if ($self->{ct}->{attributes}) {
913 # ## NOTE: This state should never be reached.
914 # !!! cp (40);
915 # !!! parse-error (type => 'end tag attribute');
916 #} else {
917
918 #}
919 } else {
920 die "$0: $self->{ct}->{type}: Unknown token type";
921 }
922 $self->{state} = DATA_STATE;
923 $self->{s_kwd} = '';
924 # reconsume
925
926 return ($self->{ct}); # start tag or end tag
927
928 redo A;
929 } elsif ($self->{nc} == 0x002F) { # /
930
931 $self->{state} = SELF_CLOSING_START_TAG_STATE;
932
933 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
934 $self->{line_prev} = $self->{line};
935 $self->{column_prev} = $self->{column};
936 $self->{column}++;
937 $self->{nc}
938 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
939 } else {
940 $self->{set_nc}->($self);
941 }
942
943 redo A;
944 } else {
945
946 $self->{ct}->{tag_name} .= chr $self->{nc};
947 # start tag or end tag
948 ## Stay in the state
949
950 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
951 $self->{line_prev} = $self->{line};
952 $self->{column_prev} = $self->{column};
953 $self->{column}++;
954 $self->{nc}
955 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
956 } else {
957 $self->{set_nc}->($self);
958 }
959
960 redo A;
961 }
962 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
963 if ($is_space->{$self->{nc}}) {
964
965 ## Stay in the state
966
967 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
968 $self->{line_prev} = $self->{line};
969 $self->{column_prev} = $self->{column};
970 $self->{column}++;
971 $self->{nc}
972 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
973 } else {
974 $self->{set_nc}->($self);
975 }
976
977 redo A;
978 } elsif ($self->{nc} == 0x003E) { # >
979 if ($self->{ct}->{type} == START_TAG_TOKEN) {
980
981 $self->{last_stag_name} = $self->{ct}->{tag_name};
982 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
983 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
984 if ($self->{ct}->{attributes}) {
985
986 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
987 } else {
988
989 }
990 } else {
991 die "$0: $self->{ct}->{type}: Unknown token type";
992 }
993 $self->{state} = DATA_STATE;
994 $self->{s_kwd} = '';
995
996 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
997 $self->{line_prev} = $self->{line};
998 $self->{column_prev} = $self->{column};
999 $self->{column}++;
1000 $self->{nc}
1001 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1002 } else {
1003 $self->{set_nc}->($self);
1004 }
1005
1006
1007 return ($self->{ct}); # start tag or end tag
1008
1009 redo A;
1010 } elsif (0x0041 <= $self->{nc} and
1011 $self->{nc} <= 0x005A) { # A..Z
1012
1013 $self->{ca}
1014 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1015 value => '',
1016 line => $self->{line}, column => $self->{column}};
1017 $self->{state} = ATTRIBUTE_NAME_STATE;
1018
1019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020 $self->{line_prev} = $self->{line};
1021 $self->{column_prev} = $self->{column};
1022 $self->{column}++;
1023 $self->{nc}
1024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025 } else {
1026 $self->{set_nc}->($self);
1027 }
1028
1029 redo A;
1030 } elsif ($self->{nc} == 0x002F) { # /
1031
1032 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1033
1034 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1035 $self->{line_prev} = $self->{line};
1036 $self->{column_prev} = $self->{column};
1037 $self->{column}++;
1038 $self->{nc}
1039 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1040 } else {
1041 $self->{set_nc}->($self);
1042 }
1043
1044 redo A;
1045 } elsif ($self->{nc} == -1) {
1046 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1047 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1048
1049 $self->{last_stag_name} = $self->{ct}->{tag_name};
1050 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1051 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1052 if ($self->{ct}->{attributes}) {
1053
1054 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1055 } else {
1056
1057 }
1058 } else {
1059 die "$0: $self->{ct}->{type}: Unknown token type";
1060 }
1061 $self->{state} = DATA_STATE;
1062 $self->{s_kwd} = '';
1063 # reconsume
1064
1065 return ($self->{ct}); # start tag or end tag
1066
1067 redo A;
1068 } else {
1069 if ({
1070 0x0022 => 1, # "
1071 0x0027 => 1, # '
1072 0x003D => 1, # =
1073 }->{$self->{nc}}) {
1074
1075 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1076 } else {
1077
1078 }
1079 $self->{ca}
1080 = {name => chr ($self->{nc}),
1081 value => '',
1082 line => $self->{line}, column => $self->{column}};
1083 $self->{state} = ATTRIBUTE_NAME_STATE;
1084
1085 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1086 $self->{line_prev} = $self->{line};
1087 $self->{column_prev} = $self->{column};
1088 $self->{column}++;
1089 $self->{nc}
1090 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1091 } else {
1092 $self->{set_nc}->($self);
1093 }
1094
1095 redo A;
1096 }
1097 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1098 my $before_leave = sub {
1099 if (exists $self->{ct}->{attributes} # start tag or end tag
1100 ->{$self->{ca}->{name}}) { # MUST
1101
1102 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1103 ## Discard $self->{ca} # MUST
1104 } else {
1105
1106 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1107 = $self->{ca};
1108 }
1109 }; # $before_leave
1110
1111 if ($is_space->{$self->{nc}}) {
1112
1113 $before_leave->();
1114 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1115
1116 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1117 $self->{line_prev} = $self->{line};
1118 $self->{column_prev} = $self->{column};
1119 $self->{column}++;
1120 $self->{nc}
1121 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1122 } else {
1123 $self->{set_nc}->($self);
1124 }
1125
1126 redo A;
1127 } elsif ($self->{nc} == 0x003D) { # =
1128
1129 $before_leave->();
1130 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1131
1132 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1133 $self->{line_prev} = $self->{line};
1134 $self->{column_prev} = $self->{column};
1135 $self->{column}++;
1136 $self->{nc}
1137 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1138 } else {
1139 $self->{set_nc}->($self);
1140 }
1141
1142 redo A;
1143 } elsif ($self->{nc} == 0x003E) { # >
1144 $before_leave->();
1145 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1146
1147 $self->{last_stag_name} = $self->{ct}->{tag_name};
1148 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1149
1150 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1151 if ($self->{ct}->{attributes}) {
1152 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1153 }
1154 } else {
1155 die "$0: $self->{ct}->{type}: Unknown token type";
1156 }
1157 $self->{state} = DATA_STATE;
1158 $self->{s_kwd} = '';
1159
1160 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1161 $self->{line_prev} = $self->{line};
1162 $self->{column_prev} = $self->{column};
1163 $self->{column}++;
1164 $self->{nc}
1165 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1166 } else {
1167 $self->{set_nc}->($self);
1168 }
1169
1170
1171 return ($self->{ct}); # start tag or end tag
1172
1173 redo A;
1174 } elsif (0x0041 <= $self->{nc} and
1175 $self->{nc} <= 0x005A) { # A..Z
1176
1177 $self->{ca}->{name}
1178 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1179 ## Stay in the state
1180
1181 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1182 $self->{line_prev} = $self->{line};
1183 $self->{column_prev} = $self->{column};
1184 $self->{column}++;
1185 $self->{nc}
1186 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1187 } else {
1188 $self->{set_nc}->($self);
1189 }
1190
1191 redo A;
1192 } elsif ($self->{nc} == 0x002F) { # /
1193
1194 $before_leave->();
1195 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1196
1197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1198 $self->{line_prev} = $self->{line};
1199 $self->{column_prev} = $self->{column};
1200 $self->{column}++;
1201 $self->{nc}
1202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1203 } else {
1204 $self->{set_nc}->($self);
1205 }
1206
1207 redo A;
1208 } elsif ($self->{nc} == -1) {
1209 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1210 $before_leave->();
1211 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1212
1213 $self->{last_stag_name} = $self->{ct}->{tag_name};
1214 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1215 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1216 if ($self->{ct}->{attributes}) {
1217
1218 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1219 } else {
1220 ## NOTE: This state should never be reached.
1221
1222 }
1223 } else {
1224 die "$0: $self->{ct}->{type}: Unknown token type";
1225 }
1226 $self->{state} = DATA_STATE;
1227 $self->{s_kwd} = '';
1228 # reconsume
1229
1230 return ($self->{ct}); # start tag or end tag
1231
1232 redo A;
1233 } else {
1234 if ($self->{nc} == 0x0022 or # "
1235 $self->{nc} == 0x0027) { # '
1236
1237 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1238 } else {
1239
1240 }
1241 $self->{ca}->{name} .= chr ($self->{nc});
1242 ## Stay in the state
1243
1244 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1245 $self->{line_prev} = $self->{line};
1246 $self->{column_prev} = $self->{column};
1247 $self->{column}++;
1248 $self->{nc}
1249 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1250 } else {
1251 $self->{set_nc}->($self);
1252 }
1253
1254 redo A;
1255 }
1256 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1257 if ($is_space->{$self->{nc}}) {
1258
1259 ## Stay in the state
1260
1261 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1262 $self->{line_prev} = $self->{line};
1263 $self->{column_prev} = $self->{column};
1264 $self->{column}++;
1265 $self->{nc}
1266 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1267 } else {
1268 $self->{set_nc}->($self);
1269 }
1270
1271 redo A;
1272 } elsif ($self->{nc} == 0x003D) { # =
1273
1274 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1275
1276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1277 $self->{line_prev} = $self->{line};
1278 $self->{column_prev} = $self->{column};
1279 $self->{column}++;
1280 $self->{nc}
1281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1282 } else {
1283 $self->{set_nc}->($self);
1284 }
1285
1286 redo A;
1287 } elsif ($self->{nc} == 0x003E) { # >
1288 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1289
1290 $self->{last_stag_name} = $self->{ct}->{tag_name};
1291 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1292 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1293 if ($self->{ct}->{attributes}) {
1294
1295 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1296 } else {
1297 ## NOTE: This state should never be reached.
1298
1299 }
1300 } else {
1301 die "$0: $self->{ct}->{type}: Unknown token type";
1302 }
1303 $self->{state} = DATA_STATE;
1304 $self->{s_kwd} = '';
1305
1306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1307 $self->{line_prev} = $self->{line};
1308 $self->{column_prev} = $self->{column};
1309 $self->{column}++;
1310 $self->{nc}
1311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1312 } else {
1313 $self->{set_nc}->($self);
1314 }
1315
1316
1317 return ($self->{ct}); # start tag or end tag
1318
1319 redo A;
1320 } elsif (0x0041 <= $self->{nc} and
1321 $self->{nc} <= 0x005A) { # A..Z
1322
1323 $self->{ca}
1324 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1325 value => '',
1326 line => $self->{line}, column => $self->{column}};
1327 $self->{state} = ATTRIBUTE_NAME_STATE;
1328
1329 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1330 $self->{line_prev} = $self->{line};
1331 $self->{column_prev} = $self->{column};
1332 $self->{column}++;
1333 $self->{nc}
1334 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1335 } else {
1336 $self->{set_nc}->($self);
1337 }
1338
1339 redo A;
1340 } elsif ($self->{nc} == 0x002F) { # /
1341
1342 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1343
1344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1345 $self->{line_prev} = $self->{line};
1346 $self->{column_prev} = $self->{column};
1347 $self->{column}++;
1348 $self->{nc}
1349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1350 } else {
1351 $self->{set_nc}->($self);
1352 }
1353
1354 redo A;
1355 } elsif ($self->{nc} == -1) {
1356 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1357 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1358
1359 $self->{last_stag_name} = $self->{ct}->{tag_name};
1360 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1361 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1362 if ($self->{ct}->{attributes}) {
1363
1364 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1365 } else {
1366 ## NOTE: This state should never be reached.
1367
1368 }
1369 } else {
1370 die "$0: $self->{ct}->{type}: Unknown token type";
1371 }
1372 $self->{s_kwd} = '';
1373 $self->{state} = DATA_STATE;
1374 # reconsume
1375
1376 return ($self->{ct}); # start tag or end tag
1377
1378 redo A;
1379 } else {
1380 if ($self->{nc} == 0x0022 or # "
1381 $self->{nc} == 0x0027) { # '
1382
1383 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1384 } else {
1385
1386 }
1387 $self->{ca}
1388 = {name => chr ($self->{nc}),
1389 value => '',
1390 line => $self->{line}, column => $self->{column}};
1391 $self->{state} = ATTRIBUTE_NAME_STATE;
1392
1393 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1394 $self->{line_prev} = $self->{line};
1395 $self->{column_prev} = $self->{column};
1396 $self->{column}++;
1397 $self->{nc}
1398 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1399 } else {
1400 $self->{set_nc}->($self);
1401 }
1402
1403 redo A;
1404 }
1405 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1406 if ($is_space->{$self->{nc}}) {
1407
1408 ## Stay in the state
1409
1410 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1411 $self->{line_prev} = $self->{line};
1412 $self->{column_prev} = $self->{column};
1413 $self->{column}++;
1414 $self->{nc}
1415 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1416 } else {
1417 $self->{set_nc}->($self);
1418 }
1419
1420 redo A;
1421 } elsif ($self->{nc} == 0x0022) { # "
1422
1423 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1424
1425 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1426 $self->{line_prev} = $self->{line};
1427 $self->{column_prev} = $self->{column};
1428 $self->{column}++;
1429 $self->{nc}
1430 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1431 } else {
1432 $self->{set_nc}->($self);
1433 }
1434
1435 redo A;
1436 } elsif ($self->{nc} == 0x0026) { # &
1437
1438 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1439 ## reconsume
1440 redo A;
1441 } elsif ($self->{nc} == 0x0027) { # '
1442
1443 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1444
1445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1446 $self->{line_prev} = $self->{line};
1447 $self->{column_prev} = $self->{column};
1448 $self->{column}++;
1449 $self->{nc}
1450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1451 } else {
1452 $self->{set_nc}->($self);
1453 }
1454
1455 redo A;
1456 } elsif ($self->{nc} == 0x003E) { # >
1457 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1458 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1459
1460 $self->{last_stag_name} = $self->{ct}->{tag_name};
1461 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1462 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1463 if ($self->{ct}->{attributes}) {
1464
1465 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1466 } else {
1467 ## NOTE: This state should never be reached.
1468
1469 }
1470 } else {
1471 die "$0: $self->{ct}->{type}: Unknown token type";
1472 }
1473 $self->{state} = DATA_STATE;
1474 $self->{s_kwd} = '';
1475
1476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1477 $self->{line_prev} = $self->{line};
1478 $self->{column_prev} = $self->{column};
1479 $self->{column}++;
1480 $self->{nc}
1481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1482 } else {
1483 $self->{set_nc}->($self);
1484 }
1485
1486
1487 return ($self->{ct}); # start tag or end tag
1488
1489 redo A;
1490 } elsif ($self->{nc} == -1) {
1491 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1492 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1493
1494 $self->{last_stag_name} = $self->{ct}->{tag_name};
1495 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1496 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1497 if ($self->{ct}->{attributes}) {
1498
1499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1500 } else {
1501 ## NOTE: This state should never be reached.
1502
1503 }
1504 } else {
1505 die "$0: $self->{ct}->{type}: Unknown token type";
1506 }
1507 $self->{state} = DATA_STATE;
1508 $self->{s_kwd} = '';
1509 ## reconsume
1510
1511 return ($self->{ct}); # start tag or end tag
1512
1513 redo A;
1514 } else {
1515 if ($self->{nc} == 0x003D) { # =
1516
1517 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1518 } else {
1519
1520 }
1521 $self->{ca}->{value} .= chr ($self->{nc});
1522 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1523
1524 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1525 $self->{line_prev} = $self->{line};
1526 $self->{column_prev} = $self->{column};
1527 $self->{column}++;
1528 $self->{nc}
1529 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1530 } else {
1531 $self->{set_nc}->($self);
1532 }
1533
1534 redo A;
1535 }
1536 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1537 if ($self->{nc} == 0x0022) { # "
1538
1539 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1540
1541 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1542 $self->{line_prev} = $self->{line};
1543 $self->{column_prev} = $self->{column};
1544 $self->{column}++;
1545 $self->{nc}
1546 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1547 } else {
1548 $self->{set_nc}->($self);
1549 }
1550
1551 redo A;
1552 } elsif ($self->{nc} == 0x0026) { # &
1553
1554 ## NOTE: In the spec, the tokenizer is switched to the
1555 ## "entity in attribute value state". In this implementation, the
1556 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1557 ## implementation of the "consume a character reference" algorithm.
1558 $self->{prev_state} = $self->{state};
1559 $self->{entity_add} = 0x0022; # "
1560 $self->{state} = ENTITY_STATE;
1561
1562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1563 $self->{line_prev} = $self->{line};
1564 $self->{column_prev} = $self->{column};
1565 $self->{column}++;
1566 $self->{nc}
1567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1568 } else {
1569 $self->{set_nc}->($self);
1570 }
1571
1572 redo A;
1573 } elsif ($self->{nc} == -1) {
1574 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1575 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1576
1577 $self->{last_stag_name} = $self->{ct}->{tag_name};
1578 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1579 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1580 if ($self->{ct}->{attributes}) {
1581
1582 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1583 } else {
1584 ## NOTE: This state should never be reached.
1585
1586 }
1587 } else {
1588 die "$0: $self->{ct}->{type}: Unknown token type";
1589 }
1590 $self->{state} = DATA_STATE;
1591 $self->{s_kwd} = '';
1592 ## reconsume
1593
1594 return ($self->{ct}); # start tag or end tag
1595
1596 redo A;
1597 } else {
1598
1599 $self->{ca}->{value} .= chr ($self->{nc});
1600 $self->{read_until}->($self->{ca}->{value},
1601 q["&],
1602 length $self->{ca}->{value});
1603
1604 ## Stay in the state
1605
1606 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1607 $self->{line_prev} = $self->{line};
1608 $self->{column_prev} = $self->{column};
1609 $self->{column}++;
1610 $self->{nc}
1611 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1612 } else {
1613 $self->{set_nc}->($self);
1614 }
1615
1616 redo A;
1617 }
1618 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1619 if ($self->{nc} == 0x0027) { # '
1620
1621 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1622
1623 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1624 $self->{line_prev} = $self->{line};
1625 $self->{column_prev} = $self->{column};
1626 $self->{column}++;
1627 $self->{nc}
1628 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1629 } else {
1630 $self->{set_nc}->($self);
1631 }
1632
1633 redo A;
1634 } elsif ($self->{nc} == 0x0026) { # &
1635
1636 ## NOTE: In the spec, the tokenizer is switched to the
1637 ## "entity in attribute value state". In this implementation, the
1638 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1639 ## implementation of the "consume a character reference" algorithm.
1640 $self->{entity_add} = 0x0027; # '
1641 $self->{prev_state} = $self->{state};
1642 $self->{state} = ENTITY_STATE;
1643
1644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1645 $self->{line_prev} = $self->{line};
1646 $self->{column_prev} = $self->{column};
1647 $self->{column}++;
1648 $self->{nc}
1649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1650 } else {
1651 $self->{set_nc}->($self);
1652 }
1653
1654 redo A;
1655 } elsif ($self->{nc} == -1) {
1656 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1657 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1658
1659 $self->{last_stag_name} = $self->{ct}->{tag_name};
1660 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1661 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1662 if ($self->{ct}->{attributes}) {
1663
1664 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1665 } else {
1666 ## NOTE: This state should never be reached.
1667
1668 }
1669 } else {
1670 die "$0: $self->{ct}->{type}: Unknown token type";
1671 }
1672 $self->{state} = DATA_STATE;
1673 $self->{s_kwd} = '';
1674 ## reconsume
1675
1676 return ($self->{ct}); # start tag or end tag
1677
1678 redo A;
1679 } else {
1680
1681 $self->{ca}->{value} .= chr ($self->{nc});
1682 $self->{read_until}->($self->{ca}->{value},
1683 q['&],
1684 length $self->{ca}->{value});
1685
1686 ## Stay in the state
1687
1688 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1689 $self->{line_prev} = $self->{line};
1690 $self->{column_prev} = $self->{column};
1691 $self->{column}++;
1692 $self->{nc}
1693 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1694 } else {
1695 $self->{set_nc}->($self);
1696 }
1697
1698 redo A;
1699 }
1700 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1701 if ($is_space->{$self->{nc}}) {
1702
1703 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1704
1705 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1706 $self->{line_prev} = $self->{line};
1707 $self->{column_prev} = $self->{column};
1708 $self->{column}++;
1709 $self->{nc}
1710 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1711 } else {
1712 $self->{set_nc}->($self);
1713 }
1714
1715 redo A;
1716 } elsif ($self->{nc} == 0x0026) { # &
1717
1718 ## NOTE: In the spec, the tokenizer is switched to the
1719 ## "entity in attribute value state". In this implementation, the
1720 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1721 ## implementation of the "consume a character reference" algorithm.
1722 $self->{entity_add} = -1;
1723 $self->{prev_state} = $self->{state};
1724 $self->{state} = ENTITY_STATE;
1725
1726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1727 $self->{line_prev} = $self->{line};
1728 $self->{column_prev} = $self->{column};
1729 $self->{column}++;
1730 $self->{nc}
1731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1732 } else {
1733 $self->{set_nc}->($self);
1734 }
1735
1736 redo A;
1737 } elsif ($self->{nc} == 0x003E) { # >
1738 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1739
1740 $self->{last_stag_name} = $self->{ct}->{tag_name};
1741 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1742 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1743 if ($self->{ct}->{attributes}) {
1744
1745 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1746 } else {
1747 ## NOTE: This state should never be reached.
1748
1749 }
1750 } else {
1751 die "$0: $self->{ct}->{type}: Unknown token type";
1752 }
1753 $self->{state} = DATA_STATE;
1754 $self->{s_kwd} = '';
1755
1756 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1757 $self->{line_prev} = $self->{line};
1758 $self->{column_prev} = $self->{column};
1759 $self->{column}++;
1760 $self->{nc}
1761 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1762 } else {
1763 $self->{set_nc}->($self);
1764 }
1765
1766
1767 return ($self->{ct}); # start tag or end tag
1768
1769 redo A;
1770 } elsif ($self->{nc} == -1) {
1771 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1772 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1773
1774 $self->{last_stag_name} = $self->{ct}->{tag_name};
1775 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1776 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1777 if ($self->{ct}->{attributes}) {
1778
1779 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1780 } else {
1781 ## NOTE: This state should never be reached.
1782
1783 }
1784 } else {
1785 die "$0: $self->{ct}->{type}: Unknown token type";
1786 }
1787 $self->{state} = DATA_STATE;
1788 $self->{s_kwd} = '';
1789 ## reconsume
1790
1791 return ($self->{ct}); # start tag or end tag
1792
1793 redo A;
1794 } else {
1795 if ({
1796 0x0022 => 1, # "
1797 0x0027 => 1, # '
1798 0x003D => 1, # =
1799 }->{$self->{nc}}) {
1800
1801 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1802 } else {
1803
1804 }
1805 $self->{ca}->{value} .= chr ($self->{nc});
1806 $self->{read_until}->($self->{ca}->{value},
1807 q["'=& >],
1808 length $self->{ca}->{value});
1809
1810 ## Stay in the state
1811
1812 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1813 $self->{line_prev} = $self->{line};
1814 $self->{column_prev} = $self->{column};
1815 $self->{column}++;
1816 $self->{nc}
1817 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1818 } else {
1819 $self->{set_nc}->($self);
1820 }
1821
1822 redo A;
1823 }
1824 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1825 if ($is_space->{$self->{nc}}) {
1826
1827 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1828
1829 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1830 $self->{line_prev} = $self->{line};
1831 $self->{column_prev} = $self->{column};
1832 $self->{column}++;
1833 $self->{nc}
1834 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1835 } else {
1836 $self->{set_nc}->($self);
1837 }
1838
1839 redo A;
1840 } elsif ($self->{nc} == 0x003E) { # >
1841 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1842
1843 $self->{last_stag_name} = $self->{ct}->{tag_name};
1844 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1845 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1846 if ($self->{ct}->{attributes}) {
1847
1848 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1849 } else {
1850 ## NOTE: This state should never be reached.
1851
1852 }
1853 } else {
1854 die "$0: $self->{ct}->{type}: Unknown token type";
1855 }
1856 $self->{state} = DATA_STATE;
1857 $self->{s_kwd} = '';
1858
1859 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1860 $self->{line_prev} = $self->{line};
1861 $self->{column_prev} = $self->{column};
1862 $self->{column}++;
1863 $self->{nc}
1864 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1865 } else {
1866 $self->{set_nc}->($self);
1867 }
1868
1869
1870 return ($self->{ct}); # start tag or end tag
1871
1872 redo A;
1873 } elsif ($self->{nc} == 0x002F) { # /
1874
1875 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1876
1877 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1878 $self->{line_prev} = $self->{line};
1879 $self->{column_prev} = $self->{column};
1880 $self->{column}++;
1881 $self->{nc}
1882 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1883 } else {
1884 $self->{set_nc}->($self);
1885 }
1886
1887 redo A;
1888 } elsif ($self->{nc} == -1) {
1889 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1890 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1891
1892 $self->{last_stag_name} = $self->{ct}->{tag_name};
1893 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1894 if ($self->{ct}->{attributes}) {
1895
1896 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1897 } else {
1898 ## NOTE: This state should never be reached.
1899
1900 }
1901 } else {
1902 die "$0: $self->{ct}->{type}: Unknown token type";
1903 }
1904 $self->{state} = DATA_STATE;
1905 $self->{s_kwd} = '';
1906 ## Reconsume.
1907 return ($self->{ct}); # start tag or end tag
1908 redo A;
1909 } else {
1910
1911 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1912 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1913 ## reconsume
1914 redo A;
1915 }
1916 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1917 if ($self->{nc} == 0x003E) { # >
1918 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1919
1920 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1921 ## TODO: Different type than slash in start tag
1922 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1923 if ($self->{ct}->{attributes}) {
1924
1925 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1926 } else {
1927
1928 }
1929 ## TODO: Test |<title></title/>|
1930 } else {
1931
1932 $self->{self_closing} = 1;
1933 }
1934
1935 $self->{state} = DATA_STATE;
1936 $self->{s_kwd} = '';
1937
1938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1939 $self->{line_prev} = $self->{line};
1940 $self->{column_prev} = $self->{column};
1941 $self->{column}++;
1942 $self->{nc}
1943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1944 } else {
1945 $self->{set_nc}->($self);
1946 }
1947
1948
1949 return ($self->{ct}); # start tag or end tag
1950
1951 redo A;
1952 } elsif ($self->{nc} == -1) {
1953 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1954 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1955
1956 $self->{last_stag_name} = $self->{ct}->{tag_name};
1957 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1958 if ($self->{ct}->{attributes}) {
1959
1960 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1961 } else {
1962 ## NOTE: This state should never be reached.
1963
1964 }
1965 } else {
1966 die "$0: $self->{ct}->{type}: Unknown token type";
1967 }
1968 $self->{state} = DATA_STATE;
1969 $self->{s_kwd} = '';
1970 ## Reconsume.
1971 return ($self->{ct}); # start tag or end tag
1972 redo A;
1973 } else {
1974
1975 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1976 ## TODO: This error type is wrong.
1977 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1978 ## Reconsume.
1979 redo A;
1980 }
1981 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1982 ## (only happen if PCDATA state)
1983
1984 ## NOTE: Unlike spec's "bogus comment state", this implementation
1985 ## consumes characters one-by-one basis.
1986
1987 if ($self->{nc} == 0x003E) { # >
1988
1989 $self->{state} = DATA_STATE;
1990 $self->{s_kwd} = '';
1991
1992 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1993 $self->{line_prev} = $self->{line};
1994 $self->{column_prev} = $self->{column};
1995 $self->{column}++;
1996 $self->{nc}
1997 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1998 } else {
1999 $self->{set_nc}->($self);
2000 }
2001
2002
2003 return ($self->{ct}); # comment
2004 redo A;
2005 } elsif ($self->{nc} == -1) {
2006
2007 $self->{state} = DATA_STATE;
2008 $self->{s_kwd} = '';
2009 ## reconsume
2010
2011 return ($self->{ct}); # comment
2012 redo A;
2013 } else {
2014
2015 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2016 $self->{read_until}->($self->{ct}->{data},
2017 q[>],
2018 length $self->{ct}->{data});
2019
2020 ## Stay in the state.
2021
2022 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2023 $self->{line_prev} = $self->{line};
2024 $self->{column_prev} = $self->{column};
2025 $self->{column}++;
2026 $self->{nc}
2027 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2028 } else {
2029 $self->{set_nc}->($self);
2030 }
2031
2032 redo A;
2033 }
2034 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2035 ## (only happen if PCDATA state)
2036
2037 if ($self->{nc} == 0x002D) { # -
2038
2039 $self->{state} = MD_HYPHEN_STATE;
2040
2041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2042 $self->{line_prev} = $self->{line};
2043 $self->{column_prev} = $self->{column};
2044 $self->{column}++;
2045 $self->{nc}
2046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2047 } else {
2048 $self->{set_nc}->($self);
2049 }
2050
2051 redo A;
2052 } elsif ($self->{nc} == 0x0044 or # D
2053 $self->{nc} == 0x0064) { # d
2054 ## ASCII case-insensitive.
2055
2056 $self->{state} = MD_DOCTYPE_STATE;
2057 $self->{s_kwd} = chr $self->{nc};
2058
2059 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2060 $self->{line_prev} = $self->{line};
2061 $self->{column_prev} = $self->{column};
2062 $self->{column}++;
2063 $self->{nc}
2064 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2065 } else {
2066 $self->{set_nc}->($self);
2067 }
2068
2069 redo A;
2070 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2071 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2072 $self->{is_xml}) and
2073 $self->{nc} == 0x005B) { # [
2074
2075 $self->{state} = MD_CDATA_STATE;
2076 $self->{s_kwd} = '[';
2077
2078 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2079 $self->{line_prev} = $self->{line};
2080 $self->{column_prev} = $self->{column};
2081 $self->{column}++;
2082 $self->{nc}
2083 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2084 } else {
2085 $self->{set_nc}->($self);
2086 }
2087
2088 redo A;
2089 } else {
2090
2091 }
2092
2093 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2094 line => $self->{line_prev},
2095 column => $self->{column_prev} - 1);
2096 ## Reconsume.
2097 $self->{state} = BOGUS_COMMENT_STATE;
2098 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2099 line => $self->{line_prev},
2100 column => $self->{column_prev} - 1,
2101 };
2102 redo A;
2103 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2104 if ($self->{nc} == 0x002D) { # -
2105
2106 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2107 line => $self->{line_prev},
2108 column => $self->{column_prev} - 2,
2109 };
2110 $self->{state} = COMMENT_START_STATE;
2111
2112 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2113 $self->{line_prev} = $self->{line};
2114 $self->{column_prev} = $self->{column};
2115 $self->{column}++;
2116 $self->{nc}
2117 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2118 } else {
2119 $self->{set_nc}->($self);
2120 }
2121
2122 redo A;
2123 } else {
2124
2125 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2126 line => $self->{line_prev},
2127 column => $self->{column_prev} - 2);
2128 $self->{state} = BOGUS_COMMENT_STATE;
2129 ## Reconsume.
2130 $self->{ct} = {type => COMMENT_TOKEN,
2131 data => '-',
2132 line => $self->{line_prev},
2133 column => $self->{column_prev} - 2,
2134 };
2135 redo A;
2136 }
2137 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2138 ## ASCII case-insensitive.
2139 if ($self->{nc} == [
2140 undef,
2141 0x004F, # O
2142 0x0043, # C
2143 0x0054, # T
2144 0x0059, # Y
2145 0x0050, # P
2146 ]->[length $self->{s_kwd}] or
2147 $self->{nc} == [
2148 undef,
2149 0x006F, # o
2150 0x0063, # c
2151 0x0074, # t
2152 0x0079, # y
2153 0x0070, # p
2154 ]->[length $self->{s_kwd}]) {
2155
2156 ## Stay in the state.
2157 $self->{s_kwd} .= chr $self->{nc};
2158
2159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2160 $self->{line_prev} = $self->{line};
2161 $self->{column_prev} = $self->{column};
2162 $self->{column}++;
2163 $self->{nc}
2164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2165 } else {
2166 $self->{set_nc}->($self);
2167 }
2168
2169 redo A;
2170 } elsif ((length $self->{s_kwd}) == 6 and
2171 ($self->{nc} == 0x0045 or # E
2172 $self->{nc} == 0x0065)) { # e
2173
2174 $self->{state} = DOCTYPE_STATE;
2175 $self->{ct} = {type => DOCTYPE_TOKEN,
2176 quirks => 1,
2177 line => $self->{line_prev},
2178 column => $self->{column_prev} - 7,
2179 };
2180
2181 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2182 $self->{line_prev} = $self->{line};
2183 $self->{column_prev} = $self->{column};
2184 $self->{column}++;
2185 $self->{nc}
2186 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2187 } else {
2188 $self->{set_nc}->($self);
2189 }
2190
2191 redo A;
2192 } else {
2193
2194 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2195 line => $self->{line_prev},
2196 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2197 $self->{state} = BOGUS_COMMENT_STATE;
2198 ## Reconsume.
2199 $self->{ct} = {type => COMMENT_TOKEN,
2200 data => $self->{s_kwd},
2201 line => $self->{line_prev},
2202 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2203 };
2204 redo A;
2205 }
2206 } elsif ($self->{state} == MD_CDATA_STATE) {
2207 if ($self->{nc} == {
2208 '[' => 0x0043, # C
2209 '[C' => 0x0044, # D
2210 '[CD' => 0x0041, # A
2211 '[CDA' => 0x0054, # T
2212 '[CDAT' => 0x0041, # A
2213 }->{$self->{s_kwd}}) {
2214
2215 ## Stay in the state.
2216 $self->{s_kwd} .= chr $self->{nc};
2217
2218 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2219 $self->{line_prev} = $self->{line};
2220 $self->{column_prev} = $self->{column};
2221 $self->{column}++;
2222 $self->{nc}
2223 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2224 } else {
2225 $self->{set_nc}->($self);
2226 }
2227
2228 redo A;
2229 } elsif ($self->{s_kwd} eq '[CDATA' and
2230 $self->{nc} == 0x005B) { # [
2231
2232
2233 if ($self->{is_xml} and
2234 not $self->{tainted} and
2235 @{$self->{open_elements} or []} == 0) {
2236 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2237 line => $self->{line_prev},
2238 column => $self->{column_prev} - 7);
2239 $self->{tainted} = 1;
2240 }
2241
2242 $self->{ct} = {type => CHARACTER_TOKEN,
2243 data => '',
2244 line => $self->{line_prev},
2245 column => $self->{column_prev} - 7};
2246 $self->{state} = CDATA_SECTION_STATE;
2247
2248 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249 $self->{line_prev} = $self->{line};
2250 $self->{column_prev} = $self->{column};
2251 $self->{column}++;
2252 $self->{nc}
2253 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254 } else {
2255 $self->{set_nc}->($self);
2256 }
2257
2258 redo A;
2259 } else {
2260
2261 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2262 line => $self->{line_prev},
2263 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2264 $self->{state} = BOGUS_COMMENT_STATE;
2265 ## Reconsume.
2266 $self->{ct} = {type => COMMENT_TOKEN,
2267 data => $self->{s_kwd},
2268 line => $self->{line_prev},
2269 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2270 };
2271 redo A;
2272 }
2273 } elsif ($self->{state} == COMMENT_START_STATE) {
2274 if ($self->{nc} == 0x002D) { # -
2275
2276 $self->{state} = COMMENT_START_DASH_STATE;
2277
2278 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2279 $self->{line_prev} = $self->{line};
2280 $self->{column_prev} = $self->{column};
2281 $self->{column}++;
2282 $self->{nc}
2283 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2284 } else {
2285 $self->{set_nc}->($self);
2286 }
2287
2288 redo A;
2289 } elsif ($self->{nc} == 0x003E) { # >
2290
2291 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2292 $self->{state} = DATA_STATE;
2293 $self->{s_kwd} = '';
2294
2295 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2296 $self->{line_prev} = $self->{line};
2297 $self->{column_prev} = $self->{column};
2298 $self->{column}++;
2299 $self->{nc}
2300 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2301 } else {
2302 $self->{set_nc}->($self);
2303 }
2304
2305
2306 return ($self->{ct}); # comment
2307
2308 redo A;
2309 } elsif ($self->{nc} == -1) {
2310
2311 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2312 $self->{state} = DATA_STATE;
2313 $self->{s_kwd} = '';
2314 ## reconsume
2315
2316 return ($self->{ct}); # comment
2317
2318 redo A;
2319 } else {
2320
2321 $self->{ct}->{data} # comment
2322 .= chr ($self->{nc});
2323 $self->{state} = COMMENT_STATE;
2324
2325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2326 $self->{line_prev} = $self->{line};
2327 $self->{column_prev} = $self->{column};
2328 $self->{column}++;
2329 $self->{nc}
2330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2331 } else {
2332 $self->{set_nc}->($self);
2333 }
2334
2335 redo A;
2336 }
2337 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2338 if ($self->{nc} == 0x002D) { # -
2339
2340 $self->{state} = COMMENT_END_STATE;
2341
2342 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2343 $self->{line_prev} = $self->{line};
2344 $self->{column_prev} = $self->{column};
2345 $self->{column}++;
2346 $self->{nc}
2347 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2348 } else {
2349 $self->{set_nc}->($self);
2350 }
2351
2352 redo A;
2353 } elsif ($self->{nc} == 0x003E) { # >
2354
2355 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2356 $self->{state} = DATA_STATE;
2357 $self->{s_kwd} = '';
2358
2359 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2360 $self->{line_prev} = $self->{line};
2361 $self->{column_prev} = $self->{column};
2362 $self->{column}++;
2363 $self->{nc}
2364 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2365 } else {
2366 $self->{set_nc}->($self);
2367 }
2368
2369
2370 return ($self->{ct}); # comment
2371
2372 redo A;
2373 } elsif ($self->{nc} == -1) {
2374
2375 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2376 $self->{state} = DATA_STATE;
2377 $self->{s_kwd} = '';
2378 ## reconsume
2379
2380 return ($self->{ct}); # comment
2381
2382 redo A;
2383 } else {
2384
2385 $self->{ct}->{data} # comment
2386 .= '-' . chr ($self->{nc});
2387 $self->{state} = COMMENT_STATE;
2388
2389 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390 $self->{line_prev} = $self->{line};
2391 $self->{column_prev} = $self->{column};
2392 $self->{column}++;
2393 $self->{nc}
2394 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395 } else {
2396 $self->{set_nc}->($self);
2397 }
2398
2399 redo A;
2400 }
2401 } elsif ($self->{state} == COMMENT_STATE) {
2402 if ($self->{nc} == 0x002D) { # -
2403
2404 $self->{state} = COMMENT_END_DASH_STATE;
2405
2406 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2407 $self->{line_prev} = $self->{line};
2408 $self->{column_prev} = $self->{column};
2409 $self->{column}++;
2410 $self->{nc}
2411 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2412 } else {
2413 $self->{set_nc}->($self);
2414 }
2415
2416 redo A;
2417 } elsif ($self->{nc} == -1) {
2418
2419 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2420 $self->{state} = DATA_STATE;
2421 $self->{s_kwd} = '';
2422 ## reconsume
2423
2424 return ($self->{ct}); # comment
2425
2426 redo A;
2427 } else {
2428
2429 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2430 $self->{read_until}->($self->{ct}->{data},
2431 q[-],
2432 length $self->{ct}->{data});
2433
2434 ## Stay in the state
2435
2436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2437 $self->{line_prev} = $self->{line};
2438 $self->{column_prev} = $self->{column};
2439 $self->{column}++;
2440 $self->{nc}
2441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2442 } else {
2443 $self->{set_nc}->($self);
2444 }
2445
2446 redo A;
2447 }
2448 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2449 if ($self->{nc} == 0x002D) { # -
2450
2451 $self->{state} = COMMENT_END_STATE;
2452
2453 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2454 $self->{line_prev} = $self->{line};
2455 $self->{column_prev} = $self->{column};
2456 $self->{column}++;
2457 $self->{nc}
2458 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2459 } else {
2460 $self->{set_nc}->($self);
2461 }
2462
2463 redo A;
2464 } elsif ($self->{nc} == -1) {
2465
2466 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2467 $self->{s_kwd} = '';
2468 $self->{state} = DATA_STATE;
2469 $self->{s_kwd} = '';
2470 ## reconsume
2471
2472 return ($self->{ct}); # comment
2473
2474 redo A;
2475 } else {
2476
2477 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2478 $self->{state} = COMMENT_STATE;
2479
2480 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2481 $self->{line_prev} = $self->{line};
2482 $self->{column_prev} = $self->{column};
2483 $self->{column}++;
2484 $self->{nc}
2485 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2486 } else {
2487 $self->{set_nc}->($self);
2488 }
2489
2490 redo A;
2491 }
2492 } elsif ($self->{state} == COMMENT_END_STATE) {
2493 if ($self->{nc} == 0x003E) { # >
2494
2495 $self->{state} = DATA_STATE;
2496 $self->{s_kwd} = '';
2497
2498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2499 $self->{line_prev} = $self->{line};
2500 $self->{column_prev} = $self->{column};
2501 $self->{column}++;
2502 $self->{nc}
2503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2504 } else {
2505 $self->{set_nc}->($self);
2506 }
2507
2508
2509 return ($self->{ct}); # comment
2510
2511 redo A;
2512 } elsif ($self->{nc} == 0x002D) { # -
2513
2514 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2515 line => $self->{line_prev},
2516 column => $self->{column_prev});
2517 $self->{ct}->{data} .= '-'; # comment
2518 ## Stay in the state
2519
2520 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2521 $self->{line_prev} = $self->{line};
2522 $self->{column_prev} = $self->{column};
2523 $self->{column}++;
2524 $self->{nc}
2525 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2526 } else {
2527 $self->{set_nc}->($self);
2528 }
2529
2530 redo A;
2531 } elsif ($self->{nc} == -1) {
2532
2533 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2534 $self->{state} = DATA_STATE;
2535 $self->{s_kwd} = '';
2536 ## reconsume
2537
2538 return ($self->{ct}); # comment
2539
2540 redo A;
2541 } else {
2542
2543 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2544 line => $self->{line_prev},
2545 column => $self->{column_prev});
2546 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2547 $self->{state} = COMMENT_STATE;
2548
2549 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2550 $self->{line_prev} = $self->{line};
2551 $self->{column_prev} = $self->{column};
2552 $self->{column}++;
2553 $self->{nc}
2554 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2555 } else {
2556 $self->{set_nc}->($self);
2557 }
2558
2559 redo A;
2560 }
2561 } elsif ($self->{state} == DOCTYPE_STATE) {
2562 if ($is_space->{$self->{nc}}) {
2563
2564 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2565
2566 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2567 $self->{line_prev} = $self->{line};
2568 $self->{column_prev} = $self->{column};
2569 $self->{column}++;
2570 $self->{nc}
2571 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2572 } else {
2573 $self->{set_nc}->($self);
2574 }
2575
2576 redo A;
2577 } else {
2578
2579 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2580 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2581 ## reconsume
2582 redo A;
2583 }
2584 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2585 if ($is_space->{$self->{nc}}) {
2586
2587 ## Stay in the state
2588
2589 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2590 $self->{line_prev} = $self->{line};
2591 $self->{column_prev} = $self->{column};
2592 $self->{column}++;
2593 $self->{nc}
2594 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2595 } else {
2596 $self->{set_nc}->($self);
2597 }
2598
2599 redo A;
2600 } elsif ($self->{nc} == 0x003E) { # >
2601
2602 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2603 $self->{state} = DATA_STATE;
2604 $self->{s_kwd} = '';
2605
2606 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2607 $self->{line_prev} = $self->{line};
2608 $self->{column_prev} = $self->{column};
2609 $self->{column}++;
2610 $self->{nc}
2611 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2612 } else {
2613 $self->{set_nc}->($self);
2614 }
2615
2616
2617 return ($self->{ct}); # DOCTYPE (quirks)
2618
2619 redo A;
2620 } elsif ($self->{nc} == -1) {
2621
2622 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2623 $self->{state} = DATA_STATE;
2624 $self->{s_kwd} = '';
2625 ## reconsume
2626
2627 return ($self->{ct}); # DOCTYPE (quirks)
2628
2629 redo A;
2630 } else {
2631
2632 $self->{ct}->{name} = chr $self->{nc};
2633 delete $self->{ct}->{quirks};
2634 $self->{state} = DOCTYPE_NAME_STATE;
2635
2636 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2637 $self->{line_prev} = $self->{line};
2638 $self->{column_prev} = $self->{column};
2639 $self->{column}++;
2640 $self->{nc}
2641 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2642 } else {
2643 $self->{set_nc}->($self);
2644 }
2645
2646 redo A;
2647 }
2648 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2649 ## ISSUE: Redundant "First," in the spec.
2650 if ($is_space->{$self->{nc}}) {
2651
2652 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2653
2654 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2655 $self->{line_prev} = $self->{line};
2656 $self->{column_prev} = $self->{column};
2657 $self->{column}++;
2658 $self->{nc}
2659 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2660 } else {
2661 $self->{set_nc}->($self);
2662 }
2663
2664 redo A;
2665 } elsif ($self->{nc} == 0x003E) { # >
2666
2667 $self->{state} = DATA_STATE;
2668 $self->{s_kwd} = '';
2669
2670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2671 $self->{line_prev} = $self->{line};
2672 $self->{column_prev} = $self->{column};
2673 $self->{column}++;
2674 $self->{nc}
2675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2676 } else {
2677 $self->{set_nc}->($self);
2678 }
2679
2680
2681 return ($self->{ct}); # DOCTYPE
2682
2683 redo A;
2684 } elsif ($self->{nc} == -1) {
2685
2686 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2687 $self->{state} = DATA_STATE;
2688 $self->{s_kwd} = '';
2689 ## reconsume
2690
2691 $self->{ct}->{quirks} = 1;
2692 return ($self->{ct}); # DOCTYPE
2693
2694 redo A;
2695 } else {
2696
2697 $self->{ct}->{name}
2698 .= chr ($self->{nc}); # DOCTYPE
2699 ## Stay in the state
2700
2701 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2702 $self->{line_prev} = $self->{line};
2703 $self->{column_prev} = $self->{column};
2704 $self->{column}++;
2705 $self->{nc}
2706 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2707 } else {
2708 $self->{set_nc}->($self);
2709 }
2710
2711 redo A;
2712 }
2713 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2714 if ($is_space->{$self->{nc}}) {
2715
2716 ## Stay in the state
2717
2718 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2719 $self->{line_prev} = $self->{line};
2720 $self->{column_prev} = $self->{column};
2721 $self->{column}++;
2722 $self->{nc}
2723 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2724 } else {
2725 $self->{set_nc}->($self);
2726 }
2727
2728 redo A;
2729 } elsif ($self->{nc} == 0x003E) { # >
2730
2731 $self->{state} = DATA_STATE;
2732 $self->{s_kwd} = '';
2733
2734 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2735 $self->{line_prev} = $self->{line};
2736 $self->{column_prev} = $self->{column};
2737 $self->{column}++;
2738 $self->{nc}
2739 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2740 } else {
2741 $self->{set_nc}->($self);
2742 }
2743
2744
2745 return ($self->{ct}); # DOCTYPE
2746
2747 redo A;
2748 } elsif ($self->{nc} == -1) {
2749
2750 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2751 $self->{state} = DATA_STATE;
2752 $self->{s_kwd} = '';
2753 ## reconsume
2754
2755 $self->{ct}->{quirks} = 1;
2756 return ($self->{ct}); # DOCTYPE
2757
2758 redo A;
2759 } elsif ($self->{nc} == 0x0050 or # P
2760 $self->{nc} == 0x0070) { # p
2761 $self->{state} = PUBLIC_STATE;
2762 $self->{s_kwd} = chr $self->{nc};
2763
2764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2765 $self->{line_prev} = $self->{line};
2766 $self->{column_prev} = $self->{column};
2767 $self->{column}++;
2768 $self->{nc}
2769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2770 } else {
2771 $self->{set_nc}->($self);
2772 }
2773
2774 redo A;
2775 } elsif ($self->{nc} == 0x0053 or # S
2776 $self->{nc} == 0x0073) { # s
2777 $self->{state} = SYSTEM_STATE;
2778 $self->{s_kwd} = chr $self->{nc};
2779
2780 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2781 $self->{line_prev} = $self->{line};
2782 $self->{column_prev} = $self->{column};
2783 $self->{column}++;
2784 $self->{nc}
2785 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2786 } else {
2787 $self->{set_nc}->($self);
2788 }
2789
2790 redo A;
2791 } else {
2792
2793 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2794 $self->{ct}->{quirks} = 1;
2795
2796 $self->{state} = BOGUS_DOCTYPE_STATE;
2797
2798 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2799 $self->{line_prev} = $self->{line};
2800 $self->{column_prev} = $self->{column};
2801 $self->{column}++;
2802 $self->{nc}
2803 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2804 } else {
2805 $self->{set_nc}->($self);
2806 }
2807
2808 redo A;
2809 }
2810 } elsif ($self->{state} == PUBLIC_STATE) {
2811 ## ASCII case-insensitive
2812 if ($self->{nc} == [
2813 undef,
2814 0x0055, # U
2815 0x0042, # B
2816 0x004C, # L
2817 0x0049, # I
2818 ]->[length $self->{s_kwd}] or
2819 $self->{nc} == [
2820 undef,
2821 0x0075, # u
2822 0x0062, # b
2823 0x006C, # l
2824 0x0069, # i
2825 ]->[length $self->{s_kwd}]) {
2826
2827 ## Stay in the state.
2828 $self->{s_kwd} .= chr $self->{nc};
2829
2830 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2831 $self->{line_prev} = $self->{line};
2832 $self->{column_prev} = $self->{column};
2833 $self->{column}++;
2834 $self->{nc}
2835 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2836 } else {
2837 $self->{set_nc}->($self);
2838 }
2839
2840 redo A;
2841 } elsif ((length $self->{s_kwd}) == 5 and
2842 ($self->{nc} == 0x0043 or # C
2843 $self->{nc} == 0x0063)) { # c
2844
2845 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2846
2847 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2848 $self->{line_prev} = $self->{line};
2849 $self->{column_prev} = $self->{column};
2850 $self->{column}++;
2851 $self->{nc}
2852 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2853 } else {
2854 $self->{set_nc}->($self);
2855 }
2856
2857 redo A;
2858 } else {
2859
2860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2861 line => $self->{line_prev},
2862 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2863 $self->{ct}->{quirks} = 1;
2864
2865 $self->{state} = BOGUS_DOCTYPE_STATE;
2866 ## Reconsume.
2867 redo A;
2868 }
2869 } elsif ($self->{state} == SYSTEM_STATE) {
2870 ## ASCII case-insensitive
2871 if ($self->{nc} == [
2872 undef,
2873 0x0059, # Y
2874 0x0053, # S
2875 0x0054, # T
2876 0x0045, # E
2877 ]->[length $self->{s_kwd}] or
2878 $self->{nc} == [
2879 undef,
2880 0x0079, # y
2881 0x0073, # s
2882 0x0074, # t
2883 0x0065, # e
2884 ]->[length $self->{s_kwd}]) {
2885
2886 ## Stay in the state.
2887 $self->{s_kwd} .= chr $self->{nc};
2888
2889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2890 $self->{line_prev} = $self->{line};
2891 $self->{column_prev} = $self->{column};
2892 $self->{column}++;
2893 $self->{nc}
2894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2895 } else {
2896 $self->{set_nc}->($self);
2897 }
2898
2899 redo A;
2900 } elsif ((length $self->{s_kwd}) == 5 and
2901 ($self->{nc} == 0x004D or # M
2902 $self->{nc} == 0x006D)) { # m
2903
2904 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2905
2906 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2907 $self->{line_prev} = $self->{line};
2908 $self->{column_prev} = $self->{column};
2909 $self->{column}++;
2910 $self->{nc}
2911 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2912 } else {
2913 $self->{set_nc}->($self);
2914 }
2915
2916 redo A;
2917 } else {
2918
2919 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2920 line => $self->{line_prev},
2921 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2922 $self->{ct}->{quirks} = 1;
2923
2924 $self->{state} = BOGUS_DOCTYPE_STATE;
2925 ## Reconsume.
2926 redo A;
2927 }
2928 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2929 if ($is_space->{$self->{nc}}) {
2930
2931 ## Stay in the state
2932
2933 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2934 $self->{line_prev} = $self->{line};
2935 $self->{column_prev} = $self->{column};
2936 $self->{column}++;
2937 $self->{nc}
2938 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2939 } else {
2940 $self->{set_nc}->($self);
2941 }
2942
2943 redo A;
2944 } elsif ($self->{nc} eq 0x0022) { # "
2945
2946 $self->{ct}->{pubid} = ''; # DOCTYPE
2947 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2948
2949 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2950 $self->{line_prev} = $self->{line};
2951 $self->{column_prev} = $self->{column};
2952 $self->{column}++;
2953 $self->{nc}
2954 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2955 } else {
2956 $self->{set_nc}->($self);
2957 }
2958
2959 redo A;
2960 } elsif ($self->{nc} eq 0x0027) { # '
2961
2962 $self->{ct}->{pubid} = ''; # DOCTYPE
2963 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2964
2965 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2966 $self->{line_prev} = $self->{line};
2967 $self->{column_prev} = $self->{column};
2968 $self->{column}++;
2969 $self->{nc}
2970 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2971 } else {
2972 $self->{set_nc}->($self);
2973 }
2974
2975 redo A;
2976 } elsif ($self->{nc} eq 0x003E) { # >
2977
2978 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2979
2980 $self->{state} = DATA_STATE;
2981 $self->{s_kwd} = '';
2982
2983 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2984 $self->{line_prev} = $self->{line};
2985 $self->{column_prev} = $self->{column};
2986 $self->{column}++;
2987 $self->{nc}
2988 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2989 } else {
2990 $self->{set_nc}->($self);
2991 }
2992
2993
2994 $self->{ct}->{quirks} = 1;
2995 return ($self->{ct}); # DOCTYPE
2996
2997 redo A;
2998 } elsif ($self->{nc} == -1) {
2999
3000 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3001
3002 $self->{state} = DATA_STATE;
3003 $self->{s_kwd} = '';
3004 ## reconsume
3005
3006 $self->{ct}->{quirks} = 1;
3007 return ($self->{ct}); # DOCTYPE
3008
3009 redo A;
3010 } else {
3011
3012 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3013 $self->{ct}->{quirks} = 1;
3014
3015 $self->{state} = BOGUS_DOCTYPE_STATE;
3016
3017 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018 $self->{line_prev} = $self->{line};
3019 $self->{column_prev} = $self->{column};
3020 $self->{column}++;
3021 $self->{nc}
3022 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023 } else {
3024 $self->{set_nc}->($self);
3025 }
3026
3027 redo A;
3028 }
3029 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3030 if ($self->{nc} == 0x0022) { # "
3031
3032 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3033
3034 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3035 $self->{line_prev} = $self->{line};
3036 $self->{column_prev} = $self->{column};
3037 $self->{column}++;
3038 $self->{nc}
3039 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3040 } else {
3041 $self->{set_nc}->($self);
3042 }
3043
3044 redo A;
3045 } elsif ($self->{nc} == 0x003E) { # >
3046
3047 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3048
3049 $self->{state} = DATA_STATE;
3050 $self->{s_kwd} = '';
3051
3052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3053 $self->{line_prev} = $self->{line};
3054 $self->{column_prev} = $self->{column};
3055 $self->{column}++;
3056 $self->{nc}
3057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3058 } else {
3059 $self->{set_nc}->($self);
3060 }
3061
3062
3063 $self->{ct}->{quirks} = 1;
3064 return ($self->{ct}); # DOCTYPE
3065
3066 redo A;
3067 } elsif ($self->{nc} == -1) {
3068
3069 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3070
3071 $self->{state} = DATA_STATE;
3072 $self->{s_kwd} = '';
3073 ## reconsume
3074
3075 $self->{ct}->{quirks} = 1;
3076 return ($self->{ct}); # DOCTYPE
3077
3078 redo A;
3079 } else {
3080
3081 $self->{ct}->{pubid} # DOCTYPE
3082 .= chr $self->{nc};
3083 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3084 length $self->{ct}->{pubid});
3085
3086 ## Stay in the state
3087
3088 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3089 $self->{line_prev} = $self->{line};
3090 $self->{column_prev} = $self->{column};
3091 $self->{column}++;
3092 $self->{nc}
3093 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3094 } else {
3095 $self->{set_nc}->($self);
3096 }
3097
3098 redo A;
3099 }
3100 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3101 if ($self->{nc} == 0x0027) { # '
3102
3103 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3104
3105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106 $self->{line_prev} = $self->{line};
3107 $self->{column_prev} = $self->{column};
3108 $self->{column}++;
3109 $self->{nc}
3110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111 } else {
3112 $self->{set_nc}->($self);
3113 }
3114
3115 redo A;
3116 } elsif ($self->{nc} == 0x003E) { # >
3117
3118 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3119
3120 $self->{state} = DATA_STATE;
3121 $self->{s_kwd} = '';
3122
3123 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3124 $self->{line_prev} = $self->{line};
3125 $self->{column_prev} = $self->{column};
3126 $self->{column}++;
3127 $self->{nc}
3128 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3129 } else {
3130 $self->{set_nc}->($self);
3131 }
3132
3133
3134 $self->{ct}->{quirks} = 1;
3135 return ($self->{ct}); # DOCTYPE
3136
3137 redo A;
3138 } elsif ($self->{nc} == -1) {
3139
3140 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3141
3142 $self->{state} = DATA_STATE;
3143 $self->{s_kwd} = '';
3144 ## reconsume
3145
3146 $self->{ct}->{quirks} = 1;
3147 return ($self->{ct}); # DOCTYPE
3148
3149 redo A;
3150 } else {
3151
3152 $self->{ct}->{pubid} # DOCTYPE
3153 .= chr $self->{nc};
3154 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3155 length $self->{ct}->{pubid});
3156
3157 ## Stay in the state
3158
3159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3160 $self->{line_prev} = $self->{line};
3161 $self->{column_prev} = $self->{column};
3162 $self->{column}++;
3163 $self->{nc}
3164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3165 } else {
3166 $self->{set_nc}->($self);
3167 }
3168
3169 redo A;
3170 }
3171 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3172 if ($is_space->{$self->{nc}}) {
3173
3174 ## Stay in the state
3175
3176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3177 $self->{line_prev} = $self->{line};
3178 $self->{column_prev} = $self->{column};
3179 $self->{column}++;
3180 $self->{nc}
3181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3182 } else {
3183 $self->{set_nc}->($self);
3184 }
3185
3186 redo A;
3187 } elsif ($self->{nc} == 0x0022) { # "
3188
3189 $self->{ct}->{sysid} = ''; # DOCTYPE
3190 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3191
3192 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3193 $self->{line_prev} = $self->{line};
3194 $self->{column_prev} = $self->{column};
3195 $self->{column}++;
3196 $self->{nc}
3197 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3198 } else {
3199 $self->{set_nc}->($self);
3200 }
3201
3202 redo A;
3203 } elsif ($self->{nc} == 0x0027) { # '
3204
3205 $self->{ct}->{sysid} = ''; # DOCTYPE
3206 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3207
3208 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3209 $self->{line_prev} = $self->{line};
3210 $self->{column_prev} = $self->{column};
3211 $self->{column}++;
3212 $self->{nc}
3213 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3214 } else {
3215 $self->{set_nc}->($self);
3216 }
3217
3218 redo A;
3219 } elsif ($self->{nc} == 0x003E) { # >
3220
3221 $self->{state} = DATA_STATE;
3222 $self->{s_kwd} = '';
3223
3224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3225 $self->{line_prev} = $self->{line};
3226 $self->{column_prev} = $self->{column};
3227 $self->{column}++;
3228 $self->{nc}
3229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3230 } else {
3231 $self->{set_nc}->($self);
3232 }
3233
3234
3235 return ($self->{ct}); # DOCTYPE
3236
3237 redo A;
3238 } elsif ($self->{nc} == -1) {
3239
3240 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3241
3242 $self->{state} = DATA_STATE;
3243 $self->{s_kwd} = '';
3244 ## reconsume
3245
3246 $self->{ct}->{quirks} = 1;
3247 return ($self->{ct}); # DOCTYPE
3248
3249 redo A;
3250 } else {
3251
3252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3253 $self->{ct}->{quirks} = 1;
3254
3255 $self->{state} = BOGUS_DOCTYPE_STATE;
3256
3257 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3258 $self->{line_prev} = $self->{line};
3259 $self->{column_prev} = $self->{column};
3260 $self->{column}++;
3261 $self->{nc}
3262 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3263 } else {
3264 $self->{set_nc}->($self);
3265 }
3266
3267 redo A;
3268 }
3269 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3270 if ($is_space->{$self->{nc}}) {
3271
3272 ## Stay in the state
3273
3274 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3275 $self->{line_prev} = $self->{line};
3276 $self->{column_prev} = $self->{column};
3277 $self->{column}++;
3278 $self->{nc}
3279 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3280 } else {
3281 $self->{set_nc}->($self);
3282 }
3283
3284 redo A;
3285 } elsif ($self->{nc} == 0x0022) { # "
3286
3287 $self->{ct}->{sysid} = ''; # DOCTYPE
3288 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3289
3290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3291 $self->{line_prev} = $self->{line};
3292 $self->{column_prev} = $self->{column};
3293 $self->{column}++;
3294 $self->{nc}
3295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3296 } else {
3297 $self->{set_nc}->($self);
3298 }
3299
3300 redo A;
3301 } elsif ($self->{nc} == 0x0027) { # '
3302
3303 $self->{ct}->{sysid} = ''; # DOCTYPE
3304 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3305
3306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3307 $self->{line_prev} = $self->{line};
3308 $self->{column_prev} = $self->{column};
3309 $self->{column}++;
3310 $self->{nc}
3311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3312 } else {
3313 $self->{set_nc}->($self);
3314 }
3315
3316 redo A;
3317 } elsif ($self->{nc} == 0x003E) { # >
3318
3319 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3320 $self->{state} = DATA_STATE;
3321 $self->{s_kwd} = '';
3322
3323 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3324 $self->{line_prev} = $self->{line};
3325 $self->{column_prev} = $self->{column};
3326 $self->{column}++;
3327 $self->{nc}
3328 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3329 } else {
3330 $self->{set_nc}->($self);
3331 }
3332
3333
3334 $self->{ct}->{quirks} = 1;
3335 return ($self->{ct}); # DOCTYPE
3336
3337 redo A;
3338 } elsif ($self->{nc} == -1) {
3339
3340 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3341
3342 $self->{state} = DATA_STATE;
3343 $self->{s_kwd} = '';
3344 ## reconsume
3345
3346 $self->{ct}->{quirks} = 1;
3347 return ($self->{ct}); # DOCTYPE
3348
3349 redo A;
3350 } else {
3351
3352 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3353 $self->{ct}->{quirks} = 1;
3354
3355 $self->{state} = BOGUS_DOCTYPE_STATE;
3356
3357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3358 $self->{line_prev} = $self->{line};
3359 $self->{column_prev} = $self->{column};
3360 $self->{column}++;
3361 $self->{nc}
3362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3363 } else {
3364 $self->{set_nc}->($self);
3365 }
3366
3367 redo A;
3368 }
3369 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3370 if ($self->{nc} == 0x0022) { # "
3371
3372 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3373
3374 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3375 $self->{line_prev} = $self->{line};
3376 $self->{column_prev} = $self->{column};
3377 $self->{column}++;
3378 $self->{nc}
3379 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3380 } else {
3381 $self->{set_nc}->($self);
3382 }
3383
3384 redo A;
3385 } elsif ($self->{nc} == 0x003E) { # >
3386
3387 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3388
3389 $self->{state} = DATA_STATE;
3390 $self->{s_kwd} = '';
3391
3392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3393 $self->{line_prev} = $self->{line};
3394 $self->{column_prev} = $self->{column};
3395 $self->{column}++;
3396 $self->{nc}
3397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3398 } else {
3399 $self->{set_nc}->($self);
3400 }
3401
3402
3403 $self->{ct}->{quirks} = 1;
3404 return ($self->{ct}); # DOCTYPE
3405
3406 redo A;
3407 } elsif ($self->{nc} == -1) {
3408
3409 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3410
3411 $self->{state} = DATA_STATE;
3412 $self->{s_kwd} = '';
3413 ## reconsume
3414
3415 $self->{ct}->{quirks} = 1;
3416 return ($self->{ct}); # DOCTYPE
3417
3418 redo A;
3419 } else {
3420
3421 $self->{ct}->{sysid} # DOCTYPE
3422 .= chr $self->{nc};
3423 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3424 length $self->{ct}->{sysid});
3425
3426 ## Stay in the state
3427
3428 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3429 $self->{line_prev} = $self->{line};
3430 $self->{column_prev} = $self->{column};
3431 $self->{column}++;
3432 $self->{nc}
3433 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3434 } else {
3435 $self->{set_nc}->($self);
3436 }
3437
3438 redo A;
3439 }
3440 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3441 if ($self->{nc} == 0x0027) { # '
3442
3443 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3444
3445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3446 $self->{line_prev} = $self->{line};
3447 $self->{column_prev} = $self->{column};
3448 $self->{column}++;
3449 $self->{nc}
3450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3451 } else {
3452 $self->{set_nc}->($self);
3453 }
3454
3455 redo A;
3456 } elsif ($self->{nc} == 0x003E) { # >
3457
3458 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3459
3460 $self->{state} = DATA_STATE;
3461 $self->{s_kwd} = '';
3462
3463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3464 $self->{line_prev} = $self->{line};
3465 $self->{column_prev} = $self->{column};
3466 $self->{column}++;
3467 $self->{nc}
3468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3469 } else {
3470 $self->{set_nc}->($self);
3471 }
3472
3473
3474 $self->{ct}->{quirks} = 1;
3475 return ($self->{ct}); # DOCTYPE
3476
3477 redo A;
3478 } elsif ($self->{nc} == -1) {
3479
3480 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3481
3482 $self->{state} = DATA_STATE;
3483 $self->{s_kwd} = '';
3484 ## reconsume
3485
3486 $self->{ct}->{quirks} = 1;
3487 return ($self->{ct}); # DOCTYPE
3488
3489 redo A;
3490 } else {
3491
3492 $self->{ct}->{sysid} # DOCTYPE
3493 .= chr $self->{nc};
3494 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3495 length $self->{ct}->{sysid});
3496
3497 ## Stay in the state
3498
3499 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3500 $self->{line_prev} = $self->{line};
3501 $self->{column_prev} = $self->{column};
3502 $self->{column}++;
3503 $self->{nc}
3504 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3505 } else {
3506 $self->{set_nc}->($self);
3507 }
3508
3509 redo A;
3510 }
3511 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3512 if ($is_space->{$self->{nc}}) {
3513
3514 ## Stay in the state
3515
3516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3517 $self->{line_prev} = $self->{line};
3518 $self->{column_prev} = $self->{column};
3519 $self->{column}++;
3520 $self->{nc}
3521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3522 } else {
3523 $self->{set_nc}->($self);
3524 }
3525
3526 redo A;
3527 } elsif ($self->{nc} == 0x003E) { # >
3528
3529 $self->{state} = DATA_STATE;
3530 $self->{s_kwd} = '';
3531
3532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3533 $self->{line_prev} = $self->{line};
3534 $self->{column_prev} = $self->{column};
3535 $self->{column}++;
3536 $self->{nc}
3537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3538 } else {
3539 $self->{set_nc}->($self);
3540 }
3541
3542
3543 return ($self->{ct}); # DOCTYPE
3544
3545 redo A;
3546 } elsif ($self->{nc} == -1) {
3547
3548 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3549 $self->{state} = DATA_STATE;
3550 $self->{s_kwd} = '';
3551 ## reconsume
3552
3553 $self->{ct}->{quirks} = 1;
3554 return ($self->{ct}); # DOCTYPE
3555
3556 redo A;
3557 } else {
3558
3559 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3560 #$self->{ct}->{quirks} = 1;
3561
3562 $self->{state} = BOGUS_DOCTYPE_STATE;
3563
3564 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3565 $self->{line_prev} = $self->{line};
3566 $self->{column_prev} = $self->{column};
3567 $self->{column}++;
3568 $self->{nc}
3569 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3570 } else {
3571 $self->{set_nc}->($self);
3572 }
3573
3574 redo A;
3575 }
3576 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3577 if ($self->{nc} == 0x003E) { # >
3578
3579 $self->{state} = DATA_STATE;
3580 $self->{s_kwd} = '';
3581
3582 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3583 $self->{line_prev} = $self->{line};
3584 $self->{column_prev} = $self->{column};
3585 $self->{column}++;
3586 $self->{nc}
3587 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3588 } else {
3589 $self->{set_nc}->($self);
3590 }
3591
3592
3593 return ($self->{ct}); # DOCTYPE
3594
3595 redo A;
3596 } elsif ($self->{nc} == -1) {
3597
3598 $self->{state} = DATA_STATE;
3599 $self->{s_kwd} = '';
3600 ## reconsume
3601
3602 return ($self->{ct}); # DOCTYPE
3603
3604 redo A;
3605 } else {
3606
3607 my $s = '';
3608 $self->{read_until}->($s, q[>], 0);
3609
3610 ## Stay in the state
3611
3612 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3613 $self->{line_prev} = $self->{line};
3614 $self->{column_prev} = $self->{column};
3615 $self->{column}++;
3616 $self->{nc}
3617 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3618 } else {
3619 $self->{set_nc}->($self);
3620 }
3621
3622 redo A;
3623 }
3624 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3625 ## NOTE: "CDATA section state" in the state is jointly implemented
3626 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3627 ## and |CDATA_SECTION_MSE2_STATE|.
3628
3629 if ($self->{nc} == 0x005D) { # ]
3630
3631 $self->{state} = CDATA_SECTION_MSE1_STATE;
3632
3633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3634 $self->{line_prev} = $self->{line};
3635 $self->{column_prev} = $self->{column};
3636 $self->{column}++;
3637 $self->{nc}
3638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3639 } else {
3640 $self->{set_nc}->($self);
3641 }
3642
3643 redo A;
3644 } elsif ($self->{nc} == -1) {
3645 if ($self->{is_xml}) {
3646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3647 }
3648
3649 $self->{state} = DATA_STATE;
3650 $self->{s_kwd} = '';
3651
3652 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3653 $self->{line_prev} = $self->{line};
3654 $self->{column_prev} = $self->{column};
3655 $self->{column}++;
3656 $self->{nc}
3657 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3658 } else {
3659 $self->{set_nc}->($self);
3660 }
3661
3662 if (length $self->{ct}->{data}) { # character
3663
3664 return ($self->{ct}); # character
3665 } else {
3666
3667 ## No token to emit. $self->{ct} is discarded.
3668 }
3669 redo A;
3670 } else {
3671
3672 $self->{ct}->{data} .= chr $self->{nc};
3673 $self->{read_until}->($self->{ct}->{data},
3674 q<]>,
3675 length $self->{ct}->{data});
3676
3677 ## Stay in the state.
3678
3679 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3680 $self->{line_prev} = $self->{line};
3681 $self->{column_prev} = $self->{column};
3682 $self->{column}++;
3683 $self->{nc}
3684 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3685 } else {
3686 $self->{set_nc}->($self);
3687 }
3688
3689 redo A;
3690 }
3691
3692 ## ISSUE: "text tokens" in spec.
3693 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3694 if ($self->{nc} == 0x005D) { # ]
3695
3696 $self->{state} = CDATA_SECTION_MSE2_STATE;
3697
3698 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3699 $self->{line_prev} = $self->{line};
3700 $self->{column_prev} = $self->{column};
3701 $self->{column}++;
3702 $self->{nc}
3703 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3704 } else {
3705 $self->{set_nc}->($self);
3706 }
3707
3708 redo A;
3709 } else {
3710
3711 $self->{ct}->{data} .= ']';
3712 $self->{state} = CDATA_SECTION_STATE;
3713 ## Reconsume.
3714 redo A;
3715 }
3716 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3717 if ($self->{nc} == 0x003E) { # >
3718 $self->{state} = DATA_STATE;
3719 $self->{s_kwd} = '';
3720
3721 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3722 $self->{line_prev} = $self->{line};
3723 $self->{column_prev} = $self->{column};
3724 $self->{column}++;
3725 $self->{nc}
3726 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3727 } else {
3728 $self->{set_nc}->($self);
3729 }
3730
3731 if (length $self->{ct}->{data}) { # character
3732
3733 return ($self->{ct}); # character
3734 } else {
3735
3736 ## No token to emit. $self->{ct} is discarded.
3737 }
3738 redo A;
3739 } elsif ($self->{nc} == 0x005D) { # ]
3740 # character
3741 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3742 ## Stay in the state.
3743
3744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745 $self->{line_prev} = $self->{line};
3746 $self->{column_prev} = $self->{column};
3747 $self->{column}++;
3748 $self->{nc}
3749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750 } else {
3751 $self->{set_nc}->($self);
3752 }
3753
3754 redo A;
3755 } else {
3756
3757 $self->{ct}->{data} .= ']]'; # character
3758 $self->{state} = CDATA_SECTION_STATE;
3759 ## Reconsume.
3760 redo A;
3761 }
3762 } elsif ($self->{state} == ENTITY_STATE) {
3763 if ($is_space->{$self->{nc}} or
3764 {
3765 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3766 $self->{entity_add} => 1,
3767 }->{$self->{nc}}) {
3768
3769 ## Don't consume
3770 ## No error
3771 ## Return nothing.
3772 #
3773 } elsif ($self->{nc} == 0x0023) { # #
3774
3775 $self->{state} = ENTITY_HASH_STATE;
3776 $self->{s_kwd} = '#';
3777
3778 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779 $self->{line_prev} = $self->{line};
3780 $self->{column_prev} = $self->{column};
3781 $self->{column}++;
3782 $self->{nc}
3783 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784 } else {
3785 $self->{set_nc}->($self);
3786 }
3787
3788 redo A;
3789 } elsif ((0x0041 <= $self->{nc} and
3790 $self->{nc} <= 0x005A) or # A..Z
3791 (0x0061 <= $self->{nc} and
3792 $self->{nc} <= 0x007A)) { # a..z
3793
3794 require Whatpm::_NamedEntityList;
3795 $self->{state} = ENTITY_NAME_STATE;
3796 $self->{s_kwd} = chr $self->{nc};
3797 $self->{entity__value} = $self->{s_kwd};
3798 $self->{entity__match} = 0;
3799
3800 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3801 $self->{line_prev} = $self->{line};
3802 $self->{column_prev} = $self->{column};
3803 $self->{column}++;
3804 $self->{nc}
3805 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3806 } else {
3807 $self->{set_nc}->($self);
3808 }
3809
3810 redo A;
3811 } else {
3812
3813 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3814 ## Return nothing.
3815 #
3816 }
3817
3818 ## NOTE: No character is consumed by the "consume a character
3819 ## reference" algorithm. In other word, there is an "&" character
3820 ## that does not introduce a character reference, which would be
3821 ## appended to the parent element or the attribute value in later
3822 ## process of the tokenizer.
3823
3824 if ($self->{prev_state} == DATA_STATE) {
3825
3826 $self->{state} = $self->{prev_state};
3827 $self->{s_kwd} = '';
3828 ## Reconsume.
3829 return ({type => CHARACTER_TOKEN, data => '&',
3830 line => $self->{line_prev},
3831 column => $self->{column_prev},
3832 });
3833 redo A;
3834 } else {
3835
3836 $self->{ca}->{value} .= '&';
3837 $self->{state} = $self->{prev_state};
3838 $self->{s_kwd} = '';
3839 ## Reconsume.
3840 redo A;
3841 }
3842 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3843 if ($self->{nc} == 0x0078 or # x
3844 $self->{nc} == 0x0058) { # X
3845
3846 $self->{state} = HEXREF_X_STATE;
3847 $self->{s_kwd} .= chr $self->{nc};
3848
3849 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3850 $self->{line_prev} = $self->{line};
3851 $self->{column_prev} = $self->{column};
3852 $self->{column}++;
3853 $self->{nc}
3854 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3855 } else {
3856 $self->{set_nc}->($self);
3857 }
3858
3859 redo A;
3860 } elsif (0x0030 <= $self->{nc} and
3861 $self->{nc} <= 0x0039) { # 0..9
3862
3863 $self->{state} = NCR_NUM_STATE;
3864 $self->{s_kwd} = $self->{nc} - 0x0030;
3865
3866 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3867 $self->{line_prev} = $self->{line};
3868 $self->{column_prev} = $self->{column};
3869 $self->{column}++;
3870 $self->{nc}
3871 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3872 } else {
3873 $self->{set_nc}->($self);
3874 }
3875
3876 redo A;
3877 } else {
3878 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3879 line => $self->{line_prev},
3880 column => $self->{column_prev} - 1);
3881
3882 ## NOTE: According to the spec algorithm, nothing is returned,
3883 ## and then "&#" is appended to the parent element or the attribute
3884 ## value in the later processing.
3885
3886 if ($self->{prev_state} == DATA_STATE) {
3887
3888 $self->{state} = $self->{prev_state};
3889 $self->{s_kwd} = '';
3890 ## Reconsume.
3891 return ({type => CHARACTER_TOKEN,
3892 data => '&#',
3893 line => $self->{line_prev},
3894 column => $self->{column_prev} - 1,
3895 });
3896 redo A;
3897 } else {
3898
3899 $self->{ca}->{value} .= '&#';
3900 $self->{state} = $self->{prev_state};
3901 $self->{s_kwd} = '';
3902 ## Reconsume.
3903 redo A;
3904 }
3905 }
3906 } elsif ($self->{state} == NCR_NUM_STATE) {
3907 if (0x0030 <= $self->{nc} and
3908 $self->{nc} <= 0x0039) { # 0..9
3909
3910 $self->{s_kwd} *= 10;
3911 $self->{s_kwd} += $self->{nc} - 0x0030;
3912
3913 ## Stay in the state.
3914
3915 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3916 $self->{line_prev} = $self->{line};
3917 $self->{column_prev} = $self->{column};
3918 $self->{column}++;
3919 $self->{nc}
3920 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3921 } else {
3922 $self->{set_nc}->($self);
3923 }
3924
3925 redo A;
3926 } elsif ($self->{nc} == 0x003B) { # ;
3927
3928
3929 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3930 $self->{line_prev} = $self->{line};
3931 $self->{column_prev} = $self->{column};
3932 $self->{column}++;
3933 $self->{nc}
3934 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3935 } else {
3936 $self->{set_nc}->($self);
3937 }
3938
3939 #
3940 } else {
3941
3942 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3943 ## Reconsume.
3944 #
3945 }
3946
3947 my $code = $self->{s_kwd};
3948 my $l = $self->{line_prev};
3949 my $c = $self->{column_prev};
3950 if ($charref_map->{$code}) {
3951
3952 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3953 text => (sprintf 'U+%04X', $code),
3954 line => $l, column => $c);
3955 $code = $charref_map->{$code};
3956 } elsif ($code > 0x10FFFF) {
3957
3958 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3959 text => (sprintf 'U-%08X', $code),
3960 line => $l, column => $c);
3961 $code = 0xFFFD;
3962 }
3963
3964 if ($self->{prev_state} == DATA_STATE) {
3965
3966 $self->{state} = $self->{prev_state};
3967 $self->{s_kwd} = '';
3968 ## Reconsume.
3969 return ({type => CHARACTER_TOKEN, data => chr $code,
3970 has_reference => 1,
3971 line => $l, column => $c,
3972 });
3973 redo A;
3974 } else {
3975
3976 $self->{ca}->{value} .= chr $code;
3977 $self->{ca}->{has_reference} = 1;
3978 $self->{state} = $self->{prev_state};
3979 $self->{s_kwd} = '';
3980 ## Reconsume.
3981 redo A;
3982 }
3983 } elsif ($self->{state} == HEXREF_X_STATE) {
3984 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3985 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3986 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3987 # 0..9, A..F, a..f
3988
3989 $self->{state} = HEXREF_HEX_STATE;
3990 $self->{s_kwd} = 0;
3991 ## Reconsume.
3992 redo A;
3993 } else {
3994 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3995 line => $self->{line_prev},
3996 column => $self->{column_prev} - 2);
3997
3998 ## NOTE: According to the spec algorithm, nothing is returned,
3999 ## and then "&#" followed by "X" or "x" is appended to the parent
4000 ## element or the attribute value in the later processing.
4001
4002 if ($self->{prev_state} == DATA_STATE) {
4003
4004 $self->{state} = $self->{prev_state};
4005 $self->{s_kwd} = '';
4006 ## Reconsume.
4007 return ({type => CHARACTER_TOKEN,
4008 data => '&' . $self->{s_kwd},
4009 line => $self->{line_prev},
4010 column => $self->{column_prev} - length $self->{s_kwd},
4011 });
4012 redo A;
4013 } else {
4014
4015 $self->{ca}->{value} .= '&' . $self->{s_kwd};
4016 $self->{state} = $self->{prev_state};
4017 $self->{s_kwd} = '';
4018 ## Reconsume.
4019 redo A;
4020 }
4021 }
4022 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4023 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4024 # 0..9
4025
4026 $self->{s_kwd} *= 0x10;
4027 $self->{s_kwd} += $self->{nc} - 0x0030;
4028 ## Stay in the state.
4029
4030 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4031 $self->{line_prev} = $self->{line};
4032 $self->{column_prev} = $self->{column};
4033 $self->{column}++;
4034 $self->{nc}
4035 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4036 } else {
4037 $self->{set_nc}->($self);
4038 }
4039
4040 redo A;
4041 } elsif (0x0061 <= $self->{nc} and
4042 $self->{nc} <= 0x0066) { # a..f
4043
4044 $self->{s_kwd} *= 0x10;
4045 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4046 ## Stay in the state.
4047
4048 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4049 $self->{line_prev} = $self->{line};
4050 $self->{column_prev} = $self->{column};
4051 $self->{column}++;
4052 $self->{nc}
4053 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4054 } else {
4055 $self->{set_nc}->($self);
4056 }
4057
4058 redo A;
4059 } elsif (0x0041 <= $self->{nc} and
4060 $self->{nc} <= 0x0046) { # A..F
4061
4062 $self->{s_kwd} *= 0x10;
4063 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4064 ## Stay in the state.
4065
4066 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4067 $self->{line_prev} = $self->{line};
4068 $self->{column_prev} = $self->{column};
4069 $self->{column}++;
4070 $self->{nc}
4071 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4072 } else {
4073 $self->{set_nc}->($self);
4074 }
4075
4076 redo A;
4077 } elsif ($self->{nc} == 0x003B) { # ;
4078
4079
4080 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4081 $self->{line_prev} = $self->{line};
4082 $self->{column_prev} = $self->{column};
4083 $self->{column}++;
4084 $self->{nc}
4085 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4086 } else {
4087 $self->{set_nc}->($self);
4088 }
4089
4090 #
4091 } else {
4092
4093 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4094 line => $self->{line},
4095 column => $self->{column});
4096 ## Reconsume.
4097 #
4098 }
4099
4100 my $code = $self->{s_kwd};
4101 my $l = $self->{line_prev};
4102 my $c = $self->{column_prev};
4103 if ($charref_map->{$code}) {
4104
4105 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4106 text => (sprintf 'U+%04X', $code),
4107 line => $l, column => $c);
4108 $code = $charref_map->{$code};
4109 } elsif ($code > 0x10FFFF) {
4110
4111 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4112 text => (sprintf 'U-%08X', $code),
4113 line => $l, column => $c);
4114 $code = 0xFFFD;
4115 }
4116
4117 if ($self->{prev_state} == DATA_STATE) {
4118
4119 $self->{state} = $self->{prev_state};
4120 $self->{s_kwd} = '';
4121 ## Reconsume.
4122 return ({type => CHARACTER_TOKEN, data => chr $code,
4123 has_reference => 1,
4124 line => $l, column => $c,
4125 });
4126 redo A;
4127 } else {
4128
4129 $self->{ca}->{value} .= chr $code;
4130 $self->{ca}->{has_reference} = 1;
4131 $self->{state} = $self->{prev_state};
4132 $self->{s_kwd} = '';
4133 ## Reconsume.
4134 redo A;
4135 }
4136 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4137 if (length $self->{s_kwd} < 30 and
4138 ## NOTE: Some number greater than the maximum length of entity name
4139 ((0x0041 <= $self->{nc} and # a
4140 $self->{nc} <= 0x005A) or # x
4141 (0x0061 <= $self->{nc} and # a
4142 $self->{nc} <= 0x007A) or # z
4143 (0x0030 <= $self->{nc} and # 0
4144 $self->{nc} <= 0x0039) or # 9
4145 $self->{nc} == 0x003B)) { # ;
4146 our $EntityChar;
4147 $self->{s_kwd} .= chr $self->{nc};
4148 if (defined $EntityChar->{$self->{s_kwd}}) {
4149 if ($self->{nc} == 0x003B) { # ;
4150
4151 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4152 $self->{entity__match} = 1;
4153
4154 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4155 $self->{line_prev} = $self->{line};
4156 $self->{column_prev} = $self->{column};
4157 $self->{column}++;
4158 $self->{nc}
4159 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4160 } else {
4161 $self->{set_nc}->($self);
4162 }
4163
4164 #
4165 } else {
4166
4167 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4168 $self->{entity__match} = -1;
4169 ## Stay in the state.
4170
4171 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4172 $self->{line_prev} = $self->{line};
4173 $self->{column_prev} = $self->{column};
4174 $self->{column}++;
4175 $self->{nc}
4176 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4177 } else {
4178 $self->{set_nc}->($self);
4179 }
4180
4181 redo A;
4182 }
4183 } else {
4184
4185 $self->{entity__value} .= chr $self->{nc};
4186 $self->{entity__match} *= 2;
4187 ## Stay in the state.
4188
4189 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190 $self->{line_prev} = $self->{line};
4191 $self->{column_prev} = $self->{column};
4192 $self->{column}++;
4193 $self->{nc}
4194 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195 } else {
4196 $self->{set_nc}->($self);
4197 }
4198
4199 redo A;
4200 }
4201 }
4202
4203 my $data;
4204 my $has_ref;
4205 if ($self->{entity__match} > 0) {
4206
4207 $data = $self->{entity__value};
4208 $has_ref = 1;
4209 #
4210 } elsif ($self->{entity__match} < 0) {
4211 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4212 if ($self->{prev_state} != DATA_STATE and # in attribute
4213 $self->{entity__match} < -1) {
4214
4215 $data = '&' . $self->{s_kwd};
4216 #
4217 } else {
4218
4219 $data = $self->{entity__value};
4220 $has_ref = 1;
4221 #
4222 }
4223 } else {
4224
4225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4226 line => $self->{line_prev},
4227 column => $self->{column_prev} - length $self->{s_kwd});
4228 $data = '&' . $self->{s_kwd};
4229 #
4230 }
4231
4232 ## NOTE: In these cases, when a character reference is found,
4233 ## it is consumed and a character token is returned, or, otherwise,
4234 ## nothing is consumed and returned, according to the spec algorithm.
4235 ## In this implementation, anything that has been examined by the
4236 ## tokenizer is appended to the parent element or the attribute value
4237 ## as string, either literal string when no character reference or
4238 ## entity-replaced string otherwise, in this stage, since any characters
4239 ## that would not be consumed are appended in the data state or in an
4240 ## appropriate attribute value state anyway.
4241
4242 if ($self->{prev_state} == DATA_STATE) {
4243
4244 $self->{state} = $self->{prev_state};
4245 $self->{s_kwd} = '';
4246 ## Reconsume.
4247 return ({type => CHARACTER_TOKEN,
4248 data => $data,
4249 has_reference => $has_ref,
4250 line => $self->{line_prev},
4251 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4252 });
4253 redo A;
4254 } else {
4255
4256 $self->{ca}->{value} .= $data;
4257 $self->{ca}->{has_reference} = 1 if $has_ref;
4258 $self->{state} = $self->{prev_state};
4259 $self->{s_kwd} = '';
4260 ## Reconsume.
4261 redo A;
4262 }
4263 } else {
4264 die "$0: $self->{state}: Unknown state";
4265 }
4266 } # A
4267
4268 die "$0: _get_next_token: unexpected case";
4269 } # _get_next_token
4270
4271 1;
4272 ## $Date: 2008/10/14 14:57:52 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24