/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations) (download)
Tue Oct 14 11:46:57 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.3: +10 -8 lines
++ whatpm/t/ChangeLog	14 Oct 2008 11:46:38 -0000
	* XML-Parser.t: "xml/elements-1.dat" and "xml/doctypes-1.dat"
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 11:46:52 -0000
	* elements-1.dat: New test data file.

	* doctypes-1.dat: New test data file.

	* attrs-1.dat: New test data on attribute name cases are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 11:40:58 -0000
	* Tokenizer.pm.src: Support for case-insensitive XML attribute
	names.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## Tree constructor state constants (see Whatpm::HTML for the full
118 ## list and descriptions)
119
120 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121 sub FOREIGN_EL () { 0b1_00000000000 }
122
123 ## Character reference mappings
124
125 my $charref_map = {
126 0x0D => 0x000A,
127 0x80 => 0x20AC,
128 0x81 => 0xFFFD,
129 0x82 => 0x201A,
130 0x83 => 0x0192,
131 0x84 => 0x201E,
132 0x85 => 0x2026,
133 0x86 => 0x2020,
134 0x87 => 0x2021,
135 0x88 => 0x02C6,
136 0x89 => 0x2030,
137 0x8A => 0x0160,
138 0x8B => 0x2039,
139 0x8C => 0x0152,
140 0x8D => 0xFFFD,
141 0x8E => 0x017D,
142 0x8F => 0xFFFD,
143 0x90 => 0xFFFD,
144 0x91 => 0x2018,
145 0x92 => 0x2019,
146 0x93 => 0x201C,
147 0x94 => 0x201D,
148 0x95 => 0x2022,
149 0x96 => 0x2013,
150 0x97 => 0x2014,
151 0x98 => 0x02DC,
152 0x99 => 0x2122,
153 0x9A => 0x0161,
154 0x9B => 0x203A,
155 0x9C => 0x0153,
156 0x9D => 0xFFFD,
157 0x9E => 0x017E,
158 0x9F => 0x0178,
159 }; # $charref_map
160 $charref_map->{$_} = 0xFFFD
161 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168
169 ## Implementations MUST act as if state machine in the spec
170
171 sub _initialize_tokenizer ($) {
172 my $self = shift;
173
174 ## NOTE: Fields set by |new| constructor:
175 #$self->{level}
176 #$self->{set_nc}
177 #$self->{parse_error}
178 #$self->{is_xml} (if XML)
179
180 $self->{state} = DATA_STATE; # MUST
181 #$self->{s_kwd}; # state keyword - initialized when used
182 #$self->{entity__value}; # initialized when used
183 #$self->{entity__match}; # initialized when used
184 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185 undef $self->{ct}; # current token
186 undef $self->{ca}; # current attribute
187 undef $self->{last_stag_name}; # last emitted start tag name
188 #$self->{prev_state}; # initialized when used
189 delete $self->{self_closing};
190 $self->{char_buffer} = '';
191 $self->{char_buffer_pos} = 0;
192 $self->{nc} = -1; # next input character
193 #$self->{next_nc}
194
195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196 $self->{line_prev} = $self->{line};
197 $self->{column_prev} = $self->{column};
198 $self->{column}++;
199 $self->{nc}
200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201 } else {
202 $self->{set_nc}->($self);
203 }
204
205 $self->{token} = [];
206 # $self->{escape}
207 } # _initialize_tokenizer
208
209 ## A token has:
210 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212 ## ->{name} (DOCTYPE_TOKEN)
213 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214 ## ->{pubid} (DOCTYPE_TOKEN)
215 ## ->{sysid} (DOCTYPE_TOKEN)
216 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218 ## ->{name}
219 ## ->{value}
220 ## ->{has_reference} == 1 or 0
221 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224 ## while the token is pushed back to the stack.
225
226 ## Emitted token MUST immediately be handled by the tree construction state.
227
228 ## Before each step, UA MAY check to see if either one of the scripts in
229 ## "list of scripts that will execute as soon as possible" or the first
230 ## script in the "list of scripts that will execute asynchronously",
231 ## has completed loading. If one has, then it MUST be executed
232 ## and removed from the list.
233
234 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235 ## (This requirement was dropped from HTML5 spec, unfortunately.)
236
237 my $is_space = {
238 0x0009 => 1, # CHARACTER TABULATION (HT)
239 0x000A => 1, # LINE FEED (LF)
240 #0x000B => 0, # LINE TABULATION (VT)
241 0x000C => 1, # FORM FEED (FF)
242 #0x000D => 1, # CARRIAGE RETURN (CR)
243 0x0020 => 1, # SPACE (SP)
244 };
245
246 sub _get_next_token ($) {
247 my $self = shift;
248
249 if ($self->{self_closing}) {
250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251 ## NOTE: The |self_closing| flag is only set by start tag token.
252 ## In addition, when a start tag token is emitted, it is always set to
253 ## |ct|.
254 delete $self->{self_closing};
255 }
256
257 if (@{$self->{token}}) {
258 $self->{self_closing} = $self->{token}->[0]->{self_closing};
259 return shift @{$self->{token}};
260 }
261
262 A: {
263 if ($self->{state} == PCDATA_STATE) {
264 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265
266 if ($self->{nc} == 0x0026) { # &
267
268 ## NOTE: In the spec, the tokenizer is switched to the
269 ## "entity data state". In this implementation, the tokenizer
270 ## is switched to the |ENTITY_STATE|, which is an implementation
271 ## of the "consume a character reference" algorithm.
272 $self->{entity_add} = -1;
273 $self->{prev_state} = DATA_STATE;
274 $self->{state} = ENTITY_STATE;
275
276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277 $self->{line_prev} = $self->{line};
278 $self->{column_prev} = $self->{column};
279 $self->{column}++;
280 $self->{nc}
281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282 } else {
283 $self->{set_nc}->($self);
284 }
285
286 redo A;
287 } elsif ($self->{nc} == 0x003C) { # <
288
289 $self->{state} = TAG_OPEN_STATE;
290
291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292 $self->{line_prev} = $self->{line};
293 $self->{column_prev} = $self->{column};
294 $self->{column}++;
295 $self->{nc}
296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297 } else {
298 $self->{set_nc}->($self);
299 }
300
301 redo A;
302 } elsif ($self->{nc} == -1) {
303
304 return ({type => END_OF_FILE_TOKEN,
305 line => $self->{line}, column => $self->{column}});
306 last A; ## TODO: ok?
307 } else {
308
309 #
310 }
311
312 # Anything else
313 my $token = {type => CHARACTER_TOKEN,
314 data => chr $self->{nc},
315 line => $self->{line}, column => $self->{column},
316 };
317 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318
319 ## Stay in the state.
320
321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322 $self->{line_prev} = $self->{line};
323 $self->{column_prev} = $self->{column};
324 $self->{column}++;
325 $self->{nc}
326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327 } else {
328 $self->{set_nc}->($self);
329 }
330
331 return ($token);
332 redo A;
333 } elsif ($self->{state} == DATA_STATE) {
334 $self->{s_kwd} = '' unless defined $self->{s_kwd};
335 if ($self->{nc} == 0x0026) { # &
336 $self->{s_kwd} = '';
337 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338 not $self->{escape}) {
339
340 ## NOTE: In the spec, the tokenizer is switched to the
341 ## "entity data state". In this implementation, the tokenizer
342 ## is switched to the |ENTITY_STATE|, which is an implementation
343 ## of the "consume a character reference" algorithm.
344 $self->{entity_add} = -1;
345 $self->{prev_state} = DATA_STATE;
346 $self->{state} = ENTITY_STATE;
347
348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349 $self->{line_prev} = $self->{line};
350 $self->{column_prev} = $self->{column};
351 $self->{column}++;
352 $self->{nc}
353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354 } else {
355 $self->{set_nc}->($self);
356 }
357
358 redo A;
359 } else {
360
361 #
362 }
363 } elsif ($self->{nc} == 0x002D) { # -
364 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365 $self->{s_kwd} .= '-';
366
367 if ($self->{s_kwd} eq '<!--') {
368
369 $self->{escape} = 1; # unless $self->{escape};
370 $self->{s_kwd} = '--';
371 #
372 } elsif ($self->{s_kwd} eq '---') {
373
374 $self->{s_kwd} = '--';
375 #
376 } else {
377
378 #
379 }
380 }
381
382 #
383 } elsif ($self->{nc} == 0x0021) { # !
384 if (length $self->{s_kwd}) {
385
386 $self->{s_kwd} .= '!';
387 #
388 } else {
389
390 #$self->{s_kwd} = '';
391 #
392 }
393 #
394 } elsif ($self->{nc} == 0x003C) { # <
395 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
396 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
397 not $self->{escape})) {
398
399 $self->{state} = TAG_OPEN_STATE;
400
401 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
402 $self->{line_prev} = $self->{line};
403 $self->{column_prev} = $self->{column};
404 $self->{column}++;
405 $self->{nc}
406 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
407 } else {
408 $self->{set_nc}->($self);
409 }
410
411 redo A;
412 } else {
413
414 $self->{s_kwd} = '';
415 #
416 }
417 } elsif ($self->{nc} == 0x003E) { # >
418 if ($self->{escape} and
419 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
420 if ($self->{s_kwd} eq '--') {
421
422 delete $self->{escape};
423 } else {
424
425 }
426 } else {
427
428 }
429
430 $self->{s_kwd} = '';
431 #
432 } elsif ($self->{nc} == -1) {
433
434 $self->{s_kwd} = '';
435 return ({type => END_OF_FILE_TOKEN,
436 line => $self->{line}, column => $self->{column}});
437 last A; ## TODO: ok?
438 } else {
439
440 $self->{s_kwd} = '';
441 #
442 }
443
444 # Anything else
445 my $token = {type => CHARACTER_TOKEN,
446 data => chr $self->{nc},
447 line => $self->{line}, column => $self->{column},
448 };
449 if ($self->{read_until}->($token->{data}, q[-!<>&],
450 length $token->{data})) {
451 $self->{s_kwd} = '';
452 }
453
454 ## Stay in the data state.
455 if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
456
457 $self->{state} = PCDATA_STATE;
458 } else {
459
460 ## Stay in the state.
461 }
462
463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
464 $self->{line_prev} = $self->{line};
465 $self->{column_prev} = $self->{column};
466 $self->{column}++;
467 $self->{nc}
468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
469 } else {
470 $self->{set_nc}->($self);
471 }
472
473 return ($token);
474 redo A;
475 } elsif ($self->{state} == TAG_OPEN_STATE) {
476 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
477 if ($self->{nc} == 0x002F) { # /
478
479
480 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
481 $self->{line_prev} = $self->{line};
482 $self->{column_prev} = $self->{column};
483 $self->{column}++;
484 $self->{nc}
485 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
486 } else {
487 $self->{set_nc}->($self);
488 }
489
490 $self->{state} = CLOSE_TAG_OPEN_STATE;
491 redo A;
492 } elsif ($self->{nc} == 0x0021) { # !
493
494 $self->{s_kwd} = '<' unless $self->{escape};
495 #
496 } else {
497
498 #
499 }
500
501 ## reconsume
502 $self->{state} = DATA_STATE;
503 return ({type => CHARACTER_TOKEN, data => '<',
504 line => $self->{line_prev},
505 column => $self->{column_prev},
506 });
507 redo A;
508 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
509 if ($self->{nc} == 0x0021) { # !
510
511 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
512
513 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
514 $self->{line_prev} = $self->{line};
515 $self->{column_prev} = $self->{column};
516 $self->{column}++;
517 $self->{nc}
518 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
519 } else {
520 $self->{set_nc}->($self);
521 }
522
523 redo A;
524 } elsif ($self->{nc} == 0x002F) { # /
525
526 $self->{state} = CLOSE_TAG_OPEN_STATE;
527
528 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
529 $self->{line_prev} = $self->{line};
530 $self->{column_prev} = $self->{column};
531 $self->{column}++;
532 $self->{nc}
533 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
534 } else {
535 $self->{set_nc}->($self);
536 }
537
538 redo A;
539 } elsif (0x0041 <= $self->{nc} and
540 $self->{nc} <= 0x005A) { # A..Z
541
542 $self->{ct}
543 = {type => START_TAG_TOKEN,
544 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
545 line => $self->{line_prev},
546 column => $self->{column_prev}};
547 $self->{state} = TAG_NAME_STATE;
548
549 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
550 $self->{line_prev} = $self->{line};
551 $self->{column_prev} = $self->{column};
552 $self->{column}++;
553 $self->{nc}
554 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
555 } else {
556 $self->{set_nc}->($self);
557 }
558
559 redo A;
560 } elsif (0x0061 <= $self->{nc} and
561 $self->{nc} <= 0x007A) { # a..z
562
563 $self->{ct} = {type => START_TAG_TOKEN,
564 tag_name => chr ($self->{nc}),
565 line => $self->{line_prev},
566 column => $self->{column_prev}};
567 $self->{state} = TAG_NAME_STATE;
568
569 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
570 $self->{line_prev} = $self->{line};
571 $self->{column_prev} = $self->{column};
572 $self->{column}++;
573 $self->{nc}
574 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
575 } else {
576 $self->{set_nc}->($self);
577 }
578
579 redo A;
580 } elsif ($self->{nc} == 0x003E) { # >
581
582 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
583 line => $self->{line_prev},
584 column => $self->{column_prev});
585 $self->{state} = DATA_STATE;
586
587 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
588 $self->{line_prev} = $self->{line};
589 $self->{column_prev} = $self->{column};
590 $self->{column}++;
591 $self->{nc}
592 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
593 } else {
594 $self->{set_nc}->($self);
595 }
596
597
598 return ({type => CHARACTER_TOKEN, data => '<>',
599 line => $self->{line_prev},
600 column => $self->{column_prev},
601 });
602
603 redo A;
604 } elsif ($self->{nc} == 0x003F) { # ?
605
606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
607 line => $self->{line_prev},
608 column => $self->{column_prev});
609 $self->{state} = BOGUS_COMMENT_STATE;
610 $self->{ct} = {type => COMMENT_TOKEN, data => '',
611 line => $self->{line_prev},
612 column => $self->{column_prev},
613 };
614 ## $self->{nc} is intentionally left as is
615 redo A;
616 } else {
617
618 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
619 line => $self->{line_prev},
620 column => $self->{column_prev});
621 $self->{state} = DATA_STATE;
622 ## reconsume
623
624 return ({type => CHARACTER_TOKEN, data => '<',
625 line => $self->{line_prev},
626 column => $self->{column_prev},
627 });
628
629 redo A;
630 }
631 } else {
632 die "$0: $self->{content_model} in tag open";
633 }
634 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
635 ## NOTE: The "close tag open state" in the spec is implemented as
636 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
637
638 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
639 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
640 if (defined $self->{last_stag_name}) {
641 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
642 $self->{s_kwd} = '';
643 ## Reconsume.
644 redo A;
645 } else {
646 ## No start tag token has ever been emitted
647 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
648
649 $self->{state} = DATA_STATE;
650 ## Reconsume.
651 return ({type => CHARACTER_TOKEN, data => '</',
652 line => $l, column => $c,
653 });
654 redo A;
655 }
656 }
657
658 if (0x0041 <= $self->{nc} and
659 $self->{nc} <= 0x005A) { # A..Z
660
661 $self->{ct}
662 = {type => END_TAG_TOKEN,
663 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
664 line => $l, column => $c};
665 $self->{state} = TAG_NAME_STATE;
666
667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668 $self->{line_prev} = $self->{line};
669 $self->{column_prev} = $self->{column};
670 $self->{column}++;
671 $self->{nc}
672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673 } else {
674 $self->{set_nc}->($self);
675 }
676
677 redo A;
678 } elsif (0x0061 <= $self->{nc} and
679 $self->{nc} <= 0x007A) { # a..z
680
681 $self->{ct} = {type => END_TAG_TOKEN,
682 tag_name => chr ($self->{nc}),
683 line => $l, column => $c};
684 $self->{state} = TAG_NAME_STATE;
685
686 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
687 $self->{line_prev} = $self->{line};
688 $self->{column_prev} = $self->{column};
689 $self->{column}++;
690 $self->{nc}
691 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
692 } else {
693 $self->{set_nc}->($self);
694 }
695
696 redo A;
697 } elsif ($self->{nc} == 0x003E) { # >
698
699 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
700 line => $self->{line_prev}, ## "<" in "</>"
701 column => $self->{column_prev} - 1);
702 $self->{state} = DATA_STATE;
703
704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705 $self->{line_prev} = $self->{line};
706 $self->{column_prev} = $self->{column};
707 $self->{column}++;
708 $self->{nc}
709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710 } else {
711 $self->{set_nc}->($self);
712 }
713
714 redo A;
715 } elsif ($self->{nc} == -1) {
716
717 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
718 $self->{state} = DATA_STATE;
719 # reconsume
720
721 return ({type => CHARACTER_TOKEN, data => '</',
722 line => $l, column => $c,
723 });
724
725 redo A;
726 } else {
727
728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
729 $self->{state} = BOGUS_COMMENT_STATE;
730 $self->{ct} = {type => COMMENT_TOKEN, data => '',
731 line => $self->{line_prev}, # "<" of "</"
732 column => $self->{column_prev} - 1,
733 };
734 ## NOTE: $self->{nc} is intentionally left as is.
735 ## Although the "anything else" case of the spec not explicitly
736 ## states that the next input character is to be reconsumed,
737 ## it will be included to the |data| of the comment token
738 ## generated from the bogus end tag, as defined in the
739 ## "bogus comment state" entry.
740 redo A;
741 }
742 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
743 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
744 if (length $ch) {
745 my $CH = $ch;
746 $ch =~ tr/a-z/A-Z/;
747 my $nch = chr $self->{nc};
748 if ($nch eq $ch or $nch eq $CH) {
749
750 ## Stay in the state.
751 $self->{s_kwd} .= $nch;
752
753 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
754 $self->{line_prev} = $self->{line};
755 $self->{column_prev} = $self->{column};
756 $self->{column}++;
757 $self->{nc}
758 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
759 } else {
760 $self->{set_nc}->($self);
761 }
762
763 redo A;
764 } else {
765
766 $self->{state} = DATA_STATE;
767 ## Reconsume.
768 return ({type => CHARACTER_TOKEN,
769 data => '</' . $self->{s_kwd},
770 line => $self->{line_prev},
771 column => $self->{column_prev} - 1 - length $self->{s_kwd},
772 });
773 redo A;
774 }
775 } else { # after "<{tag-name}"
776 unless ($is_space->{$self->{nc}} or
777 {
778 0x003E => 1, # >
779 0x002F => 1, # /
780 -1 => 1, # EOF
781 }->{$self->{nc}}) {
782
783 ## Reconsume.
784 $self->{state} = DATA_STATE;
785 return ({type => CHARACTER_TOKEN,
786 data => '</' . $self->{s_kwd},
787 line => $self->{line_prev},
788 column => $self->{column_prev} - 1 - length $self->{s_kwd},
789 });
790 redo A;
791 } else {
792
793 $self->{ct}
794 = {type => END_TAG_TOKEN,
795 tag_name => $self->{last_stag_name},
796 line => $self->{line_prev},
797 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
798 $self->{state} = TAG_NAME_STATE;
799 ## Reconsume.
800 redo A;
801 }
802 }
803 } elsif ($self->{state} == TAG_NAME_STATE) {
804 if ($is_space->{$self->{nc}}) {
805
806 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
807
808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
809 $self->{line_prev} = $self->{line};
810 $self->{column_prev} = $self->{column};
811 $self->{column}++;
812 $self->{nc}
813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
814 } else {
815 $self->{set_nc}->($self);
816 }
817
818 redo A;
819 } elsif ($self->{nc} == 0x003E) { # >
820 if ($self->{ct}->{type} == START_TAG_TOKEN) {
821
822 $self->{last_stag_name} = $self->{ct}->{tag_name};
823 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
824 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
825 #if ($self->{ct}->{attributes}) {
826 # ## NOTE: This should never be reached.
827 # !!! cp (36);
828 # !!! parse-error (type => 'end tag attribute');
829 #} else {
830
831 #}
832 } else {
833 die "$0: $self->{ct}->{type}: Unknown token type";
834 }
835 $self->{state} = DATA_STATE;
836
837 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
838 $self->{line_prev} = $self->{line};
839 $self->{column_prev} = $self->{column};
840 $self->{column}++;
841 $self->{nc}
842 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
843 } else {
844 $self->{set_nc}->($self);
845 }
846
847
848 return ($self->{ct}); # start tag or end tag
849
850 redo A;
851 } elsif (0x0041 <= $self->{nc} and
852 $self->{nc} <= 0x005A) { # A..Z
853
854 $self->{ct}->{tag_name}
855 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
856 # start tag or end tag
857 ## Stay in this state
858
859 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
860 $self->{line_prev} = $self->{line};
861 $self->{column_prev} = $self->{column};
862 $self->{column}++;
863 $self->{nc}
864 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
865 } else {
866 $self->{set_nc}->($self);
867 }
868
869 redo A;
870 } elsif ($self->{nc} == -1) {
871 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
872 if ($self->{ct}->{type} == START_TAG_TOKEN) {
873
874 $self->{last_stag_name} = $self->{ct}->{tag_name};
875 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
876 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
877 #if ($self->{ct}->{attributes}) {
878 # ## NOTE: This state should never be reached.
879 # !!! cp (40);
880 # !!! parse-error (type => 'end tag attribute');
881 #} else {
882
883 #}
884 } else {
885 die "$0: $self->{ct}->{type}: Unknown token type";
886 }
887 $self->{state} = DATA_STATE;
888 # reconsume
889
890 return ($self->{ct}); # start tag or end tag
891
892 redo A;
893 } elsif ($self->{nc} == 0x002F) { # /
894
895 $self->{state} = SELF_CLOSING_START_TAG_STATE;
896
897 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
898 $self->{line_prev} = $self->{line};
899 $self->{column_prev} = $self->{column};
900 $self->{column}++;
901 $self->{nc}
902 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
903 } else {
904 $self->{set_nc}->($self);
905 }
906
907 redo A;
908 } else {
909
910 $self->{ct}->{tag_name} .= chr $self->{nc};
911 # start tag or end tag
912 ## Stay in the state
913
914 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
915 $self->{line_prev} = $self->{line};
916 $self->{column_prev} = $self->{column};
917 $self->{column}++;
918 $self->{nc}
919 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
920 } else {
921 $self->{set_nc}->($self);
922 }
923
924 redo A;
925 }
926 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
927 if ($is_space->{$self->{nc}}) {
928
929 ## Stay in the state
930
931 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
932 $self->{line_prev} = $self->{line};
933 $self->{column_prev} = $self->{column};
934 $self->{column}++;
935 $self->{nc}
936 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
937 } else {
938 $self->{set_nc}->($self);
939 }
940
941 redo A;
942 } elsif ($self->{nc} == 0x003E) { # >
943 if ($self->{ct}->{type} == START_TAG_TOKEN) {
944
945 $self->{last_stag_name} = $self->{ct}->{tag_name};
946 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
947 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
948 if ($self->{ct}->{attributes}) {
949
950 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
951 } else {
952
953 }
954 } else {
955 die "$0: $self->{ct}->{type}: Unknown token type";
956 }
957 $self->{state} = DATA_STATE;
958
959 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
960 $self->{line_prev} = $self->{line};
961 $self->{column_prev} = $self->{column};
962 $self->{column}++;
963 $self->{nc}
964 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
965 } else {
966 $self->{set_nc}->($self);
967 }
968
969
970 return ($self->{ct}); # start tag or end tag
971
972 redo A;
973 } elsif (0x0041 <= $self->{nc} and
974 $self->{nc} <= 0x005A) { # A..Z
975
976 $self->{ca}
977 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
978 value => '',
979 line => $self->{line}, column => $self->{column}};
980 $self->{state} = ATTRIBUTE_NAME_STATE;
981
982 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
983 $self->{line_prev} = $self->{line};
984 $self->{column_prev} = $self->{column};
985 $self->{column}++;
986 $self->{nc}
987 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
988 } else {
989 $self->{set_nc}->($self);
990 }
991
992 redo A;
993 } elsif ($self->{nc} == 0x002F) { # /
994
995 $self->{state} = SELF_CLOSING_START_TAG_STATE;
996
997 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
998 $self->{line_prev} = $self->{line};
999 $self->{column_prev} = $self->{column};
1000 $self->{column}++;
1001 $self->{nc}
1002 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1003 } else {
1004 $self->{set_nc}->($self);
1005 }
1006
1007 redo A;
1008 } elsif ($self->{nc} == -1) {
1009 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1010 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1011
1012 $self->{last_stag_name} = $self->{ct}->{tag_name};
1013 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1014 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1015 if ($self->{ct}->{attributes}) {
1016
1017 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1018 } else {
1019
1020 }
1021 } else {
1022 die "$0: $self->{ct}->{type}: Unknown token type";
1023 }
1024 $self->{state} = DATA_STATE;
1025 # reconsume
1026
1027 return ($self->{ct}); # start tag or end tag
1028
1029 redo A;
1030 } else {
1031 if ({
1032 0x0022 => 1, # "
1033 0x0027 => 1, # '
1034 0x003D => 1, # =
1035 }->{$self->{nc}}) {
1036
1037 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1038 } else {
1039
1040 }
1041 $self->{ca}
1042 = {name => chr ($self->{nc}),
1043 value => '',
1044 line => $self->{line}, column => $self->{column}};
1045 $self->{state} = ATTRIBUTE_NAME_STATE;
1046
1047 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1048 $self->{line_prev} = $self->{line};
1049 $self->{column_prev} = $self->{column};
1050 $self->{column}++;
1051 $self->{nc}
1052 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1053 } else {
1054 $self->{set_nc}->($self);
1055 }
1056
1057 redo A;
1058 }
1059 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1060 my $before_leave = sub {
1061 if (exists $self->{ct}->{attributes} # start tag or end tag
1062 ->{$self->{ca}->{name}}) { # MUST
1063
1064 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1065 ## Discard $self->{ca} # MUST
1066 } else {
1067
1068 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1069 = $self->{ca};
1070 }
1071 }; # $before_leave
1072
1073 if ($is_space->{$self->{nc}}) {
1074
1075 $before_leave->();
1076 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1077
1078 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1079 $self->{line_prev} = $self->{line};
1080 $self->{column_prev} = $self->{column};
1081 $self->{column}++;
1082 $self->{nc}
1083 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1084 } else {
1085 $self->{set_nc}->($self);
1086 }
1087
1088 redo A;
1089 } elsif ($self->{nc} == 0x003D) { # =
1090
1091 $before_leave->();
1092 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1093
1094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1095 $self->{line_prev} = $self->{line};
1096 $self->{column_prev} = $self->{column};
1097 $self->{column}++;
1098 $self->{nc}
1099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1100 } else {
1101 $self->{set_nc}->($self);
1102 }
1103
1104 redo A;
1105 } elsif ($self->{nc} == 0x003E) { # >
1106 $before_leave->();
1107 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1108
1109 $self->{last_stag_name} = $self->{ct}->{tag_name};
1110 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1111
1112 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113 if ($self->{ct}->{attributes}) {
1114 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1115 }
1116 } else {
1117 die "$0: $self->{ct}->{type}: Unknown token type";
1118 }
1119 $self->{state} = DATA_STATE;
1120
1121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1122 $self->{line_prev} = $self->{line};
1123 $self->{column_prev} = $self->{column};
1124 $self->{column}++;
1125 $self->{nc}
1126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1127 } else {
1128 $self->{set_nc}->($self);
1129 }
1130
1131
1132 return ($self->{ct}); # start tag or end tag
1133
1134 redo A;
1135 } elsif (0x0041 <= $self->{nc} and
1136 $self->{nc} <= 0x005A) { # A..Z
1137
1138 $self->{ca}->{name}
1139 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1140 ## Stay in the state
1141
1142 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1143 $self->{line_prev} = $self->{line};
1144 $self->{column_prev} = $self->{column};
1145 $self->{column}++;
1146 $self->{nc}
1147 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1148 } else {
1149 $self->{set_nc}->($self);
1150 }
1151
1152 redo A;
1153 } elsif ($self->{nc} == 0x002F) { # /
1154
1155 $before_leave->();
1156 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1157
1158 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1159 $self->{line_prev} = $self->{line};
1160 $self->{column_prev} = $self->{column};
1161 $self->{column}++;
1162 $self->{nc}
1163 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1164 } else {
1165 $self->{set_nc}->($self);
1166 }
1167
1168 redo A;
1169 } elsif ($self->{nc} == -1) {
1170 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1171 $before_leave->();
1172 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1173
1174 $self->{last_stag_name} = $self->{ct}->{tag_name};
1175 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1176 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1177 if ($self->{ct}->{attributes}) {
1178
1179 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1180 } else {
1181 ## NOTE: This state should never be reached.
1182
1183 }
1184 } else {
1185 die "$0: $self->{ct}->{type}: Unknown token type";
1186 }
1187 $self->{state} = DATA_STATE;
1188 # reconsume
1189
1190 return ($self->{ct}); # start tag or end tag
1191
1192 redo A;
1193 } else {
1194 if ($self->{nc} == 0x0022 or # "
1195 $self->{nc} == 0x0027) { # '
1196
1197 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1198 } else {
1199
1200 }
1201 $self->{ca}->{name} .= chr ($self->{nc});
1202 ## Stay in the state
1203
1204 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1205 $self->{line_prev} = $self->{line};
1206 $self->{column_prev} = $self->{column};
1207 $self->{column}++;
1208 $self->{nc}
1209 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1210 } else {
1211 $self->{set_nc}->($self);
1212 }
1213
1214 redo A;
1215 }
1216 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1217 if ($is_space->{$self->{nc}}) {
1218
1219 ## Stay in the state
1220
1221 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1222 $self->{line_prev} = $self->{line};
1223 $self->{column_prev} = $self->{column};
1224 $self->{column}++;
1225 $self->{nc}
1226 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1227 } else {
1228 $self->{set_nc}->($self);
1229 }
1230
1231 redo A;
1232 } elsif ($self->{nc} == 0x003D) { # =
1233
1234 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1235
1236 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1237 $self->{line_prev} = $self->{line};
1238 $self->{column_prev} = $self->{column};
1239 $self->{column}++;
1240 $self->{nc}
1241 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1242 } else {
1243 $self->{set_nc}->($self);
1244 }
1245
1246 redo A;
1247 } elsif ($self->{nc} == 0x003E) { # >
1248 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1249
1250 $self->{last_stag_name} = $self->{ct}->{tag_name};
1251 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1252 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1253 if ($self->{ct}->{attributes}) {
1254
1255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1256 } else {
1257 ## NOTE: This state should never be reached.
1258
1259 }
1260 } else {
1261 die "$0: $self->{ct}->{type}: Unknown token type";
1262 }
1263 $self->{state} = DATA_STATE;
1264
1265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1266 $self->{line_prev} = $self->{line};
1267 $self->{column_prev} = $self->{column};
1268 $self->{column}++;
1269 $self->{nc}
1270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1271 } else {
1272 $self->{set_nc}->($self);
1273 }
1274
1275
1276 return ($self->{ct}); # start tag or end tag
1277
1278 redo A;
1279 } elsif (0x0041 <= $self->{nc} and
1280 $self->{nc} <= 0x005A) { # A..Z
1281
1282 $self->{ca}
1283 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1284 value => '',
1285 line => $self->{line}, column => $self->{column}};
1286 $self->{state} = ATTRIBUTE_NAME_STATE;
1287
1288 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1289 $self->{line_prev} = $self->{line};
1290 $self->{column_prev} = $self->{column};
1291 $self->{column}++;
1292 $self->{nc}
1293 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1294 } else {
1295 $self->{set_nc}->($self);
1296 }
1297
1298 redo A;
1299 } elsif ($self->{nc} == 0x002F) { # /
1300
1301 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1302
1303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1304 $self->{line_prev} = $self->{line};
1305 $self->{column_prev} = $self->{column};
1306 $self->{column}++;
1307 $self->{nc}
1308 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1309 } else {
1310 $self->{set_nc}->($self);
1311 }
1312
1313 redo A;
1314 } elsif ($self->{nc} == -1) {
1315 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1316 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1317
1318 $self->{last_stag_name} = $self->{ct}->{tag_name};
1319 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1320 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1321 if ($self->{ct}->{attributes}) {
1322
1323 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1324 } else {
1325 ## NOTE: This state should never be reached.
1326
1327 }
1328 } else {
1329 die "$0: $self->{ct}->{type}: Unknown token type";
1330 }
1331 $self->{state} = DATA_STATE;
1332 # reconsume
1333
1334 return ($self->{ct}); # start tag or end tag
1335
1336 redo A;
1337 } else {
1338 if ($self->{nc} == 0x0022 or # "
1339 $self->{nc} == 0x0027) { # '
1340
1341 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1342 } else {
1343
1344 }
1345 $self->{ca}
1346 = {name => chr ($self->{nc}),
1347 value => '',
1348 line => $self->{line}, column => $self->{column}};
1349 $self->{state} = ATTRIBUTE_NAME_STATE;
1350
1351 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1352 $self->{line_prev} = $self->{line};
1353 $self->{column_prev} = $self->{column};
1354 $self->{column}++;
1355 $self->{nc}
1356 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1357 } else {
1358 $self->{set_nc}->($self);
1359 }
1360
1361 redo A;
1362 }
1363 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1364 if ($is_space->{$self->{nc}}) {
1365
1366 ## Stay in the state
1367
1368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1369 $self->{line_prev} = $self->{line};
1370 $self->{column_prev} = $self->{column};
1371 $self->{column}++;
1372 $self->{nc}
1373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1374 } else {
1375 $self->{set_nc}->($self);
1376 }
1377
1378 redo A;
1379 } elsif ($self->{nc} == 0x0022) { # "
1380
1381 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1382
1383 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1384 $self->{line_prev} = $self->{line};
1385 $self->{column_prev} = $self->{column};
1386 $self->{column}++;
1387 $self->{nc}
1388 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1389 } else {
1390 $self->{set_nc}->($self);
1391 }
1392
1393 redo A;
1394 } elsif ($self->{nc} == 0x0026) { # &
1395
1396 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1397 ## reconsume
1398 redo A;
1399 } elsif ($self->{nc} == 0x0027) { # '
1400
1401 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1402
1403 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1404 $self->{line_prev} = $self->{line};
1405 $self->{column_prev} = $self->{column};
1406 $self->{column}++;
1407 $self->{nc}
1408 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1409 } else {
1410 $self->{set_nc}->($self);
1411 }
1412
1413 redo A;
1414 } elsif ($self->{nc} == 0x003E) { # >
1415 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1416 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1417
1418 $self->{last_stag_name} = $self->{ct}->{tag_name};
1419 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1420 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1421 if ($self->{ct}->{attributes}) {
1422
1423 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1424 } else {
1425 ## NOTE: This state should never be reached.
1426
1427 }
1428 } else {
1429 die "$0: $self->{ct}->{type}: Unknown token type";
1430 }
1431 $self->{state} = DATA_STATE;
1432
1433 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1434 $self->{line_prev} = $self->{line};
1435 $self->{column_prev} = $self->{column};
1436 $self->{column}++;
1437 $self->{nc}
1438 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1439 } else {
1440 $self->{set_nc}->($self);
1441 }
1442
1443
1444 return ($self->{ct}); # start tag or end tag
1445
1446 redo A;
1447 } elsif ($self->{nc} == -1) {
1448 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1449 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1450
1451 $self->{last_stag_name} = $self->{ct}->{tag_name};
1452 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1453 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1454 if ($self->{ct}->{attributes}) {
1455
1456 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1457 } else {
1458 ## NOTE: This state should never be reached.
1459
1460 }
1461 } else {
1462 die "$0: $self->{ct}->{type}: Unknown token type";
1463 }
1464 $self->{state} = DATA_STATE;
1465 ## reconsume
1466
1467 return ($self->{ct}); # start tag or end tag
1468
1469 redo A;
1470 } else {
1471 if ($self->{nc} == 0x003D) { # =
1472
1473 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1474 } else {
1475
1476 }
1477 $self->{ca}->{value} .= chr ($self->{nc});
1478 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1479
1480 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1481 $self->{line_prev} = $self->{line};
1482 $self->{column_prev} = $self->{column};
1483 $self->{column}++;
1484 $self->{nc}
1485 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1486 } else {
1487 $self->{set_nc}->($self);
1488 }
1489
1490 redo A;
1491 }
1492 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1493 if ($self->{nc} == 0x0022) { # "
1494
1495 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1496
1497 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1498 $self->{line_prev} = $self->{line};
1499 $self->{column_prev} = $self->{column};
1500 $self->{column}++;
1501 $self->{nc}
1502 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1503 } else {
1504 $self->{set_nc}->($self);
1505 }
1506
1507 redo A;
1508 } elsif ($self->{nc} == 0x0026) { # &
1509
1510 ## NOTE: In the spec, the tokenizer is switched to the
1511 ## "entity in attribute value state". In this implementation, the
1512 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1513 ## implementation of the "consume a character reference" algorithm.
1514 $self->{prev_state} = $self->{state};
1515 $self->{entity_add} = 0x0022; # "
1516 $self->{state} = ENTITY_STATE;
1517
1518 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1519 $self->{line_prev} = $self->{line};
1520 $self->{column_prev} = $self->{column};
1521 $self->{column}++;
1522 $self->{nc}
1523 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1524 } else {
1525 $self->{set_nc}->($self);
1526 }
1527
1528 redo A;
1529 } elsif ($self->{nc} == -1) {
1530 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1531 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1532
1533 $self->{last_stag_name} = $self->{ct}->{tag_name};
1534 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1535 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1536 if ($self->{ct}->{attributes}) {
1537
1538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1539 } else {
1540 ## NOTE: This state should never be reached.
1541
1542 }
1543 } else {
1544 die "$0: $self->{ct}->{type}: Unknown token type";
1545 }
1546 $self->{state} = DATA_STATE;
1547 ## reconsume
1548
1549 return ($self->{ct}); # start tag or end tag
1550
1551 redo A;
1552 } else {
1553
1554 $self->{ca}->{value} .= chr ($self->{nc});
1555 $self->{read_until}->($self->{ca}->{value},
1556 q["&],
1557 length $self->{ca}->{value});
1558
1559 ## Stay in the state
1560
1561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562 $self->{line_prev} = $self->{line};
1563 $self->{column_prev} = $self->{column};
1564 $self->{column}++;
1565 $self->{nc}
1566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567 } else {
1568 $self->{set_nc}->($self);
1569 }
1570
1571 redo A;
1572 }
1573 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1574 if ($self->{nc} == 0x0027) { # '
1575
1576 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1577
1578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1579 $self->{line_prev} = $self->{line};
1580 $self->{column_prev} = $self->{column};
1581 $self->{column}++;
1582 $self->{nc}
1583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1584 } else {
1585 $self->{set_nc}->($self);
1586 }
1587
1588 redo A;
1589 } elsif ($self->{nc} == 0x0026) { # &
1590
1591 ## NOTE: In the spec, the tokenizer is switched to the
1592 ## "entity in attribute value state". In this implementation, the
1593 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1594 ## implementation of the "consume a character reference" algorithm.
1595 $self->{entity_add} = 0x0027; # '
1596 $self->{prev_state} = $self->{state};
1597 $self->{state} = ENTITY_STATE;
1598
1599 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1600 $self->{line_prev} = $self->{line};
1601 $self->{column_prev} = $self->{column};
1602 $self->{column}++;
1603 $self->{nc}
1604 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1605 } else {
1606 $self->{set_nc}->($self);
1607 }
1608
1609 redo A;
1610 } elsif ($self->{nc} == -1) {
1611 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1612 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1613
1614 $self->{last_stag_name} = $self->{ct}->{tag_name};
1615 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1616 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1617 if ($self->{ct}->{attributes}) {
1618
1619 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1620 } else {
1621 ## NOTE: This state should never be reached.
1622
1623 }
1624 } else {
1625 die "$0: $self->{ct}->{type}: Unknown token type";
1626 }
1627 $self->{state} = DATA_STATE;
1628 ## reconsume
1629
1630 return ($self->{ct}); # start tag or end tag
1631
1632 redo A;
1633 } else {
1634
1635 $self->{ca}->{value} .= chr ($self->{nc});
1636 $self->{read_until}->($self->{ca}->{value},
1637 q['&],
1638 length $self->{ca}->{value});
1639
1640 ## Stay in the state
1641
1642 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1643 $self->{line_prev} = $self->{line};
1644 $self->{column_prev} = $self->{column};
1645 $self->{column}++;
1646 $self->{nc}
1647 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1648 } else {
1649 $self->{set_nc}->($self);
1650 }
1651
1652 redo A;
1653 }
1654 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1655 if ($is_space->{$self->{nc}}) {
1656
1657 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1658
1659 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1660 $self->{line_prev} = $self->{line};
1661 $self->{column_prev} = $self->{column};
1662 $self->{column}++;
1663 $self->{nc}
1664 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1665 } else {
1666 $self->{set_nc}->($self);
1667 }
1668
1669 redo A;
1670 } elsif ($self->{nc} == 0x0026) { # &
1671
1672 ## NOTE: In the spec, the tokenizer is switched to the
1673 ## "entity in attribute value state". In this implementation, the
1674 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1675 ## implementation of the "consume a character reference" algorithm.
1676 $self->{entity_add} = -1;
1677 $self->{prev_state} = $self->{state};
1678 $self->{state} = ENTITY_STATE;
1679
1680 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1681 $self->{line_prev} = $self->{line};
1682 $self->{column_prev} = $self->{column};
1683 $self->{column}++;
1684 $self->{nc}
1685 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1686 } else {
1687 $self->{set_nc}->($self);
1688 }
1689
1690 redo A;
1691 } elsif ($self->{nc} == 0x003E) { # >
1692 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1693
1694 $self->{last_stag_name} = $self->{ct}->{tag_name};
1695 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1696 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1697 if ($self->{ct}->{attributes}) {
1698
1699 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1700 } else {
1701 ## NOTE: This state should never be reached.
1702
1703 }
1704 } else {
1705 die "$0: $self->{ct}->{type}: Unknown token type";
1706 }
1707 $self->{state} = DATA_STATE;
1708
1709 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1710 $self->{line_prev} = $self->{line};
1711 $self->{column_prev} = $self->{column};
1712 $self->{column}++;
1713 $self->{nc}
1714 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1715 } else {
1716 $self->{set_nc}->($self);
1717 }
1718
1719
1720 return ($self->{ct}); # start tag or end tag
1721
1722 redo A;
1723 } elsif ($self->{nc} == -1) {
1724 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1725 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1726
1727 $self->{last_stag_name} = $self->{ct}->{tag_name};
1728 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1729 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1730 if ($self->{ct}->{attributes}) {
1731
1732 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1733 } else {
1734 ## NOTE: This state should never be reached.
1735
1736 }
1737 } else {
1738 die "$0: $self->{ct}->{type}: Unknown token type";
1739 }
1740 $self->{state} = DATA_STATE;
1741 ## reconsume
1742
1743 return ($self->{ct}); # start tag or end tag
1744
1745 redo A;
1746 } else {
1747 if ({
1748 0x0022 => 1, # "
1749 0x0027 => 1, # '
1750 0x003D => 1, # =
1751 }->{$self->{nc}}) {
1752
1753 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1754 } else {
1755
1756 }
1757 $self->{ca}->{value} .= chr ($self->{nc});
1758 $self->{read_until}->($self->{ca}->{value},
1759 q["'=& >],
1760 length $self->{ca}->{value});
1761
1762 ## Stay in the state
1763
1764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1765 $self->{line_prev} = $self->{line};
1766 $self->{column_prev} = $self->{column};
1767 $self->{column}++;
1768 $self->{nc}
1769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1770 } else {
1771 $self->{set_nc}->($self);
1772 }
1773
1774 redo A;
1775 }
1776 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1777 if ($is_space->{$self->{nc}}) {
1778
1779 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1780
1781 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1782 $self->{line_prev} = $self->{line};
1783 $self->{column_prev} = $self->{column};
1784 $self->{column}++;
1785 $self->{nc}
1786 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1787 } else {
1788 $self->{set_nc}->($self);
1789 }
1790
1791 redo A;
1792 } elsif ($self->{nc} == 0x003E) { # >
1793 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1794
1795 $self->{last_stag_name} = $self->{ct}->{tag_name};
1796 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1797 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1798 if ($self->{ct}->{attributes}) {
1799
1800 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1801 } else {
1802 ## NOTE: This state should never be reached.
1803
1804 }
1805 } else {
1806 die "$0: $self->{ct}->{type}: Unknown token type";
1807 }
1808 $self->{state} = DATA_STATE;
1809
1810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1811 $self->{line_prev} = $self->{line};
1812 $self->{column_prev} = $self->{column};
1813 $self->{column}++;
1814 $self->{nc}
1815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1816 } else {
1817 $self->{set_nc}->($self);
1818 }
1819
1820
1821 return ($self->{ct}); # start tag or end tag
1822
1823 redo A;
1824 } elsif ($self->{nc} == 0x002F) { # /
1825
1826 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1827
1828 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1829 $self->{line_prev} = $self->{line};
1830 $self->{column_prev} = $self->{column};
1831 $self->{column}++;
1832 $self->{nc}
1833 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1834 } else {
1835 $self->{set_nc}->($self);
1836 }
1837
1838 redo A;
1839 } elsif ($self->{nc} == -1) {
1840 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1841 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1842
1843 $self->{last_stag_name} = $self->{ct}->{tag_name};
1844 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1845 if ($self->{ct}->{attributes}) {
1846
1847 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1848 } else {
1849 ## NOTE: This state should never be reached.
1850
1851 }
1852 } else {
1853 die "$0: $self->{ct}->{type}: Unknown token type";
1854 }
1855 $self->{state} = DATA_STATE;
1856 ## Reconsume.
1857 return ($self->{ct}); # start tag or end tag
1858 redo A;
1859 } else {
1860
1861 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1862 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1863 ## reconsume
1864 redo A;
1865 }
1866 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1867 if ($self->{nc} == 0x003E) { # >
1868 if ($self->{ct}->{type} == END_TAG_TOKEN) {
1869
1870 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1871 ## TODO: Different type than slash in start tag
1872 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1873 if ($self->{ct}->{attributes}) {
1874
1875 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1876 } else {
1877
1878 }
1879 ## TODO: Test |<title></title/>|
1880 } else {
1881
1882 $self->{self_closing} = 1;
1883 }
1884
1885 $self->{state} = DATA_STATE;
1886
1887 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1888 $self->{line_prev} = $self->{line};
1889 $self->{column_prev} = $self->{column};
1890 $self->{column}++;
1891 $self->{nc}
1892 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1893 } else {
1894 $self->{set_nc}->($self);
1895 }
1896
1897
1898 return ($self->{ct}); # start tag or end tag
1899
1900 redo A;
1901 } elsif ($self->{nc} == -1) {
1902 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1903 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1904
1905 $self->{last_stag_name} = $self->{ct}->{tag_name};
1906 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1907 if ($self->{ct}->{attributes}) {
1908
1909 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1910 } else {
1911 ## NOTE: This state should never be reached.
1912
1913 }
1914 } else {
1915 die "$0: $self->{ct}->{type}: Unknown token type";
1916 }
1917 $self->{state} = DATA_STATE;
1918 ## Reconsume.
1919 return ($self->{ct}); # start tag or end tag
1920 redo A;
1921 } else {
1922
1923 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1924 ## TODO: This error type is wrong.
1925 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1926 ## Reconsume.
1927 redo A;
1928 }
1929 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1930 ## (only happen if PCDATA state)
1931
1932 ## NOTE: Unlike spec's "bogus comment state", this implementation
1933 ## consumes characters one-by-one basis.
1934
1935 if ($self->{nc} == 0x003E) { # >
1936
1937 $self->{state} = DATA_STATE;
1938
1939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940 $self->{line_prev} = $self->{line};
1941 $self->{column_prev} = $self->{column};
1942 $self->{column}++;
1943 $self->{nc}
1944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945 } else {
1946 $self->{set_nc}->($self);
1947 }
1948
1949
1950 return ($self->{ct}); # comment
1951 redo A;
1952 } elsif ($self->{nc} == -1) {
1953
1954 $self->{state} = DATA_STATE;
1955 ## reconsume
1956
1957 return ($self->{ct}); # comment
1958 redo A;
1959 } else {
1960
1961 $self->{ct}->{data} .= chr ($self->{nc}); # comment
1962 $self->{read_until}->($self->{ct}->{data},
1963 q[>],
1964 length $self->{ct}->{data});
1965
1966 ## Stay in the state.
1967
1968 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1969 $self->{line_prev} = $self->{line};
1970 $self->{column_prev} = $self->{column};
1971 $self->{column}++;
1972 $self->{nc}
1973 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1974 } else {
1975 $self->{set_nc}->($self);
1976 }
1977
1978 redo A;
1979 }
1980 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1981 ## (only happen if PCDATA state)
1982
1983 if ($self->{nc} == 0x002D) { # -
1984
1985 $self->{state} = MD_HYPHEN_STATE;
1986
1987 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1988 $self->{line_prev} = $self->{line};
1989 $self->{column_prev} = $self->{column};
1990 $self->{column}++;
1991 $self->{nc}
1992 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1993 } else {
1994 $self->{set_nc}->($self);
1995 }
1996
1997 redo A;
1998 } elsif ($self->{nc} == 0x0044 or # D
1999 $self->{nc} == 0x0064) { # d
2000 ## ASCII case-insensitive.
2001
2002 $self->{state} = MD_DOCTYPE_STATE;
2003 $self->{s_kwd} = chr $self->{nc};
2004
2005 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2006 $self->{line_prev} = $self->{line};
2007 $self->{column_prev} = $self->{column};
2008 $self->{column}++;
2009 $self->{nc}
2010 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2011 } else {
2012 $self->{set_nc}->($self);
2013 }
2014
2015 redo A;
2016 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2017 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2018 $self->{is_xml}) and
2019 $self->{nc} == 0x005B) { # [
2020
2021 $self->{state} = MD_CDATA_STATE;
2022 $self->{s_kwd} = '[';
2023
2024 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2025 $self->{line_prev} = $self->{line};
2026 $self->{column_prev} = $self->{column};
2027 $self->{column}++;
2028 $self->{nc}
2029 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2030 } else {
2031 $self->{set_nc}->($self);
2032 }
2033
2034 redo A;
2035 } else {
2036
2037 }
2038
2039 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2040 line => $self->{line_prev},
2041 column => $self->{column_prev} - 1);
2042 ## Reconsume.
2043 $self->{state} = BOGUS_COMMENT_STATE;
2044 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2045 line => $self->{line_prev},
2046 column => $self->{column_prev} - 1,
2047 };
2048 redo A;
2049 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2050 if ($self->{nc} == 0x002D) { # -
2051
2052 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2053 line => $self->{line_prev},
2054 column => $self->{column_prev} - 2,
2055 };
2056 $self->{state} = COMMENT_START_STATE;
2057
2058 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2059 $self->{line_prev} = $self->{line};
2060 $self->{column_prev} = $self->{column};
2061 $self->{column}++;
2062 $self->{nc}
2063 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2064 } else {
2065 $self->{set_nc}->($self);
2066 }
2067
2068 redo A;
2069 } else {
2070
2071 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2072 line => $self->{line_prev},
2073 column => $self->{column_prev} - 2);
2074 $self->{state} = BOGUS_COMMENT_STATE;
2075 ## Reconsume.
2076 $self->{ct} = {type => COMMENT_TOKEN,
2077 data => '-',
2078 line => $self->{line_prev},
2079 column => $self->{column_prev} - 2,
2080 };
2081 redo A;
2082 }
2083 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2084 ## ASCII case-insensitive.
2085 if ($self->{nc} == [
2086 undef,
2087 0x004F, # O
2088 0x0043, # C
2089 0x0054, # T
2090 0x0059, # Y
2091 0x0050, # P
2092 ]->[length $self->{s_kwd}] or
2093 $self->{nc} == [
2094 undef,
2095 0x006F, # o
2096 0x0063, # c
2097 0x0074, # t
2098 0x0079, # y
2099 0x0070, # p
2100 ]->[length $self->{s_kwd}]) {
2101
2102 ## Stay in the state.
2103 $self->{s_kwd} .= chr $self->{nc};
2104
2105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2106 $self->{line_prev} = $self->{line};
2107 $self->{column_prev} = $self->{column};
2108 $self->{column}++;
2109 $self->{nc}
2110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2111 } else {
2112 $self->{set_nc}->($self);
2113 }
2114
2115 redo A;
2116 } elsif ((length $self->{s_kwd}) == 6 and
2117 ($self->{nc} == 0x0045 or # E
2118 $self->{nc} == 0x0065)) { # e
2119
2120 $self->{state} = DOCTYPE_STATE;
2121 $self->{ct} = {type => DOCTYPE_TOKEN,
2122 quirks => 1,
2123 line => $self->{line_prev},
2124 column => $self->{column_prev} - 7,
2125 };
2126
2127 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2128 $self->{line_prev} = $self->{line};
2129 $self->{column_prev} = $self->{column};
2130 $self->{column}++;
2131 $self->{nc}
2132 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2133 } else {
2134 $self->{set_nc}->($self);
2135 }
2136
2137 redo A;
2138 } else {
2139
2140 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2141 line => $self->{line_prev},
2142 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2143 $self->{state} = BOGUS_COMMENT_STATE;
2144 ## Reconsume.
2145 $self->{ct} = {type => COMMENT_TOKEN,
2146 data => $self->{s_kwd},
2147 line => $self->{line_prev},
2148 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2149 };
2150 redo A;
2151 }
2152 } elsif ($self->{state} == MD_CDATA_STATE) {
2153 if ($self->{nc} == {
2154 '[' => 0x0043, # C
2155 '[C' => 0x0044, # D
2156 '[CD' => 0x0041, # A
2157 '[CDA' => 0x0054, # T
2158 '[CDAT' => 0x0041, # A
2159 }->{$self->{s_kwd}}) {
2160
2161 ## Stay in the state.
2162 $self->{s_kwd} .= chr $self->{nc};
2163
2164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2165 $self->{line_prev} = $self->{line};
2166 $self->{column_prev} = $self->{column};
2167 $self->{column}++;
2168 $self->{nc}
2169 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2170 } else {
2171 $self->{set_nc}->($self);
2172 }
2173
2174 redo A;
2175 } elsif ($self->{s_kwd} eq '[CDATA' and
2176 $self->{nc} == 0x005B) { # [
2177
2178 $self->{ct} = {type => CHARACTER_TOKEN,
2179 data => '',
2180 line => $self->{line_prev},
2181 column => $self->{column_prev} - 7};
2182 $self->{state} = CDATA_SECTION_STATE;
2183
2184 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2185 $self->{line_prev} = $self->{line};
2186 $self->{column_prev} = $self->{column};
2187 $self->{column}++;
2188 $self->{nc}
2189 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2190 } else {
2191 $self->{set_nc}->($self);
2192 }
2193
2194 redo A;
2195 } else {
2196
2197 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2198 line => $self->{line_prev},
2199 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2200 $self->{state} = BOGUS_COMMENT_STATE;
2201 ## Reconsume.
2202 $self->{ct} = {type => COMMENT_TOKEN,
2203 data => $self->{s_kwd},
2204 line => $self->{line_prev},
2205 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2206 };
2207 redo A;
2208 }
2209 } elsif ($self->{state} == COMMENT_START_STATE) {
2210 if ($self->{nc} == 0x002D) { # -
2211
2212 $self->{state} = COMMENT_START_DASH_STATE;
2213
2214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2215 $self->{line_prev} = $self->{line};
2216 $self->{column_prev} = $self->{column};
2217 $self->{column}++;
2218 $self->{nc}
2219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2220 } else {
2221 $self->{set_nc}->($self);
2222 }
2223
2224 redo A;
2225 } elsif ($self->{nc} == 0x003E) { # >
2226
2227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2228 $self->{state} = DATA_STATE;
2229
2230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2231 $self->{line_prev} = $self->{line};
2232 $self->{column_prev} = $self->{column};
2233 $self->{column}++;
2234 $self->{nc}
2235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2236 } else {
2237 $self->{set_nc}->($self);
2238 }
2239
2240
2241 return ($self->{ct}); # comment
2242
2243 redo A;
2244 } elsif ($self->{nc} == -1) {
2245
2246 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2247 $self->{state} = DATA_STATE;
2248 ## reconsume
2249
2250 return ($self->{ct}); # comment
2251
2252 redo A;
2253 } else {
2254
2255 $self->{ct}->{data} # comment
2256 .= chr ($self->{nc});
2257 $self->{state} = COMMENT_STATE;
2258
2259 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2260 $self->{line_prev} = $self->{line};
2261 $self->{column_prev} = $self->{column};
2262 $self->{column}++;
2263 $self->{nc}
2264 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2265 } else {
2266 $self->{set_nc}->($self);
2267 }
2268
2269 redo A;
2270 }
2271 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2272 if ($self->{nc} == 0x002D) { # -
2273
2274 $self->{state} = COMMENT_END_STATE;
2275
2276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2277 $self->{line_prev} = $self->{line};
2278 $self->{column_prev} = $self->{column};
2279 $self->{column}++;
2280 $self->{nc}
2281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2282 } else {
2283 $self->{set_nc}->($self);
2284 }
2285
2286 redo A;
2287 } elsif ($self->{nc} == 0x003E) { # >
2288
2289 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2290 $self->{state} = DATA_STATE;
2291
2292 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2293 $self->{line_prev} = $self->{line};
2294 $self->{column_prev} = $self->{column};
2295 $self->{column}++;
2296 $self->{nc}
2297 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2298 } else {
2299 $self->{set_nc}->($self);
2300 }
2301
2302
2303 return ($self->{ct}); # comment
2304
2305 redo A;
2306 } elsif ($self->{nc} == -1) {
2307
2308 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2309 $self->{state} = DATA_STATE;
2310 ## reconsume
2311
2312 return ($self->{ct}); # comment
2313
2314 redo A;
2315 } else {
2316
2317 $self->{ct}->{data} # comment
2318 .= '-' . chr ($self->{nc});
2319 $self->{state} = COMMENT_STATE;
2320
2321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2322 $self->{line_prev} = $self->{line};
2323 $self->{column_prev} = $self->{column};
2324 $self->{column}++;
2325 $self->{nc}
2326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2327 } else {
2328 $self->{set_nc}->($self);
2329 }
2330
2331 redo A;
2332 }
2333 } elsif ($self->{state} == COMMENT_STATE) {
2334 if ($self->{nc} == 0x002D) { # -
2335
2336 $self->{state} = COMMENT_END_DASH_STATE;
2337
2338 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339 $self->{line_prev} = $self->{line};
2340 $self->{column_prev} = $self->{column};
2341 $self->{column}++;
2342 $self->{nc}
2343 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2344 } else {
2345 $self->{set_nc}->($self);
2346 }
2347
2348 redo A;
2349 } elsif ($self->{nc} == -1) {
2350
2351 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2352 $self->{state} = DATA_STATE;
2353 ## reconsume
2354
2355 return ($self->{ct}); # comment
2356
2357 redo A;
2358 } else {
2359
2360 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2361 $self->{read_until}->($self->{ct}->{data},
2362 q[-],
2363 length $self->{ct}->{data});
2364
2365 ## Stay in the state
2366
2367 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2368 $self->{line_prev} = $self->{line};
2369 $self->{column_prev} = $self->{column};
2370 $self->{column}++;
2371 $self->{nc}
2372 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2373 } else {
2374 $self->{set_nc}->($self);
2375 }
2376
2377 redo A;
2378 }
2379 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2380 if ($self->{nc} == 0x002D) { # -
2381
2382 $self->{state} = COMMENT_END_STATE;
2383
2384 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2385 $self->{line_prev} = $self->{line};
2386 $self->{column_prev} = $self->{column};
2387 $self->{column}++;
2388 $self->{nc}
2389 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2390 } else {
2391 $self->{set_nc}->($self);
2392 }
2393
2394 redo A;
2395 } elsif ($self->{nc} == -1) {
2396
2397 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2398 $self->{state} = DATA_STATE;
2399 ## reconsume
2400
2401 return ($self->{ct}); # comment
2402
2403 redo A;
2404 } else {
2405
2406 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2407 $self->{state} = COMMENT_STATE;
2408
2409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2410 $self->{line_prev} = $self->{line};
2411 $self->{column_prev} = $self->{column};
2412 $self->{column}++;
2413 $self->{nc}
2414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2415 } else {
2416 $self->{set_nc}->($self);
2417 }
2418
2419 redo A;
2420 }
2421 } elsif ($self->{state} == COMMENT_END_STATE) {
2422 if ($self->{nc} == 0x003E) { # >
2423
2424 $self->{state} = DATA_STATE;
2425
2426 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427 $self->{line_prev} = $self->{line};
2428 $self->{column_prev} = $self->{column};
2429 $self->{column}++;
2430 $self->{nc}
2431 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432 } else {
2433 $self->{set_nc}->($self);
2434 }
2435
2436
2437 return ($self->{ct}); # comment
2438
2439 redo A;
2440 } elsif ($self->{nc} == 0x002D) { # -
2441
2442 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2443 line => $self->{line_prev},
2444 column => $self->{column_prev});
2445 $self->{ct}->{data} .= '-'; # comment
2446 ## Stay in the state
2447
2448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2449 $self->{line_prev} = $self->{line};
2450 $self->{column_prev} = $self->{column};
2451 $self->{column}++;
2452 $self->{nc}
2453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2454 } else {
2455 $self->{set_nc}->($self);
2456 }
2457
2458 redo A;
2459 } elsif ($self->{nc} == -1) {
2460
2461 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2462 $self->{state} = DATA_STATE;
2463 ## reconsume
2464
2465 return ($self->{ct}); # comment
2466
2467 redo A;
2468 } else {
2469
2470 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2471 line => $self->{line_prev},
2472 column => $self->{column_prev});
2473 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2474 $self->{state} = COMMENT_STATE;
2475
2476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2477 $self->{line_prev} = $self->{line};
2478 $self->{column_prev} = $self->{column};
2479 $self->{column}++;
2480 $self->{nc}
2481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2482 } else {
2483 $self->{set_nc}->($self);
2484 }
2485
2486 redo A;
2487 }
2488 } elsif ($self->{state} == DOCTYPE_STATE) {
2489 if ($is_space->{$self->{nc}}) {
2490
2491 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2492
2493 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2494 $self->{line_prev} = $self->{line};
2495 $self->{column_prev} = $self->{column};
2496 $self->{column}++;
2497 $self->{nc}
2498 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2499 } else {
2500 $self->{set_nc}->($self);
2501 }
2502
2503 redo A;
2504 } else {
2505
2506 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2507 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2508 ## reconsume
2509 redo A;
2510 }
2511 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2512 if ($is_space->{$self->{nc}}) {
2513
2514 ## Stay in the state
2515
2516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2517 $self->{line_prev} = $self->{line};
2518 $self->{column_prev} = $self->{column};
2519 $self->{column}++;
2520 $self->{nc}
2521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2522 } else {
2523 $self->{set_nc}->($self);
2524 }
2525
2526 redo A;
2527 } elsif ($self->{nc} == 0x003E) { # >
2528
2529 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2530 $self->{state} = DATA_STATE;
2531
2532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2533 $self->{line_prev} = $self->{line};
2534 $self->{column_prev} = $self->{column};
2535 $self->{column}++;
2536 $self->{nc}
2537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2538 } else {
2539 $self->{set_nc}->($self);
2540 }
2541
2542
2543 return ($self->{ct}); # DOCTYPE (quirks)
2544
2545 redo A;
2546 } elsif ($self->{nc} == -1) {
2547
2548 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2549 $self->{state} = DATA_STATE;
2550 ## reconsume
2551
2552 return ($self->{ct}); # DOCTYPE (quirks)
2553
2554 redo A;
2555 } else {
2556
2557 $self->{ct}->{name} = chr $self->{nc};
2558 delete $self->{ct}->{quirks};
2559 $self->{state} = DOCTYPE_NAME_STATE;
2560
2561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2562 $self->{line_prev} = $self->{line};
2563 $self->{column_prev} = $self->{column};
2564 $self->{column}++;
2565 $self->{nc}
2566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2567 } else {
2568 $self->{set_nc}->($self);
2569 }
2570
2571 redo A;
2572 }
2573 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2574 ## ISSUE: Redundant "First," in the spec.
2575 if ($is_space->{$self->{nc}}) {
2576
2577 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2578
2579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2580 $self->{line_prev} = $self->{line};
2581 $self->{column_prev} = $self->{column};
2582 $self->{column}++;
2583 $self->{nc}
2584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2585 } else {
2586 $self->{set_nc}->($self);
2587 }
2588
2589 redo A;
2590 } elsif ($self->{nc} == 0x003E) { # >
2591
2592 $self->{state} = DATA_STATE;
2593
2594 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2595 $self->{line_prev} = $self->{line};
2596 $self->{column_prev} = $self->{column};
2597 $self->{column}++;
2598 $self->{nc}
2599 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2600 } else {
2601 $self->{set_nc}->($self);
2602 }
2603
2604
2605 return ($self->{ct}); # DOCTYPE
2606
2607 redo A;
2608 } elsif ($self->{nc} == -1) {
2609
2610 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2611 $self->{state} = DATA_STATE;
2612 ## reconsume
2613
2614 $self->{ct}->{quirks} = 1;
2615 return ($self->{ct}); # DOCTYPE
2616
2617 redo A;
2618 } else {
2619
2620 $self->{ct}->{name}
2621 .= chr ($self->{nc}); # DOCTYPE
2622 ## Stay in the state
2623
2624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2625 $self->{line_prev} = $self->{line};
2626 $self->{column_prev} = $self->{column};
2627 $self->{column}++;
2628 $self->{nc}
2629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2630 } else {
2631 $self->{set_nc}->($self);
2632 }
2633
2634 redo A;
2635 }
2636 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2637 if ($is_space->{$self->{nc}}) {
2638
2639 ## Stay in the state
2640
2641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2642 $self->{line_prev} = $self->{line};
2643 $self->{column_prev} = $self->{column};
2644 $self->{column}++;
2645 $self->{nc}
2646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2647 } else {
2648 $self->{set_nc}->($self);
2649 }
2650
2651 redo A;
2652 } elsif ($self->{nc} == 0x003E) { # >
2653
2654 $self->{state} = DATA_STATE;
2655
2656 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657 $self->{line_prev} = $self->{line};
2658 $self->{column_prev} = $self->{column};
2659 $self->{column}++;
2660 $self->{nc}
2661 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662 } else {
2663 $self->{set_nc}->($self);
2664 }
2665
2666
2667 return ($self->{ct}); # DOCTYPE
2668
2669 redo A;
2670 } elsif ($self->{nc} == -1) {
2671
2672 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2673 $self->{state} = DATA_STATE;
2674 ## reconsume
2675
2676 $self->{ct}->{quirks} = 1;
2677 return ($self->{ct}); # DOCTYPE
2678
2679 redo A;
2680 } elsif ($self->{nc} == 0x0050 or # P
2681 $self->{nc} == 0x0070) { # p
2682 $self->{state} = PUBLIC_STATE;
2683 $self->{s_kwd} = chr $self->{nc};
2684
2685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2686 $self->{line_prev} = $self->{line};
2687 $self->{column_prev} = $self->{column};
2688 $self->{column}++;
2689 $self->{nc}
2690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2691 } else {
2692 $self->{set_nc}->($self);
2693 }
2694
2695 redo A;
2696 } elsif ($self->{nc} == 0x0053 or # S
2697 $self->{nc} == 0x0073) { # s
2698 $self->{state} = SYSTEM_STATE;
2699 $self->{s_kwd} = chr $self->{nc};
2700
2701 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2702 $self->{line_prev} = $self->{line};
2703 $self->{column_prev} = $self->{column};
2704 $self->{column}++;
2705 $self->{nc}
2706 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2707 } else {
2708 $self->{set_nc}->($self);
2709 }
2710
2711 redo A;
2712 } else {
2713
2714 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2715 $self->{ct}->{quirks} = 1;
2716
2717 $self->{state} = BOGUS_DOCTYPE_STATE;
2718
2719 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2720 $self->{line_prev} = $self->{line};
2721 $self->{column_prev} = $self->{column};
2722 $self->{column}++;
2723 $self->{nc}
2724 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2725 } else {
2726 $self->{set_nc}->($self);
2727 }
2728
2729 redo A;
2730 }
2731 } elsif ($self->{state} == PUBLIC_STATE) {
2732 ## ASCII case-insensitive
2733 if ($self->{nc} == [
2734 undef,
2735 0x0055, # U
2736 0x0042, # B
2737 0x004C, # L
2738 0x0049, # I
2739 ]->[length $self->{s_kwd}] or
2740 $self->{nc} == [
2741 undef,
2742 0x0075, # u
2743 0x0062, # b
2744 0x006C, # l
2745 0x0069, # i
2746 ]->[length $self->{s_kwd}]) {
2747
2748 ## Stay in the state.
2749 $self->{s_kwd} .= chr $self->{nc};
2750
2751 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2752 $self->{line_prev} = $self->{line};
2753 $self->{column_prev} = $self->{column};
2754 $self->{column}++;
2755 $self->{nc}
2756 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2757 } else {
2758 $self->{set_nc}->($self);
2759 }
2760
2761 redo A;
2762 } elsif ((length $self->{s_kwd}) == 5 and
2763 ($self->{nc} == 0x0043 or # C
2764 $self->{nc} == 0x0063)) { # c
2765
2766 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2767
2768 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2769 $self->{line_prev} = $self->{line};
2770 $self->{column_prev} = $self->{column};
2771 $self->{column}++;
2772 $self->{nc}
2773 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2774 } else {
2775 $self->{set_nc}->($self);
2776 }
2777
2778 redo A;
2779 } else {
2780
2781 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2782 line => $self->{line_prev},
2783 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2784 $self->{ct}->{quirks} = 1;
2785
2786 $self->{state} = BOGUS_DOCTYPE_STATE;
2787 ## Reconsume.
2788 redo A;
2789 }
2790 } elsif ($self->{state} == SYSTEM_STATE) {
2791 ## ASCII case-insensitive
2792 if ($self->{nc} == [
2793 undef,
2794 0x0059, # Y
2795 0x0053, # S
2796 0x0054, # T
2797 0x0045, # E
2798 ]->[length $self->{s_kwd}] or
2799 $self->{nc} == [
2800 undef,
2801 0x0079, # y
2802 0x0073, # s
2803 0x0074, # t
2804 0x0065, # e
2805 ]->[length $self->{s_kwd}]) {
2806
2807 ## Stay in the state.
2808 $self->{s_kwd} .= chr $self->{nc};
2809
2810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2811 $self->{line_prev} = $self->{line};
2812 $self->{column_prev} = $self->{column};
2813 $self->{column}++;
2814 $self->{nc}
2815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2816 } else {
2817 $self->{set_nc}->($self);
2818 }
2819
2820 redo A;
2821 } elsif ((length $self->{s_kwd}) == 5 and
2822 ($self->{nc} == 0x004D or # M
2823 $self->{nc} == 0x006D)) { # m
2824
2825 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2826
2827 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2828 $self->{line_prev} = $self->{line};
2829 $self->{column_prev} = $self->{column};
2830 $self->{column}++;
2831 $self->{nc}
2832 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2833 } else {
2834 $self->{set_nc}->($self);
2835 }
2836
2837 redo A;
2838 } else {
2839
2840 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2841 line => $self->{line_prev},
2842 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2843 $self->{ct}->{quirks} = 1;
2844
2845 $self->{state} = BOGUS_DOCTYPE_STATE;
2846 ## Reconsume.
2847 redo A;
2848 }
2849 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2850 if ($is_space->{$self->{nc}}) {
2851
2852 ## Stay in the state
2853
2854 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2855 $self->{line_prev} = $self->{line};
2856 $self->{column_prev} = $self->{column};
2857 $self->{column}++;
2858 $self->{nc}
2859 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2860 } else {
2861 $self->{set_nc}->($self);
2862 }
2863
2864 redo A;
2865 } elsif ($self->{nc} eq 0x0022) { # "
2866
2867 $self->{ct}->{pubid} = ''; # DOCTYPE
2868 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2869
2870 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2871 $self->{line_prev} = $self->{line};
2872 $self->{column_prev} = $self->{column};
2873 $self->{column}++;
2874 $self->{nc}
2875 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2876 } else {
2877 $self->{set_nc}->($self);
2878 }
2879
2880 redo A;
2881 } elsif ($self->{nc} eq 0x0027) { # '
2882
2883 $self->{ct}->{pubid} = ''; # DOCTYPE
2884 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2885
2886 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2887 $self->{line_prev} = $self->{line};
2888 $self->{column_prev} = $self->{column};
2889 $self->{column}++;
2890 $self->{nc}
2891 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2892 } else {
2893 $self->{set_nc}->($self);
2894 }
2895
2896 redo A;
2897 } elsif ($self->{nc} eq 0x003E) { # >
2898
2899 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2900
2901 $self->{state} = DATA_STATE;
2902
2903 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2904 $self->{line_prev} = $self->{line};
2905 $self->{column_prev} = $self->{column};
2906 $self->{column}++;
2907 $self->{nc}
2908 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2909 } else {
2910 $self->{set_nc}->($self);
2911 }
2912
2913
2914 $self->{ct}->{quirks} = 1;
2915 return ($self->{ct}); # DOCTYPE
2916
2917 redo A;
2918 } elsif ($self->{nc} == -1) {
2919
2920 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2921
2922 $self->{state} = DATA_STATE;
2923 ## reconsume
2924
2925 $self->{ct}->{quirks} = 1;
2926 return ($self->{ct}); # DOCTYPE
2927
2928 redo A;
2929 } else {
2930
2931 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
2932 $self->{ct}->{quirks} = 1;
2933
2934 $self->{state} = BOGUS_DOCTYPE_STATE;
2935
2936 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2937 $self->{line_prev} = $self->{line};
2938 $self->{column_prev} = $self->{column};
2939 $self->{column}++;
2940 $self->{nc}
2941 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2942 } else {
2943 $self->{set_nc}->($self);
2944 }
2945
2946 redo A;
2947 }
2948 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2949 if ($self->{nc} == 0x0022) { # "
2950
2951 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2952
2953 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2954 $self->{line_prev} = $self->{line};
2955 $self->{column_prev} = $self->{column};
2956 $self->{column}++;
2957 $self->{nc}
2958 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2959 } else {
2960 $self->{set_nc}->($self);
2961 }
2962
2963 redo A;
2964 } elsif ($self->{nc} == 0x003E) { # >
2965
2966 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2967
2968 $self->{state} = DATA_STATE;
2969
2970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2971 $self->{line_prev} = $self->{line};
2972 $self->{column_prev} = $self->{column};
2973 $self->{column}++;
2974 $self->{nc}
2975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2976 } else {
2977 $self->{set_nc}->($self);
2978 }
2979
2980
2981 $self->{ct}->{quirks} = 1;
2982 return ($self->{ct}); # DOCTYPE
2983
2984 redo A;
2985 } elsif ($self->{nc} == -1) {
2986
2987 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2988
2989 $self->{state} = DATA_STATE;
2990 ## reconsume
2991
2992 $self->{ct}->{quirks} = 1;
2993 return ($self->{ct}); # DOCTYPE
2994
2995 redo A;
2996 } else {
2997
2998 $self->{ct}->{pubid} # DOCTYPE
2999 .= chr $self->{nc};
3000 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3001 length $self->{ct}->{pubid});
3002
3003 ## Stay in the state
3004
3005 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3006 $self->{line_prev} = $self->{line};
3007 $self->{column_prev} = $self->{column};
3008 $self->{column}++;
3009 $self->{nc}
3010 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3011 } else {
3012 $self->{set_nc}->($self);
3013 }
3014
3015 redo A;
3016 }
3017 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3018 if ($self->{nc} == 0x0027) { # '
3019
3020 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3021
3022 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3023 $self->{line_prev} = $self->{line};
3024 $self->{column_prev} = $self->{column};
3025 $self->{column}++;
3026 $self->{nc}
3027 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3028 } else {
3029 $self->{set_nc}->($self);
3030 }
3031
3032 redo A;
3033 } elsif ($self->{nc} == 0x003E) { # >
3034
3035 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3036
3037 $self->{state} = DATA_STATE;
3038
3039 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3040 $self->{line_prev} = $self->{line};
3041 $self->{column_prev} = $self->{column};
3042 $self->{column}++;
3043 $self->{nc}
3044 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3045 } else {
3046 $self->{set_nc}->($self);
3047 }
3048
3049
3050 $self->{ct}->{quirks} = 1;
3051 return ($self->{ct}); # DOCTYPE
3052
3053 redo A;
3054 } elsif ($self->{nc} == -1) {
3055
3056 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3057
3058 $self->{state} = DATA_STATE;
3059 ## reconsume
3060
3061 $self->{ct}->{quirks} = 1;
3062 return ($self->{ct}); # DOCTYPE
3063
3064 redo A;
3065 } else {
3066
3067 $self->{ct}->{pubid} # DOCTYPE
3068 .= chr $self->{nc};
3069 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3070 length $self->{ct}->{pubid});
3071
3072 ## Stay in the state
3073
3074 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3075 $self->{line_prev} = $self->{line};
3076 $self->{column_prev} = $self->{column};
3077 $self->{column}++;
3078 $self->{nc}
3079 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3080 } else {
3081 $self->{set_nc}->($self);
3082 }
3083
3084 redo A;
3085 }
3086 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3087 if ($is_space->{$self->{nc}}) {
3088
3089 ## Stay in the state
3090
3091 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3092 $self->{line_prev} = $self->{line};
3093 $self->{column_prev} = $self->{column};
3094 $self->{column}++;
3095 $self->{nc}
3096 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3097 } else {
3098 $self->{set_nc}->($self);
3099 }
3100
3101 redo A;
3102 } elsif ($self->{nc} == 0x0022) { # "
3103
3104 $self->{ct}->{sysid} = ''; # DOCTYPE
3105 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3106
3107 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3108 $self->{line_prev} = $self->{line};
3109 $self->{column_prev} = $self->{column};
3110 $self->{column}++;
3111 $self->{nc}
3112 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3113 } else {
3114 $self->{set_nc}->($self);
3115 }
3116
3117 redo A;
3118 } elsif ($self->{nc} == 0x0027) { # '
3119
3120 $self->{ct}->{sysid} = ''; # DOCTYPE
3121 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3122
3123 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3124 $self->{line_prev} = $self->{line};
3125 $self->{column_prev} = $self->{column};
3126 $self->{column}++;
3127 $self->{nc}
3128 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3129 } else {
3130 $self->{set_nc}->($self);
3131 }
3132
3133 redo A;
3134 } elsif ($self->{nc} == 0x003E) { # >
3135
3136 $self->{state} = DATA_STATE;
3137
3138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3139 $self->{line_prev} = $self->{line};
3140 $self->{column_prev} = $self->{column};
3141 $self->{column}++;
3142 $self->{nc}
3143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3144 } else {
3145 $self->{set_nc}->($self);
3146 }
3147
3148
3149 return ($self->{ct}); # DOCTYPE
3150
3151 redo A;
3152 } elsif ($self->{nc} == -1) {
3153
3154 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3155
3156 $self->{state} = DATA_STATE;
3157 ## reconsume
3158
3159 $self->{ct}->{quirks} = 1;
3160 return ($self->{ct}); # DOCTYPE
3161
3162 redo A;
3163 } else {
3164
3165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3166 $self->{ct}->{quirks} = 1;
3167
3168 $self->{state} = BOGUS_DOCTYPE_STATE;
3169
3170 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171 $self->{line_prev} = $self->{line};
3172 $self->{column_prev} = $self->{column};
3173 $self->{column}++;
3174 $self->{nc}
3175 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176 } else {
3177 $self->{set_nc}->($self);
3178 }
3179
3180 redo A;
3181 }
3182 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3183 if ($is_space->{$self->{nc}}) {
3184
3185 ## Stay in the state
3186
3187 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3188 $self->{line_prev} = $self->{line};
3189 $self->{column_prev} = $self->{column};
3190 $self->{column}++;
3191 $self->{nc}
3192 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3193 } else {
3194 $self->{set_nc}->($self);
3195 }
3196
3197 redo A;
3198 } elsif ($self->{nc} == 0x0022) { # "
3199
3200 $self->{ct}->{sysid} = ''; # DOCTYPE
3201 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3202
3203 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3204 $self->{line_prev} = $self->{line};
3205 $self->{column_prev} = $self->{column};
3206 $self->{column}++;
3207 $self->{nc}
3208 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3209 } else {
3210 $self->{set_nc}->($self);
3211 }
3212
3213 redo A;
3214 } elsif ($self->{nc} == 0x0027) { # '
3215
3216 $self->{ct}->{sysid} = ''; # DOCTYPE
3217 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3218
3219 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3220 $self->{line_prev} = $self->{line};
3221 $self->{column_prev} = $self->{column};
3222 $self->{column}++;
3223 $self->{nc}
3224 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3225 } else {
3226 $self->{set_nc}->($self);
3227 }
3228
3229 redo A;
3230 } elsif ($self->{nc} == 0x003E) { # >
3231
3232 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3233 $self->{state} = DATA_STATE;
3234
3235 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3236 $self->{line_prev} = $self->{line};
3237 $self->{column_prev} = $self->{column};
3238 $self->{column}++;
3239 $self->{nc}
3240 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3241 } else {
3242 $self->{set_nc}->($self);
3243 }
3244
3245
3246 $self->{ct}->{quirks} = 1;
3247 return ($self->{ct}); # DOCTYPE
3248
3249 redo A;
3250 } elsif ($self->{nc} == -1) {
3251
3252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3253
3254 $self->{state} = DATA_STATE;
3255 ## reconsume
3256
3257 $self->{ct}->{quirks} = 1;
3258 return ($self->{ct}); # DOCTYPE
3259
3260 redo A;
3261 } else {
3262
3263 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3264 $self->{ct}->{quirks} = 1;
3265
3266 $self->{state} = BOGUS_DOCTYPE_STATE;
3267
3268 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3269 $self->{line_prev} = $self->{line};
3270 $self->{column_prev} = $self->{column};
3271 $self->{column}++;
3272 $self->{nc}
3273 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3274 } else {
3275 $self->{set_nc}->($self);
3276 }
3277
3278 redo A;
3279 }
3280 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3281 if ($self->{nc} == 0x0022) { # "
3282
3283 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3284
3285 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3286 $self->{line_prev} = $self->{line};
3287 $self->{column_prev} = $self->{column};
3288 $self->{column}++;
3289 $self->{nc}
3290 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3291 } else {
3292 $self->{set_nc}->($self);
3293 }
3294
3295 redo A;
3296 } elsif ($self->{nc} == 0x003E) { # >
3297
3298 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3299
3300 $self->{state} = DATA_STATE;
3301
3302 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3303 $self->{line_prev} = $self->{line};
3304 $self->{column_prev} = $self->{column};
3305 $self->{column}++;
3306 $self->{nc}
3307 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3308 } else {
3309 $self->{set_nc}->($self);
3310 }
3311
3312
3313 $self->{ct}->{quirks} = 1;
3314 return ($self->{ct}); # DOCTYPE
3315
3316 redo A;
3317 } elsif ($self->{nc} == -1) {
3318
3319 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3320
3321 $self->{state} = DATA_STATE;
3322 ## reconsume
3323
3324 $self->{ct}->{quirks} = 1;
3325 return ($self->{ct}); # DOCTYPE
3326
3327 redo A;
3328 } else {
3329
3330 $self->{ct}->{sysid} # DOCTYPE
3331 .= chr $self->{nc};
3332 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3333 length $self->{ct}->{sysid});
3334
3335 ## Stay in the state
3336
3337 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3338 $self->{line_prev} = $self->{line};
3339 $self->{column_prev} = $self->{column};
3340 $self->{column}++;
3341 $self->{nc}
3342 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3343 } else {
3344 $self->{set_nc}->($self);
3345 }
3346
3347 redo A;
3348 }
3349 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3350 if ($self->{nc} == 0x0027) { # '
3351
3352 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3353
3354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3355 $self->{line_prev} = $self->{line};
3356 $self->{column_prev} = $self->{column};
3357 $self->{column}++;
3358 $self->{nc}
3359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3360 } else {
3361 $self->{set_nc}->($self);
3362 }
3363
3364 redo A;
3365 } elsif ($self->{nc} == 0x003E) { # >
3366
3367 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3368
3369 $self->{state} = DATA_STATE;
3370
3371 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3372 $self->{line_prev} = $self->{line};
3373 $self->{column_prev} = $self->{column};
3374 $self->{column}++;
3375 $self->{nc}
3376 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3377 } else {
3378 $self->{set_nc}->($self);
3379 }
3380
3381
3382 $self->{ct}->{quirks} = 1;
3383 return ($self->{ct}); # DOCTYPE
3384
3385 redo A;
3386 } elsif ($self->{nc} == -1) {
3387
3388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3389
3390 $self->{state} = DATA_STATE;
3391 ## reconsume
3392
3393 $self->{ct}->{quirks} = 1;
3394 return ($self->{ct}); # DOCTYPE
3395
3396 redo A;
3397 } else {
3398
3399 $self->{ct}->{sysid} # DOCTYPE
3400 .= chr $self->{nc};
3401 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3402 length $self->{ct}->{sysid});
3403
3404 ## Stay in the state
3405
3406 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3407 $self->{line_prev} = $self->{line};
3408 $self->{column_prev} = $self->{column};
3409 $self->{column}++;
3410 $self->{nc}
3411 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3412 } else {
3413 $self->{set_nc}->($self);
3414 }
3415
3416 redo A;
3417 }
3418 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3419 if ($is_space->{$self->{nc}}) {
3420
3421 ## Stay in the state
3422
3423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3424 $self->{line_prev} = $self->{line};
3425 $self->{column_prev} = $self->{column};
3426 $self->{column}++;
3427 $self->{nc}
3428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3429 } else {
3430 $self->{set_nc}->($self);
3431 }
3432
3433 redo A;
3434 } elsif ($self->{nc} == 0x003E) { # >
3435
3436 $self->{state} = DATA_STATE;
3437
3438 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3439 $self->{line_prev} = $self->{line};
3440 $self->{column_prev} = $self->{column};
3441 $self->{column}++;
3442 $self->{nc}
3443 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3444 } else {
3445 $self->{set_nc}->($self);
3446 }
3447
3448
3449 return ($self->{ct}); # DOCTYPE
3450
3451 redo A;
3452 } elsif ($self->{nc} == -1) {
3453
3454 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3455 $self->{state} = DATA_STATE;
3456 ## reconsume
3457
3458 $self->{ct}->{quirks} = 1;
3459 return ($self->{ct}); # DOCTYPE
3460
3461 redo A;
3462 } else {
3463
3464 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3465 #$self->{ct}->{quirks} = 1;
3466
3467 $self->{state} = BOGUS_DOCTYPE_STATE;
3468
3469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3470 $self->{line_prev} = $self->{line};
3471 $self->{column_prev} = $self->{column};
3472 $self->{column}++;
3473 $self->{nc}
3474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3475 } else {
3476 $self->{set_nc}->($self);
3477 }
3478
3479 redo A;
3480 }
3481 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3482 if ($self->{nc} == 0x003E) { # >
3483
3484 $self->{state} = DATA_STATE;
3485
3486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3487 $self->{line_prev} = $self->{line};
3488 $self->{column_prev} = $self->{column};
3489 $self->{column}++;
3490 $self->{nc}
3491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3492 } else {
3493 $self->{set_nc}->($self);
3494 }
3495
3496
3497 return ($self->{ct}); # DOCTYPE
3498
3499 redo A;
3500 } elsif ($self->{nc} == -1) {
3501
3502 $self->{state} = DATA_STATE;
3503 ## reconsume
3504
3505 return ($self->{ct}); # DOCTYPE
3506
3507 redo A;
3508 } else {
3509
3510 my $s = '';
3511 $self->{read_until}->($s, q[>], 0);
3512
3513 ## Stay in the state
3514
3515 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3516 $self->{line_prev} = $self->{line};
3517 $self->{column_prev} = $self->{column};
3518 $self->{column}++;
3519 $self->{nc}
3520 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3521 } else {
3522 $self->{set_nc}->($self);
3523 }
3524
3525 redo A;
3526 }
3527 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3528 ## NOTE: "CDATA section state" in the state is jointly implemented
3529 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3530 ## and |CDATA_SECTION_MSE2_STATE|.
3531
3532 if ($self->{nc} == 0x005D) { # ]
3533
3534 $self->{state} = CDATA_SECTION_MSE1_STATE;
3535
3536 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3537 $self->{line_prev} = $self->{line};
3538 $self->{column_prev} = $self->{column};
3539 $self->{column}++;
3540 $self->{nc}
3541 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3542 } else {
3543 $self->{set_nc}->($self);
3544 }
3545
3546 redo A;
3547 } elsif ($self->{nc} == -1) {
3548 $self->{state} = DATA_STATE;
3549
3550 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3551 $self->{line_prev} = $self->{line};
3552 $self->{column_prev} = $self->{column};
3553 $self->{column}++;
3554 $self->{nc}
3555 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3556 } else {
3557 $self->{set_nc}->($self);
3558 }
3559
3560 if (length $self->{ct}->{data}) { # character
3561
3562 return ($self->{ct}); # character
3563 } else {
3564
3565 ## No token to emit. $self->{ct} is discarded.
3566 }
3567 redo A;
3568 } else {
3569
3570 $self->{ct}->{data} .= chr $self->{nc};
3571 $self->{read_until}->($self->{ct}->{data},
3572 q<]>,
3573 length $self->{ct}->{data});
3574
3575 ## Stay in the state.
3576
3577 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3578 $self->{line_prev} = $self->{line};
3579 $self->{column_prev} = $self->{column};
3580 $self->{column}++;
3581 $self->{nc}
3582 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3583 } else {
3584 $self->{set_nc}->($self);
3585 }
3586
3587 redo A;
3588 }
3589
3590 ## ISSUE: "text tokens" in spec.
3591 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3592 if ($self->{nc} == 0x005D) { # ]
3593
3594 $self->{state} = CDATA_SECTION_MSE2_STATE;
3595
3596 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3597 $self->{line_prev} = $self->{line};
3598 $self->{column_prev} = $self->{column};
3599 $self->{column}++;
3600 $self->{nc}
3601 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3602 } else {
3603 $self->{set_nc}->($self);
3604 }
3605
3606 redo A;
3607 } else {
3608
3609 $self->{ct}->{data} .= ']';
3610 $self->{state} = CDATA_SECTION_STATE;
3611 ## Reconsume.
3612 redo A;
3613 }
3614 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3615 if ($self->{nc} == 0x003E) { # >
3616 $self->{state} = DATA_STATE;
3617
3618 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3619 $self->{line_prev} = $self->{line};
3620 $self->{column_prev} = $self->{column};
3621 $self->{column}++;
3622 $self->{nc}
3623 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3624 } else {
3625 $self->{set_nc}->($self);
3626 }
3627
3628 if (length $self->{ct}->{data}) { # character
3629
3630 return ($self->{ct}); # character
3631 } else {
3632
3633 ## No token to emit. $self->{ct} is discarded.
3634 }
3635 redo A;
3636 } elsif ($self->{nc} == 0x005D) { # ]
3637 # character
3638 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3639 ## Stay in the state.
3640
3641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642 $self->{line_prev} = $self->{line};
3643 $self->{column_prev} = $self->{column};
3644 $self->{column}++;
3645 $self->{nc}
3646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647 } else {
3648 $self->{set_nc}->($self);
3649 }
3650
3651 redo A;
3652 } else {
3653
3654 $self->{ct}->{data} .= ']]'; # character
3655 $self->{state} = CDATA_SECTION_STATE;
3656 ## Reconsume.
3657 redo A;
3658 }
3659 } elsif ($self->{state} == ENTITY_STATE) {
3660 if ($is_space->{$self->{nc}} or
3661 {
3662 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3663 $self->{entity_add} => 1,
3664 }->{$self->{nc}}) {
3665
3666 ## Don't consume
3667 ## No error
3668 ## Return nothing.
3669 #
3670 } elsif ($self->{nc} == 0x0023) { # #
3671
3672 $self->{state} = ENTITY_HASH_STATE;
3673 $self->{s_kwd} = '#';
3674
3675 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3676 $self->{line_prev} = $self->{line};
3677 $self->{column_prev} = $self->{column};
3678 $self->{column}++;
3679 $self->{nc}
3680 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3681 } else {
3682 $self->{set_nc}->($self);
3683 }
3684
3685 redo A;
3686 } elsif ((0x0041 <= $self->{nc} and
3687 $self->{nc} <= 0x005A) or # A..Z
3688 (0x0061 <= $self->{nc} and
3689 $self->{nc} <= 0x007A)) { # a..z
3690
3691 require Whatpm::_NamedEntityList;
3692 $self->{state} = ENTITY_NAME_STATE;
3693 $self->{s_kwd} = chr $self->{nc};
3694 $self->{entity__value} = $self->{s_kwd};
3695 $self->{entity__match} = 0;
3696
3697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3698 $self->{line_prev} = $self->{line};
3699 $self->{column_prev} = $self->{column};
3700 $self->{column}++;
3701 $self->{nc}
3702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3703 } else {
3704 $self->{set_nc}->($self);
3705 }
3706
3707 redo A;
3708 } else {
3709
3710 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3711 ## Return nothing.
3712 #
3713 }
3714
3715 ## NOTE: No character is consumed by the "consume a character
3716 ## reference" algorithm. In other word, there is an "&" character
3717 ## that does not introduce a character reference, which would be
3718 ## appended to the parent element or the attribute value in later
3719 ## process of the tokenizer.
3720
3721 if ($self->{prev_state} == DATA_STATE) {
3722
3723 $self->{state} = $self->{prev_state};
3724 ## Reconsume.
3725 return ({type => CHARACTER_TOKEN, data => '&',
3726 line => $self->{line_prev},
3727 column => $self->{column_prev},
3728 });
3729 redo A;
3730 } else {
3731
3732 $self->{ca}->{value} .= '&';
3733 $self->{state} = $self->{prev_state};
3734 ## Reconsume.
3735 redo A;
3736 }
3737 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3738 if ($self->{nc} == 0x0078 or # x
3739 $self->{nc} == 0x0058) { # X
3740
3741 $self->{state} = HEXREF_X_STATE;
3742 $self->{s_kwd} .= chr $self->{nc};
3743
3744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745 $self->{line_prev} = $self->{line};
3746 $self->{column_prev} = $self->{column};
3747 $self->{column}++;
3748 $self->{nc}
3749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750 } else {
3751 $self->{set_nc}->($self);
3752 }
3753
3754 redo A;
3755 } elsif (0x0030 <= $self->{nc} and
3756 $self->{nc} <= 0x0039) { # 0..9
3757
3758 $self->{state} = NCR_NUM_STATE;
3759 $self->{s_kwd} = $self->{nc} - 0x0030;
3760
3761 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3762 $self->{line_prev} = $self->{line};
3763 $self->{column_prev} = $self->{column};
3764 $self->{column}++;
3765 $self->{nc}
3766 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3767 } else {
3768 $self->{set_nc}->($self);
3769 }
3770
3771 redo A;
3772 } else {
3773 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3774 line => $self->{line_prev},
3775 column => $self->{column_prev} - 1);
3776
3777 ## NOTE: According to the spec algorithm, nothing is returned,
3778 ## and then "&#" is appended to the parent element or the attribute
3779 ## value in the later processing.
3780
3781 if ($self->{prev_state} == DATA_STATE) {
3782
3783 $self->{state} = $self->{prev_state};
3784 ## Reconsume.
3785 return ({type => CHARACTER_TOKEN,
3786 data => '&#',
3787 line => $self->{line_prev},
3788 column => $self->{column_prev} - 1,
3789 });
3790 redo A;
3791 } else {
3792
3793 $self->{ca}->{value} .= '&#';
3794 $self->{state} = $self->{prev_state};
3795 ## Reconsume.
3796 redo A;
3797 }
3798 }
3799 } elsif ($self->{state} == NCR_NUM_STATE) {
3800 if (0x0030 <= $self->{nc} and
3801 $self->{nc} <= 0x0039) { # 0..9
3802
3803 $self->{s_kwd} *= 10;
3804 $self->{s_kwd} += $self->{nc} - 0x0030;
3805
3806 ## Stay in the state.
3807
3808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3809 $self->{line_prev} = $self->{line};
3810 $self->{column_prev} = $self->{column};
3811 $self->{column}++;
3812 $self->{nc}
3813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3814 } else {
3815 $self->{set_nc}->($self);
3816 }
3817
3818 redo A;
3819 } elsif ($self->{nc} == 0x003B) { # ;
3820
3821
3822 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3823 $self->{line_prev} = $self->{line};
3824 $self->{column_prev} = $self->{column};
3825 $self->{column}++;
3826 $self->{nc}
3827 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3828 } else {
3829 $self->{set_nc}->($self);
3830 }
3831
3832 #
3833 } else {
3834
3835 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3836 ## Reconsume.
3837 #
3838 }
3839
3840 my $code = $self->{s_kwd};
3841 my $l = $self->{line_prev};
3842 my $c = $self->{column_prev};
3843 if ($charref_map->{$code}) {
3844
3845 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3846 text => (sprintf 'U+%04X', $code),
3847 line => $l, column => $c);
3848 $code = $charref_map->{$code};
3849 } elsif ($code > 0x10FFFF) {
3850
3851 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3852 text => (sprintf 'U-%08X', $code),
3853 line => $l, column => $c);
3854 $code = 0xFFFD;
3855 }
3856
3857 if ($self->{prev_state} == DATA_STATE) {
3858
3859 $self->{state} = $self->{prev_state};
3860 ## Reconsume.
3861 return ({type => CHARACTER_TOKEN, data => chr $code,
3862 line => $l, column => $c,
3863 });
3864 redo A;
3865 } else {
3866
3867 $self->{ca}->{value} .= chr $code;
3868 $self->{ca}->{has_reference} = 1;
3869 $self->{state} = $self->{prev_state};
3870 ## Reconsume.
3871 redo A;
3872 }
3873 } elsif ($self->{state} == HEXREF_X_STATE) {
3874 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3875 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3876 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3877 # 0..9, A..F, a..f
3878
3879 $self->{state} = HEXREF_HEX_STATE;
3880 $self->{s_kwd} = 0;
3881 ## Reconsume.
3882 redo A;
3883 } else {
3884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3885 line => $self->{line_prev},
3886 column => $self->{column_prev} - 2);
3887
3888 ## NOTE: According to the spec algorithm, nothing is returned,
3889 ## and then "&#" followed by "X" or "x" is appended to the parent
3890 ## element or the attribute value in the later processing.
3891
3892 if ($self->{prev_state} == DATA_STATE) {
3893
3894 $self->{state} = $self->{prev_state};
3895 ## Reconsume.
3896 return ({type => CHARACTER_TOKEN,
3897 data => '&' . $self->{s_kwd},
3898 line => $self->{line_prev},
3899 column => $self->{column_prev} - length $self->{s_kwd},
3900 });
3901 redo A;
3902 } else {
3903
3904 $self->{ca}->{value} .= '&' . $self->{s_kwd};
3905 $self->{state} = $self->{prev_state};
3906 ## Reconsume.
3907 redo A;
3908 }
3909 }
3910 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3911 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3912 # 0..9
3913
3914 $self->{s_kwd} *= 0x10;
3915 $self->{s_kwd} += $self->{nc} - 0x0030;
3916 ## Stay in the state.
3917
3918 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3919 $self->{line_prev} = $self->{line};
3920 $self->{column_prev} = $self->{column};
3921 $self->{column}++;
3922 $self->{nc}
3923 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3924 } else {
3925 $self->{set_nc}->($self);
3926 }
3927
3928 redo A;
3929 } elsif (0x0061 <= $self->{nc} and
3930 $self->{nc} <= 0x0066) { # a..f
3931
3932 $self->{s_kwd} *= 0x10;
3933 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3934 ## Stay in the state.
3935
3936 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3937 $self->{line_prev} = $self->{line};
3938 $self->{column_prev} = $self->{column};
3939 $self->{column}++;
3940 $self->{nc}
3941 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3942 } else {
3943 $self->{set_nc}->($self);
3944 }
3945
3946 redo A;
3947 } elsif (0x0041 <= $self->{nc} and
3948 $self->{nc} <= 0x0046) { # A..F
3949
3950 $self->{s_kwd} *= 0x10;
3951 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3952 ## Stay in the state.
3953
3954 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3955 $self->{line_prev} = $self->{line};
3956 $self->{column_prev} = $self->{column};
3957 $self->{column}++;
3958 $self->{nc}
3959 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3960 } else {
3961 $self->{set_nc}->($self);
3962 }
3963
3964 redo A;
3965 } elsif ($self->{nc} == 0x003B) { # ;
3966
3967
3968 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3969 $self->{line_prev} = $self->{line};
3970 $self->{column_prev} = $self->{column};
3971 $self->{column}++;
3972 $self->{nc}
3973 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3974 } else {
3975 $self->{set_nc}->($self);
3976 }
3977
3978 #
3979 } else {
3980
3981 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
3982 line => $self->{line},
3983 column => $self->{column});
3984 ## Reconsume.
3985 #
3986 }
3987
3988 my $code = $self->{s_kwd};
3989 my $l = $self->{line_prev};
3990 my $c = $self->{column_prev};
3991 if ($charref_map->{$code}) {
3992
3993 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3994 text => (sprintf 'U+%04X', $code),
3995 line => $l, column => $c);
3996 $code = $charref_map->{$code};
3997 } elsif ($code > 0x10FFFF) {
3998
3999 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4000 text => (sprintf 'U-%08X', $code),
4001 line => $l, column => $c);
4002 $code = 0xFFFD;
4003 }
4004
4005 if ($self->{prev_state} == DATA_STATE) {
4006
4007 $self->{state} = $self->{prev_state};
4008 ## Reconsume.
4009 return ({type => CHARACTER_TOKEN, data => chr $code,
4010 line => $l, column => $c,
4011 });
4012 redo A;
4013 } else {
4014
4015 $self->{ca}->{value} .= chr $code;
4016 $self->{ca}->{has_reference} = 1;
4017 $self->{state} = $self->{prev_state};
4018 ## Reconsume.
4019 redo A;
4020 }
4021 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4022 if (length $self->{s_kwd} < 30 and
4023 ## NOTE: Some number greater than the maximum length of entity name
4024 ((0x0041 <= $self->{nc} and # a
4025 $self->{nc} <= 0x005A) or # x
4026 (0x0061 <= $self->{nc} and # a
4027 $self->{nc} <= 0x007A) or # z
4028 (0x0030 <= $self->{nc} and # 0
4029 $self->{nc} <= 0x0039) or # 9
4030 $self->{nc} == 0x003B)) { # ;
4031 our $EntityChar;
4032 $self->{s_kwd} .= chr $self->{nc};
4033 if (defined $EntityChar->{$self->{s_kwd}}) {
4034 if ($self->{nc} == 0x003B) { # ;
4035
4036 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4037 $self->{entity__match} = 1;
4038
4039 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4040 $self->{line_prev} = $self->{line};
4041 $self->{column_prev} = $self->{column};
4042 $self->{column}++;
4043 $self->{nc}
4044 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4045 } else {
4046 $self->{set_nc}->($self);
4047 }
4048
4049 #
4050 } else {
4051
4052 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4053 $self->{entity__match} = -1;
4054 ## Stay in the state.
4055
4056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4057 $self->{line_prev} = $self->{line};
4058 $self->{column_prev} = $self->{column};
4059 $self->{column}++;
4060 $self->{nc}
4061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4062 } else {
4063 $self->{set_nc}->($self);
4064 }
4065
4066 redo A;
4067 }
4068 } else {
4069
4070 $self->{entity__value} .= chr $self->{nc};
4071 $self->{entity__match} *= 2;
4072 ## Stay in the state.
4073
4074 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4075 $self->{line_prev} = $self->{line};
4076 $self->{column_prev} = $self->{column};
4077 $self->{column}++;
4078 $self->{nc}
4079 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4080 } else {
4081 $self->{set_nc}->($self);
4082 }
4083
4084 redo A;
4085 }
4086 }
4087
4088 my $data;
4089 my $has_ref;
4090 if ($self->{entity__match} > 0) {
4091
4092 $data = $self->{entity__value};
4093 $has_ref = 1;
4094 #
4095 } elsif ($self->{entity__match} < 0) {
4096 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4097 if ($self->{prev_state} != DATA_STATE and # in attribute
4098 $self->{entity__match} < -1) {
4099
4100 $data = '&' . $self->{s_kwd};
4101 #
4102 } else {
4103
4104 $data = $self->{entity__value};
4105 $has_ref = 1;
4106 #
4107 }
4108 } else {
4109
4110 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4111 line => $self->{line_prev},
4112 column => $self->{column_prev} - length $self->{s_kwd});
4113 $data = '&' . $self->{s_kwd};
4114 #
4115 }
4116
4117 ## NOTE: In these cases, when a character reference is found,
4118 ## it is consumed and a character token is returned, or, otherwise,
4119 ## nothing is consumed and returned, according to the spec algorithm.
4120 ## In this implementation, anything that has been examined by the
4121 ## tokenizer is appended to the parent element or the attribute value
4122 ## as string, either literal string when no character reference or
4123 ## entity-replaced string otherwise, in this stage, since any characters
4124 ## that would not be consumed are appended in the data state or in an
4125 ## appropriate attribute value state anyway.
4126
4127 if ($self->{prev_state} == DATA_STATE) {
4128
4129 $self->{state} = $self->{prev_state};
4130 ## Reconsume.
4131 return ({type => CHARACTER_TOKEN,
4132 data => $data,
4133 line => $self->{line_prev},
4134 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4135 });
4136 redo A;
4137 } else {
4138
4139 $self->{ca}->{value} .= $data;
4140 $self->{ca}->{has_reference} = 1 if $has_ref;
4141 $self->{state} = $self->{prev_state};
4142 ## Reconsume.
4143 redo A;
4144 }
4145 } else {
4146 die "$0: $self->{state}: Unknown state";
4147 }
4148 } # A
4149
4150 die "$0: _get_next_token: unexpected case";
4151 } # _get_next_token
4152
4153 1;
4154 ## $Date: 2008/10/14 05:34:05 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24