/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (show annotations) (download)
Wed Oct 15 10:50:38 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.10: +99 -8 lines
++ whatpm/t/xml/ChangeLog	15 Oct 2008 10:50:31 -0000
	* attrs-1.dat: Test cases for tokenizing errors are added.

	* elements-1.dat: A test result updated.

	* ns-attrs-1.dat: Test results updated.  New test cases for
	duplicate namespaced attributes are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 10:48:03 -0000
	* Tokenizer.pm.src: Set index attribute to each attribute token,
	for ignoring namespaced duplicate attribute at the XML namespace
	parser layer.  Raise a parse error if the attribute value is
	omitted, in XML mode.  Raise a parse error if the attribute value
	is not quoted, in XML mode.  Raise a parse error if "<" character
	is found in a quoted attribute value, in XML mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 10:49:16 -0000
	* Parser.pm.src: Use source order to determine which attribute is
	duplicate.  Preserve duplicate namespaced attributes as
	non-namespaced attributes.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.10 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 );
19
20 our %EXPORT_TAGS = (
21 token => [qw(
22 DOCTYPE_TOKEN
23 COMMENT_TOKEN
24 START_TAG_TOKEN
25 END_TAG_TOKEN
26 END_OF_FILE_TOKEN
27 CHARACTER_TOKEN
28 PI_TOKEN
29 ABORT_TOKEN
30 )],
31 );
32 }
33
34 ## Token types
35
36 sub DOCTYPE_TOKEN () { 1 }
37 sub COMMENT_TOKEN () { 2 }
38 sub START_TAG_TOKEN () { 3 }
39 sub END_TAG_TOKEN () { 4 }
40 sub END_OF_FILE_TOKEN () { 5 }
41 sub CHARACTER_TOKEN () { 6 }
42 sub PI_TOKEN () { 7 } # XML5
43 sub ABORT_TOKEN () { 8 } # Not a token actually
44
45 package Whatpm::HTML;
46
47 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48
49 ## Content model flags
50
51 sub CM_ENTITY () { 0b001 } # & markup in data
52 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54
55 sub PLAINTEXT_CONTENT_MODEL () { 0 }
56 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59
60 ## Tokenizer states
61
62 sub DATA_STATE () { 0 }
63 #sub ENTITY_DATA_STATE () { 1 }
64 sub TAG_OPEN_STATE () { 2 }
65 sub CLOSE_TAG_OPEN_STATE () { 3 }
66 sub TAG_NAME_STATE () { 4 }
67 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68 sub ATTRIBUTE_NAME_STATE () { 6 }
69 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76 sub COMMENT_START_STATE () { 14 }
77 sub COMMENT_START_DASH_STATE () { 15 }
78 sub COMMENT_STATE () { 16 }
79 sub COMMENT_END_STATE () { 17 }
80 sub COMMENT_END_DASH_STATE () { 18 }
81 sub BOGUS_COMMENT_STATE () { 19 }
82 sub DOCTYPE_STATE () { 20 }
83 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84 sub DOCTYPE_NAME_STATE () { 22 }
85 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94 sub BOGUS_DOCTYPE_STATE () { 32 }
95 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96 sub SELF_CLOSING_START_TAG_STATE () { 34 }
97 sub CDATA_SECTION_STATE () { 35 }
98 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106 ## NOTE: "Entity data state", "entity in attribute value state", and
107 ## "consume a character reference" algorithm are jointly implemented
108 ## using the following six states:
109 sub ENTITY_STATE () { 44 }
110 sub ENTITY_HASH_STATE () { 45 }
111 sub NCR_NUM_STATE () { 46 }
112 sub HEXREF_X_STATE () { 47 }
113 sub HEXREF_HEX_STATE () { 48 }
114 sub ENTITY_NAME_STATE () { 49 }
115 sub PCDATA_STATE () { 50 } # "data state" in the spec
116
117 ## XML states
118 sub PI_STATE () { 51 }
119 sub PI_TARGET_STATE () { 52 }
120 sub PI_TARGET_AFTER_STATE () { 53 }
121 sub PI_DATA_STATE () { 54 }
122 sub PI_AFTER_STATE () { 55 }
123 sub PI_DATA_AFTER_STATE () { 56 }
124
125 ## Tree constructor state constants (see Whatpm::HTML for the full
126 ## list and descriptions)
127
128 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129 sub FOREIGN_EL () { 0b1_00000000000 }
130
131 ## Character reference mappings
132
133 my $charref_map = {
134 0x0D => 0x000A,
135 0x80 => 0x20AC,
136 0x81 => 0xFFFD,
137 0x82 => 0x201A,
138 0x83 => 0x0192,
139 0x84 => 0x201E,
140 0x85 => 0x2026,
141 0x86 => 0x2020,
142 0x87 => 0x2021,
143 0x88 => 0x02C6,
144 0x89 => 0x2030,
145 0x8A => 0x0160,
146 0x8B => 0x2039,
147 0x8C => 0x0152,
148 0x8D => 0xFFFD,
149 0x8E => 0x017D,
150 0x8F => 0xFFFD,
151 0x90 => 0xFFFD,
152 0x91 => 0x2018,
153 0x92 => 0x2019,
154 0x93 => 0x201C,
155 0x94 => 0x201D,
156 0x95 => 0x2022,
157 0x96 => 0x2013,
158 0x97 => 0x2014,
159 0x98 => 0x02DC,
160 0x99 => 0x2122,
161 0x9A => 0x0161,
162 0x9B => 0x203A,
163 0x9C => 0x0153,
164 0x9D => 0xFFFD,
165 0x9E => 0x017E,
166 0x9F => 0x0178,
167 }; # $charref_map
168 $charref_map->{$_} = 0xFFFD
169 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176
177 ## Implementations MUST act as if state machine in the spec
178
179 sub _initialize_tokenizer ($) {
180 my $self = shift;
181
182 ## NOTE: Fields set by |new| constructor:
183 #$self->{level}
184 #$self->{set_nc}
185 #$self->{parse_error}
186 #$self->{is_xml} (if XML)
187
188 $self->{state} = DATA_STATE; # MUST
189 $self->{s_kwd} = ''; # state keyword
190 #$self->{entity__value}; # initialized when used
191 #$self->{entity__match}; # initialized when used
192 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193 undef $self->{ct}; # current token
194 undef $self->{ca}; # current attribute
195 undef $self->{last_stag_name}; # last emitted start tag name
196 #$self->{prev_state}; # initialized when used
197 delete $self->{self_closing};
198 $self->{char_buffer} = '';
199 $self->{char_buffer_pos} = 0;
200 $self->{nc} = -1; # next input character
201 #$self->{next_nc}
202
203 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
204 $self->{line_prev} = $self->{line};
205 $self->{column_prev} = $self->{column};
206 $self->{column}++;
207 $self->{nc}
208 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
209 } else {
210 $self->{set_nc}->($self);
211 }
212
213 $self->{token} = [];
214 # $self->{escape}
215 } # _initialize_tokenizer
216
217 ## A token has:
218 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
220 ## ->{name} (DOCTYPE_TOKEN)
221 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222 ## ->{target} (PI_TOKEN)
223 ## ->{pubid} (DOCTYPE_TOKEN)
224 ## ->{sysid} (DOCTYPE_TOKEN)
225 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
226 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
227 ## ->{name}
228 ## ->{value}
229 ## ->{has_reference} == 1 or 0
230 ## ->{index}: Index of the attribute in a tag.
231 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
232 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
233 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
234 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
235 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
236 ## while the token is pushed back to the stack.
237
238 ## Emitted token MUST immediately be handled by the tree construction state.
239
240 ## Before each step, UA MAY check to see if either one of the scripts in
241 ## "list of scripts that will execute as soon as possible" or the first
242 ## script in the "list of scripts that will execute asynchronously",
243 ## has completed loading. If one has, then it MUST be executed
244 ## and removed from the list.
245
246 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
247 ## (This requirement was dropped from HTML5 spec, unfortunately.)
248
249 my $is_space = {
250 0x0009 => 1, # CHARACTER TABULATION (HT)
251 0x000A => 1, # LINE FEED (LF)
252 #0x000B => 0, # LINE TABULATION (VT)
253 0x000C => 1, # FORM FEED (FF)
254 #0x000D => 1, # CARRIAGE RETURN (CR)
255 0x0020 => 1, # SPACE (SP)
256 };
257
258 sub _get_next_token ($) {
259 my $self = shift;
260
261 if ($self->{self_closing}) {
262 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
263 ## NOTE: The |self_closing| flag is only set by start tag token.
264 ## In addition, when a start tag token is emitted, it is always set to
265 ## |ct|.
266 delete $self->{self_closing};
267 }
268
269 if (@{$self->{token}}) {
270 $self->{self_closing} = $self->{token}->[0]->{self_closing};
271 return shift @{$self->{token}};
272 }
273
274 A: {
275 if ($self->{state} == PCDATA_STATE) {
276 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
277
278 if ($self->{nc} == 0x0026) { # &
279
280 ## NOTE: In the spec, the tokenizer is switched to the
281 ## "entity data state". In this implementation, the tokenizer
282 ## is switched to the |ENTITY_STATE|, which is an implementation
283 ## of the "consume a character reference" algorithm.
284 $self->{entity_add} = -1;
285 $self->{prev_state} = DATA_STATE;
286 $self->{state} = ENTITY_STATE;
287
288 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
289 $self->{line_prev} = $self->{line};
290 $self->{column_prev} = $self->{column};
291 $self->{column}++;
292 $self->{nc}
293 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
294 } else {
295 $self->{set_nc}->($self);
296 }
297
298 redo A;
299 } elsif ($self->{nc} == 0x003C) { # <
300
301 $self->{state} = TAG_OPEN_STATE;
302
303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
304 $self->{line_prev} = $self->{line};
305 $self->{column_prev} = $self->{column};
306 $self->{column}++;
307 $self->{nc}
308 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
309 } else {
310 $self->{set_nc}->($self);
311 }
312
313 redo A;
314 } elsif ($self->{nc} == -1) {
315
316 return ({type => END_OF_FILE_TOKEN,
317 line => $self->{line}, column => $self->{column}});
318 last A; ## TODO: ok?
319 } else {
320
321 #
322 }
323
324 # Anything else
325 my $token = {type => CHARACTER_TOKEN,
326 data => chr $self->{nc},
327 line => $self->{line}, column => $self->{column},
328 };
329 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
330
331 ## Stay in the state.
332
333 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
334 $self->{line_prev} = $self->{line};
335 $self->{column_prev} = $self->{column};
336 $self->{column}++;
337 $self->{nc}
338 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
339 } else {
340 $self->{set_nc}->($self);
341 }
342
343 return ($token);
344 redo A;
345 } elsif ($self->{state} == DATA_STATE) {
346 $self->{s_kwd} = '' unless defined $self->{s_kwd};
347 if ($self->{nc} == 0x0026) { # &
348 $self->{s_kwd} = '';
349 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
350 not $self->{escape}) {
351
352 ## NOTE: In the spec, the tokenizer is switched to the
353 ## "entity data state". In this implementation, the tokenizer
354 ## is switched to the |ENTITY_STATE|, which is an implementation
355 ## of the "consume a character reference" algorithm.
356 $self->{entity_add} = -1;
357 $self->{prev_state} = DATA_STATE;
358 $self->{state} = ENTITY_STATE;
359
360 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
361 $self->{line_prev} = $self->{line};
362 $self->{column_prev} = $self->{column};
363 $self->{column}++;
364 $self->{nc}
365 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
366 } else {
367 $self->{set_nc}->($self);
368 }
369
370 redo A;
371 } else {
372
373 #
374 }
375 } elsif ($self->{nc} == 0x002D) { # -
376 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
377 if ($self->{s_kwd} eq '<!-') {
378
379 $self->{escape} = 1; # unless $self->{escape};
380 $self->{s_kwd} = '--';
381 #
382 } elsif ($self->{s_kwd} eq '-') {
383
384 $self->{s_kwd} = '--';
385 #
386 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
387
388 $self->{s_kwd} .= '-';
389 #
390 } else {
391
392 $self->{s_kwd} = '-';
393 #
394 }
395 }
396
397 #
398 } elsif ($self->{nc} == 0x0021) { # !
399 if (length $self->{s_kwd}) {
400
401 $self->{s_kwd} .= '!';
402 #
403 } else {
404
405 #$self->{s_kwd} = '';
406 #
407 }
408 #
409 } elsif ($self->{nc} == 0x003C) { # <
410 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
411 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
412 not $self->{escape})) {
413
414 $self->{state} = TAG_OPEN_STATE;
415
416 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
417 $self->{line_prev} = $self->{line};
418 $self->{column_prev} = $self->{column};
419 $self->{column}++;
420 $self->{nc}
421 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
422 } else {
423 $self->{set_nc}->($self);
424 }
425
426 redo A;
427 } else {
428
429 $self->{s_kwd} = '';
430 #
431 }
432 } elsif ($self->{nc} == 0x003E) { # >
433 if ($self->{escape} and
434 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
435 if ($self->{s_kwd} eq '--') {
436
437 delete $self->{escape};
438 #
439 } else {
440
441 #
442 }
443 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
444
445 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
446 line => $self->{line_prev},
447 column => $self->{column_prev} - 1);
448 #
449 } else {
450
451 #
452 }
453
454 $self->{s_kwd} = '';
455 #
456 } elsif ($self->{nc} == 0x005D) { # ]
457 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
458
459 $self->{s_kwd} .= ']';
460 } elsif ($self->{s_kwd} eq ']]') {
461
462 #
463 } else {
464
465 $self->{s_kwd} = '';
466 }
467 #
468 } elsif ($self->{nc} == -1) {
469
470 $self->{s_kwd} = '';
471 return ({type => END_OF_FILE_TOKEN,
472 line => $self->{line}, column => $self->{column}});
473 last A; ## TODO: ok?
474 } else {
475
476 $self->{s_kwd} = '';
477 #
478 }
479
480 # Anything else
481 my $token = {type => CHARACTER_TOKEN,
482 data => chr $self->{nc},
483 line => $self->{line}, column => $self->{column},
484 };
485 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
486 length $token->{data})) {
487 $self->{s_kwd} = '';
488 }
489
490 ## Stay in the data state.
491 if (not $self->{is_xml} and
492 $self->{content_model} == PCDATA_CONTENT_MODEL) {
493
494 $self->{state} = PCDATA_STATE;
495 } else {
496
497 ## Stay in the state.
498 }
499
500 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
501 $self->{line_prev} = $self->{line};
502 $self->{column_prev} = $self->{column};
503 $self->{column}++;
504 $self->{nc}
505 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
506 } else {
507 $self->{set_nc}->($self);
508 }
509
510 return ($token);
511 redo A;
512 } elsif ($self->{state} == TAG_OPEN_STATE) {
513 ## XML5: "tag state".
514
515 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
516 if ($self->{nc} == 0x002F) { # /
517
518
519 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
520 $self->{line_prev} = $self->{line};
521 $self->{column_prev} = $self->{column};
522 $self->{column}++;
523 $self->{nc}
524 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
525 } else {
526 $self->{set_nc}->($self);
527 }
528
529 $self->{state} = CLOSE_TAG_OPEN_STATE;
530 redo A;
531 } elsif ($self->{nc} == 0x0021) { # !
532
533 $self->{s_kwd} = '<' unless $self->{escape};
534 #
535 } else {
536
537 #
538 }
539
540 ## reconsume
541 $self->{state} = DATA_STATE;
542 $self->{s_kwd} = '';
543 return ({type => CHARACTER_TOKEN, data => '<',
544 line => $self->{line_prev},
545 column => $self->{column_prev},
546 });
547 redo A;
548 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
549 if ($self->{nc} == 0x0021) { # !
550
551 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
552
553 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
554 $self->{line_prev} = $self->{line};
555 $self->{column_prev} = $self->{column};
556 $self->{column}++;
557 $self->{nc}
558 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
559 } else {
560 $self->{set_nc}->($self);
561 }
562
563 redo A;
564 } elsif ($self->{nc} == 0x002F) { # /
565
566 $self->{state} = CLOSE_TAG_OPEN_STATE;
567
568 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
569 $self->{line_prev} = $self->{line};
570 $self->{column_prev} = $self->{column};
571 $self->{column}++;
572 $self->{nc}
573 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
574 } else {
575 $self->{set_nc}->($self);
576 }
577
578 redo A;
579 } elsif (0x0041 <= $self->{nc} and
580 $self->{nc} <= 0x005A) { # A..Z
581
582 $self->{ct}
583 = {type => START_TAG_TOKEN,
584 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
585 line => $self->{line_prev},
586 column => $self->{column_prev}};
587 $self->{state} = TAG_NAME_STATE;
588
589 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
590 $self->{line_prev} = $self->{line};
591 $self->{column_prev} = $self->{column};
592 $self->{column}++;
593 $self->{nc}
594 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
595 } else {
596 $self->{set_nc}->($self);
597 }
598
599 redo A;
600 } elsif (0x0061 <= $self->{nc} and
601 $self->{nc} <= 0x007A) { # a..z
602
603 $self->{ct} = {type => START_TAG_TOKEN,
604 tag_name => chr ($self->{nc}),
605 line => $self->{line_prev},
606 column => $self->{column_prev}};
607 $self->{state} = TAG_NAME_STATE;
608
609 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
610 $self->{line_prev} = $self->{line};
611 $self->{column_prev} = $self->{column};
612 $self->{column}++;
613 $self->{nc}
614 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
615 } else {
616 $self->{set_nc}->($self);
617 }
618
619 redo A;
620 } elsif ($self->{nc} == 0x003E) { # >
621
622 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
623 line => $self->{line_prev},
624 column => $self->{column_prev});
625 $self->{state} = DATA_STATE;
626 $self->{s_kwd} = '';
627
628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
629 $self->{line_prev} = $self->{line};
630 $self->{column_prev} = $self->{column};
631 $self->{column}++;
632 $self->{nc}
633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
634 } else {
635 $self->{set_nc}->($self);
636 }
637
638
639 return ({type => CHARACTER_TOKEN, data => '<>',
640 line => $self->{line_prev},
641 column => $self->{column_prev},
642 });
643
644 redo A;
645 } elsif ($self->{nc} == 0x003F) { # ?
646 if ($self->{is_xml}) {
647
648 $self->{state} = PI_STATE;
649
650 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
651 $self->{line_prev} = $self->{line};
652 $self->{column_prev} = $self->{column};
653 $self->{column}++;
654 $self->{nc}
655 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
656 } else {
657 $self->{set_nc}->($self);
658 }
659
660 redo A;
661 } else {
662
663 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
664 line => $self->{line_prev},
665 column => $self->{column_prev});
666 $self->{state} = BOGUS_COMMENT_STATE;
667 $self->{ct} = {type => COMMENT_TOKEN, data => '',
668 line => $self->{line_prev},
669 column => $self->{column_prev},
670 };
671 ## $self->{nc} is intentionally left as is
672 redo A;
673 }
674 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
675
676 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
677 line => $self->{line_prev},
678 column => $self->{column_prev});
679 $self->{state} = DATA_STATE;
680 $self->{s_kwd} = '';
681 ## reconsume
682
683 return ({type => CHARACTER_TOKEN, data => '<',
684 line => $self->{line_prev},
685 column => $self->{column_prev},
686 });
687
688 redo A;
689 } else {
690 ## XML5: "<:" is a parse error.
691
692 $self->{ct} = {type => START_TAG_TOKEN,
693 tag_name => chr ($self->{nc}),
694 line => $self->{line_prev},
695 column => $self->{column_prev}};
696 $self->{state} = TAG_NAME_STATE;
697
698 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
699 $self->{line_prev} = $self->{line};
700 $self->{column_prev} = $self->{column};
701 $self->{column}++;
702 $self->{nc}
703 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
704 } else {
705 $self->{set_nc}->($self);
706 }
707
708 redo A;
709 }
710 } else {
711 die "$0: $self->{content_model} in tag open";
712 }
713 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
714 ## NOTE: The "close tag open state" in the spec is implemented as
715 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
716
717 ## XML5: "end tag state".
718
719 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
720 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
721 if (defined $self->{last_stag_name}) {
722 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
723 $self->{s_kwd} = '';
724 ## Reconsume.
725 redo A;
726 } else {
727 ## No start tag token has ever been emitted
728 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
729
730 $self->{state} = DATA_STATE;
731 $self->{s_kwd} = '';
732 ## Reconsume.
733 return ({type => CHARACTER_TOKEN, data => '</',
734 line => $l, column => $c,
735 });
736 redo A;
737 }
738 }
739
740 if (0x0041 <= $self->{nc} and
741 $self->{nc} <= 0x005A) { # A..Z
742
743 $self->{ct}
744 = {type => END_TAG_TOKEN,
745 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
746 line => $l, column => $c};
747 $self->{state} = TAG_NAME_STATE;
748
749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
750 $self->{line_prev} = $self->{line};
751 $self->{column_prev} = $self->{column};
752 $self->{column}++;
753 $self->{nc}
754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
755 } else {
756 $self->{set_nc}->($self);
757 }
758
759 redo A;
760 } elsif (0x0061 <= $self->{nc} and
761 $self->{nc} <= 0x007A) { # a..z
762
763 $self->{ct} = {type => END_TAG_TOKEN,
764 tag_name => chr ($self->{nc}),
765 line => $l, column => $c};
766 $self->{state} = TAG_NAME_STATE;
767
768 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
769 $self->{line_prev} = $self->{line};
770 $self->{column_prev} = $self->{column};
771 $self->{column}++;
772 $self->{nc}
773 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
774 } else {
775 $self->{set_nc}->($self);
776 }
777
778 redo A;
779 } elsif ($self->{nc} == 0x003E) { # >
780 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
781 line => $self->{line_prev}, ## "<" in "</>"
782 column => $self->{column_prev} - 1);
783 $self->{state} = DATA_STATE;
784 $self->{s_kwd} = '';
785 if ($self->{is_xml}) {
786
787 ## XML5: No parse error.
788
789 ## NOTE: This parser raises a parse error, since it supports
790 ## XML1, not XML5.
791
792 ## NOTE: A short end tag token.
793 my $ct = {type => END_TAG_TOKEN,
794 tag_name => '',
795 line => $self->{line_prev},
796 column => $self->{column_prev} - 1,
797 };
798
799 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
800 $self->{line_prev} = $self->{line};
801 $self->{column_prev} = $self->{column};
802 $self->{column}++;
803 $self->{nc}
804 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
805 } else {
806 $self->{set_nc}->($self);
807 }
808
809 return ($ct);
810 } else {
811
812
813 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
814 $self->{line_prev} = $self->{line};
815 $self->{column_prev} = $self->{column};
816 $self->{column}++;
817 $self->{nc}
818 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
819 } else {
820 $self->{set_nc}->($self);
821 }
822
823 }
824 redo A;
825 } elsif ($self->{nc} == -1) {
826
827 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
828 $self->{s_kwd} = '';
829 $self->{state} = DATA_STATE;
830 # reconsume
831
832 return ({type => CHARACTER_TOKEN, data => '</',
833 line => $l, column => $c,
834 });
835
836 redo A;
837 } elsif (not $self->{is_xml} or
838 $is_space->{$self->{nc}}) {
839
840 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
841 line => $self->{line_prev}, # "<" of "</"
842 column => $self->{column_prev} - 1);
843 $self->{state} = BOGUS_COMMENT_STATE;
844 $self->{ct} = {type => COMMENT_TOKEN, data => '',
845 line => $self->{line_prev}, # "<" of "</"
846 column => $self->{column_prev} - 1,
847 };
848 ## NOTE: $self->{nc} is intentionally left as is.
849 ## Although the "anything else" case of the spec not explicitly
850 ## states that the next input character is to be reconsumed,
851 ## it will be included to the |data| of the comment token
852 ## generated from the bogus end tag, as defined in the
853 ## "bogus comment state" entry.
854 redo A;
855 } else {
856 ## XML5: "</:" is a parse error.
857
858 $self->{ct} = {type => END_TAG_TOKEN,
859 tag_name => chr ($self->{nc}),
860 line => $l, column => $c};
861 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
862
863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
864 $self->{line_prev} = $self->{line};
865 $self->{column_prev} = $self->{column};
866 $self->{column}++;
867 $self->{nc}
868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
869 } else {
870 $self->{set_nc}->($self);
871 }
872
873 redo A;
874 }
875 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
876 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
877 if (length $ch) {
878 my $CH = $ch;
879 $ch =~ tr/a-z/A-Z/;
880 my $nch = chr $self->{nc};
881 if ($nch eq $ch or $nch eq $CH) {
882
883 ## Stay in the state.
884 $self->{s_kwd} .= $nch;
885
886 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
887 $self->{line_prev} = $self->{line};
888 $self->{column_prev} = $self->{column};
889 $self->{column}++;
890 $self->{nc}
891 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
892 } else {
893 $self->{set_nc}->($self);
894 }
895
896 redo A;
897 } else {
898
899 $self->{state} = DATA_STATE;
900 $self->{s_kwd} = '';
901 ## Reconsume.
902 return ({type => CHARACTER_TOKEN,
903 data => '</' . $self->{s_kwd},
904 line => $self->{line_prev},
905 column => $self->{column_prev} - 1 - length $self->{s_kwd},
906 });
907 redo A;
908 }
909 } else { # after "<{tag-name}"
910 unless ($is_space->{$self->{nc}} or
911 {
912 0x003E => 1, # >
913 0x002F => 1, # /
914 -1 => 1, # EOF
915 }->{$self->{nc}}) {
916
917 ## Reconsume.
918 $self->{state} = DATA_STATE;
919 $self->{s_kwd} = '';
920 return ({type => CHARACTER_TOKEN,
921 data => '</' . $self->{s_kwd},
922 line => $self->{line_prev},
923 column => $self->{column_prev} - 1 - length $self->{s_kwd},
924 });
925 redo A;
926 } else {
927
928 $self->{ct}
929 = {type => END_TAG_TOKEN,
930 tag_name => $self->{last_stag_name},
931 line => $self->{line_prev},
932 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
933 $self->{state} = TAG_NAME_STATE;
934 ## Reconsume.
935 redo A;
936 }
937 }
938 } elsif ($self->{state} == TAG_NAME_STATE) {
939 if ($is_space->{$self->{nc}}) {
940
941 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
942
943 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
944 $self->{line_prev} = $self->{line};
945 $self->{column_prev} = $self->{column};
946 $self->{column}++;
947 $self->{nc}
948 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
949 } else {
950 $self->{set_nc}->($self);
951 }
952
953 redo A;
954 } elsif ($self->{nc} == 0x003E) { # >
955 if ($self->{ct}->{type} == START_TAG_TOKEN) {
956
957 $self->{last_stag_name} = $self->{ct}->{tag_name};
958 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
959 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
960 #if ($self->{ct}->{attributes}) {
961 # ## NOTE: This should never be reached.
962 # !!! cp (36);
963 # !!! parse-error (type => 'end tag attribute');
964 #} else {
965
966 #}
967 } else {
968 die "$0: $self->{ct}->{type}: Unknown token type";
969 }
970 $self->{state} = DATA_STATE;
971 $self->{s_kwd} = '';
972
973 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
974 $self->{line_prev} = $self->{line};
975 $self->{column_prev} = $self->{column};
976 $self->{column}++;
977 $self->{nc}
978 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
979 } else {
980 $self->{set_nc}->($self);
981 }
982
983
984 return ($self->{ct}); # start tag or end tag
985
986 redo A;
987 } elsif (0x0041 <= $self->{nc} and
988 $self->{nc} <= 0x005A) { # A..Z
989
990 $self->{ct}->{tag_name}
991 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
992 # start tag or end tag
993 ## Stay in this state
994
995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996 $self->{line_prev} = $self->{line};
997 $self->{column_prev} = $self->{column};
998 $self->{column}++;
999 $self->{nc}
1000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001 } else {
1002 $self->{set_nc}->($self);
1003 }
1004
1005 redo A;
1006 } elsif ($self->{nc} == -1) {
1007 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1008 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1009
1010 $self->{last_stag_name} = $self->{ct}->{tag_name};
1011 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1012 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013 #if ($self->{ct}->{attributes}) {
1014 # ## NOTE: This state should never be reached.
1015 # !!! cp (40);
1016 # !!! parse-error (type => 'end tag attribute');
1017 #} else {
1018
1019 #}
1020 } else {
1021 die "$0: $self->{ct}->{type}: Unknown token type";
1022 }
1023 $self->{state} = DATA_STATE;
1024 $self->{s_kwd} = '';
1025 # reconsume
1026
1027 return ($self->{ct}); # start tag or end tag
1028
1029 redo A;
1030 } elsif ($self->{nc} == 0x002F) { # /
1031
1032 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1033
1034 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1035 $self->{line_prev} = $self->{line};
1036 $self->{column_prev} = $self->{column};
1037 $self->{column}++;
1038 $self->{nc}
1039 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1040 } else {
1041 $self->{set_nc}->($self);
1042 }
1043
1044 redo A;
1045 } else {
1046
1047 $self->{ct}->{tag_name} .= chr $self->{nc};
1048 # start tag or end tag
1049 ## Stay in the state
1050
1051 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1052 $self->{line_prev} = $self->{line};
1053 $self->{column_prev} = $self->{column};
1054 $self->{column}++;
1055 $self->{nc}
1056 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1057 } else {
1058 $self->{set_nc}->($self);
1059 }
1060
1061 redo A;
1062 }
1063 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1064 ## XML5: "Tag attribute name before state".
1065
1066 if ($is_space->{$self->{nc}}) {
1067
1068 ## Stay in the state
1069
1070 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1071 $self->{line_prev} = $self->{line};
1072 $self->{column_prev} = $self->{column};
1073 $self->{column}++;
1074 $self->{nc}
1075 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1076 } else {
1077 $self->{set_nc}->($self);
1078 }
1079
1080 redo A;
1081 } elsif ($self->{nc} == 0x003E) { # >
1082 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1083
1084 $self->{last_stag_name} = $self->{ct}->{tag_name};
1085 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1086 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1087 if ($self->{ct}->{attributes}) {
1088
1089 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1090 } else {
1091
1092 }
1093 } else {
1094 die "$0: $self->{ct}->{type}: Unknown token type";
1095 }
1096 $self->{state} = DATA_STATE;
1097 $self->{s_kwd} = '';
1098
1099 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1100 $self->{line_prev} = $self->{line};
1101 $self->{column_prev} = $self->{column};
1102 $self->{column}++;
1103 $self->{nc}
1104 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1105 } else {
1106 $self->{set_nc}->($self);
1107 }
1108
1109
1110 return ($self->{ct}); # start tag or end tag
1111
1112 redo A;
1113 } elsif (0x0041 <= $self->{nc} and
1114 $self->{nc} <= 0x005A) { # A..Z
1115
1116 $self->{ca}
1117 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1118 value => '',
1119 line => $self->{line}, column => $self->{column}};
1120 $self->{state} = ATTRIBUTE_NAME_STATE;
1121
1122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1123 $self->{line_prev} = $self->{line};
1124 $self->{column_prev} = $self->{column};
1125 $self->{column}++;
1126 $self->{nc}
1127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1128 } else {
1129 $self->{set_nc}->($self);
1130 }
1131
1132 redo A;
1133 } elsif ($self->{nc} == 0x002F) { # /
1134
1135 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1136
1137 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1138 $self->{line_prev} = $self->{line};
1139 $self->{column_prev} = $self->{column};
1140 $self->{column}++;
1141 $self->{nc}
1142 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1143 } else {
1144 $self->{set_nc}->($self);
1145 }
1146
1147 redo A;
1148 } elsif ($self->{nc} == -1) {
1149 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1150 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1151
1152 $self->{last_stag_name} = $self->{ct}->{tag_name};
1153 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1154 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1155 if ($self->{ct}->{attributes}) {
1156
1157 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1158 } else {
1159
1160 }
1161 } else {
1162 die "$0: $self->{ct}->{type}: Unknown token type";
1163 }
1164 $self->{state} = DATA_STATE;
1165 $self->{s_kwd} = '';
1166 # reconsume
1167
1168 return ($self->{ct}); # start tag or end tag
1169
1170 redo A;
1171 } else {
1172 if ({
1173 0x0022 => 1, # "
1174 0x0027 => 1, # '
1175 0x003D => 1, # =
1176 }->{$self->{nc}}) {
1177
1178 ## XML5: Not a parse error.
1179 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1180 } else {
1181
1182 ## XML5: ":" raises a parse error and is ignored.
1183 }
1184 $self->{ca}
1185 = {name => chr ($self->{nc}),
1186 value => '',
1187 line => $self->{line}, column => $self->{column}};
1188 $self->{state} = ATTRIBUTE_NAME_STATE;
1189
1190 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1191 $self->{line_prev} = $self->{line};
1192 $self->{column_prev} = $self->{column};
1193 $self->{column}++;
1194 $self->{nc}
1195 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1196 } else {
1197 $self->{set_nc}->($self);
1198 }
1199
1200 redo A;
1201 }
1202 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1203 ## XML5: "Tag attribute name state".
1204
1205 my $before_leave = sub {
1206 if (exists $self->{ct}->{attributes} # start tag or end tag
1207 ->{$self->{ca}->{name}}) { # MUST
1208
1209 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1210 ## Discard $self->{ca} # MUST
1211 } else {
1212
1213 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1214 = $self->{ca};
1215 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1216 }
1217 }; # $before_leave
1218
1219 if ($is_space->{$self->{nc}}) {
1220
1221 $before_leave->();
1222 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1223
1224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1225 $self->{line_prev} = $self->{line};
1226 $self->{column_prev} = $self->{column};
1227 $self->{column}++;
1228 $self->{nc}
1229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1230 } else {
1231 $self->{set_nc}->($self);
1232 }
1233
1234 redo A;
1235 } elsif ($self->{nc} == 0x003D) { # =
1236
1237 $before_leave->();
1238 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1239
1240 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1241 $self->{line_prev} = $self->{line};
1242 $self->{column_prev} = $self->{column};
1243 $self->{column}++;
1244 $self->{nc}
1245 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1246 } else {
1247 $self->{set_nc}->($self);
1248 }
1249
1250 redo A;
1251 } elsif ($self->{nc} == 0x003E) { # >
1252 if ($self->{is_xml}) {
1253
1254 ## XML5: Not a parse error.
1255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1256 } else {
1257
1258 }
1259
1260 $before_leave->();
1261 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1262
1263 $self->{last_stag_name} = $self->{ct}->{tag_name};
1264 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1265
1266 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1267 if ($self->{ct}->{attributes}) {
1268 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1269 }
1270 } else {
1271 die "$0: $self->{ct}->{type}: Unknown token type";
1272 }
1273 $self->{state} = DATA_STATE;
1274 $self->{s_kwd} = '';
1275
1276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1277 $self->{line_prev} = $self->{line};
1278 $self->{column_prev} = $self->{column};
1279 $self->{column}++;
1280 $self->{nc}
1281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1282 } else {
1283 $self->{set_nc}->($self);
1284 }
1285
1286
1287 return ($self->{ct}); # start tag or end tag
1288
1289 redo A;
1290 } elsif (0x0041 <= $self->{nc} and
1291 $self->{nc} <= 0x005A) { # A..Z
1292
1293 $self->{ca}->{name}
1294 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1295 ## Stay in the state
1296
1297 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1298 $self->{line_prev} = $self->{line};
1299 $self->{column_prev} = $self->{column};
1300 $self->{column}++;
1301 $self->{nc}
1302 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1303 } else {
1304 $self->{set_nc}->($self);
1305 }
1306
1307 redo A;
1308 } elsif ($self->{nc} == 0x002F) { # /
1309 if ($self->{is_xml}) {
1310
1311 ## XML5: Not a parse error.
1312 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1313 } else {
1314
1315 }
1316
1317 $before_leave->();
1318 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1319
1320 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1321 $self->{line_prev} = $self->{line};
1322 $self->{column_prev} = $self->{column};
1323 $self->{column}++;
1324 $self->{nc}
1325 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1326 } else {
1327 $self->{set_nc}->($self);
1328 }
1329
1330 redo A;
1331 } elsif ($self->{nc} == -1) {
1332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1333 $before_leave->();
1334 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1335
1336 $self->{last_stag_name} = $self->{ct}->{tag_name};
1337 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1338 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1339 if ($self->{ct}->{attributes}) {
1340
1341 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1342 } else {
1343 ## NOTE: This state should never be reached.
1344
1345 }
1346 } else {
1347 die "$0: $self->{ct}->{type}: Unknown token type";
1348 }
1349 $self->{state} = DATA_STATE;
1350 $self->{s_kwd} = '';
1351 # reconsume
1352
1353 return ($self->{ct}); # start tag or end tag
1354
1355 redo A;
1356 } else {
1357 if ($self->{nc} == 0x0022 or # "
1358 $self->{nc} == 0x0027) { # '
1359
1360 ## XML5: Not a parse error.
1361 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1362 } else {
1363
1364 }
1365 $self->{ca}->{name} .= chr ($self->{nc});
1366 ## Stay in the state
1367
1368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1369 $self->{line_prev} = $self->{line};
1370 $self->{column_prev} = $self->{column};
1371 $self->{column}++;
1372 $self->{nc}
1373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1374 } else {
1375 $self->{set_nc}->($self);
1376 }
1377
1378 redo A;
1379 }
1380 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1381 ## XML5: "Tag attribute name after state".
1382
1383 if ($is_space->{$self->{nc}}) {
1384
1385 ## Stay in the state
1386
1387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1388 $self->{line_prev} = $self->{line};
1389 $self->{column_prev} = $self->{column};
1390 $self->{column}++;
1391 $self->{nc}
1392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1393 } else {
1394 $self->{set_nc}->($self);
1395 }
1396
1397 redo A;
1398 } elsif ($self->{nc} == 0x003D) { # =
1399
1400 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1401
1402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1403 $self->{line_prev} = $self->{line};
1404 $self->{column_prev} = $self->{column};
1405 $self->{column}++;
1406 $self->{nc}
1407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1408 } else {
1409 $self->{set_nc}->($self);
1410 }
1411
1412 redo A;
1413 } elsif ($self->{nc} == 0x003E) { # >
1414 if ($self->{is_xml}) {
1415
1416 ## XML5: Not a parse error.
1417 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1418 } else {
1419
1420 }
1421
1422 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1423
1424 $self->{last_stag_name} = $self->{ct}->{tag_name};
1425 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1426 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1427 if ($self->{ct}->{attributes}) {
1428
1429 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1430 } else {
1431 ## NOTE: This state should never be reached.
1432
1433 }
1434 } else {
1435 die "$0: $self->{ct}->{type}: Unknown token type";
1436 }
1437 $self->{state} = DATA_STATE;
1438 $self->{s_kwd} = '';
1439
1440 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1441 $self->{line_prev} = $self->{line};
1442 $self->{column_prev} = $self->{column};
1443 $self->{column}++;
1444 $self->{nc}
1445 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1446 } else {
1447 $self->{set_nc}->($self);
1448 }
1449
1450
1451 return ($self->{ct}); # start tag or end tag
1452
1453 redo A;
1454 } elsif (0x0041 <= $self->{nc} and
1455 $self->{nc} <= 0x005A) { # A..Z
1456
1457 $self->{ca}
1458 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1459 value => '',
1460 line => $self->{line}, column => $self->{column}};
1461 $self->{state} = ATTRIBUTE_NAME_STATE;
1462
1463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464 $self->{line_prev} = $self->{line};
1465 $self->{column_prev} = $self->{column};
1466 $self->{column}++;
1467 $self->{nc}
1468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469 } else {
1470 $self->{set_nc}->($self);
1471 }
1472
1473 redo A;
1474 } elsif ($self->{nc} == 0x002F) { # /
1475 if ($self->{is_xml}) {
1476
1477 ## XML5: Not a parse error.
1478 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1479 } else {
1480
1481 }
1482
1483 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1484
1485 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1486 $self->{line_prev} = $self->{line};
1487 $self->{column_prev} = $self->{column};
1488 $self->{column}++;
1489 $self->{nc}
1490 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1491 } else {
1492 $self->{set_nc}->($self);
1493 }
1494
1495 redo A;
1496 } elsif ($self->{nc} == -1) {
1497 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1498 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499
1500 $self->{last_stag_name} = $self->{ct}->{tag_name};
1501 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503 if ($self->{ct}->{attributes}) {
1504
1505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506 } else {
1507 ## NOTE: This state should never be reached.
1508
1509 }
1510 } else {
1511 die "$0: $self->{ct}->{type}: Unknown token type";
1512 }
1513 $self->{s_kwd} = '';
1514 $self->{state} = DATA_STATE;
1515 # reconsume
1516
1517 return ($self->{ct}); # start tag or end tag
1518
1519 redo A;
1520 } else {
1521 if ($self->{is_xml}) {
1522
1523 ## XML5: Not a parse error.
1524 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1525 } else {
1526
1527 }
1528
1529 if ($self->{nc} == 0x0022 or # "
1530 $self->{nc} == 0x0027) { # '
1531
1532 ## XML5: Not a parse error.
1533 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1534 } else {
1535
1536 }
1537 $self->{ca}
1538 = {name => chr ($self->{nc}),
1539 value => '',
1540 line => $self->{line}, column => $self->{column}};
1541 $self->{state} = ATTRIBUTE_NAME_STATE;
1542
1543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1544 $self->{line_prev} = $self->{line};
1545 $self->{column_prev} = $self->{column};
1546 $self->{column}++;
1547 $self->{nc}
1548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1549 } else {
1550 $self->{set_nc}->($self);
1551 }
1552
1553 redo A;
1554 }
1555 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1556 ## XML5: "Tag attribute value before state".
1557
1558 if ($is_space->{$self->{nc}}) {
1559
1560 ## Stay in the state
1561
1562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1563 $self->{line_prev} = $self->{line};
1564 $self->{column_prev} = $self->{column};
1565 $self->{column}++;
1566 $self->{nc}
1567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1568 } else {
1569 $self->{set_nc}->($self);
1570 }
1571
1572 redo A;
1573 } elsif ($self->{nc} == 0x0022) { # "
1574
1575 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1576
1577 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1578 $self->{line_prev} = $self->{line};
1579 $self->{column_prev} = $self->{column};
1580 $self->{column}++;
1581 $self->{nc}
1582 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1583 } else {
1584 $self->{set_nc}->($self);
1585 }
1586
1587 redo A;
1588 } elsif ($self->{nc} == 0x0026) { # &
1589
1590 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1591 ## reconsume
1592 redo A;
1593 } elsif ($self->{nc} == 0x0027) { # '
1594
1595 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1596
1597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1598 $self->{line_prev} = $self->{line};
1599 $self->{column_prev} = $self->{column};
1600 $self->{column}++;
1601 $self->{nc}
1602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1603 } else {
1604 $self->{set_nc}->($self);
1605 }
1606
1607 redo A;
1608 } elsif ($self->{nc} == 0x003E) { # >
1609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1610 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1611
1612 $self->{last_stag_name} = $self->{ct}->{tag_name};
1613 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1614 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1615 if ($self->{ct}->{attributes}) {
1616
1617 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1618 } else {
1619 ## NOTE: This state should never be reached.
1620
1621 }
1622 } else {
1623 die "$0: $self->{ct}->{type}: Unknown token type";
1624 }
1625 $self->{state} = DATA_STATE;
1626 $self->{s_kwd} = '';
1627
1628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1629 $self->{line_prev} = $self->{line};
1630 $self->{column_prev} = $self->{column};
1631 $self->{column}++;
1632 $self->{nc}
1633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1634 } else {
1635 $self->{set_nc}->($self);
1636 }
1637
1638
1639 return ($self->{ct}); # start tag or end tag
1640
1641 redo A;
1642 } elsif ($self->{nc} == -1) {
1643 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1644 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1645
1646 $self->{last_stag_name} = $self->{ct}->{tag_name};
1647 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1648 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1649 if ($self->{ct}->{attributes}) {
1650
1651 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1652 } else {
1653 ## NOTE: This state should never be reached.
1654
1655 }
1656 } else {
1657 die "$0: $self->{ct}->{type}: Unknown token type";
1658 }
1659 $self->{state} = DATA_STATE;
1660 $self->{s_kwd} = '';
1661 ## reconsume
1662
1663 return ($self->{ct}); # start tag or end tag
1664
1665 redo A;
1666 } else {
1667 if ($self->{nc} == 0x003D) { # =
1668
1669 ## XML5: Not a parse error.
1670 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1671 } elsif ($self->{is_xml}) {
1672
1673 ## XML5: No parse error.
1674 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1675 } else {
1676
1677 }
1678 $self->{ca}->{value} .= chr ($self->{nc});
1679 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1680
1681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1682 $self->{line_prev} = $self->{line};
1683 $self->{column_prev} = $self->{column};
1684 $self->{column}++;
1685 $self->{nc}
1686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1687 } else {
1688 $self->{set_nc}->($self);
1689 }
1690
1691 redo A;
1692 }
1693 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1694 ## XML5: "Tag attribute value double quoted state".
1695
1696 if ($self->{nc} == 0x0022) { # "
1697
1698 ## XML5: "Tag attribute name before state".
1699 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1700
1701 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1702 $self->{line_prev} = $self->{line};
1703 $self->{column_prev} = $self->{column};
1704 $self->{column}++;
1705 $self->{nc}
1706 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1707 } else {
1708 $self->{set_nc}->($self);
1709 }
1710
1711 redo A;
1712 } elsif ($self->{nc} == 0x0026) { # &
1713
1714 ## XML5: Not defined yet.
1715
1716 ## NOTE: In the spec, the tokenizer is switched to the
1717 ## "entity in attribute value state". In this implementation, the
1718 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1719 ## implementation of the "consume a character reference" algorithm.
1720 $self->{prev_state} = $self->{state};
1721 $self->{entity_add} = 0x0022; # "
1722 $self->{state} = ENTITY_STATE;
1723
1724 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1725 $self->{line_prev} = $self->{line};
1726 $self->{column_prev} = $self->{column};
1727 $self->{column}++;
1728 $self->{nc}
1729 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1730 } else {
1731 $self->{set_nc}->($self);
1732 }
1733
1734 redo A;
1735 } elsif ($self->{nc} == -1) {
1736 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1737 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1738
1739 $self->{last_stag_name} = $self->{ct}->{tag_name};
1740 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1741 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1742 if ($self->{ct}->{attributes}) {
1743
1744 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1745 } else {
1746 ## NOTE: This state should never be reached.
1747
1748 }
1749 } else {
1750 die "$0: $self->{ct}->{type}: Unknown token type";
1751 }
1752 $self->{state} = DATA_STATE;
1753 $self->{s_kwd} = '';
1754 ## reconsume
1755
1756 return ($self->{ct}); # start tag or end tag
1757
1758 redo A;
1759 } else {
1760 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1761
1762 ## XML5: Not a parse error.
1763 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1764 } else {
1765
1766 }
1767 $self->{ca}->{value} .= chr ($self->{nc});
1768 $self->{read_until}->($self->{ca}->{value},
1769 q["&<],
1770 length $self->{ca}->{value});
1771
1772 ## Stay in the state
1773
1774 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1775 $self->{line_prev} = $self->{line};
1776 $self->{column_prev} = $self->{column};
1777 $self->{column}++;
1778 $self->{nc}
1779 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1780 } else {
1781 $self->{set_nc}->($self);
1782 }
1783
1784 redo A;
1785 }
1786 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1787 ## XML5: "Tag attribute value single quoted state".
1788
1789 if ($self->{nc} == 0x0027) { # '
1790
1791 ## XML5: "Before attribute name state" (sic).
1792 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1793
1794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795 $self->{line_prev} = $self->{line};
1796 $self->{column_prev} = $self->{column};
1797 $self->{column}++;
1798 $self->{nc}
1799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800 } else {
1801 $self->{set_nc}->($self);
1802 }
1803
1804 redo A;
1805 } elsif ($self->{nc} == 0x0026) { # &
1806
1807 ## XML5: Not defined yet.
1808
1809 ## NOTE: In the spec, the tokenizer is switched to the
1810 ## "entity in attribute value state". In this implementation, the
1811 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1812 ## implementation of the "consume a character reference" algorithm.
1813 $self->{entity_add} = 0x0027; # '
1814 $self->{prev_state} = $self->{state};
1815 $self->{state} = ENTITY_STATE;
1816
1817 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1818 $self->{line_prev} = $self->{line};
1819 $self->{column_prev} = $self->{column};
1820 $self->{column}++;
1821 $self->{nc}
1822 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1823 } else {
1824 $self->{set_nc}->($self);
1825 }
1826
1827 redo A;
1828 } elsif ($self->{nc} == -1) {
1829 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1830 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1831
1832 $self->{last_stag_name} = $self->{ct}->{tag_name};
1833 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1834 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1835 if ($self->{ct}->{attributes}) {
1836
1837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1838 } else {
1839 ## NOTE: This state should never be reached.
1840
1841 }
1842 } else {
1843 die "$0: $self->{ct}->{type}: Unknown token type";
1844 }
1845 $self->{state} = DATA_STATE;
1846 $self->{s_kwd} = '';
1847 ## reconsume
1848
1849 return ($self->{ct}); # start tag or end tag
1850
1851 redo A;
1852 } else {
1853 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1854
1855 ## XML5: Not a parse error.
1856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1857 } else {
1858
1859 }
1860 $self->{ca}->{value} .= chr ($self->{nc});
1861 $self->{read_until}->($self->{ca}->{value},
1862 q['&<],
1863 length $self->{ca}->{value});
1864
1865 ## Stay in the state
1866
1867 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1868 $self->{line_prev} = $self->{line};
1869 $self->{column_prev} = $self->{column};
1870 $self->{column}++;
1871 $self->{nc}
1872 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1873 } else {
1874 $self->{set_nc}->($self);
1875 }
1876
1877 redo A;
1878 }
1879 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1880 ## XML5: "Tag attribute value unquoted state".
1881
1882 if ($is_space->{$self->{nc}}) {
1883
1884 ## XML5: "Tag attribute name before state".
1885 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1886
1887 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1888 $self->{line_prev} = $self->{line};
1889 $self->{column_prev} = $self->{column};
1890 $self->{column}++;
1891 $self->{nc}
1892 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1893 } else {
1894 $self->{set_nc}->($self);
1895 }
1896
1897 redo A;
1898 } elsif ($self->{nc} == 0x0026) { # &
1899
1900
1901 ## XML5: Not defined yet.
1902
1903 ## NOTE: In the spec, the tokenizer is switched to the
1904 ## "entity in attribute value state". In this implementation, the
1905 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1906 ## implementation of the "consume a character reference" algorithm.
1907 $self->{entity_add} = -1;
1908 $self->{prev_state} = $self->{state};
1909 $self->{state} = ENTITY_STATE;
1910
1911 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1912 $self->{line_prev} = $self->{line};
1913 $self->{column_prev} = $self->{column};
1914 $self->{column}++;
1915 $self->{nc}
1916 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1917 } else {
1918 $self->{set_nc}->($self);
1919 }
1920
1921 redo A;
1922 } elsif ($self->{nc} == 0x003E) { # >
1923 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1924
1925 $self->{last_stag_name} = $self->{ct}->{tag_name};
1926 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1927 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1928 if ($self->{ct}->{attributes}) {
1929
1930 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1931 } else {
1932 ## NOTE: This state should never be reached.
1933
1934 }
1935 } else {
1936 die "$0: $self->{ct}->{type}: Unknown token type";
1937 }
1938 $self->{state} = DATA_STATE;
1939 $self->{s_kwd} = '';
1940
1941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1942 $self->{line_prev} = $self->{line};
1943 $self->{column_prev} = $self->{column};
1944 $self->{column}++;
1945 $self->{nc}
1946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1947 } else {
1948 $self->{set_nc}->($self);
1949 }
1950
1951
1952 return ($self->{ct}); # start tag or end tag
1953
1954 redo A;
1955 } elsif ($self->{nc} == -1) {
1956 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1957 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1958
1959 $self->{last_stag_name} = $self->{ct}->{tag_name};
1960 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1961 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1962 if ($self->{ct}->{attributes}) {
1963
1964 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1965 } else {
1966 ## NOTE: This state should never be reached.
1967
1968 }
1969 } else {
1970 die "$0: $self->{ct}->{type}: Unknown token type";
1971 }
1972 $self->{state} = DATA_STATE;
1973 $self->{s_kwd} = '';
1974 ## reconsume
1975
1976 return ($self->{ct}); # start tag or end tag
1977
1978 redo A;
1979 } else {
1980 if ({
1981 0x0022 => 1, # "
1982 0x0027 => 1, # '
1983 0x003D => 1, # =
1984 }->{$self->{nc}}) {
1985
1986 ## XML5: Not a parse error.
1987 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1988 } else {
1989
1990 }
1991 $self->{ca}->{value} .= chr ($self->{nc});
1992 $self->{read_until}->($self->{ca}->{value},
1993 q["'=& >],
1994 length $self->{ca}->{value});
1995
1996 ## Stay in the state
1997
1998 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1999 $self->{line_prev} = $self->{line};
2000 $self->{column_prev} = $self->{column};
2001 $self->{column}++;
2002 $self->{nc}
2003 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2004 } else {
2005 $self->{set_nc}->($self);
2006 }
2007
2008 redo A;
2009 }
2010 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2011 if ($is_space->{$self->{nc}}) {
2012
2013 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2014
2015 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2016 $self->{line_prev} = $self->{line};
2017 $self->{column_prev} = $self->{column};
2018 $self->{column}++;
2019 $self->{nc}
2020 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2021 } else {
2022 $self->{set_nc}->($self);
2023 }
2024
2025 redo A;
2026 } elsif ($self->{nc} == 0x003E) { # >
2027 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2028
2029 $self->{last_stag_name} = $self->{ct}->{tag_name};
2030 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2031 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2032 if ($self->{ct}->{attributes}) {
2033
2034 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2035 } else {
2036 ## NOTE: This state should never be reached.
2037
2038 }
2039 } else {
2040 die "$0: $self->{ct}->{type}: Unknown token type";
2041 }
2042 $self->{state} = DATA_STATE;
2043 $self->{s_kwd} = '';
2044
2045 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046 $self->{line_prev} = $self->{line};
2047 $self->{column_prev} = $self->{column};
2048 $self->{column}++;
2049 $self->{nc}
2050 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2051 } else {
2052 $self->{set_nc}->($self);
2053 }
2054
2055
2056 return ($self->{ct}); # start tag or end tag
2057
2058 redo A;
2059 } elsif ($self->{nc} == 0x002F) { # /
2060
2061 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2062
2063 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2064 $self->{line_prev} = $self->{line};
2065 $self->{column_prev} = $self->{column};
2066 $self->{column}++;
2067 $self->{nc}
2068 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2069 } else {
2070 $self->{set_nc}->($self);
2071 }
2072
2073 redo A;
2074 } elsif ($self->{nc} == -1) {
2075 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2076 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2077
2078 $self->{last_stag_name} = $self->{ct}->{tag_name};
2079 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2080 if ($self->{ct}->{attributes}) {
2081
2082 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2083 } else {
2084 ## NOTE: This state should never be reached.
2085
2086 }
2087 } else {
2088 die "$0: $self->{ct}->{type}: Unknown token type";
2089 }
2090 $self->{state} = DATA_STATE;
2091 $self->{s_kwd} = '';
2092 ## Reconsume.
2093 return ($self->{ct}); # start tag or end tag
2094 redo A;
2095 } else {
2096
2097 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2098 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2099 ## reconsume
2100 redo A;
2101 }
2102 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2103 ## XML5: "Empty tag state".
2104
2105 if ($self->{nc} == 0x003E) { # >
2106 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2107
2108 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2109 ## TODO: Different type than slash in start tag
2110 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2111 if ($self->{ct}->{attributes}) {
2112
2113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2114 } else {
2115
2116 }
2117 ## TODO: Test |<title></title/>|
2118 } else {
2119
2120 $self->{self_closing} = 1;
2121 }
2122
2123 $self->{state} = DATA_STATE;
2124 $self->{s_kwd} = '';
2125
2126 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2127 $self->{line_prev} = $self->{line};
2128 $self->{column_prev} = $self->{column};
2129 $self->{column}++;
2130 $self->{nc}
2131 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2132 } else {
2133 $self->{set_nc}->($self);
2134 }
2135
2136
2137 return ($self->{ct}); # start tag or end tag
2138
2139 redo A;
2140 } elsif ($self->{nc} == -1) {
2141 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2142 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2143
2144 $self->{last_stag_name} = $self->{ct}->{tag_name};
2145 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2146 if ($self->{ct}->{attributes}) {
2147
2148 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2149 } else {
2150 ## NOTE: This state should never be reached.
2151
2152 }
2153 } else {
2154 die "$0: $self->{ct}->{type}: Unknown token type";
2155 }
2156 ## XML5: "Tag attribute name before state".
2157 $self->{state} = DATA_STATE;
2158 $self->{s_kwd} = '';
2159 ## Reconsume.
2160 return ($self->{ct}); # start tag or end tag
2161 redo A;
2162 } else {
2163
2164 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2165 ## TODO: This error type is wrong.
2166 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2167 ## Reconsume.
2168 redo A;
2169 }
2170 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2171 ## (only happen if PCDATA state)
2172
2173 ## NOTE: Unlike spec's "bogus comment state", this implementation
2174 ## consumes characters one-by-one basis.
2175
2176 if ($self->{nc} == 0x003E) { # >
2177
2178 $self->{state} = DATA_STATE;
2179 $self->{s_kwd} = '';
2180
2181 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2182 $self->{line_prev} = $self->{line};
2183 $self->{column_prev} = $self->{column};
2184 $self->{column}++;
2185 $self->{nc}
2186 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2187 } else {
2188 $self->{set_nc}->($self);
2189 }
2190
2191
2192 return ($self->{ct}); # comment
2193 redo A;
2194 } elsif ($self->{nc} == -1) {
2195
2196 $self->{state} = DATA_STATE;
2197 $self->{s_kwd} = '';
2198 ## reconsume
2199
2200 return ($self->{ct}); # comment
2201 redo A;
2202 } else {
2203
2204 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2205 $self->{read_until}->($self->{ct}->{data},
2206 q[>],
2207 length $self->{ct}->{data});
2208
2209 ## Stay in the state.
2210
2211 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2212 $self->{line_prev} = $self->{line};
2213 $self->{column_prev} = $self->{column};
2214 $self->{column}++;
2215 $self->{nc}
2216 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2217 } else {
2218 $self->{set_nc}->($self);
2219 }
2220
2221 redo A;
2222 }
2223 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2224 ## (only happen if PCDATA state)
2225
2226 if ($self->{nc} == 0x002D) { # -
2227
2228 $self->{state} = MD_HYPHEN_STATE;
2229
2230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2231 $self->{line_prev} = $self->{line};
2232 $self->{column_prev} = $self->{column};
2233 $self->{column}++;
2234 $self->{nc}
2235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2236 } else {
2237 $self->{set_nc}->($self);
2238 }
2239
2240 redo A;
2241 } elsif ($self->{nc} == 0x0044 or # D
2242 $self->{nc} == 0x0064) { # d
2243 ## ASCII case-insensitive.
2244
2245 $self->{state} = MD_DOCTYPE_STATE;
2246 $self->{s_kwd} = chr $self->{nc};
2247
2248 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249 $self->{line_prev} = $self->{line};
2250 $self->{column_prev} = $self->{column};
2251 $self->{column}++;
2252 $self->{nc}
2253 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254 } else {
2255 $self->{set_nc}->($self);
2256 }
2257
2258 redo A;
2259 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2260 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2261 $self->{is_xml}) and
2262 $self->{nc} == 0x005B) { # [
2263
2264 $self->{state} = MD_CDATA_STATE;
2265 $self->{s_kwd} = '[';
2266
2267 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2268 $self->{line_prev} = $self->{line};
2269 $self->{column_prev} = $self->{column};
2270 $self->{column}++;
2271 $self->{nc}
2272 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2273 } else {
2274 $self->{set_nc}->($self);
2275 }
2276
2277 redo A;
2278 } else {
2279
2280 }
2281
2282 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2283 line => $self->{line_prev},
2284 column => $self->{column_prev} - 1);
2285 ## Reconsume.
2286 $self->{state} = BOGUS_COMMENT_STATE;
2287 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2288 line => $self->{line_prev},
2289 column => $self->{column_prev} - 1,
2290 };
2291 redo A;
2292 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2293 if ($self->{nc} == 0x002D) { # -
2294
2295 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2296 line => $self->{line_prev},
2297 column => $self->{column_prev} - 2,
2298 };
2299 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2300
2301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2302 $self->{line_prev} = $self->{line};
2303 $self->{column_prev} = $self->{column};
2304 $self->{column}++;
2305 $self->{nc}
2306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2307 } else {
2308 $self->{set_nc}->($self);
2309 }
2310
2311 redo A;
2312 } else {
2313
2314 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2315 line => $self->{line_prev},
2316 column => $self->{column_prev} - 2);
2317 $self->{state} = BOGUS_COMMENT_STATE;
2318 ## Reconsume.
2319 $self->{ct} = {type => COMMENT_TOKEN,
2320 data => '-',
2321 line => $self->{line_prev},
2322 column => $self->{column_prev} - 2,
2323 };
2324 redo A;
2325 }
2326 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2327 ## ASCII case-insensitive.
2328 if ($self->{nc} == [
2329 undef,
2330 0x004F, # O
2331 0x0043, # C
2332 0x0054, # T
2333 0x0059, # Y
2334 0x0050, # P
2335 ]->[length $self->{s_kwd}] or
2336 $self->{nc} == [
2337 undef,
2338 0x006F, # o
2339 0x0063, # c
2340 0x0074, # t
2341 0x0079, # y
2342 0x0070, # p
2343 ]->[length $self->{s_kwd}]) {
2344
2345 ## Stay in the state.
2346 $self->{s_kwd} .= chr $self->{nc};
2347
2348 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2349 $self->{line_prev} = $self->{line};
2350 $self->{column_prev} = $self->{column};
2351 $self->{column}++;
2352 $self->{nc}
2353 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2354 } else {
2355 $self->{set_nc}->($self);
2356 }
2357
2358 redo A;
2359 } elsif ((length $self->{s_kwd}) == 6 and
2360 ($self->{nc} == 0x0045 or # E
2361 $self->{nc} == 0x0065)) { # e
2362 if ($self->{s_kwd} ne 'DOCTYP') {
2363
2364 ## XML5: case-sensitive.
2365 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2366 text => 'DOCTYPE',
2367 line => $self->{line_prev},
2368 column => $self->{column_prev} - 5);
2369 } else {
2370
2371 }
2372 $self->{state} = DOCTYPE_STATE;
2373 $self->{ct} = {type => DOCTYPE_TOKEN,
2374 quirks => 1,
2375 line => $self->{line_prev},
2376 column => $self->{column_prev} - 7,
2377 };
2378
2379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2380 $self->{line_prev} = $self->{line};
2381 $self->{column_prev} = $self->{column};
2382 $self->{column}++;
2383 $self->{nc}
2384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2385 } else {
2386 $self->{set_nc}->($self);
2387 }
2388
2389 redo A;
2390 } else {
2391
2392 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2393 line => $self->{line_prev},
2394 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2395 $self->{state} = BOGUS_COMMENT_STATE;
2396 ## Reconsume.
2397 $self->{ct} = {type => COMMENT_TOKEN,
2398 data => $self->{s_kwd},
2399 line => $self->{line_prev},
2400 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2401 };
2402 redo A;
2403 }
2404 } elsif ($self->{state} == MD_CDATA_STATE) {
2405 if ($self->{nc} == {
2406 '[' => 0x0043, # C
2407 '[C' => 0x0044, # D
2408 '[CD' => 0x0041, # A
2409 '[CDA' => 0x0054, # T
2410 '[CDAT' => 0x0041, # A
2411 }->{$self->{s_kwd}}) {
2412
2413 ## Stay in the state.
2414 $self->{s_kwd} .= chr $self->{nc};
2415
2416 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2417 $self->{line_prev} = $self->{line};
2418 $self->{column_prev} = $self->{column};
2419 $self->{column}++;
2420 $self->{nc}
2421 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2422 } else {
2423 $self->{set_nc}->($self);
2424 }
2425
2426 redo A;
2427 } elsif ($self->{s_kwd} eq '[CDATA' and
2428 $self->{nc} == 0x005B) { # [
2429 if ($self->{is_xml} and
2430 not $self->{tainted} and
2431 @{$self->{open_elements} or []} == 0) {
2432
2433 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2434 line => $self->{line_prev},
2435 column => $self->{column_prev} - 7);
2436 $self->{tainted} = 1;
2437 } else {
2438
2439 }
2440
2441 $self->{ct} = {type => CHARACTER_TOKEN,
2442 data => '',
2443 line => $self->{line_prev},
2444 column => $self->{column_prev} - 7};
2445 $self->{state} = CDATA_SECTION_STATE;
2446
2447 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2448 $self->{line_prev} = $self->{line};
2449 $self->{column_prev} = $self->{column};
2450 $self->{column}++;
2451 $self->{nc}
2452 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2453 } else {
2454 $self->{set_nc}->($self);
2455 }
2456
2457 redo A;
2458 } else {
2459
2460 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461 line => $self->{line_prev},
2462 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2463 $self->{state} = BOGUS_COMMENT_STATE;
2464 ## Reconsume.
2465 $self->{ct} = {type => COMMENT_TOKEN,
2466 data => $self->{s_kwd},
2467 line => $self->{line_prev},
2468 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2469 };
2470 redo A;
2471 }
2472 } elsif ($self->{state} == COMMENT_START_STATE) {
2473 if ($self->{nc} == 0x002D) { # -
2474
2475 $self->{state} = COMMENT_START_DASH_STATE;
2476
2477 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2478 $self->{line_prev} = $self->{line};
2479 $self->{column_prev} = $self->{column};
2480 $self->{column}++;
2481 $self->{nc}
2482 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2483 } else {
2484 $self->{set_nc}->($self);
2485 }
2486
2487 redo A;
2488 } elsif ($self->{nc} == 0x003E) { # >
2489
2490 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2491 $self->{state} = DATA_STATE;
2492 $self->{s_kwd} = '';
2493
2494 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2495 $self->{line_prev} = $self->{line};
2496 $self->{column_prev} = $self->{column};
2497 $self->{column}++;
2498 $self->{nc}
2499 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2500 } else {
2501 $self->{set_nc}->($self);
2502 }
2503
2504
2505 return ($self->{ct}); # comment
2506
2507 redo A;
2508 } elsif ($self->{nc} == -1) {
2509
2510 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2511 $self->{state} = DATA_STATE;
2512 $self->{s_kwd} = '';
2513 ## reconsume
2514
2515 return ($self->{ct}); # comment
2516
2517 redo A;
2518 } else {
2519
2520 $self->{ct}->{data} # comment
2521 .= chr ($self->{nc});
2522 $self->{state} = COMMENT_STATE;
2523
2524 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525 $self->{line_prev} = $self->{line};
2526 $self->{column_prev} = $self->{column};
2527 $self->{column}++;
2528 $self->{nc}
2529 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530 } else {
2531 $self->{set_nc}->($self);
2532 }
2533
2534 redo A;
2535 }
2536 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2537 if ($self->{nc} == 0x002D) { # -
2538
2539 $self->{state} = COMMENT_END_STATE;
2540
2541 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2542 $self->{line_prev} = $self->{line};
2543 $self->{column_prev} = $self->{column};
2544 $self->{column}++;
2545 $self->{nc}
2546 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2547 } else {
2548 $self->{set_nc}->($self);
2549 }
2550
2551 redo A;
2552 } elsif ($self->{nc} == 0x003E) { # >
2553
2554 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2555 $self->{state} = DATA_STATE;
2556 $self->{s_kwd} = '';
2557
2558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559 $self->{line_prev} = $self->{line};
2560 $self->{column_prev} = $self->{column};
2561 $self->{column}++;
2562 $self->{nc}
2563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564 } else {
2565 $self->{set_nc}->($self);
2566 }
2567
2568
2569 return ($self->{ct}); # comment
2570
2571 redo A;
2572 } elsif ($self->{nc} == -1) {
2573
2574 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2575 $self->{state} = DATA_STATE;
2576 $self->{s_kwd} = '';
2577 ## reconsume
2578
2579 return ($self->{ct}); # comment
2580
2581 redo A;
2582 } else {
2583
2584 $self->{ct}->{data} # comment
2585 .= '-' . chr ($self->{nc});
2586 $self->{state} = COMMENT_STATE;
2587
2588 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2589 $self->{line_prev} = $self->{line};
2590 $self->{column_prev} = $self->{column};
2591 $self->{column}++;
2592 $self->{nc}
2593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2594 } else {
2595 $self->{set_nc}->($self);
2596 }
2597
2598 redo A;
2599 }
2600 } elsif ($self->{state} == COMMENT_STATE) {
2601 if ($self->{nc} == 0x002D) { # -
2602
2603 $self->{state} = COMMENT_END_DASH_STATE;
2604
2605 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2606 $self->{line_prev} = $self->{line};
2607 $self->{column_prev} = $self->{column};
2608 $self->{column}++;
2609 $self->{nc}
2610 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2611 } else {
2612 $self->{set_nc}->($self);
2613 }
2614
2615 redo A;
2616 } elsif ($self->{nc} == -1) {
2617
2618 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619 $self->{state} = DATA_STATE;
2620 $self->{s_kwd} = '';
2621 ## reconsume
2622
2623 return ($self->{ct}); # comment
2624
2625 redo A;
2626 } else {
2627
2628 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2629 $self->{read_until}->($self->{ct}->{data},
2630 q[-],
2631 length $self->{ct}->{data});
2632
2633 ## Stay in the state
2634
2635 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2636 $self->{line_prev} = $self->{line};
2637 $self->{column_prev} = $self->{column};
2638 $self->{column}++;
2639 $self->{nc}
2640 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2641 } else {
2642 $self->{set_nc}->($self);
2643 }
2644
2645 redo A;
2646 }
2647 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2648 ## XML5: "comment dash state".
2649
2650 if ($self->{nc} == 0x002D) { # -
2651
2652 $self->{state} = COMMENT_END_STATE;
2653
2654 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2655 $self->{line_prev} = $self->{line};
2656 $self->{column_prev} = $self->{column};
2657 $self->{column}++;
2658 $self->{nc}
2659 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2660 } else {
2661 $self->{set_nc}->($self);
2662 }
2663
2664 redo A;
2665 } elsif ($self->{nc} == -1) {
2666
2667 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668 $self->{s_kwd} = '';
2669 $self->{state} = DATA_STATE;
2670 $self->{s_kwd} = '';
2671 ## reconsume
2672
2673 return ($self->{ct}); # comment
2674
2675 redo A;
2676 } else {
2677
2678 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2679 $self->{state} = COMMENT_STATE;
2680
2681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2682 $self->{line_prev} = $self->{line};
2683 $self->{column_prev} = $self->{column};
2684 $self->{column}++;
2685 $self->{nc}
2686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2687 } else {
2688 $self->{set_nc}->($self);
2689 }
2690
2691 redo A;
2692 }
2693 } elsif ($self->{state} == COMMENT_END_STATE) {
2694 if ($self->{nc} == 0x003E) { # >
2695
2696 $self->{state} = DATA_STATE;
2697 $self->{s_kwd} = '';
2698
2699 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2700 $self->{line_prev} = $self->{line};
2701 $self->{column_prev} = $self->{column};
2702 $self->{column}++;
2703 $self->{nc}
2704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2705 } else {
2706 $self->{set_nc}->($self);
2707 }
2708
2709
2710 return ($self->{ct}); # comment
2711
2712 redo A;
2713 } elsif ($self->{nc} == 0x002D) { # -
2714
2715 ## XML5: Not a parse error.
2716 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2717 line => $self->{line_prev},
2718 column => $self->{column_prev});
2719 $self->{ct}->{data} .= '-'; # comment
2720 ## Stay in the state
2721
2722 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2723 $self->{line_prev} = $self->{line};
2724 $self->{column_prev} = $self->{column};
2725 $self->{column}++;
2726 $self->{nc}
2727 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2728 } else {
2729 $self->{set_nc}->($self);
2730 }
2731
2732 redo A;
2733 } elsif ($self->{nc} == -1) {
2734
2735 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2736 $self->{state} = DATA_STATE;
2737 $self->{s_kwd} = '';
2738 ## reconsume
2739
2740 return ($self->{ct}); # comment
2741
2742 redo A;
2743 } else {
2744
2745 ## XML5: Not a parse error.
2746 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2747 line => $self->{line_prev},
2748 column => $self->{column_prev});
2749 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2750 $self->{state} = COMMENT_STATE;
2751
2752 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753 $self->{line_prev} = $self->{line};
2754 $self->{column_prev} = $self->{column};
2755 $self->{column}++;
2756 $self->{nc}
2757 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758 } else {
2759 $self->{set_nc}->($self);
2760 }
2761
2762 redo A;
2763 }
2764 } elsif ($self->{state} == DOCTYPE_STATE) {
2765 if ($is_space->{$self->{nc}}) {
2766
2767 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2768
2769 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2770 $self->{line_prev} = $self->{line};
2771 $self->{column_prev} = $self->{column};
2772 $self->{column}++;
2773 $self->{nc}
2774 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2775 } else {
2776 $self->{set_nc}->($self);
2777 }
2778
2779 redo A;
2780 } else {
2781
2782 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2783 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2784 ## reconsume
2785 redo A;
2786 }
2787 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2788 if ($is_space->{$self->{nc}}) {
2789
2790 ## Stay in the state
2791
2792 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2793 $self->{line_prev} = $self->{line};
2794 $self->{column_prev} = $self->{column};
2795 $self->{column}++;
2796 $self->{nc}
2797 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2798 } else {
2799 $self->{set_nc}->($self);
2800 }
2801
2802 redo A;
2803 } elsif ($self->{nc} == 0x003E) { # >
2804
2805 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2806 $self->{state} = DATA_STATE;
2807 $self->{s_kwd} = '';
2808
2809 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2810 $self->{line_prev} = $self->{line};
2811 $self->{column_prev} = $self->{column};
2812 $self->{column}++;
2813 $self->{nc}
2814 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2815 } else {
2816 $self->{set_nc}->($self);
2817 }
2818
2819
2820 return ($self->{ct}); # DOCTYPE (quirks)
2821
2822 redo A;
2823 } elsif ($self->{nc} == -1) {
2824
2825 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2826 $self->{state} = DATA_STATE;
2827 $self->{s_kwd} = '';
2828 ## reconsume
2829
2830 return ($self->{ct}); # DOCTYPE (quirks)
2831
2832 redo A;
2833 } else {
2834
2835 $self->{ct}->{name} = chr $self->{nc};
2836 delete $self->{ct}->{quirks};
2837 $self->{state} = DOCTYPE_NAME_STATE;
2838
2839 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2840 $self->{line_prev} = $self->{line};
2841 $self->{column_prev} = $self->{column};
2842 $self->{column}++;
2843 $self->{nc}
2844 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2845 } else {
2846 $self->{set_nc}->($self);
2847 }
2848
2849 redo A;
2850 }
2851 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2852 ## ISSUE: Redundant "First," in the spec.
2853 if ($is_space->{$self->{nc}}) {
2854
2855 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2856
2857 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2858 $self->{line_prev} = $self->{line};
2859 $self->{column_prev} = $self->{column};
2860 $self->{column}++;
2861 $self->{nc}
2862 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2863 } else {
2864 $self->{set_nc}->($self);
2865 }
2866
2867 redo A;
2868 } elsif ($self->{nc} == 0x003E) { # >
2869
2870 $self->{state} = DATA_STATE;
2871 $self->{s_kwd} = '';
2872
2873 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2874 $self->{line_prev} = $self->{line};
2875 $self->{column_prev} = $self->{column};
2876 $self->{column}++;
2877 $self->{nc}
2878 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2879 } else {
2880 $self->{set_nc}->($self);
2881 }
2882
2883
2884 return ($self->{ct}); # DOCTYPE
2885
2886 redo A;
2887 } elsif ($self->{nc} == -1) {
2888
2889 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2890 $self->{state} = DATA_STATE;
2891 $self->{s_kwd} = '';
2892 ## reconsume
2893
2894 $self->{ct}->{quirks} = 1;
2895 return ($self->{ct}); # DOCTYPE
2896
2897 redo A;
2898 } else {
2899
2900 $self->{ct}->{name}
2901 .= chr ($self->{nc}); # DOCTYPE
2902 ## Stay in the state
2903
2904 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2905 $self->{line_prev} = $self->{line};
2906 $self->{column_prev} = $self->{column};
2907 $self->{column}++;
2908 $self->{nc}
2909 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2910 } else {
2911 $self->{set_nc}->($self);
2912 }
2913
2914 redo A;
2915 }
2916 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2917 if ($is_space->{$self->{nc}}) {
2918
2919 ## Stay in the state
2920
2921 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2922 $self->{line_prev} = $self->{line};
2923 $self->{column_prev} = $self->{column};
2924 $self->{column}++;
2925 $self->{nc}
2926 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2927 } else {
2928 $self->{set_nc}->($self);
2929 }
2930
2931 redo A;
2932 } elsif ($self->{nc} == 0x003E) { # >
2933
2934 $self->{state} = DATA_STATE;
2935 $self->{s_kwd} = '';
2936
2937 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2938 $self->{line_prev} = $self->{line};
2939 $self->{column_prev} = $self->{column};
2940 $self->{column}++;
2941 $self->{nc}
2942 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2943 } else {
2944 $self->{set_nc}->($self);
2945 }
2946
2947
2948 return ($self->{ct}); # DOCTYPE
2949
2950 redo A;
2951 } elsif ($self->{nc} == -1) {
2952
2953 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2954 $self->{state} = DATA_STATE;
2955 $self->{s_kwd} = '';
2956 ## reconsume
2957
2958 $self->{ct}->{quirks} = 1;
2959 return ($self->{ct}); # DOCTYPE
2960
2961 redo A;
2962 } elsif ($self->{nc} == 0x0050 or # P
2963 $self->{nc} == 0x0070) { # p
2964 $self->{state} = PUBLIC_STATE;
2965 $self->{s_kwd} = chr $self->{nc};
2966
2967 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2968 $self->{line_prev} = $self->{line};
2969 $self->{column_prev} = $self->{column};
2970 $self->{column}++;
2971 $self->{nc}
2972 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2973 } else {
2974 $self->{set_nc}->($self);
2975 }
2976
2977 redo A;
2978 } elsif ($self->{nc} == 0x0053 or # S
2979 $self->{nc} == 0x0073) { # s
2980 $self->{state} = SYSTEM_STATE;
2981 $self->{s_kwd} = chr $self->{nc};
2982
2983 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2984 $self->{line_prev} = $self->{line};
2985 $self->{column_prev} = $self->{column};
2986 $self->{column}++;
2987 $self->{nc}
2988 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2989 } else {
2990 $self->{set_nc}->($self);
2991 }
2992
2993 redo A;
2994 } else {
2995
2996 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2997 $self->{ct}->{quirks} = 1;
2998
2999 $self->{state} = BOGUS_DOCTYPE_STATE;
3000
3001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002 $self->{line_prev} = $self->{line};
3003 $self->{column_prev} = $self->{column};
3004 $self->{column}++;
3005 $self->{nc}
3006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007 } else {
3008 $self->{set_nc}->($self);
3009 }
3010
3011 redo A;
3012 }
3013 } elsif ($self->{state} == PUBLIC_STATE) {
3014 ## ASCII case-insensitive
3015 if ($self->{nc} == [
3016 undef,
3017 0x0055, # U
3018 0x0042, # B
3019 0x004C, # L
3020 0x0049, # I
3021 ]->[length $self->{s_kwd}] or
3022 $self->{nc} == [
3023 undef,
3024 0x0075, # u
3025 0x0062, # b
3026 0x006C, # l
3027 0x0069, # i
3028 ]->[length $self->{s_kwd}]) {
3029
3030 ## Stay in the state.
3031 $self->{s_kwd} .= chr $self->{nc};
3032
3033 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3034 $self->{line_prev} = $self->{line};
3035 $self->{column_prev} = $self->{column};
3036 $self->{column}++;
3037 $self->{nc}
3038 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3039 } else {
3040 $self->{set_nc}->($self);
3041 }
3042
3043 redo A;
3044 } elsif ((length $self->{s_kwd}) == 5 and
3045 ($self->{nc} == 0x0043 or # C
3046 $self->{nc} == 0x0063)) { # c
3047
3048 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3049
3050 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3051 $self->{line_prev} = $self->{line};
3052 $self->{column_prev} = $self->{column};
3053 $self->{column}++;
3054 $self->{nc}
3055 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3056 } else {
3057 $self->{set_nc}->($self);
3058 }
3059
3060 redo A;
3061 } else {
3062
3063 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3064 line => $self->{line_prev},
3065 column => $self->{column_prev} + 1 - length $self->{s_kwd});
3066 $self->{ct}->{quirks} = 1;
3067
3068 $self->{state} = BOGUS_DOCTYPE_STATE;
3069 ## Reconsume.
3070 redo A;
3071 }
3072 } elsif ($self->{state} == SYSTEM_STATE) {
3073 ## ASCII case-insensitive
3074 if ($self->{nc} == [
3075 undef,
3076 0x0059, # Y
3077 0x0053, # S
3078 0x0054, # T
3079 0x0045, # E
3080 ]->[length $self->{s_kwd}] or
3081 $self->{nc} == [
3082 undef,
3083 0x0079, # y
3084 0x0073, # s
3085 0x0074, # t
3086 0x0065, # e
3087 ]->[length $self->{s_kwd}]) {
3088
3089 ## Stay in the state.
3090 $self->{s_kwd} .= chr $self->{nc};
3091
3092 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3093 $self->{line_prev} = $self->{line};
3094 $self->{column_prev} = $self->{column};
3095 $self->{column}++;
3096 $self->{nc}
3097 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3098 } else {
3099 $self->{set_nc}->($self);
3100 }
3101
3102 redo A;
3103 } elsif ((length $self->{s_kwd}) == 5 and
3104 ($self->{nc} == 0x004D or # M
3105 $self->{nc} == 0x006D)) { # m
3106
3107 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3108
3109 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3110 $self->{line_prev} = $self->{line};
3111 $self->{column_prev} = $self->{column};
3112 $self->{column}++;
3113 $self->{nc}
3114 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3115 } else {
3116 $self->{set_nc}->($self);
3117 }
3118
3119 redo A;
3120 } else {
3121
3122 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3123 line => $self->{line_prev},
3124 column => $self->{column_prev} + 1 - length $self->{s_kwd});
3125 $self->{ct}->{quirks} = 1;
3126
3127 $self->{state} = BOGUS_DOCTYPE_STATE;
3128 ## Reconsume.
3129 redo A;
3130 }
3131 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3132 if ($is_space->{$self->{nc}}) {
3133
3134 ## Stay in the state
3135
3136 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137 $self->{line_prev} = $self->{line};
3138 $self->{column_prev} = $self->{column};
3139 $self->{column}++;
3140 $self->{nc}
3141 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142 } else {
3143 $self->{set_nc}->($self);
3144 }
3145
3146 redo A;
3147 } elsif ($self->{nc} eq 0x0022) { # "
3148
3149 $self->{ct}->{pubid} = ''; # DOCTYPE
3150 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3151
3152 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153 $self->{line_prev} = $self->{line};
3154 $self->{column_prev} = $self->{column};
3155 $self->{column}++;
3156 $self->{nc}
3157 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158 } else {
3159 $self->{set_nc}->($self);
3160 }
3161
3162 redo A;
3163 } elsif ($self->{nc} eq 0x0027) { # '
3164
3165 $self->{ct}->{pubid} = ''; # DOCTYPE
3166 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3167
3168 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3169 $self->{line_prev} = $self->{line};
3170 $self->{column_prev} = $self->{column};
3171 $self->{column}++;
3172 $self->{nc}
3173 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3174 } else {
3175 $self->{set_nc}->($self);
3176 }
3177
3178 redo A;
3179 } elsif ($self->{nc} eq 0x003E) { # >
3180
3181 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3182
3183 $self->{state} = DATA_STATE;
3184 $self->{s_kwd} = '';
3185
3186 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3187 $self->{line_prev} = $self->{line};
3188 $self->{column_prev} = $self->{column};
3189 $self->{column}++;
3190 $self->{nc}
3191 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3192 } else {
3193 $self->{set_nc}->($self);
3194 }
3195
3196
3197 $self->{ct}->{quirks} = 1;
3198 return ($self->{ct}); # DOCTYPE
3199
3200 redo A;
3201 } elsif ($self->{nc} == -1) {
3202
3203 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3204
3205 $self->{state} = DATA_STATE;
3206 $self->{s_kwd} = '';
3207 ## reconsume
3208
3209 $self->{ct}->{quirks} = 1;
3210 return ($self->{ct}); # DOCTYPE
3211
3212 redo A;
3213 } else {
3214
3215 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3216 $self->{ct}->{quirks} = 1;
3217
3218 $self->{state} = BOGUS_DOCTYPE_STATE;
3219
3220 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3221 $self->{line_prev} = $self->{line};
3222 $self->{column_prev} = $self->{column};
3223 $self->{column}++;
3224 $self->{nc}
3225 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3226 } else {
3227 $self->{set_nc}->($self);
3228 }
3229
3230 redo A;
3231 }
3232 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3233 if ($self->{nc} == 0x0022) { # "
3234
3235 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3236
3237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238 $self->{line_prev} = $self->{line};
3239 $self->{column_prev} = $self->{column};
3240 $self->{column}++;
3241 $self->{nc}
3242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243 } else {
3244 $self->{set_nc}->($self);
3245 }
3246
3247 redo A;
3248 } elsif ($self->{nc} == 0x003E) { # >
3249
3250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3251
3252 $self->{state} = DATA_STATE;
3253 $self->{s_kwd} = '';
3254
3255 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3256 $self->{line_prev} = $self->{line};
3257 $self->{column_prev} = $self->{column};
3258 $self->{column}++;
3259 $self->{nc}
3260 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3261 } else {
3262 $self->{set_nc}->($self);
3263 }
3264
3265
3266 $self->{ct}->{quirks} = 1;
3267 return ($self->{ct}); # DOCTYPE
3268
3269 redo A;
3270 } elsif ($self->{nc} == -1) {
3271
3272 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3273
3274 $self->{state} = DATA_STATE;
3275 $self->{s_kwd} = '';
3276 ## reconsume
3277
3278 $self->{ct}->{quirks} = 1;
3279 return ($self->{ct}); # DOCTYPE
3280
3281 redo A;
3282 } else {
3283
3284 $self->{ct}->{pubid} # DOCTYPE
3285 .= chr $self->{nc};
3286 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3287 length $self->{ct}->{pubid});
3288
3289 ## Stay in the state
3290
3291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3292 $self->{line_prev} = $self->{line};
3293 $self->{column_prev} = $self->{column};
3294 $self->{column}++;
3295 $self->{nc}
3296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3297 } else {
3298 $self->{set_nc}->($self);
3299 }
3300
3301 redo A;
3302 }
3303 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3304 if ($self->{nc} == 0x0027) { # '
3305
3306 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3307
3308 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3309 $self->{line_prev} = $self->{line};
3310 $self->{column_prev} = $self->{column};
3311 $self->{column}++;
3312 $self->{nc}
3313 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3314 } else {
3315 $self->{set_nc}->($self);
3316 }
3317
3318 redo A;
3319 } elsif ($self->{nc} == 0x003E) { # >
3320
3321 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3322
3323 $self->{state} = DATA_STATE;
3324 $self->{s_kwd} = '';
3325
3326 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3327 $self->{line_prev} = $self->{line};
3328 $self->{column_prev} = $self->{column};
3329 $self->{column}++;
3330 $self->{nc}
3331 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3332 } else {
3333 $self->{set_nc}->($self);
3334 }
3335
3336
3337 $self->{ct}->{quirks} = 1;
3338 return ($self->{ct}); # DOCTYPE
3339
3340 redo A;
3341 } elsif ($self->{nc} == -1) {
3342
3343 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3344
3345 $self->{state} = DATA_STATE;
3346 $self->{s_kwd} = '';
3347 ## reconsume
3348
3349 $self->{ct}->{quirks} = 1;
3350 return ($self->{ct}); # DOCTYPE
3351
3352 redo A;
3353 } else {
3354
3355 $self->{ct}->{pubid} # DOCTYPE
3356 .= chr $self->{nc};
3357 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3358 length $self->{ct}->{pubid});
3359
3360 ## Stay in the state
3361
3362 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3363 $self->{line_prev} = $self->{line};
3364 $self->{column_prev} = $self->{column};
3365 $self->{column}++;
3366 $self->{nc}
3367 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3368 } else {
3369 $self->{set_nc}->($self);
3370 }
3371
3372 redo A;
3373 }
3374 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3375 if ($is_space->{$self->{nc}}) {
3376
3377 ## Stay in the state
3378
3379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3380 $self->{line_prev} = $self->{line};
3381 $self->{column_prev} = $self->{column};
3382 $self->{column}++;
3383 $self->{nc}
3384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3385 } else {
3386 $self->{set_nc}->($self);
3387 }
3388
3389 redo A;
3390 } elsif ($self->{nc} == 0x0022) { # "
3391
3392 $self->{ct}->{sysid} = ''; # DOCTYPE
3393 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3394
3395 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3396 $self->{line_prev} = $self->{line};
3397 $self->{column_prev} = $self->{column};
3398 $self->{column}++;
3399 $self->{nc}
3400 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3401 } else {
3402 $self->{set_nc}->($self);
3403 }
3404
3405 redo A;
3406 } elsif ($self->{nc} == 0x0027) { # '
3407
3408 $self->{ct}->{sysid} = ''; # DOCTYPE
3409 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3410
3411 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3412 $self->{line_prev} = $self->{line};
3413 $self->{column_prev} = $self->{column};
3414 $self->{column}++;
3415 $self->{nc}
3416 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3417 } else {
3418 $self->{set_nc}->($self);
3419 }
3420
3421 redo A;
3422 } elsif ($self->{nc} == 0x003E) { # >
3423
3424 $self->{state} = DATA_STATE;
3425 $self->{s_kwd} = '';
3426
3427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428 $self->{line_prev} = $self->{line};
3429 $self->{column_prev} = $self->{column};
3430 $self->{column}++;
3431 $self->{nc}
3432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3433 } else {
3434 $self->{set_nc}->($self);
3435 }
3436
3437
3438 return ($self->{ct}); # DOCTYPE
3439
3440 redo A;
3441 } elsif ($self->{nc} == -1) {
3442
3443 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3444
3445 $self->{state} = DATA_STATE;
3446 $self->{s_kwd} = '';
3447 ## reconsume
3448
3449 $self->{ct}->{quirks} = 1;
3450 return ($self->{ct}); # DOCTYPE
3451
3452 redo A;
3453 } else {
3454
3455 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3456 $self->{ct}->{quirks} = 1;
3457
3458 $self->{state} = BOGUS_DOCTYPE_STATE;
3459
3460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3461 $self->{line_prev} = $self->{line};
3462 $self->{column_prev} = $self->{column};
3463 $self->{column}++;
3464 $self->{nc}
3465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3466 } else {
3467 $self->{set_nc}->($self);
3468 }
3469
3470 redo A;
3471 }
3472 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3473 if ($is_space->{$self->{nc}}) {
3474
3475 ## Stay in the state
3476
3477 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3478 $self->{line_prev} = $self->{line};
3479 $self->{column_prev} = $self->{column};
3480 $self->{column}++;
3481 $self->{nc}
3482 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3483 } else {
3484 $self->{set_nc}->($self);
3485 }
3486
3487 redo A;
3488 } elsif ($self->{nc} == 0x0022) { # "
3489
3490 $self->{ct}->{sysid} = ''; # DOCTYPE
3491 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3492
3493 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3494 $self->{line_prev} = $self->{line};
3495 $self->{column_prev} = $self->{column};
3496 $self->{column}++;
3497 $self->{nc}
3498 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3499 } else {
3500 $self->{set_nc}->($self);
3501 }
3502
3503 redo A;
3504 } elsif ($self->{nc} == 0x0027) { # '
3505
3506 $self->{ct}->{sysid} = ''; # DOCTYPE
3507 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3508
3509 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3510 $self->{line_prev} = $self->{line};
3511 $self->{column_prev} = $self->{column};
3512 $self->{column}++;
3513 $self->{nc}
3514 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3515 } else {
3516 $self->{set_nc}->($self);
3517 }
3518
3519 redo A;
3520 } elsif ($self->{nc} == 0x003E) { # >
3521
3522 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3523 $self->{state} = DATA_STATE;
3524 $self->{s_kwd} = '';
3525
3526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3527 $self->{line_prev} = $self->{line};
3528 $self->{column_prev} = $self->{column};
3529 $self->{column}++;
3530 $self->{nc}
3531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3532 } else {
3533 $self->{set_nc}->($self);
3534 }
3535
3536
3537 $self->{ct}->{quirks} = 1;
3538 return ($self->{ct}); # DOCTYPE
3539
3540 redo A;
3541 } elsif ($self->{nc} == -1) {
3542
3543 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3544
3545 $self->{state} = DATA_STATE;
3546 $self->{s_kwd} = '';
3547 ## reconsume
3548
3549 $self->{ct}->{quirks} = 1;
3550 return ($self->{ct}); # DOCTYPE
3551
3552 redo A;
3553 } else {
3554
3555 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3556 $self->{ct}->{quirks} = 1;
3557
3558 $self->{state} = BOGUS_DOCTYPE_STATE;
3559
3560 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3561 $self->{line_prev} = $self->{line};
3562 $self->{column_prev} = $self->{column};
3563 $self->{column}++;
3564 $self->{nc}
3565 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3566 } else {
3567 $self->{set_nc}->($self);
3568 }
3569
3570 redo A;
3571 }
3572 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3573 if ($self->{nc} == 0x0022) { # "
3574
3575 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3576
3577 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3578 $self->{line_prev} = $self->{line};
3579 $self->{column_prev} = $self->{column};
3580 $self->{column}++;
3581 $self->{nc}
3582 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3583 } else {
3584 $self->{set_nc}->($self);
3585 }
3586
3587 redo A;
3588 } elsif ($self->{nc} == 0x003E) { # >
3589
3590 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3591
3592 $self->{state} = DATA_STATE;
3593 $self->{s_kwd} = '';
3594
3595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3596 $self->{line_prev} = $self->{line};
3597 $self->{column_prev} = $self->{column};
3598 $self->{column}++;
3599 $self->{nc}
3600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3601 } else {
3602 $self->{set_nc}->($self);
3603 }
3604
3605
3606 $self->{ct}->{quirks} = 1;
3607 return ($self->{ct}); # DOCTYPE
3608
3609 redo A;
3610 } elsif ($self->{nc} == -1) {
3611
3612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3613
3614 $self->{state} = DATA_STATE;
3615 $self->{s_kwd} = '';
3616 ## reconsume
3617
3618 $self->{ct}->{quirks} = 1;
3619 return ($self->{ct}); # DOCTYPE
3620
3621 redo A;
3622 } else {
3623
3624 $self->{ct}->{sysid} # DOCTYPE
3625 .= chr $self->{nc};
3626 $self->{read_until}->($self->{ct}->{sysid}, q[">],
3627 length $self->{ct}->{sysid});
3628
3629 ## Stay in the state
3630
3631 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3632 $self->{line_prev} = $self->{line};
3633 $self->{column_prev} = $self->{column};
3634 $self->{column}++;
3635 $self->{nc}
3636 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3637 } else {
3638 $self->{set_nc}->($self);
3639 }
3640
3641 redo A;
3642 }
3643 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3644 if ($self->{nc} == 0x0027) { # '
3645
3646 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3647
3648 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3649 $self->{line_prev} = $self->{line};
3650 $self->{column_prev} = $self->{column};
3651 $self->{column}++;
3652 $self->{nc}
3653 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3654 } else {
3655 $self->{set_nc}->($self);
3656 }
3657
3658 redo A;
3659 } elsif ($self->{nc} == 0x003E) { # >
3660
3661 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3662
3663 $self->{state} = DATA_STATE;
3664 $self->{s_kwd} = '';
3665
3666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667 $self->{line_prev} = $self->{line};
3668 $self->{column_prev} = $self->{column};
3669 $self->{column}++;
3670 $self->{nc}
3671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672 } else {
3673 $self->{set_nc}->($self);
3674 }
3675
3676
3677 $self->{ct}->{quirks} = 1;
3678 return ($self->{ct}); # DOCTYPE
3679
3680 redo A;
3681 } elsif ($self->{nc} == -1) {
3682
3683 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3684
3685 $self->{state} = DATA_STATE;
3686 $self->{s_kwd} = '';
3687 ## reconsume
3688
3689 $self->{ct}->{quirks} = 1;
3690 return ($self->{ct}); # DOCTYPE
3691
3692 redo A;
3693 } else {
3694
3695 $self->{ct}->{sysid} # DOCTYPE
3696 .= chr $self->{nc};
3697 $self->{read_until}->($self->{ct}->{sysid}, q['>],
3698 length $self->{ct}->{sysid});
3699
3700 ## Stay in the state
3701
3702 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703 $self->{line_prev} = $self->{line};
3704 $self->{column_prev} = $self->{column};
3705 $self->{column}++;
3706 $self->{nc}
3707 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708 } else {
3709 $self->{set_nc}->($self);
3710 }
3711
3712 redo A;
3713 }
3714 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3715 if ($is_space->{$self->{nc}}) {
3716
3717 ## Stay in the state
3718
3719 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720 $self->{line_prev} = $self->{line};
3721 $self->{column_prev} = $self->{column};
3722 $self->{column}++;
3723 $self->{nc}
3724 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725 } else {
3726 $self->{set_nc}->($self);
3727 }
3728
3729 redo A;
3730 } elsif ($self->{nc} == 0x003E) { # >
3731
3732 $self->{state} = DATA_STATE;
3733 $self->{s_kwd} = '';
3734
3735 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3736 $self->{line_prev} = $self->{line};
3737 $self->{column_prev} = $self->{column};
3738 $self->{column}++;
3739 $self->{nc}
3740 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3741 } else {
3742 $self->{set_nc}->($self);
3743 }
3744
3745
3746 return ($self->{ct}); # DOCTYPE
3747
3748 redo A;
3749 } elsif ($self->{nc} == -1) {
3750
3751 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3752 $self->{state} = DATA_STATE;
3753 $self->{s_kwd} = '';
3754 ## reconsume
3755
3756 $self->{ct}->{quirks} = 1;
3757 return ($self->{ct}); # DOCTYPE
3758
3759 redo A;
3760 } else {
3761
3762 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3763 #$self->{ct}->{quirks} = 1;
3764
3765 $self->{state} = BOGUS_DOCTYPE_STATE;
3766
3767 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3768 $self->{line_prev} = $self->{line};
3769 $self->{column_prev} = $self->{column};
3770 $self->{column}++;
3771 $self->{nc}
3772 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3773 } else {
3774 $self->{set_nc}->($self);
3775 }
3776
3777 redo A;
3778 }
3779 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3780 if ($self->{nc} == 0x003E) { # >
3781
3782 $self->{state} = DATA_STATE;
3783 $self->{s_kwd} = '';
3784
3785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786 $self->{line_prev} = $self->{line};
3787 $self->{column_prev} = $self->{column};
3788 $self->{column}++;
3789 $self->{nc}
3790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791 } else {
3792 $self->{set_nc}->($self);
3793 }
3794
3795
3796 return ($self->{ct}); # DOCTYPE
3797
3798 redo A;
3799 } elsif ($self->{nc} == -1) {
3800
3801 $self->{state} = DATA_STATE;
3802 $self->{s_kwd} = '';
3803 ## reconsume
3804
3805 return ($self->{ct}); # DOCTYPE
3806
3807 redo A;
3808 } else {
3809
3810 my $s = '';
3811 $self->{read_until}->($s, q[>], 0);
3812
3813 ## Stay in the state
3814
3815 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3816 $self->{line_prev} = $self->{line};
3817 $self->{column_prev} = $self->{column};
3818 $self->{column}++;
3819 $self->{nc}
3820 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3821 } else {
3822 $self->{set_nc}->($self);
3823 }
3824
3825 redo A;
3826 }
3827 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3828 ## NOTE: "CDATA section state" in the state is jointly implemented
3829 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3830 ## and |CDATA_SECTION_MSE2_STATE|.
3831
3832 ## XML5: "CDATA state".
3833
3834 if ($self->{nc} == 0x005D) { # ]
3835
3836 $self->{state} = CDATA_SECTION_MSE1_STATE;
3837
3838 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3839 $self->{line_prev} = $self->{line};
3840 $self->{column_prev} = $self->{column};
3841 $self->{column}++;
3842 $self->{nc}
3843 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3844 } else {
3845 $self->{set_nc}->($self);
3846 }
3847
3848 redo A;
3849 } elsif ($self->{nc} == -1) {
3850 if ($self->{is_xml}) {
3851
3852 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3853 } else {
3854
3855 }
3856
3857 $self->{state} = DATA_STATE;
3858 $self->{s_kwd} = '';
3859 ## Reconsume.
3860 if (length $self->{ct}->{data}) { # character
3861
3862 return ($self->{ct}); # character
3863 } else {
3864
3865 ## No token to emit. $self->{ct} is discarded.
3866 }
3867 redo A;
3868 } else {
3869
3870 $self->{ct}->{data} .= chr $self->{nc};
3871 $self->{read_until}->($self->{ct}->{data},
3872 q<]>,
3873 length $self->{ct}->{data});
3874
3875 ## Stay in the state.
3876
3877 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3878 $self->{line_prev} = $self->{line};
3879 $self->{column_prev} = $self->{column};
3880 $self->{column}++;
3881 $self->{nc}
3882 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3883 } else {
3884 $self->{set_nc}->($self);
3885 }
3886
3887 redo A;
3888 }
3889
3890 ## ISSUE: "text tokens" in spec.
3891 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3892 ## XML5: "CDATA bracket state".
3893
3894 if ($self->{nc} == 0x005D) { # ]
3895
3896 $self->{state} = CDATA_SECTION_MSE2_STATE;
3897
3898 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899 $self->{line_prev} = $self->{line};
3900 $self->{column_prev} = $self->{column};
3901 $self->{column}++;
3902 $self->{nc}
3903 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904 } else {
3905 $self->{set_nc}->($self);
3906 }
3907
3908 redo A;
3909 } else {
3910
3911 ## XML5: If EOF, "]" is not appended and changed to the data state.
3912 $self->{ct}->{data} .= ']';
3913 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3914 ## Reconsume.
3915 redo A;
3916 }
3917 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3918 ## XML5: "CDATA end state".
3919
3920 if ($self->{nc} == 0x003E) { # >
3921 $self->{state} = DATA_STATE;
3922 $self->{s_kwd} = '';
3923
3924 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3925 $self->{line_prev} = $self->{line};
3926 $self->{column_prev} = $self->{column};
3927 $self->{column}++;
3928 $self->{nc}
3929 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3930 } else {
3931 $self->{set_nc}->($self);
3932 }
3933
3934 if (length $self->{ct}->{data}) { # character
3935
3936 return ($self->{ct}); # character
3937 } else {
3938
3939 ## No token to emit. $self->{ct} is discarded.
3940 }
3941 redo A;
3942 } elsif ($self->{nc} == 0x005D) { # ]
3943 # character
3944 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3945 ## Stay in the state.
3946
3947 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3948 $self->{line_prev} = $self->{line};
3949 $self->{column_prev} = $self->{column};
3950 $self->{column}++;
3951 $self->{nc}
3952 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3953 } else {
3954 $self->{set_nc}->($self);
3955 }
3956
3957 redo A;
3958 } else {
3959
3960 $self->{ct}->{data} .= ']]'; # character
3961 $self->{state} = CDATA_SECTION_STATE;
3962 ## Reconsume. ## XML5: Emit.
3963 redo A;
3964 }
3965 } elsif ($self->{state} == ENTITY_STATE) {
3966 if ($is_space->{$self->{nc}} or
3967 {
3968 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3969 $self->{entity_add} => 1,
3970 }->{$self->{nc}}) {
3971
3972 ## Don't consume
3973 ## No error
3974 ## Return nothing.
3975 #
3976 } elsif ($self->{nc} == 0x0023) { # #
3977
3978 $self->{state} = ENTITY_HASH_STATE;
3979 $self->{s_kwd} = '#';
3980
3981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3982 $self->{line_prev} = $self->{line};
3983 $self->{column_prev} = $self->{column};
3984 $self->{column}++;
3985 $self->{nc}
3986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3987 } else {
3988 $self->{set_nc}->($self);
3989 }
3990
3991 redo A;
3992 } elsif ((0x0041 <= $self->{nc} and
3993 $self->{nc} <= 0x005A) or # A..Z
3994 (0x0061 <= $self->{nc} and
3995 $self->{nc} <= 0x007A)) { # a..z
3996
3997 require Whatpm::_NamedEntityList;
3998 $self->{state} = ENTITY_NAME_STATE;
3999 $self->{s_kwd} = chr $self->{nc};
4000 $self->{entity__value} = $self->{s_kwd};
4001 $self->{entity__match} = 0;
4002
4003 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4004 $self->{line_prev} = $self->{line};
4005 $self->{column_prev} = $self->{column};
4006 $self->{column}++;
4007 $self->{nc}
4008 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4009 } else {
4010 $self->{set_nc}->($self);
4011 }
4012
4013 redo A;
4014 } else {
4015
4016 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4017 ## Return nothing.
4018 #
4019 }
4020
4021 ## NOTE: No character is consumed by the "consume a character
4022 ## reference" algorithm. In other word, there is an "&" character
4023 ## that does not introduce a character reference, which would be
4024 ## appended to the parent element or the attribute value in later
4025 ## process of the tokenizer.
4026
4027 if ($self->{prev_state} == DATA_STATE) {
4028
4029 $self->{state} = $self->{prev_state};
4030 $self->{s_kwd} = '';
4031 ## Reconsume.
4032 return ({type => CHARACTER_TOKEN, data => '&',
4033 line => $self->{line_prev},
4034 column => $self->{column_prev},
4035 });
4036 redo A;
4037 } else {
4038
4039 $self->{ca}->{value} .= '&';
4040 $self->{state} = $self->{prev_state};
4041 $self->{s_kwd} = '';
4042 ## Reconsume.
4043 redo A;
4044 }
4045 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4046 if ($self->{nc} == 0x0078 or # x
4047 $self->{nc} == 0x0058) { # X
4048
4049 $self->{state} = HEXREF_X_STATE;
4050 $self->{s_kwd} .= chr $self->{nc};
4051
4052 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4053 $self->{line_prev} = $self->{line};
4054 $self->{column_prev} = $self->{column};
4055 $self->{column}++;
4056 $self->{nc}
4057 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4058 } else {
4059 $self->{set_nc}->($self);
4060 }
4061
4062 redo A;
4063 } elsif (0x0030 <= $self->{nc} and
4064 $self->{nc} <= 0x0039) { # 0..9
4065
4066 $self->{state} = NCR_NUM_STATE;
4067 $self->{s_kwd} = $self->{nc} - 0x0030;
4068
4069 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070 $self->{line_prev} = $self->{line};
4071 $self->{column_prev} = $self->{column};
4072 $self->{column}++;
4073 $self->{nc}
4074 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4075 } else {
4076 $self->{set_nc}->($self);
4077 }
4078
4079 redo A;
4080 } else {
4081 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4082 line => $self->{line_prev},
4083 column => $self->{column_prev} - 1);
4084
4085 ## NOTE: According to the spec algorithm, nothing is returned,
4086 ## and then "&#" is appended to the parent element or the attribute
4087 ## value in the later processing.
4088
4089 if ($self->{prev_state} == DATA_STATE) {
4090
4091 $self->{state} = $self->{prev_state};
4092 $self->{s_kwd} = '';
4093 ## Reconsume.
4094 return ({type => CHARACTER_TOKEN,
4095 data => '&#',
4096 line => $self->{line_prev},
4097 column => $self->{column_prev} - 1,
4098 });
4099 redo A;
4100 } else {
4101
4102 $self->{ca}->{value} .= '&#';
4103 $self->{state} = $self->{prev_state};
4104 $self->{s_kwd} = '';
4105 ## Reconsume.
4106 redo A;
4107 }
4108 }
4109 } elsif ($self->{state} == NCR_NUM_STATE) {
4110 if (0x0030 <= $self->{nc} and
4111 $self->{nc} <= 0x0039) { # 0..9
4112
4113 $self->{s_kwd} *= 10;
4114 $self->{s_kwd} += $self->{nc} - 0x0030;
4115
4116 ## Stay in the state.
4117
4118 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4119 $self->{line_prev} = $self->{line};
4120 $self->{column_prev} = $self->{column};
4121 $self->{column}++;
4122 $self->{nc}
4123 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4124 } else {
4125 $self->{set_nc}->($self);
4126 }
4127
4128 redo A;
4129 } elsif ($self->{nc} == 0x003B) { # ;
4130
4131
4132 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4133 $self->{line_prev} = $self->{line};
4134 $self->{column_prev} = $self->{column};
4135 $self->{column}++;
4136 $self->{nc}
4137 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4138 } else {
4139 $self->{set_nc}->($self);
4140 }
4141
4142 #
4143 } else {
4144
4145 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4146 ## Reconsume.
4147 #
4148 }
4149
4150 my $code = $self->{s_kwd};
4151 my $l = $self->{line_prev};
4152 my $c = $self->{column_prev};
4153 if ($charref_map->{$code}) {
4154
4155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4156 text => (sprintf 'U+%04X', $code),
4157 line => $l, column => $c);
4158 $code = $charref_map->{$code};
4159 } elsif ($code > 0x10FFFF) {
4160
4161 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4162 text => (sprintf 'U-%08X', $code),
4163 line => $l, column => $c);
4164 $code = 0xFFFD;
4165 }
4166
4167 if ($self->{prev_state} == DATA_STATE) {
4168
4169 $self->{state} = $self->{prev_state};
4170 $self->{s_kwd} = '';
4171 ## Reconsume.
4172 return ({type => CHARACTER_TOKEN, data => chr $code,
4173 has_reference => 1,
4174 line => $l, column => $c,
4175 });
4176 redo A;
4177 } else {
4178
4179 $self->{ca}->{value} .= chr $code;
4180 $self->{ca}->{has_reference} = 1;
4181 $self->{state} = $self->{prev_state};
4182 $self->{s_kwd} = '';
4183 ## Reconsume.
4184 redo A;
4185 }
4186 } elsif ($self->{state} == HEXREF_X_STATE) {
4187 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4188 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4189 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4190 # 0..9, A..F, a..f
4191
4192 $self->{state} = HEXREF_HEX_STATE;
4193 $self->{s_kwd} = 0;
4194 ## Reconsume.
4195 redo A;
4196 } else {
4197 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4198 line => $self->{line_prev},
4199 column => $self->{column_prev} - 2);
4200
4201 ## NOTE: According to the spec algorithm, nothing is returned,
4202 ## and then "&#" followed by "X" or "x" is appended to the parent
4203 ## element or the attribute value in the later processing.
4204
4205 if ($self->{prev_state} == DATA_STATE) {
4206
4207 $self->{state} = $self->{prev_state};
4208 $self->{s_kwd} = '';
4209 ## Reconsume.
4210 return ({type => CHARACTER_TOKEN,
4211 data => '&' . $self->{s_kwd},
4212 line => $self->{line_prev},
4213 column => $self->{column_prev} - length $self->{s_kwd},
4214 });
4215 redo A;
4216 } else {
4217
4218 $self->{ca}->{value} .= '&' . $self->{s_kwd};
4219 $self->{state} = $self->{prev_state};
4220 $self->{s_kwd} = '';
4221 ## Reconsume.
4222 redo A;
4223 }
4224 }
4225 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4226 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4227 # 0..9
4228
4229 $self->{s_kwd} *= 0x10;
4230 $self->{s_kwd} += $self->{nc} - 0x0030;
4231 ## Stay in the state.
4232
4233 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4234 $self->{line_prev} = $self->{line};
4235 $self->{column_prev} = $self->{column};
4236 $self->{column}++;
4237 $self->{nc}
4238 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4239 } else {
4240 $self->{set_nc}->($self);
4241 }
4242
4243 redo A;
4244 } elsif (0x0061 <= $self->{nc} and
4245 $self->{nc} <= 0x0066) { # a..f
4246
4247 $self->{s_kwd} *= 0x10;
4248 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4249 ## Stay in the state.
4250
4251 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4252 $self->{line_prev} = $self->{line};
4253 $self->{column_prev} = $self->{column};
4254 $self->{column}++;
4255 $self->{nc}
4256 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4257 } else {
4258 $self->{set_nc}->($self);
4259 }
4260
4261 redo A;
4262 } elsif (0x0041 <= $self->{nc} and
4263 $self->{nc} <= 0x0046) { # A..F
4264
4265 $self->{s_kwd} *= 0x10;
4266 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4267 ## Stay in the state.
4268
4269 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4270 $self->{line_prev} = $self->{line};
4271 $self->{column_prev} = $self->{column};
4272 $self->{column}++;
4273 $self->{nc}
4274 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4275 } else {
4276 $self->{set_nc}->($self);
4277 }
4278
4279 redo A;
4280 } elsif ($self->{nc} == 0x003B) { # ;
4281
4282
4283 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4284 $self->{line_prev} = $self->{line};
4285 $self->{column_prev} = $self->{column};
4286 $self->{column}++;
4287 $self->{nc}
4288 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4289 } else {
4290 $self->{set_nc}->($self);
4291 }
4292
4293 #
4294 } else {
4295
4296 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4297 line => $self->{line},
4298 column => $self->{column});
4299 ## Reconsume.
4300 #
4301 }
4302
4303 my $code = $self->{s_kwd};
4304 my $l = $self->{line_prev};
4305 my $c = $self->{column_prev};
4306 if ($charref_map->{$code}) {
4307
4308 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4309 text => (sprintf 'U+%04X', $code),
4310 line => $l, column => $c);
4311 $code = $charref_map->{$code};
4312 } elsif ($code > 0x10FFFF) {
4313
4314 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4315 text => (sprintf 'U-%08X', $code),
4316 line => $l, column => $c);
4317 $code = 0xFFFD;
4318 }
4319
4320 if ($self->{prev_state} == DATA_STATE) {
4321
4322 $self->{state} = $self->{prev_state};
4323 $self->{s_kwd} = '';
4324 ## Reconsume.
4325 return ({type => CHARACTER_TOKEN, data => chr $code,
4326 has_reference => 1,
4327 line => $l, column => $c,
4328 });
4329 redo A;
4330 } else {
4331
4332 $self->{ca}->{value} .= chr $code;
4333 $self->{ca}->{has_reference} = 1;
4334 $self->{state} = $self->{prev_state};
4335 $self->{s_kwd} = '';
4336 ## Reconsume.
4337 redo A;
4338 }
4339 } elsif ($self->{state} == ENTITY_NAME_STATE) {
4340 if (length $self->{s_kwd} < 30 and
4341 ## NOTE: Some number greater than the maximum length of entity name
4342 ((0x0041 <= $self->{nc} and # a
4343 $self->{nc} <= 0x005A) or # x
4344 (0x0061 <= $self->{nc} and # a
4345 $self->{nc} <= 0x007A) or # z
4346 (0x0030 <= $self->{nc} and # 0
4347 $self->{nc} <= 0x0039) or # 9
4348 $self->{nc} == 0x003B)) { # ;
4349 our $EntityChar;
4350 $self->{s_kwd} .= chr $self->{nc};
4351 if (defined $EntityChar->{$self->{s_kwd}}) {
4352 if ($self->{nc} == 0x003B) { # ;
4353
4354 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4355 $self->{entity__match} = 1;
4356
4357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4358 $self->{line_prev} = $self->{line};
4359 $self->{column_prev} = $self->{column};
4360 $self->{column}++;
4361 $self->{nc}
4362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4363 } else {
4364 $self->{set_nc}->($self);
4365 }
4366
4367 #
4368 } else {
4369
4370 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4371 $self->{entity__match} = -1;
4372 ## Stay in the state.
4373
4374 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4375 $self->{line_prev} = $self->{line};
4376 $self->{column_prev} = $self->{column};
4377 $self->{column}++;
4378 $self->{nc}
4379 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4380 } else {
4381 $self->{set_nc}->($self);
4382 }
4383
4384 redo A;
4385 }
4386 } else {
4387
4388 $self->{entity__value} .= chr $self->{nc};
4389 $self->{entity__match} *= 2;
4390 ## Stay in the state.
4391
4392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4393 $self->{line_prev} = $self->{line};
4394 $self->{column_prev} = $self->{column};
4395 $self->{column}++;
4396 $self->{nc}
4397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4398 } else {
4399 $self->{set_nc}->($self);
4400 }
4401
4402 redo A;
4403 }
4404 }
4405
4406 my $data;
4407 my $has_ref;
4408 if ($self->{entity__match} > 0) {
4409
4410 $data = $self->{entity__value};
4411 $has_ref = 1;
4412 #
4413 } elsif ($self->{entity__match} < 0) {
4414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4415 if ($self->{prev_state} != DATA_STATE and # in attribute
4416 $self->{entity__match} < -1) {
4417
4418 $data = '&' . $self->{s_kwd};
4419 #
4420 } else {
4421
4422 $data = $self->{entity__value};
4423 $has_ref = 1;
4424 #
4425 }
4426 } else {
4427
4428 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4429 line => $self->{line_prev},
4430 column => $self->{column_prev} - length $self->{s_kwd});
4431 $data = '&' . $self->{s_kwd};
4432 #
4433 }
4434
4435 ## NOTE: In these cases, when a character reference is found,
4436 ## it is consumed and a character token is returned, or, otherwise,
4437 ## nothing is consumed and returned, according to the spec algorithm.
4438 ## In this implementation, anything that has been examined by the
4439 ## tokenizer is appended to the parent element or the attribute value
4440 ## as string, either literal string when no character reference or
4441 ## entity-replaced string otherwise, in this stage, since any characters
4442 ## that would not be consumed are appended in the data state or in an
4443 ## appropriate attribute value state anyway.
4444
4445 if ($self->{prev_state} == DATA_STATE) {
4446
4447 $self->{state} = $self->{prev_state};
4448 $self->{s_kwd} = '';
4449 ## Reconsume.
4450 return ({type => CHARACTER_TOKEN,
4451 data => $data,
4452 has_reference => $has_ref,
4453 line => $self->{line_prev},
4454 column => $self->{column_prev} + 1 - length $self->{s_kwd},
4455 });
4456 redo A;
4457 } else {
4458
4459 $self->{ca}->{value} .= $data;
4460 $self->{ca}->{has_reference} = 1 if $has_ref;
4461 $self->{state} = $self->{prev_state};
4462 $self->{s_kwd} = '';
4463 ## Reconsume.
4464 redo A;
4465 }
4466
4467 ## XML-only states
4468
4469 } elsif ($self->{state} == PI_STATE) {
4470 if ($is_space->{$self->{nc}} or
4471 $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4472 $self->{nc} == -1) {
4473 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4474 line => $self->{line_prev},
4475 column => $self->{column_prev}
4476 - 1 * ($self->{nc} != -1));
4477 $self->{state} = BOGUS_COMMENT_STATE;
4478 ## Reconsume.
4479 $self->{ct} = {type => COMMENT_TOKEN,
4480 data => '?',
4481 line => $self->{line_prev},
4482 column => $self->{column_prev}
4483 - 1 * ($self->{nc} != -1),
4484 };
4485 redo A;
4486 } else {
4487 $self->{ct} = {type => PI_TOKEN,
4488 target => chr $self->{nc},
4489 data => '',
4490 line => $self->{line_prev},
4491 column => $self->{column_prev} - 1,
4492 };
4493 $self->{state} = PI_TARGET_STATE;
4494
4495 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4496 $self->{line_prev} = $self->{line};
4497 $self->{column_prev} = $self->{column};
4498 $self->{column}++;
4499 $self->{nc}
4500 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4501 } else {
4502 $self->{set_nc}->($self);
4503 }
4504
4505 redo A;
4506 }
4507 } elsif ($self->{state} == PI_TARGET_STATE) {
4508 if ($is_space->{$self->{nc}}) {
4509 $self->{state} = PI_TARGET_AFTER_STATE;
4510
4511 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4512 $self->{line_prev} = $self->{line};
4513 $self->{column_prev} = $self->{column};
4514 $self->{column}++;
4515 $self->{nc}
4516 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4517 } else {
4518 $self->{set_nc}->($self);
4519 }
4520
4521 redo A;
4522 } elsif ($self->{nc} == -1) {
4523 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4524 $self->{state} = DATA_STATE;
4525 $self->{s_kwd} = '';
4526 ## Reconsume.
4527 return ($self->{ct}); # pi
4528 redo A;
4529 } elsif ($self->{nc} == 0x003F) { # ?
4530 $self->{state} = PI_AFTER_STATE;
4531
4532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4533 $self->{line_prev} = $self->{line};
4534 $self->{column_prev} = $self->{column};
4535 $self->{column}++;
4536 $self->{nc}
4537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4538 } else {
4539 $self->{set_nc}->($self);
4540 }
4541
4542 redo A;
4543 } else {
4544 ## XML5: typo ("tag name" -> "target")
4545 $self->{ct}->{target} .= chr $self->{nc}; # pi
4546
4547 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4548 $self->{line_prev} = $self->{line};
4549 $self->{column_prev} = $self->{column};
4550 $self->{column}++;
4551 $self->{nc}
4552 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4553 } else {
4554 $self->{set_nc}->($self);
4555 }
4556
4557 redo A;
4558 }
4559 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4560 if ($is_space->{$self->{nc}}) {
4561 ## Stay in the state.
4562
4563 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4564 $self->{line_prev} = $self->{line};
4565 $self->{column_prev} = $self->{column};
4566 $self->{column}++;
4567 $self->{nc}
4568 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4569 } else {
4570 $self->{set_nc}->($self);
4571 }
4572
4573 redo A;
4574 } else {
4575 $self->{state} = PI_DATA_STATE;
4576 ## Reprocess.
4577 redo A;
4578 }
4579 } elsif ($self->{state} == PI_DATA_STATE) {
4580 if ($self->{nc} == 0x003F) { # ?
4581 $self->{state} = PI_DATA_AFTER_STATE;
4582
4583 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4584 $self->{line_prev} = $self->{line};
4585 $self->{column_prev} = $self->{column};
4586 $self->{column}++;
4587 $self->{nc}
4588 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4589 } else {
4590 $self->{set_nc}->($self);
4591 }
4592
4593 redo A;
4594 } elsif ($self->{nc} == -1) {
4595 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4596 $self->{state} = DATA_STATE;
4597 $self->{s_kwd} = '';
4598 ## Reprocess.
4599 return ($self->{ct}); # pi
4600 redo A;
4601 } else {
4602 $self->{ct}->{data} .= chr $self->{nc}; # pi
4603 $self->{read_until}->($self->{ct}->{data}, q[?],
4604 length $self->{ct}->{data});
4605 ## Stay in the state.
4606
4607 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608 $self->{line_prev} = $self->{line};
4609 $self->{column_prev} = $self->{column};
4610 $self->{column}++;
4611 $self->{nc}
4612 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613 } else {
4614 $self->{set_nc}->($self);
4615 }
4616
4617 ## Reprocess.
4618 redo A;
4619 }
4620 } elsif ($self->{state} == PI_AFTER_STATE) {
4621 if ($self->{nc} == 0x003E) { # >
4622 $self->{state} = DATA_STATE;
4623 $self->{s_kwd} = '';
4624
4625 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4626 $self->{line_prev} = $self->{line};
4627 $self->{column_prev} = $self->{column};
4628 $self->{column}++;
4629 $self->{nc}
4630 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4631 } else {
4632 $self->{set_nc}->($self);
4633 }
4634
4635 return ($self->{ct}); # pi
4636 redo A;
4637 } elsif ($self->{nc} == 0x003F) { # ?
4638 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4639 line => $self->{line_prev},
4640 column => $self->{column_prev}); ## XML5: no error
4641 $self->{ct}->{data} .= '?';
4642 $self->{state} = PI_DATA_AFTER_STATE;
4643
4644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4645 $self->{line_prev} = $self->{line};
4646 $self->{column_prev} = $self->{column};
4647 $self->{column}++;
4648 $self->{nc}
4649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4650 } else {
4651 $self->{set_nc}->($self);
4652 }
4653
4654 redo A;
4655 } else {
4656 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4657 line => $self->{line_prev},
4658 column => $self->{column_prev}
4659 + 1 * ($self->{nc} == -1)); ## XML5: no error
4660 $self->{ct}->{data} .= '?'; ## XML5: not appended
4661 $self->{state} = PI_DATA_STATE;
4662 ## Reprocess.
4663 redo A;
4664 }
4665 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4666 ## XML5: Same as "pi after state" in XML5
4667 if ($self->{nc} == 0x003E) { # >
4668 $self->{state} = DATA_STATE;
4669 $self->{s_kwd} = '';
4670
4671 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4672 $self->{line_prev} = $self->{line};
4673 $self->{column_prev} = $self->{column};
4674 $self->{column}++;
4675 $self->{nc}
4676 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4677 } else {
4678 $self->{set_nc}->($self);
4679 }
4680
4681 return ($self->{ct}); # pi
4682 redo A;
4683 } elsif ($self->{nc} == 0x003F) { # ?
4684 $self->{ct}->{data} .= '?';
4685 ## Stay in the state.
4686
4687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4688 $self->{line_prev} = $self->{line};
4689 $self->{column_prev} = $self->{column};
4690 $self->{column}++;
4691 $self->{nc}
4692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4693 } else {
4694 $self->{set_nc}->($self);
4695 }
4696
4697 redo A;
4698 } else {
4699 $self->{ct}->{data} .= '?'; ## XML5: not appended
4700 $self->{state} = PI_DATA_STATE;
4701 ## Reprocess.
4702 redo A;
4703 }
4704
4705 } else {
4706 die "$0: $self->{state}: Unknown state";
4707 }
4708 } # A
4709
4710 die "$0: _get_next_token: unexpected case";
4711 } # _get_next_token
4712
4713 1;
4714 ## $Date: 2008/10/15 08:51:02 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24