/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (show annotations) (download) (as text)
Tue Oct 14 15:25:50 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.8: +3 -0 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	14 Oct 2008 15:23:30 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/charref-1.dat" added.

++ whatpm/t/xml/ChangeLog	14 Oct 2008 15:23:49 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* charref-1.dat: New test data file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 15:24:42 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Mark CHARACTER_TOKEN with character reference
	as such, for the support of XML parse error.

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 15:25:35 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Raise a parse error for white space character
	generated by a character reference outside of the root element.

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68 !!!cp ('j1');
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72 !!!cp ('j2');
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82 !!!cp ('j4');
83 !!!parse-error (type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 !!!parse-error (layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182
183 $self->{is_xml} = 1;
184
185 return $self;
186 } # new
187
188 sub _initialize_tree_constructor ($) {
189 my $self = shift;
190 ## NOTE: $self->{document} MUST be specified before this method is called
191 $self->{document}->strict_error_checking (0);
192 ## TODO: Turn mutation events off # MUST
193 $self->{document}->dom_config
194 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195 = 0;
196 $self->{document}->manakai_is_html (0);
197 $self->{document}->set_user_data (manakai_source_line => 1);
198 $self->{document}->set_user_data (manakai_source_column => 1);
199 } # _initialize_tree_constructor
200
201 sub _terminate_tree_constructor ($) {
202 my $self = shift;
203 $self->{document}->strict_error_checking (1);
204 $self->{document}->dom_config
205 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206 = 1;
207 ## TODO: Turn mutation events on
208 } # _terminate_tree_constructor
209
210 ## Tree construction stage
211
212
213 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214
215 ## XML5: No namespace support.
216
217 ## XML5: XML5 has "empty tag token". In this implementation, it is
218 ## represented as a start tag token with $self->{self_closing} flag
219 ## set to true.
220
221 ## XML5: XML5 has "short end tag token". In this implementation, it
222 ## is represented as an end tag token with $token->{tag_name} flag set
223 ## to an empty string.
224
225 ## XML5: Start, main, end phases. In this implementation, they are
226 ## represented by insertion modes.
227
228 ## Insertion modes
229 sub INITIAL_IM () { 0 }
230 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
231 sub IN_ELEMENT_IM () { 2 }
232 sub AFTER_ROOT_ELEMENT_IM () { 3 }
233
234 {
235 my $token; ## TODO: change to $self->{t}
236
237 sub _construct_tree ($) {
238 my ($self) = @_;
239
240 delete $self->{tainted};
241 $self->{open_elements} = [];
242 $self->{insertion_mode} = INITIAL_IM;
243
244 !!!next-token;
245
246 while (1) {
247 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
248 $self->_tree_in_element;
249 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
250 $self->_tree_after_root_element;
251 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
252 $self->_tree_before_root_element;
253 } elsif ($self->{insertion_mode} == INITIAL_IM) {
254 $self->_tree_initial;
255 } else {
256 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
257 }
258
259 last if $token->{type} == ABORT_TOKEN;
260 }
261 } # _construct_tree
262
263 sub _tree_initial ($) {
264 my $self = shift;
265
266 B: while (1) {
267 if ($token->{type} == DOCTYPE_TOKEN) {
268 ## XML5: No "DOCTYPE" token.
269
270 my $doctype = $self->{document}->create_document_type_definition
271 (defined $token->{name} ? $token->{name} : '');
272
273 ## NOTE: Default value for both |public_id| and |system_id| attributes
274 ## are empty strings, so that we don't set any value in missing cases.
275 $doctype->public_id ($token->{public_identifier})
276 if defined $token->{public_identifier};
277 $doctype->system_id ($token->{system_identifier})
278 if defined $token->{system_identifier};
279
280 ## TODO: internal_subset
281
282 $self->{document}->append_child ($doctype);
283
284 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
285 !!!next-token;
286 return;
287 } elsif ($token->{type} == START_TAG_TOKEN or
288 $token->{type} == END_OF_FILE_TOKEN) {
289 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
290 ## Reprocess.
291 return;
292 } elsif ($token->{type} == COMMENT_TOKEN) {
293 my $comment = $self->{document}->create_comment ($token->{data});
294 $self->{document}->append_child ($comment);
295
296 ## Stay in the mode.
297 !!!next-token;
298 next B;
299 } elsif ($token->{type} == PI_TOKEN) {
300 my $pi = $self->{document}->create_processing_instruction
301 ($token->{target}, $token->{data});
302 $self->{document}->append_child ($pi);
303
304 ## Stay in the mode.
305 !!!next-token;
306 next B;
307 } elsif ($token->{type} == CHARACTER_TOKEN) {
308 if (not $self->{tainted} and
309 not $token->{has_reference} and
310 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
311 #
312 }
313
314 if (length $token->{data}) {
315 ## XML5: Ignore the token.
316
317 unless ($self->{tainted}) {
318 !!!parse-error (type => 'text outside of root element',
319 token => $token);
320 $self->{tainted} = 1;
321 }
322
323 $self->{document}->manakai_append_text ($token->{data});
324 }
325
326 ## Stay in the mode.
327 !!!next-token;
328 next B;
329 } elsif ($token->{type} == END_TAG_TOKEN) {
330 !!!parse-error (type => 'unmatched end tag',
331 text => $token->{tag_name},
332 token => $token);
333 ## Ignore the token.
334
335 ## Stay in the mode.
336 !!!next-token;
337 next B;
338 } elsif ($token->{type} == ABORT_TOKEN) {
339 return;
340 } else {
341 die "$0: XML parser initial: Unknown token type $token->{type}";
342 }
343 } # B
344 } # _tree_initial
345
346 sub _tree_before_root_element ($) {
347 my $self = shift;
348
349 B: while (1) {
350 if ($token->{type} == START_TAG_TOKEN) {
351 my $nsmap = {
352 xml => q<http://www.w3.org/XML/1998/namespace>,
353 xmlns => q<http://www.w3.org/2000/xmlns/>,
354 };
355
356 for (keys %{$token->{attributes}}) {
357 if (/^xmlns:./s) {
358 my $prefix = substr $_, 6;
359 my $value = $token->{attributes}->{$_}->{value};
360 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
361 $value eq q<http://www.w3.org/XML/1998/namespace> or
362 $value eq q<http://www.w3.org/2000/xmlns/>) {
363 ## NOTE: Error should be detected at the DOM layer.
364 #
365 } elsif (length $value) {
366 $nsmap->{$prefix} = $value;
367 } else {
368 delete $nsmap->{$prefix};
369 ## TODO: Error unless XML1.1
370 }
371 } elsif ($_ eq 'xmlns') {
372 my $value = $token->{attributes}->{$_}->{value};
373 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
374 $value eq q<http://www.w3.org/2000/xmlns/>) {
375 ## NOTE: Error should be detected at the DOM layer.
376 #
377 } elsif (length $value) {
378 $nsmap->{''} = $value;
379 } else {
380 delete $nsmap->{''};
381 }
382 }
383 }
384
385 my $ns;
386 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
387
388 if (defined $ln) { # prefixed
389 if (defined $nsmap->{$prefix}) {
390 $ns = $nsmap->{$prefix};
391 } else {
392 ## NOTE: Error should be detected at the DOM layer.
393 ($prefix, $ln) = (undef, $token->{tag_name});
394 }
395 } else {
396 ($prefix, $ln) = (undef, $prefix);
397 $ns = $nsmap->{''};
398 }
399
400 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
401 $el->set_user_data (manakai_source_line => $token->{line});
402 $el->set_user_data (manakai_source_column => $token->{column});
403
404 my $has_attr;
405 for my $attr_name (sort {$a cmp $b} keys %{$token->{attributes}}) {
406 my $ns;
407 my ($p, $l) = split /:/, $attr_name, 2;
408
409 if ($attr_name eq 'xmlns:xmlns') {
410 ($p, $l) = (undef, $attr_name);
411 } elsif (defined $l) { # prefixed
412 if (defined $nsmap->{$p}) {
413 $ns = $nsmap->{$p};
414 } else {
415 ## NOTE: Error should be detected at the DOM-layer.
416 ($p, $l) = (undef, $attr_name);
417 }
418 } else {
419 if ($p eq 'xmlns') {
420 $ns = $nsmap->{xmlns};
421 }
422 ($p, $l) = (undef, $p);
423 }
424
425 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
426 ## NOTE: Attributes are sorted as Unicode characters (not
427 ## code units) of their names, for stable output.
428
429 ## TODO: Should be sorted by source order?
430 !!!parse-error (type => 'duplicate ns attr',
431 token => $token,
432 value => $attr_name);
433 next;
434 } else {
435 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
436 }
437
438 my $attr_t = $token->{attributes}->{$attr_name};
439 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
440 $attr->value ($attr_t->{value});
441 $attr->set_user_data (manakai_source_line => $attr_t->{line});
442 $attr->set_user_data (manakai_source_column => $attr_t->{column});
443 $el->set_attribute_node_ns ($attr);
444 }
445
446 $self->{document}->append_child ($el);
447
448 if ($self->{self_closing}) {
449 !!!ack ('ack');
450 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
451 } else {
452 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
453 $self->{insertion_mode} = IN_ELEMENT_IM;
454 }
455
456 #delete $self->{tainted};
457
458 !!!next-token;
459 return;
460 } elsif ($token->{type} == COMMENT_TOKEN) {
461 my $comment = $self->{document}->create_comment ($token->{data});
462 $self->{document}->append_child ($comment);
463
464 ## Stay in the mode.
465 !!!next-token;
466 next B;
467 } elsif ($token->{type} == PI_TOKEN) {
468 my $pi = $self->{document}->create_processing_instruction
469 ($token->{target}, $token->{data});
470 $self->{document}->append_child ($pi);
471
472 ## Stay in the mode.
473 !!!next-token;
474 next B;
475 } elsif ($token->{type} == CHARACTER_TOKEN) {
476 if (not $self->{tainted} and
477 not $token->{has_reference} and
478 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
479 #
480 }
481
482 if (length $token->{data}) {
483 ## XML5: Ignore the token.
484
485 unless ($self->{tainted}) {
486 !!!parse-error (type => 'text outside of root element',
487 token => $token);
488 $self->{tainted} = 1;
489 }
490
491 $self->{document}->manakai_append_text ($token->{data});
492 }
493
494 ## Stay in the mode.
495 !!!next-token;
496 next B;
497 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
498 !!!parse-error (type => 'no root element',
499 token => $token);
500
501 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
502 ## Reprocess.
503 return;
504 } elsif ($token->{type} == END_TAG_TOKEN) {
505 !!!parse-error (type => 'unmatched end tag',
506 text => $token->{tag_name},
507 token => $token);
508 ## Ignore the token.
509
510 ## Stay in the mode.
511 !!!next-token;
512 next B;
513 } elsif ($token->{type} == DOCTYPE_TOKEN) {
514 !!!parse-error (type => 'in html:#doctype',
515 token => $token);
516 ## Ignore the token.
517
518 ## Stay in the mode.
519 !!!next-token;
520 next B;
521 } elsif ($token->{type} == ABORT_TOKEN) {
522 return;
523 } else {
524 die "$0: XML parser initial: Unknown token type $token->{type}";
525 }
526 } # B
527 } # _tree_before_root_element
528
529 sub _tree_in_element ($) {
530 my $self = shift;
531
532 B: while (1) {
533 if ($token->{type} == CHARACTER_TOKEN) {
534 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
535
536 ## Stay in the mode.
537 !!!next-token;
538 next B;
539 } elsif ($token->{type} == START_TAG_TOKEN) {
540 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
541
542 for (keys %{$token->{attributes}}) {
543 if (/^xmlns:./s) {
544 my $prefix = substr $_, 6;
545 my $value = $token->{attributes}->{$_}->{value};
546 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
547 $value eq q<http://www.w3.org/XML/1998/namespace> or
548 $value eq q<http://www.w3.org/2000/xmlns/>) {
549 ## NOTE: Error should be detected at the DOM layer.
550 #
551 } elsif (length $value) {
552 $nsmap->{$prefix} = $value;
553 } else {
554 delete $nsmap->{$prefix};
555 ## TODO: Error unless XML1.1
556 }
557 } elsif ($_ eq 'xmlns') {
558 my $value = $token->{attributes}->{$_}->{value};
559 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
560 $value eq q<http://www.w3.org/2000/xmlns/>) {
561 ## NOTE: Error should be detected at the DOM layer.
562 #
563 } elsif (length $value) {
564 $nsmap->{''} = $value;
565 } else {
566 delete $nsmap->{''};
567 }
568 }
569 }
570
571 my $ns;
572 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
573
574 if (defined $ln) { # prefixed
575 if (defined $nsmap->{$prefix}) {
576 $ns = $nsmap->{$prefix};
577 } else {
578 ## NOTE: Error should be detected at the DOM layer.
579 ($prefix, $ln) = (undef, $token->{tag_name});
580 }
581 } else {
582 ($prefix, $ln) = (undef, $prefix);
583 $ns = $nsmap->{''};
584 }
585
586 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
587 $el->set_user_data (manakai_source_line => $token->{line});
588 $el->set_user_data (manakai_source_column => $token->{column});
589
590 my $has_attr;
591 for my $attr_name (sort {$a cmp $b} keys %{$token->{attributes}}) {
592 my $ns;
593 my ($p, $l) = split /:/, $attr_name, 2;
594
595 if ($attr_name eq 'xmlns:xmlns') {
596 ($p, $l) = (undef, $attr_name);
597 } elsif (defined $l) { # prefixed
598 if (defined $nsmap->{$p}) {
599 $ns = $nsmap->{$p};
600 } else {
601 ## NOTE: Error should be detected at the DOM-layer.
602 ($p, $l) = (undef, $attr_name);
603 }
604 } else {
605 if ($p eq 'xmlns') {
606 $ns = $nsmap->{xmlns};
607 }
608 ($p, $l) = (undef, $p);
609 }
610
611 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
612 ## NOTE: Attributes are sorted as Unicode characters (not
613 ## code units) of their names, for stable output.
614
615 ## TODO: Should be sorted by source order?
616 !!!parse-error (type => 'duplicate ns attr',
617 token => $token,
618 value => $attr_name);
619 next;
620 } else {
621 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
622 }
623
624 my $attr_t = $token->{attributes}->{$attr_name};
625 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
626 $attr->value ($attr_t->{value});
627 $attr->set_user_data (manakai_source_line => $attr_t->{line});
628 $attr->set_user_data (manakai_source_column => $attr_t->{column});
629 $el->set_attribute_node_ns ($attr);
630 }
631
632 $self->{open_elements}->[-1]->[0]->append_child ($el);
633
634 if ($self->{self_closing}) {
635 !!!ack ('ack');
636 } else {
637 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
638 }
639
640 ## Stay in the mode.
641 !!!next-token;
642 next B;
643 } elsif ($token->{type} == END_TAG_TOKEN) {
644 if ($token->{tag_name} eq '') {
645 ## Short end tag token.
646 pop @{$self->{open_elements}};
647 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
648 pop @{$self->{open_elements}};
649 } else {
650 !!!parse-error (type => 'unmatched end tag',
651 text => $token->{tag_name},
652 token => $token);
653
654 ## Has an element in scope
655 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
656 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
657 splice @{$self->{open_elements}}, $i;
658 last INSCOPE;
659 }
660 } # INSCOPE
661 }
662
663 unless (@{$self->{open_elements}}) {
664 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
665 !!!next-token;
666 return;
667 } else {
668 ## Stay in the state.
669 !!!next-token;
670 redo B;
671 }
672 } elsif ($token->{type} == COMMENT_TOKEN) {
673 my $comment = $self->{document}->create_comment ($token->{data});
674 $self->{open_elements}->[-1]->[0]->append_child ($comment);
675
676 ## Stay in the mode.
677 !!!next-token;
678 next B;
679 } elsif ($token->{type} == PI_TOKEN) {
680 my $pi = $self->{document}->create_processing_instruction
681 ($token->{target}, $token->{data});
682 $self->{open_elements}->[-1]->[0]->append_child ($pi);
683
684 ## Stay in the mode.
685 !!!next-token;
686 next B;
687 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
688 !!!parse-error (type => 'in body:#eof',
689 token => $token);
690
691 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
692 !!!next-token;
693 return;
694 } elsif ($token->{type} == DOCTYPE_TOKEN) {
695 !!!parse-error (type => 'in html:#doctype',
696 token => $token);
697 ## Ignore the token.
698
699 ## Stay in the mode.
700 !!!next-token;
701 next B;
702 } elsif ($token->{type} == ABORT_TOKEN) {
703 return;
704 } else {
705 die "$0: XML parser initial: Unknown token type $token->{type}";
706 }
707 } # B
708 } # _tree_in_element
709
710 sub _tree_after_root_element ($) {
711 my $self = shift;
712
713 B: while (1) {
714 if ($token->{type} == START_TAG_TOKEN) {
715 !!!parse-error (type => 'second root element',
716 token => $token);
717
718 ## XML5: Ignore the token.
719
720 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
721 ## Reprocess.
722 return;
723 } elsif ($token->{type} == COMMENT_TOKEN) {
724 my $comment = $self->{document}->create_comment ($token->{data});
725 $self->{document}->append_child ($comment);
726
727 ## Stay in the mode.
728 !!!next-token;
729 next B;
730 } elsif ($token->{type} == PI_TOKEN) {
731 my $pi = $self->{document}->create_processing_instruction
732 ($token->{target}, $token->{data});
733 $self->{document}->append_child ($pi);
734
735 ## Stay in the mode.
736 !!!next-token;
737 next B;
738 } elsif ($token->{type} == CHARACTER_TOKEN) {
739 if (not $self->{tainted} and
740 not $token->{has_reference} and
741 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
742 #
743 }
744
745 if (length $token->{data}) {
746 ## XML5: Ignore the token.
747
748 unless ($self->{tainted}) {
749 !!!parse-error (type => 'text outside of root element',
750 token => $token);
751 $self->{tainted} = 1;
752 }
753
754 $self->{document}->manakai_append_text ($token->{data});
755 }
756
757 ## Stay in the mode.
758 !!!next-token;
759 next B;
760 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
761 ## Stop parsing.
762
763 ## TODO: implement "stop parsing".
764
765 $token = {type => ABORT_TOKEN};
766 return;
767 } elsif ($token->{type} == END_TAG_TOKEN) {
768 !!!parse-error (type => 'unmatched end tag',
769 text => $token->{tag_name},
770 token => $token);
771 ## Ignore the token.
772
773 ## Stay in the mode.
774 !!!next-token;
775 next B;
776 } elsif ($token->{type} == DOCTYPE_TOKEN) {
777 !!!parse-error (type => 'in html:#doctype',
778 token => $token);
779 ## Ignore the token.
780
781 ## Stay in the mode.
782 !!!next-token;
783 next B;
784 } elsif ($token->{type} == ABORT_TOKEN) {
785 return;
786 } else {
787 die "$0: XML parser initial: Unknown token type $token->{type}";
788 }
789 } # B
790 } # _tree_after_root_element
791
792 }
793
794 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24