/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (show annotations) (download)
Wed Oct 15 10:50:38 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.11: +10 -18 lines
++ whatpm/t/xml/ChangeLog	15 Oct 2008 10:50:31 -0000
	* attrs-1.dat: Test cases for tokenizing errors are added.

	* elements-1.dat: A test result updated.

	* ns-attrs-1.dat: Test results updated.  New test cases for
	duplicate namespaced attributes are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 10:48:03 -0000
	* Tokenizer.pm.src: Set index attribute to each attribute token,
	for ignoring namespaced duplicate attribute at the XML namespace
	parser layer.  Raise a parse error if the attribute value is
	omitted, in XML mode.  Raise a parse error if the attribute value
	is not quoted, in XML mode.  Raise a parse error if "<" character
	is found in a quoted attribute value, in XML mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 10:49:16 -0000
	* Parser.pm.src: Use source order to determine which attribute is
	duplicate.  Preserve duplicate namespaced attributes as
	non-namespaced attributes.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82
83 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182
183 $self->{is_xml} = 1;
184
185 return $self;
186 } # new
187
188 sub _initialize_tree_constructor ($) {
189 my $self = shift;
190 ## NOTE: $self->{document} MUST be specified before this method is called
191 $self->{document}->strict_error_checking (0);
192 ## TODO: Turn mutation events off # MUST
193 $self->{document}->dom_config
194 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195 = 0;
196 $self->{document}->manakai_is_html (0);
197 $self->{document}->set_user_data (manakai_source_line => 1);
198 $self->{document}->set_user_data (manakai_source_column => 1);
199 } # _initialize_tree_constructor
200
201 sub _terminate_tree_constructor ($) {
202 my $self = shift;
203 $self->{document}->strict_error_checking (1);
204 $self->{document}->dom_config
205 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206 = 1;
207 ## TODO: Turn mutation events on
208 } # _terminate_tree_constructor
209
210 ## Tree construction stage
211
212
213 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214
215 ## XML5: No namespace support.
216
217 ## XML5: XML5 has "empty tag token". In this implementation, it is
218 ## represented as a start tag token with $self->{self_closing} flag
219 ## set to true.
220
221 ## XML5: XML5 has "short end tag token". In this implementation, it
222 ## is represented as an end tag token with $token->{tag_name} flag set
223 ## to an empty string.
224
225 ## XML5: Start, main, end phases. In this implementation, they are
226 ## represented by insertion modes.
227
228 ## Insertion modes
229 sub INITIAL_IM () { 0 }
230 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
231 sub IN_ELEMENT_IM () { 2 }
232 sub AFTER_ROOT_ELEMENT_IM () { 3 }
233
234 {
235 my $token; ## TODO: change to $self->{t}
236
237 sub _construct_tree ($) {
238 my ($self) = @_;
239
240 delete $self->{tainted};
241 $self->{open_elements} = [];
242 $self->{insertion_mode} = INITIAL_IM;
243
244 $token = $self->_get_next_token;
245
246 ## XML5: No support for the XML declaration
247 if ($token->{type} == PI_TOKEN and
248 $token->{target} eq 'xml' and
249 $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
250 (?>"([^"]*)"|'([^']*)')
251 (?:[\x09\x0A\x20]+
252 encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
253 (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
254 (?:[\x09\x0A\x20]+
255 standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
256 (?>"(yes|no)"|'(yes|no)'))?
257 [\x09\x0A\x20]*\z/x) {
258 $self->{document}->xml_version (defined $1 ? $1 : $2);
259 $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
260 $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
261
262 $token = $self->_get_next_token;
263 } else {
264 $self->{document}->xml_version ('1.0');
265 $self->{document}->xml_encoding (undef);
266 $self->{document}->xml_standalone (0);
267 }
268
269 while (1) {
270 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
271 $self->_tree_in_element;
272 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
273 $self->_tree_after_root_element;
274 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
275 $self->_tree_before_root_element;
276 } elsif ($self->{insertion_mode} == INITIAL_IM) {
277 $self->_tree_initial;
278 } else {
279 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
280 }
281
282 last if $token->{type} == ABORT_TOKEN;
283 }
284 } # _construct_tree
285
286 sub _tree_initial ($) {
287 my $self = shift;
288
289 B: while (1) {
290 if ($token->{type} == DOCTYPE_TOKEN) {
291 ## XML5: No "DOCTYPE" token.
292
293 my $doctype = $self->{document}->create_document_type_definition
294 (defined $token->{name} ? $token->{name} : '');
295
296 ## NOTE: Default value for both |public_id| and |system_id| attributes
297 ## are empty strings, so that we don't set any value in missing cases.
298 $doctype->public_id ($token->{public_identifier})
299 if defined $token->{public_identifier};
300 $doctype->system_id ($token->{system_identifier})
301 if defined $token->{system_identifier};
302
303 ## TODO: internal_subset
304
305 $self->{document}->append_child ($doctype);
306
307 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
308 $token = $self->_get_next_token;
309 return;
310 } elsif ($token->{type} == START_TAG_TOKEN or
311 $token->{type} == END_OF_FILE_TOKEN) {
312 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
313 ## Reprocess.
314 return;
315 } elsif ($token->{type} == COMMENT_TOKEN) {
316 my $comment = $self->{document}->create_comment ($token->{data});
317 $self->{document}->append_child ($comment);
318
319 ## Stay in the mode.
320 $token = $self->_get_next_token;
321 next B;
322 } elsif ($token->{type} == PI_TOKEN) {
323 my $pi = $self->{document}->create_processing_instruction
324 ($token->{target}, $token->{data});
325 $self->{document}->append_child ($pi);
326
327 ## Stay in the mode.
328 $token = $self->_get_next_token;
329 next B;
330 } elsif ($token->{type} == CHARACTER_TOKEN) {
331 if (not $self->{tainted} and
332 not $token->{has_reference} and
333 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
334 #
335 }
336
337 if (length $token->{data}) {
338 ## XML5: Ignore the token.
339
340 unless ($self->{tainted}) {
341 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
342 token => $token);
343 $self->{tainted} = 1;
344 }
345
346 $self->{document}->manakai_append_text ($token->{data});
347 }
348
349 ## Stay in the mode.
350 $token = $self->_get_next_token;
351 next B;
352 } elsif ($token->{type} == END_TAG_TOKEN) {
353 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
354 text => $token->{tag_name},
355 token => $token);
356 ## Ignore the token.
357
358 ## Stay in the mode.
359 $token = $self->_get_next_token;
360 next B;
361 } elsif ($token->{type} == ABORT_TOKEN) {
362 return;
363 } else {
364 die "$0: XML parser initial: Unknown token type $token->{type}";
365 }
366 } # B
367 } # _tree_initial
368
369 sub _tree_before_root_element ($) {
370 my $self = shift;
371
372 B: while (1) {
373 if ($token->{type} == START_TAG_TOKEN) {
374 my $nsmap = {
375 xml => q<http://www.w3.org/XML/1998/namespace>,
376 xmlns => q<http://www.w3.org/2000/xmlns/>,
377 };
378
379 for (keys %{$token->{attributes}}) {
380 if (/^xmlns:./s) {
381 my $prefix = substr $_, 6;
382 my $value = $token->{attributes}->{$_}->{value};
383 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
384 $value eq q<http://www.w3.org/XML/1998/namespace> or
385 $value eq q<http://www.w3.org/2000/xmlns/>) {
386 ## NOTE: Error should be detected at the DOM layer.
387 #
388 } elsif (length $value) {
389 $nsmap->{$prefix} = $value;
390 } else {
391 delete $nsmap->{$prefix};
392 }
393 } elsif ($_ eq 'xmlns') {
394 my $value = $token->{attributes}->{$_}->{value};
395 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
396 $value eq q<http://www.w3.org/2000/xmlns/>) {
397 ## NOTE: Error should be detected at the DOM layer.
398 #
399 } elsif (length $value) {
400 $nsmap->{''} = $value;
401 } else {
402 delete $nsmap->{''};
403 }
404 }
405 }
406
407 my $ns;
408 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
409
410 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
411 if (defined $nsmap->{$prefix}) {
412 $ns = $nsmap->{$prefix};
413 } else {
414 ($prefix, $ln) = (undef, $token->{tag_name});
415 }
416 } else {
417 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
418 ($prefix, $ln) = (undef, $token->{tag_name});
419 }
420
421 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
422 $el->set_user_data (manakai_source_line => $token->{line});
423 $el->set_user_data (manakai_source_column => $token->{column});
424
425 my $has_attr;
426 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
427 $token->{attributes}->{$b}->{index}}
428 keys %{$token->{attributes}}) {
429 my $ns;
430 my ($p, $l) = split /:/, $attr_name, 2;
431
432 if ($attr_name eq 'xmlns:xmlns') {
433 ($p, $l) = (undef, $attr_name);
434 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
435 if (defined $nsmap->{$p}) {
436 $ns = $nsmap->{$p};
437 } else {
438 ## NOTE: Error should be detected at the DOM-layer.
439 ($p, $l) = (undef, $attr_name);
440 }
441 } else {
442 if ($attr_name eq 'xmlns') {
443 $ns = $nsmap->{xmlns};
444 }
445 ($p, $l) = (undef, $attr_name);
446 }
447
448 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
449 $ns = undef;
450 ($p, $l) = (undef, $attr_name);
451 } else {
452 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
453 }
454
455 my $attr_t = $token->{attributes}->{$attr_name};
456 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
457 $attr->value ($attr_t->{value});
458 $attr->set_user_data (manakai_source_line => $attr_t->{line});
459 $attr->set_user_data (manakai_source_column => $attr_t->{column});
460 $el->set_attribute_node_ns ($attr);
461 }
462
463 $self->{document}->append_child ($el);
464
465 if ($self->{self_closing}) {
466 delete $self->{self_closing};
467 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
468 } else {
469 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
470 $self->{insertion_mode} = IN_ELEMENT_IM;
471 }
472
473 #delete $self->{tainted};
474
475 $token = $self->_get_next_token;
476 return;
477 } elsif ($token->{type} == COMMENT_TOKEN) {
478 my $comment = $self->{document}->create_comment ($token->{data});
479 $self->{document}->append_child ($comment);
480
481 ## Stay in the mode.
482 $token = $self->_get_next_token;
483 next B;
484 } elsif ($token->{type} == PI_TOKEN) {
485 my $pi = $self->{document}->create_processing_instruction
486 ($token->{target}, $token->{data});
487 $self->{document}->append_child ($pi);
488
489 ## Stay in the mode.
490 $token = $self->_get_next_token;
491 next B;
492 } elsif ($token->{type} == CHARACTER_TOKEN) {
493 if (not $self->{tainted} and
494 not $token->{has_reference} and
495 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
496 #
497 }
498
499 if (length $token->{data}) {
500 ## XML5: Ignore the token.
501
502 unless ($self->{tainted}) {
503 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
504 token => $token);
505 $self->{tainted} = 1;
506 }
507
508 $self->{document}->manakai_append_text ($token->{data});
509 }
510
511 ## Stay in the mode.
512 $token = $self->_get_next_token;
513 next B;
514 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
515 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
516 token => $token);
517
518 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
519 ## Reprocess.
520 return;
521 } elsif ($token->{type} == END_TAG_TOKEN) {
522 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
523 text => $token->{tag_name},
524 token => $token);
525 ## Ignore the token.
526
527 ## Stay in the mode.
528 $token = $self->_get_next_token;
529 next B;
530 } elsif ($token->{type} == DOCTYPE_TOKEN) {
531 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
532 token => $token);
533 ## Ignore the token.
534
535 ## Stay in the mode.
536 $token = $self->_get_next_token;
537 next B;
538 } elsif ($token->{type} == ABORT_TOKEN) {
539 return;
540 } else {
541 die "$0: XML parser initial: Unknown token type $token->{type}";
542 }
543 } # B
544 } # _tree_before_root_element
545
546 sub _tree_in_element ($) {
547 my $self = shift;
548
549 B: while (1) {
550 if ($token->{type} == CHARACTER_TOKEN) {
551 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
552
553 ## Stay in the mode.
554 $token = $self->_get_next_token;
555 next B;
556 } elsif ($token->{type} == START_TAG_TOKEN) {
557 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
558
559 for (keys %{$token->{attributes}}) {
560 if (/^xmlns:./s) {
561 my $prefix = substr $_, 6;
562 my $value = $token->{attributes}->{$_}->{value};
563 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
564 $value eq q<http://www.w3.org/XML/1998/namespace> or
565 $value eq q<http://www.w3.org/2000/xmlns/>) {
566 ## NOTE: Error should be detected at the DOM layer.
567 #
568 } elsif (length $value) {
569 $nsmap->{$prefix} = $value;
570 } else {
571 delete $nsmap->{$prefix};
572 }
573 } elsif ($_ eq 'xmlns') {
574 my $value = $token->{attributes}->{$_}->{value};
575 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
576 $value eq q<http://www.w3.org/2000/xmlns/>) {
577 ## NOTE: Error should be detected at the DOM layer.
578 #
579 } elsif (length $value) {
580 $nsmap->{''} = $value;
581 } else {
582 delete $nsmap->{''};
583 }
584 }
585 }
586
587 my $ns;
588 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
589
590 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
591 if (defined $nsmap->{$prefix}) {
592 $ns = $nsmap->{$prefix};
593 } else {
594 ## NOTE: Error should be detected at the DOM layer.
595 ($prefix, $ln) = (undef, $token->{tag_name});
596 }
597 } else {
598 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
599 ($prefix, $ln) = (undef, $token->{tag_name});
600 }
601
602 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
603 $el->set_user_data (manakai_source_line => $token->{line});
604 $el->set_user_data (manakai_source_column => $token->{column});
605
606 my $has_attr;
607 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
608 $token->{attributes}->{$b}->{index}}
609 keys %{$token->{attributes}}) {
610 my $ns;
611 my ($p, $l) = split /:/, $attr_name, 2;
612
613 if ($attr_name eq 'xmlns:xmlns') {
614 ($p, $l) = (undef, $attr_name);
615 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
616 if (defined $nsmap->{$p}) {
617 $ns = $nsmap->{$p};
618 } else {
619 ## NOTE: Error should be detected at the DOM-layer.
620 ($p, $l) = (undef, $attr_name);
621 }
622 } else {
623 if ($attr_name eq 'xmlns') {
624 $ns = $nsmap->{xmlns};
625 }
626 ($p, $l) = (undef, $attr_name);
627 }
628
629 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
630 $ns = undef;
631 ($p, $l) = (undef, $attr_name);
632 } else {
633 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
634 }
635
636 my $attr_t = $token->{attributes}->{$attr_name};
637 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
638 $attr->value ($attr_t->{value});
639 $attr->set_user_data (manakai_source_line => $attr_t->{line});
640 $attr->set_user_data (manakai_source_column => $attr_t->{column});
641 $el->set_attribute_node_ns ($attr);
642 }
643
644 $self->{open_elements}->[-1]->[0]->append_child ($el);
645
646 if ($self->{self_closing}) {
647 delete $self->{self_closing};
648 } else {
649 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
650 }
651
652 ## Stay in the mode.
653 $token = $self->_get_next_token;
654 next B;
655 } elsif ($token->{type} == END_TAG_TOKEN) {
656 if ($token->{tag_name} eq '') {
657 ## Short end tag token.
658 pop @{$self->{open_elements}};
659 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
660 pop @{$self->{open_elements}};
661 } else {
662 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
663 text => $token->{tag_name},
664 token => $token);
665
666 ## Has an element in scope
667 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
668 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
669 splice @{$self->{open_elements}}, $i;
670 last INSCOPE;
671 }
672 } # INSCOPE
673 }
674
675 unless (@{$self->{open_elements}}) {
676 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
677 $token = $self->_get_next_token;
678 return;
679 } else {
680 ## Stay in the state.
681 $token = $self->_get_next_token;
682 redo B;
683 }
684 } elsif ($token->{type} == COMMENT_TOKEN) {
685 my $comment = $self->{document}->create_comment ($token->{data});
686 $self->{open_elements}->[-1]->[0]->append_child ($comment);
687
688 ## Stay in the mode.
689 $token = $self->_get_next_token;
690 next B;
691 } elsif ($token->{type} == PI_TOKEN) {
692 my $pi = $self->{document}->create_processing_instruction
693 ($token->{target}, $token->{data});
694 $self->{open_elements}->[-1]->[0]->append_child ($pi);
695
696 ## Stay in the mode.
697 $token = $self->_get_next_token;
698 next B;
699 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
700 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
701 token => $token);
702
703 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
704 $token = $self->_get_next_token;
705 return;
706 } elsif ($token->{type} == DOCTYPE_TOKEN) {
707 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
708 token => $token);
709 ## Ignore the token.
710
711 ## Stay in the mode.
712 $token = $self->_get_next_token;
713 next B;
714 } elsif ($token->{type} == ABORT_TOKEN) {
715 return;
716 } else {
717 die "$0: XML parser initial: Unknown token type $token->{type}";
718 }
719 } # B
720 } # _tree_in_element
721
722 sub _tree_after_root_element ($) {
723 my $self = shift;
724
725 B: while (1) {
726 if ($token->{type} == START_TAG_TOKEN) {
727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
728 token => $token);
729
730 ## XML5: Ignore the token.
731
732 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
733 ## Reprocess.
734 return;
735 } elsif ($token->{type} == COMMENT_TOKEN) {
736 my $comment = $self->{document}->create_comment ($token->{data});
737 $self->{document}->append_child ($comment);
738
739 ## Stay in the mode.
740 $token = $self->_get_next_token;
741 next B;
742 } elsif ($token->{type} == PI_TOKEN) {
743 my $pi = $self->{document}->create_processing_instruction
744 ($token->{target}, $token->{data});
745 $self->{document}->append_child ($pi);
746
747 ## Stay in the mode.
748 $token = $self->_get_next_token;
749 next B;
750 } elsif ($token->{type} == CHARACTER_TOKEN) {
751 if (not $self->{tainted} and
752 not $token->{has_reference} and
753 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
754 #
755 }
756
757 if (length $token->{data}) {
758 ## XML5: Ignore the token.
759
760 unless ($self->{tainted}) {
761 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
762 token => $token);
763 $self->{tainted} = 1;
764 }
765
766 $self->{document}->manakai_append_text ($token->{data});
767 }
768
769 ## Stay in the mode.
770 $token = $self->_get_next_token;
771 next B;
772 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
773 ## Stop parsing.
774
775 ## TODO: implement "stop parsing".
776
777 $token = {type => ABORT_TOKEN};
778 return;
779 } elsif ($token->{type} == END_TAG_TOKEN) {
780 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
781 text => $token->{tag_name},
782 token => $token);
783 ## Ignore the token.
784
785 ## Stay in the mode.
786 $token = $self->_get_next_token;
787 next B;
788 } elsif ($token->{type} == DOCTYPE_TOKEN) {
789 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
790 token => $token);
791 ## Ignore the token.
792
793 ## Stay in the mode.
794 $token = $self->_get_next_token;
795 next B;
796 } elsif ($token->{type} == ABORT_TOKEN) {
797 return;
798 } else {
799 die "$0: XML parser initial: Unknown token type $token->{type}";
800 }
801 } # B
802 } # _tree_after_root_element
803
804 }
805
806 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24