/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (hide annotations) (download) (as text)
Wed Oct 15 10:50:38 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.11: +10 -18 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	15 Oct 2008 10:50:31 -0000
	* attrs-1.dat: Test cases for tokenizing errors are added.

	* elements-1.dat: A test result updated.

	* ns-attrs-1.dat: Test results updated.  New test cases for
	duplicate namespaced attributes are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 10:48:03 -0000
	* Tokenizer.pm.src: Set index attribute to each attribute token,
	for ignoring namespaced duplicate attribute at the XML namespace
	parser layer.  Raise a parse error if the attribute value is
	omitted, in XML mode.  Raise a parse error if the attribute value
	is not quoted, in XML mode.  Raise a parse error if "<" character
	is found in a quoted attribute value, in XML mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 10:49:16 -0000
	* Parser.pm.src: Use source order to determine which attribute is
	duplicate.  Preserve duplicate namespaced attributes as
	non-namespaced attributes.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68     !!!cp ('j1');
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72     !!!cp ('j2');
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82     !!!cp ('j4');
83     !!!parse-error (type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     !!!parse-error (layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182 wakaba 1.2
183     $self->{is_xml} = 1;
184    
185 wakaba 1.1 return $self;
186     } # new
187    
188     sub _initialize_tree_constructor ($) {
189     my $self = shift;
190     ## NOTE: $self->{document} MUST be specified before this method is called
191     $self->{document}->strict_error_checking (0);
192     ## TODO: Turn mutation events off # MUST
193     $self->{document}->dom_config
194     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195     = 0;
196     $self->{document}->manakai_is_html (0);
197     $self->{document}->set_user_data (manakai_source_line => 1);
198     $self->{document}->set_user_data (manakai_source_column => 1);
199     } # _initialize_tree_constructor
200    
201     sub _terminate_tree_constructor ($) {
202     my $self = shift;
203     $self->{document}->strict_error_checking (1);
204     $self->{document}->dom_config
205     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206     = 1;
207     ## TODO: Turn mutation events on
208     } # _terminate_tree_constructor
209    
210     ## Tree construction stage
211    
212    
213     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214    
215     ## XML5: No namespace support.
216    
217     ## XML5: XML5 has "empty tag token". In this implementation, it is
218 wakaba 1.3 ## represented as a start tag token with $self->{self_closing} flag
219 wakaba 1.1 ## set to true.
220    
221     ## XML5: XML5 has "short end tag token". In this implementation, it
222     ## is represented as an end tag token with $token->{tag_name} flag set
223     ## to an empty string.
224    
225     ## XML5: Start, main, end phases. In this implementation, they are
226     ## represented by insertion modes.
227    
228     ## Insertion modes
229     sub INITIAL_IM () { 0 }
230     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
231     sub IN_ELEMENT_IM () { 2 }
232     sub AFTER_ROOT_ELEMENT_IM () { 3 }
233    
234     {
235     my $token; ## TODO: change to $self->{t}
236    
237     sub _construct_tree ($) {
238     my ($self) = @_;
239    
240     delete $self->{tainted};
241     $self->{open_elements} = [];
242     $self->{insertion_mode} = INITIAL_IM;
243 wakaba 1.8
244     !!!next-token;
245 wakaba 1.10
246     ## XML5: No support for the XML declaration
247     if ($token->{type} == PI_TOKEN and
248     $token->{target} eq 'xml' and
249     $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
250     (?>"([^"]*)"|'([^']*)')
251     (?:[\x09\x0A\x20]+
252     encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
253     (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
254     (?:[\x09\x0A\x20]+
255     standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
256     (?>"(yes|no)"|'(yes|no)'))?
257     [\x09\x0A\x20]*\z/x) {
258     $self->{document}->xml_version (defined $1 ? $1 : $2);
259     $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
260     $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
261    
262     !!!next-token;
263     } else {
264     $self->{document}->xml_version ('1.0');
265     $self->{document}->xml_encoding (undef);
266     $self->{document}->xml_standalone (0);
267     }
268 wakaba 1.1
269     while (1) {
270     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
271     $self->_tree_in_element;
272     } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
273     $self->_tree_after_root_element;
274     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
275     $self->_tree_before_root_element;
276     } elsif ($self->{insertion_mode} == INITIAL_IM) {
277     $self->_tree_initial;
278     } else {
279     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
280     }
281    
282     last if $token->{type} == ABORT_TOKEN;
283     }
284     } # _construct_tree
285    
286     sub _tree_initial ($) {
287     my $self = shift;
288    
289     B: while (1) {
290     if ($token->{type} == DOCTYPE_TOKEN) {
291     ## XML5: No "DOCTYPE" token.
292    
293     my $doctype = $self->{document}->create_document_type_definition
294     (defined $token->{name} ? $token->{name} : '');
295    
296     ## NOTE: Default value for both |public_id| and |system_id| attributes
297     ## are empty strings, so that we don't set any value in missing cases.
298     $doctype->public_id ($token->{public_identifier})
299     if defined $token->{public_identifier};
300     $doctype->system_id ($token->{system_identifier})
301     if defined $token->{system_identifier};
302    
303     ## TODO: internal_subset
304    
305     $self->{document}->append_child ($doctype);
306    
307     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
308     !!!next-token;
309     return;
310     } elsif ($token->{type} == START_TAG_TOKEN or
311     $token->{type} == END_OF_FILE_TOKEN) {
312     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
313     ## Reprocess.
314     return;
315     } elsif ($token->{type} == COMMENT_TOKEN) {
316     my $comment = $self->{document}->create_comment ($token->{data});
317     $self->{document}->append_child ($comment);
318    
319     ## Stay in the mode.
320     !!!next-token;
321     next B;
322     } elsif ($token->{type} == PI_TOKEN) {
323     my $pi = $self->{document}->create_processing_instruction
324     ($token->{target}, $token->{data});
325     $self->{document}->append_child ($pi);
326    
327     ## Stay in the mode.
328     !!!next-token;
329     next B;
330     } elsif ($token->{type} == CHARACTER_TOKEN) {
331     if (not $self->{tainted} and
332 wakaba 1.9 not $token->{has_reference} and
333 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
334     #
335     }
336    
337     if (length $token->{data}) {
338     ## XML5: Ignore the token.
339    
340     unless ($self->{tainted}) {
341     !!!parse-error (type => 'text outside of root element',
342     token => $token);
343     $self->{tainted} = 1;
344     }
345    
346     $self->{document}->manakai_append_text ($token->{data});
347     }
348    
349     ## Stay in the mode.
350     !!!next-token;
351     next B;
352     } elsif ($token->{type} == END_TAG_TOKEN) {
353     !!!parse-error (type => 'unmatched end tag',
354     text => $token->{tag_name},
355     token => $token);
356     ## Ignore the token.
357    
358     ## Stay in the mode.
359     !!!next-token;
360     next B;
361     } elsif ($token->{type} == ABORT_TOKEN) {
362     return;
363     } else {
364     die "$0: XML parser initial: Unknown token type $token->{type}";
365     }
366     } # B
367     } # _tree_initial
368    
369     sub _tree_before_root_element ($) {
370     my $self = shift;
371    
372     B: while (1) {
373     if ($token->{type} == START_TAG_TOKEN) {
374 wakaba 1.6 my $nsmap = {
375     xml => q<http://www.w3.org/XML/1998/namespace>,
376     xmlns => q<http://www.w3.org/2000/xmlns/>,
377     };
378    
379     for (keys %{$token->{attributes}}) {
380     if (/^xmlns:./s) {
381     my $prefix = substr $_, 6;
382     my $value = $token->{attributes}->{$_}->{value};
383     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
384     $value eq q<http://www.w3.org/XML/1998/namespace> or
385     $value eq q<http://www.w3.org/2000/xmlns/>) {
386     ## NOTE: Error should be detected at the DOM layer.
387     #
388     } elsif (length $value) {
389     $nsmap->{$prefix} = $value;
390     } else {
391     delete $nsmap->{$prefix};
392     }
393     } elsif ($_ eq 'xmlns') {
394     my $value = $token->{attributes}->{$_}->{value};
395     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
396     $value eq q<http://www.w3.org/2000/xmlns/>) {
397     ## NOTE: Error should be detected at the DOM layer.
398     #
399     } elsif (length $value) {
400     $nsmap->{''} = $value;
401     } else {
402     delete $nsmap->{''};
403     }
404     }
405     }
406    
407     my $ns;
408 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
409 wakaba 1.6
410 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
411 wakaba 1.6 if (defined $nsmap->{$prefix}) {
412     $ns = $nsmap->{$prefix};
413     } else {
414     ($prefix, $ln) = (undef, $token->{tag_name});
415     }
416     } else {
417 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
418     ($prefix, $ln) = (undef, $token->{tag_name});
419 wakaba 1.6 }
420    
421 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
422 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
423     $el->set_user_data (manakai_source_column => $token->{column});
424    
425 wakaba 1.6 my $has_attr;
426 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
427     $token->{attributes}->{$b}->{index}}
428     keys %{$token->{attributes}}) {
429 wakaba 1.6 my $ns;
430 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
431 wakaba 1.6
432     if ($attr_name eq 'xmlns:xmlns') {
433     ($p, $l) = (undef, $attr_name);
434 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
435 wakaba 1.6 if (defined $nsmap->{$p}) {
436     $ns = $nsmap->{$p};
437     } else {
438     ## NOTE: Error should be detected at the DOM-layer.
439     ($p, $l) = (undef, $attr_name);
440     }
441     } else {
442 wakaba 1.11 if ($attr_name eq 'xmlns') {
443 wakaba 1.6 $ns = $nsmap->{xmlns};
444     }
445 wakaba 1.11 ($p, $l) = (undef, $attr_name);
446 wakaba 1.6 }
447    
448     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
449 wakaba 1.12 $ns = undef;
450     ($p, $l) = (undef, $attr_name);
451 wakaba 1.6 } else {
452     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
453     }
454    
455 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
456     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
457     $attr->value ($attr_t->{value});
458     $attr->set_user_data (manakai_source_line => $attr_t->{line});
459     $attr->set_user_data (manakai_source_column => $attr_t->{column});
460     $el->set_attribute_node_ns ($attr);
461     }
462    
463 wakaba 1.1 $self->{document}->append_child ($el);
464    
465 wakaba 1.3 if ($self->{self_closing}) {
466 wakaba 1.1 !!!ack ('ack');
467     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
468     } else {
469 wakaba 1.6 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
470 wakaba 1.1 $self->{insertion_mode} = IN_ELEMENT_IM;
471     }
472    
473     #delete $self->{tainted};
474    
475     !!!next-token;
476     return;
477     } elsif ($token->{type} == COMMENT_TOKEN) {
478     my $comment = $self->{document}->create_comment ($token->{data});
479     $self->{document}->append_child ($comment);
480    
481     ## Stay in the mode.
482     !!!next-token;
483     next B;
484     } elsif ($token->{type} == PI_TOKEN) {
485     my $pi = $self->{document}->create_processing_instruction
486     ($token->{target}, $token->{data});
487     $self->{document}->append_child ($pi);
488    
489     ## Stay in the mode.
490     !!!next-token;
491     next B;
492     } elsif ($token->{type} == CHARACTER_TOKEN) {
493     if (not $self->{tainted} and
494 wakaba 1.9 not $token->{has_reference} and
495 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
496     #
497     }
498    
499     if (length $token->{data}) {
500     ## XML5: Ignore the token.
501    
502     unless ($self->{tainted}) {
503     !!!parse-error (type => 'text outside of root element',
504     token => $token);
505     $self->{tainted} = 1;
506     }
507    
508     $self->{document}->manakai_append_text ($token->{data});
509     }
510    
511     ## Stay in the mode.
512     !!!next-token;
513     next B;
514     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
515     !!!parse-error (type => 'no root element',
516     token => $token);
517    
518     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
519     ## Reprocess.
520     return;
521     } elsif ($token->{type} == END_TAG_TOKEN) {
522     !!!parse-error (type => 'unmatched end tag',
523     text => $token->{tag_name},
524     token => $token);
525     ## Ignore the token.
526    
527     ## Stay in the mode.
528     !!!next-token;
529     next B;
530     } elsif ($token->{type} == DOCTYPE_TOKEN) {
531     !!!parse-error (type => 'in html:#doctype',
532     token => $token);
533     ## Ignore the token.
534    
535     ## Stay in the mode.
536     !!!next-token;
537     next B;
538     } elsif ($token->{type} == ABORT_TOKEN) {
539     return;
540     } else {
541     die "$0: XML parser initial: Unknown token type $token->{type}";
542     }
543     } # B
544     } # _tree_before_root_element
545    
546     sub _tree_in_element ($) {
547     my $self = shift;
548    
549     B: while (1) {
550     if ($token->{type} == CHARACTER_TOKEN) {
551     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
552    
553     ## Stay in the mode.
554     !!!next-token;
555     next B;
556     } elsif ($token->{type} == START_TAG_TOKEN) {
557 wakaba 1.7 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
558    
559     for (keys %{$token->{attributes}}) {
560     if (/^xmlns:./s) {
561     my $prefix = substr $_, 6;
562     my $value = $token->{attributes}->{$_}->{value};
563     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
564     $value eq q<http://www.w3.org/XML/1998/namespace> or
565     $value eq q<http://www.w3.org/2000/xmlns/>) {
566     ## NOTE: Error should be detected at the DOM layer.
567     #
568     } elsif (length $value) {
569     $nsmap->{$prefix} = $value;
570     } else {
571     delete $nsmap->{$prefix};
572     }
573     } elsif ($_ eq 'xmlns') {
574     my $value = $token->{attributes}->{$_}->{value};
575     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
576     $value eq q<http://www.w3.org/2000/xmlns/>) {
577     ## NOTE: Error should be detected at the DOM layer.
578     #
579     } elsif (length $value) {
580     $nsmap->{''} = $value;
581     } else {
582     delete $nsmap->{''};
583     }
584     }
585     }
586    
587     my $ns;
588 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
589 wakaba 1.7
590 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
591 wakaba 1.7 if (defined $nsmap->{$prefix}) {
592     $ns = $nsmap->{$prefix};
593     } else {
594     ## NOTE: Error should be detected at the DOM layer.
595     ($prefix, $ln) = (undef, $token->{tag_name});
596     }
597     } else {
598 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
599     ($prefix, $ln) = (undef, $token->{tag_name});
600 wakaba 1.7 }
601    
602 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
603 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
604     $el->set_user_data (manakai_source_column => $token->{column});
605    
606 wakaba 1.7 my $has_attr;
607 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
608     $token->{attributes}->{$b}->{index}}
609     keys %{$token->{attributes}}) {
610 wakaba 1.7 my $ns;
611 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
612 wakaba 1.7
613     if ($attr_name eq 'xmlns:xmlns') {
614     ($p, $l) = (undef, $attr_name);
615 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
616 wakaba 1.7 if (defined $nsmap->{$p}) {
617     $ns = $nsmap->{$p};
618     } else {
619     ## NOTE: Error should be detected at the DOM-layer.
620     ($p, $l) = (undef, $attr_name);
621     }
622     } else {
623 wakaba 1.11 if ($attr_name eq 'xmlns') {
624 wakaba 1.7 $ns = $nsmap->{xmlns};
625     }
626 wakaba 1.11 ($p, $l) = (undef, $attr_name);
627 wakaba 1.7 }
628    
629     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
630 wakaba 1.12 $ns = undef;
631     ($p, $l) = (undef, $attr_name);
632 wakaba 1.7 } else {
633     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
634     }
635    
636 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
637     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
638     $attr->value ($attr_t->{value});
639     $attr->set_user_data (manakai_source_line => $attr_t->{line});
640     $attr->set_user_data (manakai_source_column => $attr_t->{column});
641     $el->set_attribute_node_ns ($attr);
642     }
643    
644 wakaba 1.1 $self->{open_elements}->[-1]->[0]->append_child ($el);
645    
646 wakaba 1.3 if ($self->{self_closing}) {
647 wakaba 1.1 !!!ack ('ack');
648     } else {
649 wakaba 1.7 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
650 wakaba 1.1 }
651    
652     ## Stay in the mode.
653     !!!next-token;
654     next B;
655     } elsif ($token->{type} == END_TAG_TOKEN) {
656 wakaba 1.2 if ($token->{tag_name} eq '') {
657 wakaba 1.1 ## Short end tag token.
658     pop @{$self->{open_elements}};
659     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
660     pop @{$self->{open_elements}};
661     } else {
662     !!!parse-error (type => 'unmatched end tag',
663     text => $token->{tag_name},
664     token => $token);
665    
666     ## Has an element in scope
667 wakaba 1.2 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
668 wakaba 1.1 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
669     splice @{$self->{open_elements}}, $i;
670     last INSCOPE;
671     }
672     } # INSCOPE
673     }
674    
675     unless (@{$self->{open_elements}}) {
676     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
677     !!!next-token;
678     return;
679     } else {
680     ## Stay in the state.
681     !!!next-token;
682     redo B;
683     }
684     } elsif ($token->{type} == COMMENT_TOKEN) {
685     my $comment = $self->{document}->create_comment ($token->{data});
686     $self->{open_elements}->[-1]->[0]->append_child ($comment);
687    
688     ## Stay in the mode.
689     !!!next-token;
690     next B;
691     } elsif ($token->{type} == PI_TOKEN) {
692     my $pi = $self->{document}->create_processing_instruction
693     ($token->{target}, $token->{data});
694     $self->{open_elements}->[-1]->[0]->append_child ($pi);
695    
696     ## Stay in the mode.
697     !!!next-token;
698     next B;
699     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
700     !!!parse-error (type => 'in body:#eof',
701     token => $token);
702    
703     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
704     !!!next-token;
705     return;
706     } elsif ($token->{type} == DOCTYPE_TOKEN) {
707     !!!parse-error (type => 'in html:#doctype',
708     token => $token);
709     ## Ignore the token.
710    
711     ## Stay in the mode.
712     !!!next-token;
713     next B;
714     } elsif ($token->{type} == ABORT_TOKEN) {
715     return;
716     } else {
717     die "$0: XML parser initial: Unknown token type $token->{type}";
718     }
719     } # B
720     } # _tree_in_element
721    
722     sub _tree_after_root_element ($) {
723     my $self = shift;
724    
725     B: while (1) {
726     if ($token->{type} == START_TAG_TOKEN) {
727     !!!parse-error (type => 'second root element',
728     token => $token);
729    
730     ## XML5: Ignore the token.
731 wakaba 1.4
732 wakaba 1.5 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
733     ## Reprocess.
734 wakaba 1.1 return;
735     } elsif ($token->{type} == COMMENT_TOKEN) {
736     my $comment = $self->{document}->create_comment ($token->{data});
737     $self->{document}->append_child ($comment);
738    
739     ## Stay in the mode.
740     !!!next-token;
741     next B;
742     } elsif ($token->{type} == PI_TOKEN) {
743     my $pi = $self->{document}->create_processing_instruction
744     ($token->{target}, $token->{data});
745     $self->{document}->append_child ($pi);
746    
747     ## Stay in the mode.
748     !!!next-token;
749     next B;
750     } elsif ($token->{type} == CHARACTER_TOKEN) {
751     if (not $self->{tainted} and
752 wakaba 1.9 not $token->{has_reference} and
753 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
754     #
755     }
756    
757     if (length $token->{data}) {
758     ## XML5: Ignore the token.
759    
760     unless ($self->{tainted}) {
761     !!!parse-error (type => 'text outside of root element',
762     token => $token);
763     $self->{tainted} = 1;
764     }
765    
766     $self->{document}->manakai_append_text ($token->{data});
767     }
768    
769     ## Stay in the mode.
770     !!!next-token;
771     next B;
772     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
773     ## Stop parsing.
774    
775     ## TODO: implement "stop parsing".
776    
777     $token = {type => ABORT_TOKEN};
778     return;
779     } elsif ($token->{type} == END_TAG_TOKEN) {
780     !!!parse-error (type => 'unmatched end tag',
781     text => $token->{tag_name},
782     token => $token);
783     ## Ignore the token.
784    
785     ## Stay in the mode.
786     !!!next-token;
787     next B;
788     } elsif ($token->{type} == DOCTYPE_TOKEN) {
789     !!!parse-error (type => 'in html:#doctype',
790     token => $token);
791     ## Ignore the token.
792    
793     ## Stay in the mode.
794     !!!next-token;
795     next B;
796     } elsif ($token->{type} == ABORT_TOKEN) {
797     return;
798     } else {
799     die "$0: XML parser initial: Unknown token type $token->{type}";
800     }
801     } # B
802     } # _tree_after_root_element
803    
804     }
805    
806     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24