/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (hide annotations) (download)
Wed Oct 15 04:38:22 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.9: +23 -0 lines
++ whatpm/t/ChangeLog	15 Oct 2008 04:37:36 -0000
	* XML-Parser.t: "xml/pis-1.dat" and "xml/xmldecls-1.dat" added.
	Test directifes "#xml-version", "#xml-encoding", and
	"#xml-standalone" are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 04:37:54 -0000
	* pis-1.dat, xmldecls-1.dat: New test data files.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	15 Oct 2008 04:33:34 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (create_processing_instruction): New method.
	(xml_version, xml_encoding, xml_standalone): New attributes.
	(ProcessingInstruction): New class.

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 04:34:03 -0000
	* Tokenizer.pm.src: Support for XML processing instructions.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 04:34:57 -0000
	* Parser.pm.src: Support for XML declarations.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68    
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72    
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82    
83     $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182 wakaba 1.2
183     $self->{is_xml} = 1;
184    
185 wakaba 1.1 return $self;
186     } # new
187    
188     sub _initialize_tree_constructor ($) {
189     my $self = shift;
190     ## NOTE: $self->{document} MUST be specified before this method is called
191     $self->{document}->strict_error_checking (0);
192     ## TODO: Turn mutation events off # MUST
193     $self->{document}->dom_config
194     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195     = 0;
196     $self->{document}->manakai_is_html (0);
197     $self->{document}->set_user_data (manakai_source_line => 1);
198     $self->{document}->set_user_data (manakai_source_column => 1);
199     } # _initialize_tree_constructor
200    
201     sub _terminate_tree_constructor ($) {
202     my $self = shift;
203     $self->{document}->strict_error_checking (1);
204     $self->{document}->dom_config
205     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206     = 1;
207     ## TODO: Turn mutation events on
208     } # _terminate_tree_constructor
209    
210     ## Tree construction stage
211    
212    
213     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214    
215     ## XML5: No namespace support.
216    
217     ## XML5: XML5 has "empty tag token". In this implementation, it is
218 wakaba 1.3 ## represented as a start tag token with $self->{self_closing} flag
219 wakaba 1.1 ## set to true.
220    
221     ## XML5: XML5 has "short end tag token". In this implementation, it
222     ## is represented as an end tag token with $token->{tag_name} flag set
223     ## to an empty string.
224    
225     ## XML5: Start, main, end phases. In this implementation, they are
226     ## represented by insertion modes.
227    
228     ## Insertion modes
229     sub INITIAL_IM () { 0 }
230     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
231     sub IN_ELEMENT_IM () { 2 }
232     sub AFTER_ROOT_ELEMENT_IM () { 3 }
233    
234     {
235     my $token; ## TODO: change to $self->{t}
236    
237     sub _construct_tree ($) {
238     my ($self) = @_;
239    
240     delete $self->{tainted};
241     $self->{open_elements} = [];
242     $self->{insertion_mode} = INITIAL_IM;
243 wakaba 1.8
244     $token = $self->_get_next_token;
245 wakaba 1.10
246     ## XML5: No support for the XML declaration
247     if ($token->{type} == PI_TOKEN and
248     $token->{target} eq 'xml' and
249     $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
250     (?>"([^"]*)"|'([^']*)')
251     (?:[\x09\x0A\x20]+
252     encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
253     (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
254     (?:[\x09\x0A\x20]+
255     standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
256     (?>"(yes|no)"|'(yes|no)'))?
257     [\x09\x0A\x20]*\z/x) {
258     $self->{document}->xml_version (defined $1 ? $1 : $2);
259     $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
260     $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
261    
262     $token = $self->_get_next_token;
263     } else {
264     $self->{document}->xml_version ('1.0');
265     $self->{document}->xml_encoding (undef);
266     $self->{document}->xml_standalone (0);
267     }
268 wakaba 1.1
269     while (1) {
270     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
271     $self->_tree_in_element;
272     } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
273     $self->_tree_after_root_element;
274     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
275     $self->_tree_before_root_element;
276     } elsif ($self->{insertion_mode} == INITIAL_IM) {
277     $self->_tree_initial;
278     } else {
279     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
280     }
281    
282     last if $token->{type} == ABORT_TOKEN;
283     }
284     } # _construct_tree
285    
286     sub _tree_initial ($) {
287     my $self = shift;
288    
289     B: while (1) {
290     if ($token->{type} == DOCTYPE_TOKEN) {
291     ## XML5: No "DOCTYPE" token.
292    
293     my $doctype = $self->{document}->create_document_type_definition
294     (defined $token->{name} ? $token->{name} : '');
295    
296     ## NOTE: Default value for both |public_id| and |system_id| attributes
297     ## are empty strings, so that we don't set any value in missing cases.
298     $doctype->public_id ($token->{public_identifier})
299     if defined $token->{public_identifier};
300     $doctype->system_id ($token->{system_identifier})
301     if defined $token->{system_identifier};
302    
303     ## TODO: internal_subset
304    
305     $self->{document}->append_child ($doctype);
306    
307     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
308     $token = $self->_get_next_token;
309     return;
310     } elsif ($token->{type} == START_TAG_TOKEN or
311     $token->{type} == END_OF_FILE_TOKEN) {
312     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
313     ## Reprocess.
314     return;
315     } elsif ($token->{type} == COMMENT_TOKEN) {
316     my $comment = $self->{document}->create_comment ($token->{data});
317     $self->{document}->append_child ($comment);
318    
319     ## Stay in the mode.
320     $token = $self->_get_next_token;
321     next B;
322     } elsif ($token->{type} == PI_TOKEN) {
323     my $pi = $self->{document}->create_processing_instruction
324     ($token->{target}, $token->{data});
325     $self->{document}->append_child ($pi);
326    
327     ## Stay in the mode.
328     $token = $self->_get_next_token;
329     next B;
330     } elsif ($token->{type} == CHARACTER_TOKEN) {
331     if (not $self->{tainted} and
332 wakaba 1.9 not $token->{has_reference} and
333 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
334     #
335     }
336    
337     if (length $token->{data}) {
338     ## XML5: Ignore the token.
339    
340     unless ($self->{tainted}) {
341     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
342     token => $token);
343     $self->{tainted} = 1;
344     }
345    
346     $self->{document}->manakai_append_text ($token->{data});
347     }
348    
349     ## Stay in the mode.
350     $token = $self->_get_next_token;
351     next B;
352     } elsif ($token->{type} == END_TAG_TOKEN) {
353     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
354     text => $token->{tag_name},
355     token => $token);
356     ## Ignore the token.
357    
358     ## Stay in the mode.
359     $token = $self->_get_next_token;
360     next B;
361     } elsif ($token->{type} == ABORT_TOKEN) {
362     return;
363     } else {
364     die "$0: XML parser initial: Unknown token type $token->{type}";
365     }
366     } # B
367     } # _tree_initial
368    
369     sub _tree_before_root_element ($) {
370     my $self = shift;
371    
372     B: while (1) {
373     if ($token->{type} == START_TAG_TOKEN) {
374 wakaba 1.6 my $nsmap = {
375     xml => q<http://www.w3.org/XML/1998/namespace>,
376     xmlns => q<http://www.w3.org/2000/xmlns/>,
377     };
378    
379     for (keys %{$token->{attributes}}) {
380     if (/^xmlns:./s) {
381     my $prefix = substr $_, 6;
382     my $value = $token->{attributes}->{$_}->{value};
383     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
384     $value eq q<http://www.w3.org/XML/1998/namespace> or
385     $value eq q<http://www.w3.org/2000/xmlns/>) {
386     ## NOTE: Error should be detected at the DOM layer.
387     #
388     } elsif (length $value) {
389     $nsmap->{$prefix} = $value;
390     } else {
391     delete $nsmap->{$prefix};
392     ## TODO: Error unless XML1.1
393     }
394     } elsif ($_ eq 'xmlns') {
395     my $value = $token->{attributes}->{$_}->{value};
396     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
397     $value eq q<http://www.w3.org/2000/xmlns/>) {
398     ## NOTE: Error should be detected at the DOM layer.
399     #
400     } elsif (length $value) {
401     $nsmap->{''} = $value;
402     } else {
403     delete $nsmap->{''};
404     }
405     }
406     }
407    
408     my $ns;
409 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
410 wakaba 1.6
411     if (defined $ln) { # prefixed
412     if (defined $nsmap->{$prefix}) {
413     $ns = $nsmap->{$prefix};
414     } else {
415     ## NOTE: Error should be detected at the DOM layer.
416     ($prefix, $ln) = (undef, $token->{tag_name});
417     }
418     } else {
419     ($prefix, $ln) = (undef, $prefix);
420     $ns = $nsmap->{''};
421     }
422    
423 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
424 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
425     $el->set_user_data (manakai_source_column => $token->{column});
426    
427 wakaba 1.6 my $has_attr;
428     for my $attr_name (sort {$a cmp $b} keys %{$token->{attributes}}) {
429     my $ns;
430 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
431 wakaba 1.6
432     if ($attr_name eq 'xmlns:xmlns') {
433     ($p, $l) = (undef, $attr_name);
434     } elsif (defined $l) { # prefixed
435     if (defined $nsmap->{$p}) {
436     $ns = $nsmap->{$p};
437     } else {
438     ## NOTE: Error should be detected at the DOM-layer.
439     ($p, $l) = (undef, $attr_name);
440     }
441     } else {
442     if ($p eq 'xmlns') {
443     $ns = $nsmap->{xmlns};
444     }
445     ($p, $l) = (undef, $p);
446     }
447    
448     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
449     ## NOTE: Attributes are sorted as Unicode characters (not
450     ## code units) of their names, for stable output.
451    
452     ## TODO: Should be sorted by source order?
453     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate ns attr',
454     token => $token,
455     value => $attr_name);
456     next;
457     } else {
458     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
459     }
460    
461 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
462     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
463     $attr->value ($attr_t->{value});
464     $attr->set_user_data (manakai_source_line => $attr_t->{line});
465     $attr->set_user_data (manakai_source_column => $attr_t->{column});
466     $el->set_attribute_node_ns ($attr);
467     }
468    
469 wakaba 1.1 $self->{document}->append_child ($el);
470    
471 wakaba 1.3 if ($self->{self_closing}) {
472 wakaba 1.1 delete $self->{self_closing};
473     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
474     } else {
475 wakaba 1.6 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
476 wakaba 1.1 $self->{insertion_mode} = IN_ELEMENT_IM;
477     }
478    
479     #delete $self->{tainted};
480    
481     $token = $self->_get_next_token;
482     return;
483     } elsif ($token->{type} == COMMENT_TOKEN) {
484     my $comment = $self->{document}->create_comment ($token->{data});
485     $self->{document}->append_child ($comment);
486    
487     ## Stay in the mode.
488     $token = $self->_get_next_token;
489     next B;
490     } elsif ($token->{type} == PI_TOKEN) {
491     my $pi = $self->{document}->create_processing_instruction
492     ($token->{target}, $token->{data});
493     $self->{document}->append_child ($pi);
494    
495     ## Stay in the mode.
496     $token = $self->_get_next_token;
497     next B;
498     } elsif ($token->{type} == CHARACTER_TOKEN) {
499     if (not $self->{tainted} and
500 wakaba 1.9 not $token->{has_reference} and
501 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
502     #
503     }
504    
505     if (length $token->{data}) {
506     ## XML5: Ignore the token.
507    
508     unless ($self->{tainted}) {
509     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
510     token => $token);
511     $self->{tainted} = 1;
512     }
513    
514     $self->{document}->manakai_append_text ($token->{data});
515     }
516    
517     ## Stay in the mode.
518     $token = $self->_get_next_token;
519     next B;
520     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
522     token => $token);
523    
524     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
525     ## Reprocess.
526     return;
527     } elsif ($token->{type} == END_TAG_TOKEN) {
528     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
529     text => $token->{tag_name},
530     token => $token);
531     ## Ignore the token.
532    
533     ## Stay in the mode.
534     $token = $self->_get_next_token;
535     next B;
536     } elsif ($token->{type} == DOCTYPE_TOKEN) {
537     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
538     token => $token);
539     ## Ignore the token.
540    
541     ## Stay in the mode.
542     $token = $self->_get_next_token;
543     next B;
544     } elsif ($token->{type} == ABORT_TOKEN) {
545     return;
546     } else {
547     die "$0: XML parser initial: Unknown token type $token->{type}";
548     }
549     } # B
550     } # _tree_before_root_element
551    
552     sub _tree_in_element ($) {
553     my $self = shift;
554    
555     B: while (1) {
556     if ($token->{type} == CHARACTER_TOKEN) {
557     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
558    
559     ## Stay in the mode.
560     $token = $self->_get_next_token;
561     next B;
562     } elsif ($token->{type} == START_TAG_TOKEN) {
563 wakaba 1.7 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
564    
565     for (keys %{$token->{attributes}}) {
566     if (/^xmlns:./s) {
567     my $prefix = substr $_, 6;
568     my $value = $token->{attributes}->{$_}->{value};
569     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
570     $value eq q<http://www.w3.org/XML/1998/namespace> or
571     $value eq q<http://www.w3.org/2000/xmlns/>) {
572     ## NOTE: Error should be detected at the DOM layer.
573     #
574     } elsif (length $value) {
575     $nsmap->{$prefix} = $value;
576     } else {
577     delete $nsmap->{$prefix};
578     ## TODO: Error unless XML1.1
579     }
580     } elsif ($_ eq 'xmlns') {
581     my $value = $token->{attributes}->{$_}->{value};
582     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
583     $value eq q<http://www.w3.org/2000/xmlns/>) {
584     ## NOTE: Error should be detected at the DOM layer.
585     #
586     } elsif (length $value) {
587     $nsmap->{''} = $value;
588     } else {
589     delete $nsmap->{''};
590     }
591     }
592     }
593    
594     my $ns;
595 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
596 wakaba 1.7
597     if (defined $ln) { # prefixed
598     if (defined $nsmap->{$prefix}) {
599     $ns = $nsmap->{$prefix};
600     } else {
601     ## NOTE: Error should be detected at the DOM layer.
602     ($prefix, $ln) = (undef, $token->{tag_name});
603     }
604     } else {
605     ($prefix, $ln) = (undef, $prefix);
606     $ns = $nsmap->{''};
607     }
608    
609 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
610 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
611     $el->set_user_data (manakai_source_column => $token->{column});
612    
613 wakaba 1.7 my $has_attr;
614     for my $attr_name (sort {$a cmp $b} keys %{$token->{attributes}}) {
615     my $ns;
616 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
617 wakaba 1.7
618     if ($attr_name eq 'xmlns:xmlns') {
619     ($p, $l) = (undef, $attr_name);
620     } elsif (defined $l) { # prefixed
621     if (defined $nsmap->{$p}) {
622     $ns = $nsmap->{$p};
623     } else {
624     ## NOTE: Error should be detected at the DOM-layer.
625     ($p, $l) = (undef, $attr_name);
626     }
627     } else {
628     if ($p eq 'xmlns') {
629     $ns = $nsmap->{xmlns};
630     }
631     ($p, $l) = (undef, $p);
632     }
633    
634     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
635     ## NOTE: Attributes are sorted as Unicode characters (not
636     ## code units) of their names, for stable output.
637    
638     ## TODO: Should be sorted by source order?
639     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate ns attr',
640     token => $token,
641     value => $attr_name);
642     next;
643     } else {
644     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
645     }
646    
647 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
648     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
649     $attr->value ($attr_t->{value});
650     $attr->set_user_data (manakai_source_line => $attr_t->{line});
651     $attr->set_user_data (manakai_source_column => $attr_t->{column});
652     $el->set_attribute_node_ns ($attr);
653     }
654    
655 wakaba 1.1 $self->{open_elements}->[-1]->[0]->append_child ($el);
656    
657 wakaba 1.3 if ($self->{self_closing}) {
658 wakaba 1.1 delete $self->{self_closing};
659     } else {
660 wakaba 1.7 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
661 wakaba 1.1 }
662    
663     ## Stay in the mode.
664     $token = $self->_get_next_token;
665     next B;
666     } elsif ($token->{type} == END_TAG_TOKEN) {
667 wakaba 1.2 if ($token->{tag_name} eq '') {
668 wakaba 1.1 ## Short end tag token.
669     pop @{$self->{open_elements}};
670     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
671     pop @{$self->{open_elements}};
672     } else {
673     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
674     text => $token->{tag_name},
675     token => $token);
676    
677     ## Has an element in scope
678 wakaba 1.2 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
679 wakaba 1.1 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
680     splice @{$self->{open_elements}}, $i;
681     last INSCOPE;
682     }
683     } # INSCOPE
684     }
685    
686     unless (@{$self->{open_elements}}) {
687     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
688     $token = $self->_get_next_token;
689     return;
690     } else {
691     ## Stay in the state.
692     $token = $self->_get_next_token;
693     redo B;
694     }
695     } elsif ($token->{type} == COMMENT_TOKEN) {
696     my $comment = $self->{document}->create_comment ($token->{data});
697     $self->{open_elements}->[-1]->[0]->append_child ($comment);
698    
699     ## Stay in the mode.
700     $token = $self->_get_next_token;
701     next B;
702     } elsif ($token->{type} == PI_TOKEN) {
703     my $pi = $self->{document}->create_processing_instruction
704     ($token->{target}, $token->{data});
705     $self->{open_elements}->[-1]->[0]->append_child ($pi);
706    
707     ## Stay in the mode.
708     $token = $self->_get_next_token;
709     next B;
710     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
711     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
712     token => $token);
713    
714     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
715     $token = $self->_get_next_token;
716     return;
717     } elsif ($token->{type} == DOCTYPE_TOKEN) {
718     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
719     token => $token);
720     ## Ignore the token.
721    
722     ## Stay in the mode.
723     $token = $self->_get_next_token;
724     next B;
725     } elsif ($token->{type} == ABORT_TOKEN) {
726     return;
727     } else {
728     die "$0: XML parser initial: Unknown token type $token->{type}";
729     }
730     } # B
731     } # _tree_in_element
732    
733     sub _tree_after_root_element ($) {
734     my $self = shift;
735    
736     B: while (1) {
737     if ($token->{type} == START_TAG_TOKEN) {
738     $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
739     token => $token);
740    
741     ## XML5: Ignore the token.
742 wakaba 1.4
743 wakaba 1.5 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
744     ## Reprocess.
745 wakaba 1.1 return;
746     } elsif ($token->{type} == COMMENT_TOKEN) {
747     my $comment = $self->{document}->create_comment ($token->{data});
748     $self->{document}->append_child ($comment);
749    
750     ## Stay in the mode.
751     $token = $self->_get_next_token;
752     next B;
753     } elsif ($token->{type} == PI_TOKEN) {
754     my $pi = $self->{document}->create_processing_instruction
755     ($token->{target}, $token->{data});
756     $self->{document}->append_child ($pi);
757    
758     ## Stay in the mode.
759     $token = $self->_get_next_token;
760     next B;
761     } elsif ($token->{type} == CHARACTER_TOKEN) {
762     if (not $self->{tainted} and
763 wakaba 1.9 not $token->{has_reference} and
764 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
765     #
766     }
767    
768     if (length $token->{data}) {
769     ## XML5: Ignore the token.
770    
771     unless ($self->{tainted}) {
772     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
773     token => $token);
774     $self->{tainted} = 1;
775     }
776    
777     $self->{document}->manakai_append_text ($token->{data});
778     }
779    
780     ## Stay in the mode.
781     $token = $self->_get_next_token;
782     next B;
783     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
784     ## Stop parsing.
785    
786     ## TODO: implement "stop parsing".
787    
788     $token = {type => ABORT_TOKEN};
789     return;
790     } elsif ($token->{type} == END_TAG_TOKEN) {
791     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
792     text => $token->{tag_name},
793     token => $token);
794     ## Ignore the token.
795    
796     ## Stay in the mode.
797     $token = $self->_get_next_token;
798     next B;
799     } elsif ($token->{type} == DOCTYPE_TOKEN) {
800     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
801     token => $token);
802     ## Ignore the token.
803    
804     ## Stay in the mode.
805     $token = $self->_get_next_token;
806     next B;
807     } elsif ($token->{type} == ABORT_TOKEN) {
808     return;
809     } else {
810     die "$0: XML parser initial: Unknown token type $token->{type}";
811     }
812     } # B
813     } # _tree_after_root_element
814    
815     }
816    
817     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24