/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations) (download)
Tue Oct 14 15:25:50 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.8: +3 -0 lines
++ whatpm/t/ChangeLog	14 Oct 2008 15:23:30 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/charref-1.dat" added.

++ whatpm/t/xml/ChangeLog	14 Oct 2008 15:23:49 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* charref-1.dat: New test data file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 15:24:42 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Mark CHARACTER_TOKEN with character reference
	as such, for the support of XML parse error.

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 15:25:35 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Raise a parse error for white space character
	generated by a character reference outside of the root element.

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68    
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72    
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82    
83     $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182 wakaba 1.2
183     $self->{is_xml} = 1;
184    
185 wakaba 1.1 return $self;
186     } # new
187    
188     sub _initialize_tree_constructor ($) {
189     my $self = shift;
190     ## NOTE: $self->{document} MUST be specified before this method is called
191     $self->{document}->strict_error_checking (0);
192     ## TODO: Turn mutation events off # MUST
193     $self->{document}->dom_config
194     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195     = 0;
196     $self->{document}->manakai_is_html (0);
197     $self->{document}->set_user_data (manakai_source_line => 1);
198     $self->{document}->set_user_data (manakai_source_column => 1);
199     } # _initialize_tree_constructor
200    
201     sub _terminate_tree_constructor ($) {
202     my $self = shift;
203     $self->{document}->strict_error_checking (1);
204     $self->{document}->dom_config
205     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206     = 1;
207     ## TODO: Turn mutation events on
208     } # _terminate_tree_constructor
209    
210     ## Tree construction stage
211    
212    
213     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214    
215     ## XML5: No namespace support.
216    
217     ## XML5: XML5 has "empty tag token". In this implementation, it is
218 wakaba 1.3 ## represented as a start tag token with $self->{self_closing} flag
219 wakaba 1.1 ## set to true.
220    
221     ## XML5: XML5 has "short end tag token". In this implementation, it
222     ## is represented as an end tag token with $token->{tag_name} flag set
223     ## to an empty string.
224    
225     ## XML5: Start, main, end phases. In this implementation, they are
226     ## represented by insertion modes.
227    
228     ## Insertion modes
229     sub INITIAL_IM () { 0 }
230     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
231     sub IN_ELEMENT_IM () { 2 }
232     sub AFTER_ROOT_ELEMENT_IM () { 3 }
233    
234     {
235     my $token; ## TODO: change to $self->{t}
236    
237     sub _construct_tree ($) {
238     my ($self) = @_;
239    
240     delete $self->{tainted};
241     $self->{open_elements} = [];
242     $self->{insertion_mode} = INITIAL_IM;
243 wakaba 1.8
244     $token = $self->_get_next_token;
245 wakaba 1.1
246     while (1) {
247     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
248     $self->_tree_in_element;
249     } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
250     $self->_tree_after_root_element;
251     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
252     $self->_tree_before_root_element;
253     } elsif ($self->{insertion_mode} == INITIAL_IM) {
254     $self->_tree_initial;
255     } else {
256     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
257     }
258    
259     last if $token->{type} == ABORT_TOKEN;
260     }
261     } # _construct_tree
262    
263     sub _tree_initial ($) {
264     my $self = shift;
265    
266     B: while (1) {
267     if ($token->{type} == DOCTYPE_TOKEN) {
268     ## XML5: No "DOCTYPE" token.
269    
270     my $doctype = $self->{document}->create_document_type_definition
271     (defined $token->{name} ? $token->{name} : '');
272    
273     ## NOTE: Default value for both |public_id| and |system_id| attributes
274     ## are empty strings, so that we don't set any value in missing cases.
275     $doctype->public_id ($token->{public_identifier})
276     if defined $token->{public_identifier};
277     $doctype->system_id ($token->{system_identifier})
278     if defined $token->{system_identifier};
279    
280     ## TODO: internal_subset
281    
282     $self->{document}->append_child ($doctype);
283    
284     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
285     $token = $self->_get_next_token;
286     return;
287     } elsif ($token->{type} == START_TAG_TOKEN or
288     $token->{type} == END_OF_FILE_TOKEN) {
289     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
290     ## Reprocess.
291     return;
292     } elsif ($token->{type} == COMMENT_TOKEN) {
293     my $comment = $self->{document}->create_comment ($token->{data});
294     $self->{document}->append_child ($comment);
295    
296     ## Stay in the mode.
297     $token = $self->_get_next_token;
298     next B;
299     } elsif ($token->{type} == PI_TOKEN) {
300     my $pi = $self->{document}->create_processing_instruction
301     ($token->{target}, $token->{data});
302     $self->{document}->append_child ($pi);
303    
304     ## Stay in the mode.
305     $token = $self->_get_next_token;
306     next B;
307     } elsif ($token->{type} == CHARACTER_TOKEN) {
308     if (not $self->{tainted} and
309 wakaba 1.9 not $token->{has_reference} and
310 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
311     #
312     }
313    
314     if (length $token->{data}) {
315     ## XML5: Ignore the token.
316    
317     unless ($self->{tainted}) {
318     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
319     token => $token);
320     $self->{tainted} = 1;
321     }
322    
323     $self->{document}->manakai_append_text ($token->{data});
324     }
325    
326     ## Stay in the mode.
327     $token = $self->_get_next_token;
328     next B;
329     } elsif ($token->{type} == END_TAG_TOKEN) {
330     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
331     text => $token->{tag_name},
332     token => $token);
333     ## Ignore the token.
334    
335     ## Stay in the mode.
336     $token = $self->_get_next_token;
337     next B;
338     } elsif ($token->{type} == ABORT_TOKEN) {
339     return;
340     } else {
341     die "$0: XML parser initial: Unknown token type $token->{type}";
342     }
343     } # B
344     } # _tree_initial
345    
346     sub _tree_before_root_element ($) {
347     my $self = shift;
348    
349     B: while (1) {
350     if ($token->{type} == START_TAG_TOKEN) {
351 wakaba 1.6 my $nsmap = {
352     xml => q<http://www.w3.org/XML/1998/namespace>,
353     xmlns => q<http://www.w3.org/2000/xmlns/>,
354     };
355    
356     for (keys %{$token->{attributes}}) {
357     if (/^xmlns:./s) {
358     my $prefix = substr $_, 6;
359     my $value = $token->{attributes}->{$_}->{value};
360     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
361     $value eq q<http://www.w3.org/XML/1998/namespace> or
362     $value eq q<http://www.w3.org/2000/xmlns/>) {
363     ## NOTE: Error should be detected at the DOM layer.
364     #
365     } elsif (length $value) {
366     $nsmap->{$prefix} = $value;
367     } else {
368     delete $nsmap->{$prefix};
369     ## TODO: Error unless XML1.1
370     }
371     } elsif ($_ eq 'xmlns') {
372     my $value = $token->{attributes}->{$_}->{value};
373     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
374     $value eq q<http://www.w3.org/2000/xmlns/>) {
375     ## NOTE: Error should be detected at the DOM layer.
376     #
377     } elsif (length $value) {
378     $nsmap->{''} = $value;
379     } else {
380     delete $nsmap->{''};
381     }
382     }
383     }
384    
385     my $ns;
386 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
387 wakaba 1.6
388     if (defined $ln) { # prefixed
389     if (defined $nsmap->{$prefix}) {
390     $ns = $nsmap->{$prefix};
391     } else {
392     ## NOTE: Error should be detected at the DOM layer.
393     ($prefix, $ln) = (undef, $token->{tag_name});
394     }
395     } else {
396     ($prefix, $ln) = (undef, $prefix);
397     $ns = $nsmap->{''};
398     }
399    
400 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
401 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
402     $el->set_user_data (manakai_source_column => $token->{column});
403    
404 wakaba 1.6 my $has_attr;
405     for my $attr_name (sort {$a cmp $b} keys %{$token->{attributes}}) {
406     my $ns;
407 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
408 wakaba 1.6
409     if ($attr_name eq 'xmlns:xmlns') {
410     ($p, $l) = (undef, $attr_name);
411     } elsif (defined $l) { # prefixed
412     if (defined $nsmap->{$p}) {
413     $ns = $nsmap->{$p};
414     } else {
415     ## NOTE: Error should be detected at the DOM-layer.
416     ($p, $l) = (undef, $attr_name);
417     }
418     } else {
419     if ($p eq 'xmlns') {
420     $ns = $nsmap->{xmlns};
421     }
422     ($p, $l) = (undef, $p);
423     }
424    
425     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
426     ## NOTE: Attributes are sorted as Unicode characters (not
427     ## code units) of their names, for stable output.
428    
429     ## TODO: Should be sorted by source order?
430     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate ns attr',
431     token => $token,
432     value => $attr_name);
433     next;
434     } else {
435     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
436     }
437    
438 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
439     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
440     $attr->value ($attr_t->{value});
441     $attr->set_user_data (manakai_source_line => $attr_t->{line});
442     $attr->set_user_data (manakai_source_column => $attr_t->{column});
443     $el->set_attribute_node_ns ($attr);
444     }
445    
446 wakaba 1.1 $self->{document}->append_child ($el);
447    
448 wakaba 1.3 if ($self->{self_closing}) {
449 wakaba 1.1 delete $self->{self_closing};
450     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
451     } else {
452 wakaba 1.6 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
453 wakaba 1.1 $self->{insertion_mode} = IN_ELEMENT_IM;
454     }
455    
456     #delete $self->{tainted};
457    
458     $token = $self->_get_next_token;
459     return;
460     } elsif ($token->{type} == COMMENT_TOKEN) {
461     my $comment = $self->{document}->create_comment ($token->{data});
462     $self->{document}->append_child ($comment);
463    
464     ## Stay in the mode.
465     $token = $self->_get_next_token;
466     next B;
467     } elsif ($token->{type} == PI_TOKEN) {
468     my $pi = $self->{document}->create_processing_instruction
469     ($token->{target}, $token->{data});
470     $self->{document}->append_child ($pi);
471    
472     ## Stay in the mode.
473     $token = $self->_get_next_token;
474     next B;
475     } elsif ($token->{type} == CHARACTER_TOKEN) {
476     if (not $self->{tainted} and
477 wakaba 1.9 not $token->{has_reference} and
478 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
479     #
480     }
481    
482     if (length $token->{data}) {
483     ## XML5: Ignore the token.
484    
485     unless ($self->{tainted}) {
486     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
487     token => $token);
488     $self->{tainted} = 1;
489     }
490    
491     $self->{document}->manakai_append_text ($token->{data});
492     }
493    
494     ## Stay in the mode.
495     $token = $self->_get_next_token;
496     next B;
497     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
499     token => $token);
500    
501     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
502     ## Reprocess.
503     return;
504     } elsif ($token->{type} == END_TAG_TOKEN) {
505     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
506     text => $token->{tag_name},
507     token => $token);
508     ## Ignore the token.
509    
510     ## Stay in the mode.
511     $token = $self->_get_next_token;
512     next B;
513     } elsif ($token->{type} == DOCTYPE_TOKEN) {
514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
515     token => $token);
516     ## Ignore the token.
517    
518     ## Stay in the mode.
519     $token = $self->_get_next_token;
520     next B;
521     } elsif ($token->{type} == ABORT_TOKEN) {
522     return;
523     } else {
524     die "$0: XML parser initial: Unknown token type $token->{type}";
525     }
526     } # B
527     } # _tree_before_root_element
528    
529     sub _tree_in_element ($) {
530     my $self = shift;
531    
532     B: while (1) {
533     if ($token->{type} == CHARACTER_TOKEN) {
534     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
535    
536     ## Stay in the mode.
537     $token = $self->_get_next_token;
538     next B;
539     } elsif ($token->{type} == START_TAG_TOKEN) {
540 wakaba 1.7 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
541    
542     for (keys %{$token->{attributes}}) {
543     if (/^xmlns:./s) {
544     my $prefix = substr $_, 6;
545     my $value = $token->{attributes}->{$_}->{value};
546     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
547     $value eq q<http://www.w3.org/XML/1998/namespace> or
548     $value eq q<http://www.w3.org/2000/xmlns/>) {
549     ## NOTE: Error should be detected at the DOM layer.
550     #
551     } elsif (length $value) {
552     $nsmap->{$prefix} = $value;
553     } else {
554     delete $nsmap->{$prefix};
555     ## TODO: Error unless XML1.1
556     }
557     } elsif ($_ eq 'xmlns') {
558     my $value = $token->{attributes}->{$_}->{value};
559     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
560     $value eq q<http://www.w3.org/2000/xmlns/>) {
561     ## NOTE: Error should be detected at the DOM layer.
562     #
563     } elsif (length $value) {
564     $nsmap->{''} = $value;
565     } else {
566     delete $nsmap->{''};
567     }
568     }
569     }
570    
571     my $ns;
572 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
573 wakaba 1.7
574     if (defined $ln) { # prefixed
575     if (defined $nsmap->{$prefix}) {
576     $ns = $nsmap->{$prefix};
577     } else {
578     ## NOTE: Error should be detected at the DOM layer.
579     ($prefix, $ln) = (undef, $token->{tag_name});
580     }
581     } else {
582     ($prefix, $ln) = (undef, $prefix);
583     $ns = $nsmap->{''};
584     }
585    
586 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
587 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
588     $el->set_user_data (manakai_source_column => $token->{column});
589    
590 wakaba 1.7 my $has_attr;
591     for my $attr_name (sort {$a cmp $b} keys %{$token->{attributes}}) {
592     my $ns;
593 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
594 wakaba 1.7
595     if ($attr_name eq 'xmlns:xmlns') {
596     ($p, $l) = (undef, $attr_name);
597     } elsif (defined $l) { # prefixed
598     if (defined $nsmap->{$p}) {
599     $ns = $nsmap->{$p};
600     } else {
601     ## NOTE: Error should be detected at the DOM-layer.
602     ($p, $l) = (undef, $attr_name);
603     }
604     } else {
605     if ($p eq 'xmlns') {
606     $ns = $nsmap->{xmlns};
607     }
608     ($p, $l) = (undef, $p);
609     }
610    
611     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
612     ## NOTE: Attributes are sorted as Unicode characters (not
613     ## code units) of their names, for stable output.
614    
615     ## TODO: Should be sorted by source order?
616     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate ns attr',
617     token => $token,
618     value => $attr_name);
619     next;
620     } else {
621     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
622     }
623    
624 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
625     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
626     $attr->value ($attr_t->{value});
627     $attr->set_user_data (manakai_source_line => $attr_t->{line});
628     $attr->set_user_data (manakai_source_column => $attr_t->{column});
629     $el->set_attribute_node_ns ($attr);
630     }
631    
632 wakaba 1.1 $self->{open_elements}->[-1]->[0]->append_child ($el);
633    
634 wakaba 1.3 if ($self->{self_closing}) {
635 wakaba 1.1 delete $self->{self_closing};
636     } else {
637 wakaba 1.7 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
638 wakaba 1.1 }
639    
640     ## Stay in the mode.
641     $token = $self->_get_next_token;
642     next B;
643     } elsif ($token->{type} == END_TAG_TOKEN) {
644 wakaba 1.2 if ($token->{tag_name} eq '') {
645 wakaba 1.1 ## Short end tag token.
646     pop @{$self->{open_elements}};
647     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
648     pop @{$self->{open_elements}};
649     } else {
650     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
651     text => $token->{tag_name},
652     token => $token);
653    
654     ## Has an element in scope
655 wakaba 1.2 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
656 wakaba 1.1 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
657     splice @{$self->{open_elements}}, $i;
658     last INSCOPE;
659     }
660     } # INSCOPE
661     }
662    
663     unless (@{$self->{open_elements}}) {
664     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
665     $token = $self->_get_next_token;
666     return;
667     } else {
668     ## Stay in the state.
669     $token = $self->_get_next_token;
670     redo B;
671     }
672     } elsif ($token->{type} == COMMENT_TOKEN) {
673     my $comment = $self->{document}->create_comment ($token->{data});
674     $self->{open_elements}->[-1]->[0]->append_child ($comment);
675    
676     ## Stay in the mode.
677     $token = $self->_get_next_token;
678     next B;
679     } elsif ($token->{type} == PI_TOKEN) {
680     my $pi = $self->{document}->create_processing_instruction
681     ($token->{target}, $token->{data});
682     $self->{open_elements}->[-1]->[0]->append_child ($pi);
683    
684     ## Stay in the mode.
685     $token = $self->_get_next_token;
686     next B;
687     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
688     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
689     token => $token);
690    
691     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
692     $token = $self->_get_next_token;
693     return;
694     } elsif ($token->{type} == DOCTYPE_TOKEN) {
695     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
696     token => $token);
697     ## Ignore the token.
698    
699     ## Stay in the mode.
700     $token = $self->_get_next_token;
701     next B;
702     } elsif ($token->{type} == ABORT_TOKEN) {
703     return;
704     } else {
705     die "$0: XML parser initial: Unknown token type $token->{type}";
706     }
707     } # B
708     } # _tree_in_element
709    
710     sub _tree_after_root_element ($) {
711     my $self = shift;
712    
713     B: while (1) {
714     if ($token->{type} == START_TAG_TOKEN) {
715     $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
716     token => $token);
717    
718     ## XML5: Ignore the token.
719 wakaba 1.4
720 wakaba 1.5 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
721     ## Reprocess.
722 wakaba 1.1 return;
723     } elsif ($token->{type} == COMMENT_TOKEN) {
724     my $comment = $self->{document}->create_comment ($token->{data});
725     $self->{document}->append_child ($comment);
726    
727     ## Stay in the mode.
728     $token = $self->_get_next_token;
729     next B;
730     } elsif ($token->{type} == PI_TOKEN) {
731     my $pi = $self->{document}->create_processing_instruction
732     ($token->{target}, $token->{data});
733     $self->{document}->append_child ($pi);
734    
735     ## Stay in the mode.
736     $token = $self->_get_next_token;
737     next B;
738     } elsif ($token->{type} == CHARACTER_TOKEN) {
739     if (not $self->{tainted} and
740 wakaba 1.9 not $token->{has_reference} and
741 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
742     #
743     }
744    
745     if (length $token->{data}) {
746     ## XML5: Ignore the token.
747    
748     unless ($self->{tainted}) {
749     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
750     token => $token);
751     $self->{tainted} = 1;
752     }
753    
754     $self->{document}->manakai_append_text ($token->{data});
755     }
756    
757     ## Stay in the mode.
758     $token = $self->_get_next_token;
759     next B;
760     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
761     ## Stop parsing.
762    
763     ## TODO: implement "stop parsing".
764    
765     $token = {type => ABORT_TOKEN};
766     return;
767     } elsif ($token->{type} == END_TAG_TOKEN) {
768     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
769     text => $token->{tag_name},
770     token => $token);
771     ## Ignore the token.
772    
773     ## Stay in the mode.
774     $token = $self->_get_next_token;
775     next B;
776     } elsif ($token->{type} == DOCTYPE_TOKEN) {
777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
778     token => $token);
779     ## Ignore the token.
780    
781     ## Stay in the mode.
782     $token = $self->_get_next_token;
783     next B;
784     } elsif ($token->{type} == ABORT_TOKEN) {
785     return;
786     } else {
787     die "$0: XML parser initial: Unknown token type $token->{type}";
788     }
789     } # B
790     } # _tree_after_root_element
791    
792     }
793    
794     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24