/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download)
Thu Oct 16 03:39:57 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.13: +43 -2 lines
++ whatpm/t/ChangeLog	16 Oct 2008 03:39:39 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/pis-2.dat" and "xml/comments-2.dat" are added.

++ whatpm/t/xml/ChangeLog	16 Oct 2008 03:39:53 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* doctypes-2.dat: New test added.

	* comments-2.dat, pis-2.dat: New test data files.

++ whatpm/Whatpm/HTML/ChangeLog	16 Oct 2008 03:36:51 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token type END_OF_DOCTYPE_TOKEN added.
	New states DOCTYPE_TAG_STATE and
	BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE are added.  (Bogus
	string after the internal subset, which was handled by the state
	BOGUS_DOCTYPE_STATE, are now handled by the new state.)  Support
	for comments, bogus comments, and processing instructions in the
	internal subset.  If there is the internal subset, then emit the
	doctype token before the internal subset (with its
	$token->{has_internal_subset} flag set) and an
	END_OF_DOCTYPE_TOKEN after the internal subset.

++ whatpm/Whatpm/XML/ChangeLog	16 Oct 2008 03:39:19 -0000
2008-10-16  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Insertion mode IN_SUBSET_IM added.  In the
	"initial" insertion mode, if the DOCTYPE token's "has internal
	subset" flag is set, then switch to the "in subset" insertion
	mode.

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68    
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72    
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82    
83     $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182 wakaba 1.2
183     $self->{is_xml} = 1;
184    
185 wakaba 1.1 return $self;
186     } # new
187    
188     sub _initialize_tree_constructor ($) {
189     my $self = shift;
190     ## NOTE: $self->{document} MUST be specified before this method is called
191     $self->{document}->strict_error_checking (0);
192     ## TODO: Turn mutation events off # MUST
193     $self->{document}->dom_config
194     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195     = 0;
196     $self->{document}->manakai_is_html (0);
197     $self->{document}->set_user_data (manakai_source_line => 1);
198     $self->{document}->set_user_data (manakai_source_column => 1);
199     } # _initialize_tree_constructor
200    
201     sub _terminate_tree_constructor ($) {
202     my $self = shift;
203     $self->{document}->strict_error_checking (1);
204     $self->{document}->dom_config
205     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206     = 1;
207     ## TODO: Turn mutation events on
208     } # _terminate_tree_constructor
209    
210     ## Tree construction stage
211    
212    
213     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214    
215     ## XML5: No namespace support.
216    
217     ## XML5: Start, main, end phases. In this implementation, they are
218     ## represented by insertion modes.
219    
220     ## Insertion modes
221     sub INITIAL_IM () { 0 }
222     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
223     sub IN_ELEMENT_IM () { 2 }
224     sub AFTER_ROOT_ELEMENT_IM () { 3 }
225 wakaba 1.14 sub IN_SUBSET_IM () { 4 }
226 wakaba 1.1
227     {
228     my $token; ## TODO: change to $self->{t}
229    
230     sub _construct_tree ($) {
231     my ($self) = @_;
232    
233     delete $self->{tainted};
234     $self->{open_elements} = [];
235     $self->{insertion_mode} = INITIAL_IM;
236 wakaba 1.8
237     $token = $self->_get_next_token;
238 wakaba 1.10
239     ## XML5: No support for the XML declaration
240     if ($token->{type} == PI_TOKEN and
241     $token->{target} eq 'xml' and
242     $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
243     (?>"([^"]*)"|'([^']*)')
244     (?:[\x09\x0A\x20]+
245     encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
246     (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
247     (?:[\x09\x0A\x20]+
248     standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
249     (?>"(yes|no)"|'(yes|no)'))?
250     [\x09\x0A\x20]*\z/x) {
251     $self->{document}->xml_version (defined $1 ? $1 : $2);
252     $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
253     $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
254    
255     $token = $self->_get_next_token;
256     } else {
257     $self->{document}->xml_version ('1.0');
258     $self->{document}->xml_encoding (undef);
259     $self->{document}->xml_standalone (0);
260     }
261 wakaba 1.1
262     while (1) {
263     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
264     $self->_tree_in_element;
265 wakaba 1.14 } elsif ($self->{insertion_mode} == IN_SUBSET_IM) {
266     $self->_tree_in_subset;
267 wakaba 1.1 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
268     $self->_tree_after_root_element;
269     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
270     $self->_tree_before_root_element;
271     } elsif ($self->{insertion_mode} == INITIAL_IM) {
272     $self->_tree_initial;
273     } else {
274     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
275     }
276    
277     last if $token->{type} == ABORT_TOKEN;
278     }
279     } # _construct_tree
280    
281     sub _tree_initial ($) {
282     my $self = shift;
283    
284     B: while (1) {
285     if ($token->{type} == DOCTYPE_TOKEN) {
286     ## XML5: No "DOCTYPE" token.
287    
288     my $doctype = $self->{document}->create_document_type_definition
289     (defined $token->{name} ? $token->{name} : '');
290    
291     ## NOTE: Default value for both |public_id| and |system_id| attributes
292     ## are empty strings, so that we don't set any value in missing cases.
293 wakaba 1.13 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
294     $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
295 wakaba 1.1
296     ## TODO: internal_subset
297    
298     $self->{document}->append_child ($doctype);
299 wakaba 1.14
300     ## XML5: No "has internal subset" flag.
301     if ($token->{has_internal_subset}) {
302     $self->{doctype} = $doctype;
303     $self->{insertion_mode} = IN_SUBSET_IM;
304     } else {
305     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
306     }
307 wakaba 1.1 $token = $self->_get_next_token;
308     return;
309     } elsif ($token->{type} == START_TAG_TOKEN or
310     $token->{type} == END_OF_FILE_TOKEN) {
311     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
312     ## Reprocess.
313     return;
314     } elsif ($token->{type} == COMMENT_TOKEN) {
315     my $comment = $self->{document}->create_comment ($token->{data});
316     $self->{document}->append_child ($comment);
317    
318     ## Stay in the mode.
319     $token = $self->_get_next_token;
320     next B;
321     } elsif ($token->{type} == PI_TOKEN) {
322     my $pi = $self->{document}->create_processing_instruction
323     ($token->{target}, $token->{data});
324     $self->{document}->append_child ($pi);
325    
326     ## Stay in the mode.
327     $token = $self->_get_next_token;
328     next B;
329     } elsif ($token->{type} == CHARACTER_TOKEN) {
330     if (not $self->{tainted} and
331 wakaba 1.9 not $token->{has_reference} and
332 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
333     #
334     }
335    
336     if (length $token->{data}) {
337     ## XML5: Ignore the token.
338    
339     unless ($self->{tainted}) {
340     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
341     token => $token);
342     $self->{tainted} = 1;
343     }
344    
345     $self->{document}->manakai_append_text ($token->{data});
346     }
347    
348     ## Stay in the mode.
349     $token = $self->_get_next_token;
350     next B;
351     } elsif ($token->{type} == END_TAG_TOKEN) {
352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
353     text => $token->{tag_name},
354     token => $token);
355     ## Ignore the token.
356    
357     ## Stay in the mode.
358     $token = $self->_get_next_token;
359     next B;
360     } elsif ($token->{type} == ABORT_TOKEN) {
361     return;
362     } else {
363     die "$0: XML parser initial: Unknown token type $token->{type}";
364     }
365     } # B
366     } # _tree_initial
367    
368     sub _tree_before_root_element ($) {
369     my $self = shift;
370    
371     B: while (1) {
372     if ($token->{type} == START_TAG_TOKEN) {
373 wakaba 1.6 my $nsmap = {
374     xml => q<http://www.w3.org/XML/1998/namespace>,
375     xmlns => q<http://www.w3.org/2000/xmlns/>,
376     };
377    
378     for (keys %{$token->{attributes}}) {
379     if (/^xmlns:./s) {
380     my $prefix = substr $_, 6;
381     my $value = $token->{attributes}->{$_}->{value};
382     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
383     $value eq q<http://www.w3.org/XML/1998/namespace> or
384     $value eq q<http://www.w3.org/2000/xmlns/>) {
385     ## NOTE: Error should be detected at the DOM layer.
386     #
387     } elsif (length $value) {
388     $nsmap->{$prefix} = $value;
389     } else {
390     delete $nsmap->{$prefix};
391     }
392     } elsif ($_ eq 'xmlns') {
393     my $value = $token->{attributes}->{$_}->{value};
394     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
395     $value eq q<http://www.w3.org/2000/xmlns/>) {
396     ## NOTE: Error should be detected at the DOM layer.
397     #
398     } elsif (length $value) {
399     $nsmap->{''} = $value;
400     } else {
401     delete $nsmap->{''};
402     }
403     }
404     }
405    
406     my $ns;
407 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
408 wakaba 1.6
409 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
410 wakaba 1.6 if (defined $nsmap->{$prefix}) {
411     $ns = $nsmap->{$prefix};
412     } else {
413     ($prefix, $ln) = (undef, $token->{tag_name});
414     }
415     } else {
416 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
417     ($prefix, $ln) = (undef, $token->{tag_name});
418 wakaba 1.6 }
419    
420 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
421 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
422     $el->set_user_data (manakai_source_column => $token->{column});
423    
424 wakaba 1.6 my $has_attr;
425 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
426     $token->{attributes}->{$b}->{index}}
427     keys %{$token->{attributes}}) {
428 wakaba 1.6 my $ns;
429 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
430 wakaba 1.6
431     if ($attr_name eq 'xmlns:xmlns') {
432     ($p, $l) = (undef, $attr_name);
433 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
434 wakaba 1.6 if (defined $nsmap->{$p}) {
435     $ns = $nsmap->{$p};
436     } else {
437     ## NOTE: Error should be detected at the DOM-layer.
438     ($p, $l) = (undef, $attr_name);
439     }
440     } else {
441 wakaba 1.11 if ($attr_name eq 'xmlns') {
442 wakaba 1.6 $ns = $nsmap->{xmlns};
443     }
444 wakaba 1.11 ($p, $l) = (undef, $attr_name);
445 wakaba 1.6 }
446    
447     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
448 wakaba 1.12 $ns = undef;
449     ($p, $l) = (undef, $attr_name);
450 wakaba 1.6 } else {
451     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
452     }
453    
454 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
455     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
456     $attr->value ($attr_t->{value});
457     $attr->set_user_data (manakai_source_line => $attr_t->{line});
458     $attr->set_user_data (manakai_source_column => $attr_t->{column});
459     $el->set_attribute_node_ns ($attr);
460     }
461    
462 wakaba 1.1 $self->{document}->append_child ($el);
463    
464 wakaba 1.3 if ($self->{self_closing}) {
465 wakaba 1.1 delete $self->{self_closing};
466     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
467     } else {
468 wakaba 1.6 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
469 wakaba 1.1 $self->{insertion_mode} = IN_ELEMENT_IM;
470     }
471    
472     #delete $self->{tainted};
473    
474     $token = $self->_get_next_token;
475     return;
476     } elsif ($token->{type} == COMMENT_TOKEN) {
477     my $comment = $self->{document}->create_comment ($token->{data});
478     $self->{document}->append_child ($comment);
479    
480     ## Stay in the mode.
481     $token = $self->_get_next_token;
482     next B;
483     } elsif ($token->{type} == PI_TOKEN) {
484     my $pi = $self->{document}->create_processing_instruction
485     ($token->{target}, $token->{data});
486     $self->{document}->append_child ($pi);
487    
488     ## Stay in the mode.
489     $token = $self->_get_next_token;
490     next B;
491     } elsif ($token->{type} == CHARACTER_TOKEN) {
492     if (not $self->{tainted} and
493 wakaba 1.9 not $token->{has_reference} and
494 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
495     #
496     }
497    
498     if (length $token->{data}) {
499     ## XML5: Ignore the token.
500    
501     unless ($self->{tainted}) {
502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
503     token => $token);
504     $self->{tainted} = 1;
505     }
506    
507     $self->{document}->manakai_append_text ($token->{data});
508     }
509    
510     ## Stay in the mode.
511     $token = $self->_get_next_token;
512     next B;
513     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
515     token => $token);
516    
517     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
518     ## Reprocess.
519     return;
520     } elsif ($token->{type} == END_TAG_TOKEN) {
521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
522     text => $token->{tag_name},
523     token => $token);
524     ## Ignore the token.
525    
526     ## Stay in the mode.
527     $token = $self->_get_next_token;
528     next B;
529     } elsif ($token->{type} == DOCTYPE_TOKEN) {
530     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
531     token => $token);
532     ## Ignore the token.
533    
534     ## Stay in the mode.
535     $token = $self->_get_next_token;
536     next B;
537     } elsif ($token->{type} == ABORT_TOKEN) {
538     return;
539     } else {
540     die "$0: XML parser initial: Unknown token type $token->{type}";
541     }
542     } # B
543     } # _tree_before_root_element
544    
545     sub _tree_in_element ($) {
546     my $self = shift;
547    
548     B: while (1) {
549     if ($token->{type} == CHARACTER_TOKEN) {
550     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
551    
552     ## Stay in the mode.
553     $token = $self->_get_next_token;
554     next B;
555     } elsif ($token->{type} == START_TAG_TOKEN) {
556 wakaba 1.7 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
557    
558     for (keys %{$token->{attributes}}) {
559     if (/^xmlns:./s) {
560     my $prefix = substr $_, 6;
561     my $value = $token->{attributes}->{$_}->{value};
562     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
563     $value eq q<http://www.w3.org/XML/1998/namespace> or
564     $value eq q<http://www.w3.org/2000/xmlns/>) {
565     ## NOTE: Error should be detected at the DOM layer.
566     #
567     } elsif (length $value) {
568     $nsmap->{$prefix} = $value;
569     } else {
570     delete $nsmap->{$prefix};
571     }
572     } elsif ($_ eq 'xmlns') {
573     my $value = $token->{attributes}->{$_}->{value};
574     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
575     $value eq q<http://www.w3.org/2000/xmlns/>) {
576     ## NOTE: Error should be detected at the DOM layer.
577     #
578     } elsif (length $value) {
579     $nsmap->{''} = $value;
580     } else {
581     delete $nsmap->{''};
582     }
583     }
584     }
585    
586     my $ns;
587 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
588 wakaba 1.7
589 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
590 wakaba 1.7 if (defined $nsmap->{$prefix}) {
591     $ns = $nsmap->{$prefix};
592     } else {
593     ## NOTE: Error should be detected at the DOM layer.
594     ($prefix, $ln) = (undef, $token->{tag_name});
595     }
596     } else {
597 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
598     ($prefix, $ln) = (undef, $token->{tag_name});
599 wakaba 1.7 }
600    
601 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
602 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
603     $el->set_user_data (manakai_source_column => $token->{column});
604    
605 wakaba 1.7 my $has_attr;
606 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
607     $token->{attributes}->{$b}->{index}}
608     keys %{$token->{attributes}}) {
609 wakaba 1.7 my $ns;
610 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
611 wakaba 1.7
612     if ($attr_name eq 'xmlns:xmlns') {
613     ($p, $l) = (undef, $attr_name);
614 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
615 wakaba 1.7 if (defined $nsmap->{$p}) {
616     $ns = $nsmap->{$p};
617     } else {
618     ## NOTE: Error should be detected at the DOM-layer.
619     ($p, $l) = (undef, $attr_name);
620     }
621     } else {
622 wakaba 1.11 if ($attr_name eq 'xmlns') {
623 wakaba 1.7 $ns = $nsmap->{xmlns};
624     }
625 wakaba 1.11 ($p, $l) = (undef, $attr_name);
626 wakaba 1.7 }
627    
628     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
629 wakaba 1.12 $ns = undef;
630     ($p, $l) = (undef, $attr_name);
631 wakaba 1.7 } else {
632     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
633     }
634    
635 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
636     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
637     $attr->value ($attr_t->{value});
638     $attr->set_user_data (manakai_source_line => $attr_t->{line});
639     $attr->set_user_data (manakai_source_column => $attr_t->{column});
640     $el->set_attribute_node_ns ($attr);
641     }
642    
643 wakaba 1.1 $self->{open_elements}->[-1]->[0]->append_child ($el);
644    
645 wakaba 1.3 if ($self->{self_closing}) {
646 wakaba 1.1 delete $self->{self_closing};
647     } else {
648 wakaba 1.7 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
649 wakaba 1.1 }
650    
651     ## Stay in the mode.
652     $token = $self->_get_next_token;
653     next B;
654     } elsif ($token->{type} == END_TAG_TOKEN) {
655 wakaba 1.2 if ($token->{tag_name} eq '') {
656 wakaba 1.1 ## Short end tag token.
657     pop @{$self->{open_elements}};
658     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
659     pop @{$self->{open_elements}};
660     } else {
661     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
662     text => $token->{tag_name},
663     token => $token);
664    
665     ## Has an element in scope
666 wakaba 1.2 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
667 wakaba 1.1 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
668     splice @{$self->{open_elements}}, $i;
669     last INSCOPE;
670     }
671     } # INSCOPE
672     }
673    
674     unless (@{$self->{open_elements}}) {
675     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
676     $token = $self->_get_next_token;
677     return;
678     } else {
679     ## Stay in the state.
680     $token = $self->_get_next_token;
681     redo B;
682     }
683     } elsif ($token->{type} == COMMENT_TOKEN) {
684     my $comment = $self->{document}->create_comment ($token->{data});
685     $self->{open_elements}->[-1]->[0]->append_child ($comment);
686    
687     ## Stay in the mode.
688     $token = $self->_get_next_token;
689     next B;
690     } elsif ($token->{type} == PI_TOKEN) {
691     my $pi = $self->{document}->create_processing_instruction
692     ($token->{target}, $token->{data});
693     $self->{open_elements}->[-1]->[0]->append_child ($pi);
694    
695     ## Stay in the mode.
696     $token = $self->_get_next_token;
697     next B;
698     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
699     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
700     token => $token);
701    
702     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
703     $token = $self->_get_next_token;
704     return;
705     } elsif ($token->{type} == DOCTYPE_TOKEN) {
706     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
707     token => $token);
708     ## Ignore the token.
709    
710     ## Stay in the mode.
711     $token = $self->_get_next_token;
712     next B;
713     } elsif ($token->{type} == ABORT_TOKEN) {
714     return;
715     } else {
716     die "$0: XML parser initial: Unknown token type $token->{type}";
717     }
718     } # B
719     } # _tree_in_element
720    
721     sub _tree_after_root_element ($) {
722     my $self = shift;
723    
724     B: while (1) {
725     if ($token->{type} == START_TAG_TOKEN) {
726     $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
727     token => $token);
728    
729     ## XML5: Ignore the token.
730 wakaba 1.4
731 wakaba 1.5 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
732     ## Reprocess.
733 wakaba 1.1 return;
734     } elsif ($token->{type} == COMMENT_TOKEN) {
735     my $comment = $self->{document}->create_comment ($token->{data});
736     $self->{document}->append_child ($comment);
737    
738     ## Stay in the mode.
739     $token = $self->_get_next_token;
740     next B;
741     } elsif ($token->{type} == PI_TOKEN) {
742     my $pi = $self->{document}->create_processing_instruction
743     ($token->{target}, $token->{data});
744     $self->{document}->append_child ($pi);
745    
746     ## Stay in the mode.
747     $token = $self->_get_next_token;
748     next B;
749     } elsif ($token->{type} == CHARACTER_TOKEN) {
750     if (not $self->{tainted} and
751 wakaba 1.9 not $token->{has_reference} and
752 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
753     #
754     }
755    
756     if (length $token->{data}) {
757     ## XML5: Ignore the token.
758    
759     unless ($self->{tainted}) {
760     $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
761     token => $token);
762     $self->{tainted} = 1;
763     }
764    
765     $self->{document}->manakai_append_text ($token->{data});
766     }
767    
768     ## Stay in the mode.
769     $token = $self->_get_next_token;
770     next B;
771     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
772     ## Stop parsing.
773    
774     ## TODO: implement "stop parsing".
775    
776     $token = {type => ABORT_TOKEN};
777     return;
778     } elsif ($token->{type} == END_TAG_TOKEN) {
779     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
780     text => $token->{tag_name},
781     token => $token);
782     ## Ignore the token.
783    
784     ## Stay in the mode.
785     $token = $self->_get_next_token;
786     next B;
787     } elsif ($token->{type} == DOCTYPE_TOKEN) {
788     $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
789     token => $token);
790     ## Ignore the token.
791    
792     ## Stay in the mode.
793     $token = $self->_get_next_token;
794     next B;
795     } elsif ($token->{type} == ABORT_TOKEN) {
796     return;
797     } else {
798     die "$0: XML parser initial: Unknown token type $token->{type}";
799     }
800     } # B
801     } # _tree_after_root_element
802 wakaba 1.14
803     sub _tree_in_subset ($) {
804     my $self = shift;
805    
806     B: while (1) {
807     if ($token->{type} == COMMENT_TOKEN) {
808     ## Ignore the token.
809    
810     ## Stay in the state.
811     $token = $self->_get_next_token;
812     next B;
813     } elsif ($token->{type} == PI_TOKEN) {
814     my $pi = $self->{document}->create_processing_instruction
815     ($token->{target}, $token->{data});
816     $self->{doctype}->append_child ($pi);
817     ## TODO: line/col
818    
819     ## Stay in the mode.
820     $token = $self->_get_next_token;
821     next B;
822     } elsif ($token->{type} == END_OF_DOCTYPE_TOKEN) {
823     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
824     $token = $self->_get_next_token;
825     return;
826     } elsif ($token->{type} == ABORT_TOKEN) {
827     return;
828     } else {
829     die "$0: XML parser subset im: Unknown token type $token->{type}";
830     }
831     } # B
832    
833     } # _tree_in_subset
834 wakaba 1.1
835     }
836    
837     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24