/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (hide annotations) (download) (as text)
Wed Oct 15 12:49:49 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.12: +2 -12 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	15 Oct 2008 12:49:07 -0000
	* XML-Parser.t: "xml/doctypes-2.dat" added.

	* tokenizer-test-1.test: Keyword case-sensitivility tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 12:49:41 -0000
	* doctypes-1.dat: A keyword case-sensitivility test added.

	* doctypes-2.dat: New test data file.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 12:46:53 -0000
	* Tokenizer.pm.src: $self->{s_kwd} for non-DATA_STATE states are
	renamed as $self->{kwd} to avoid confliction.  Don't raise
	case-sensitivity error for the keyword "DOCTYPE" in HTML mode.
	Support for internal subsets (internal subset itself only; no
	declaration in them is supported yet).  Raise a parse error for
	non-uppercase keywords "PUBLIC" and "SYSTEM" in XML mode.  Raise a
	parse error if no system identifier is specified for a DOCTYPE
	declaration with a public identifier.  Don't close the DOCTYPE
	declaration by a ">" character in the system declaration in XML
	mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 12:48:30 -0000
	* Parser.pm.src: Typo fixed.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68     !!!cp ('j1');
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72     !!!cp ('j2');
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82     !!!cp ('j4');
83     !!!parse-error (type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     !!!parse-error (layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182 wakaba 1.2
183     $self->{is_xml} = 1;
184    
185 wakaba 1.1 return $self;
186     } # new
187    
188     sub _initialize_tree_constructor ($) {
189     my $self = shift;
190     ## NOTE: $self->{document} MUST be specified before this method is called
191     $self->{document}->strict_error_checking (0);
192     ## TODO: Turn mutation events off # MUST
193     $self->{document}->dom_config
194     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195     = 0;
196     $self->{document}->manakai_is_html (0);
197     $self->{document}->set_user_data (manakai_source_line => 1);
198     $self->{document}->set_user_data (manakai_source_column => 1);
199     } # _initialize_tree_constructor
200    
201     sub _terminate_tree_constructor ($) {
202     my $self = shift;
203     $self->{document}->strict_error_checking (1);
204     $self->{document}->dom_config
205     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206     = 1;
207     ## TODO: Turn mutation events on
208     } # _terminate_tree_constructor
209    
210     ## Tree construction stage
211    
212    
213     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214    
215     ## XML5: No namespace support.
216    
217     ## XML5: Start, main, end phases. In this implementation, they are
218     ## represented by insertion modes.
219    
220     ## Insertion modes
221     sub INITIAL_IM () { 0 }
222     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
223     sub IN_ELEMENT_IM () { 2 }
224     sub AFTER_ROOT_ELEMENT_IM () { 3 }
225    
226     {
227     my $token; ## TODO: change to $self->{t}
228    
229     sub _construct_tree ($) {
230     my ($self) = @_;
231    
232     delete $self->{tainted};
233     $self->{open_elements} = [];
234     $self->{insertion_mode} = INITIAL_IM;
235 wakaba 1.8
236     !!!next-token;
237 wakaba 1.10
238     ## XML5: No support for the XML declaration
239     if ($token->{type} == PI_TOKEN and
240     $token->{target} eq 'xml' and
241     $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
242     (?>"([^"]*)"|'([^']*)')
243     (?:[\x09\x0A\x20]+
244     encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
245     (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
246     (?:[\x09\x0A\x20]+
247     standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
248     (?>"(yes|no)"|'(yes|no)'))?
249     [\x09\x0A\x20]*\z/x) {
250     $self->{document}->xml_version (defined $1 ? $1 : $2);
251     $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
252     $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
253    
254     !!!next-token;
255     } else {
256     $self->{document}->xml_version ('1.0');
257     $self->{document}->xml_encoding (undef);
258     $self->{document}->xml_standalone (0);
259     }
260 wakaba 1.1
261     while (1) {
262     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
263     $self->_tree_in_element;
264     } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
265     $self->_tree_after_root_element;
266     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
267     $self->_tree_before_root_element;
268     } elsif ($self->{insertion_mode} == INITIAL_IM) {
269     $self->_tree_initial;
270     } else {
271     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
272     }
273    
274     last if $token->{type} == ABORT_TOKEN;
275     }
276     } # _construct_tree
277    
278     sub _tree_initial ($) {
279     my $self = shift;
280    
281     B: while (1) {
282     if ($token->{type} == DOCTYPE_TOKEN) {
283     ## XML5: No "DOCTYPE" token.
284    
285     my $doctype = $self->{document}->create_document_type_definition
286     (defined $token->{name} ? $token->{name} : '');
287    
288     ## NOTE: Default value for both |public_id| and |system_id| attributes
289     ## are empty strings, so that we don't set any value in missing cases.
290 wakaba 1.13 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
291     $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
292 wakaba 1.1
293     ## TODO: internal_subset
294    
295     $self->{document}->append_child ($doctype);
296    
297     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
298     !!!next-token;
299     return;
300     } elsif ($token->{type} == START_TAG_TOKEN or
301     $token->{type} == END_OF_FILE_TOKEN) {
302     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
303     ## Reprocess.
304     return;
305     } elsif ($token->{type} == COMMENT_TOKEN) {
306     my $comment = $self->{document}->create_comment ($token->{data});
307     $self->{document}->append_child ($comment);
308    
309     ## Stay in the mode.
310     !!!next-token;
311     next B;
312     } elsif ($token->{type} == PI_TOKEN) {
313     my $pi = $self->{document}->create_processing_instruction
314     ($token->{target}, $token->{data});
315     $self->{document}->append_child ($pi);
316    
317     ## Stay in the mode.
318     !!!next-token;
319     next B;
320     } elsif ($token->{type} == CHARACTER_TOKEN) {
321     if (not $self->{tainted} and
322 wakaba 1.9 not $token->{has_reference} and
323 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
324     #
325     }
326    
327     if (length $token->{data}) {
328     ## XML5: Ignore the token.
329    
330     unless ($self->{tainted}) {
331     !!!parse-error (type => 'text outside of root element',
332     token => $token);
333     $self->{tainted} = 1;
334     }
335    
336     $self->{document}->manakai_append_text ($token->{data});
337     }
338    
339     ## Stay in the mode.
340     !!!next-token;
341     next B;
342     } elsif ($token->{type} == END_TAG_TOKEN) {
343     !!!parse-error (type => 'unmatched end tag',
344     text => $token->{tag_name},
345     token => $token);
346     ## Ignore the token.
347    
348     ## Stay in the mode.
349     !!!next-token;
350     next B;
351     } elsif ($token->{type} == ABORT_TOKEN) {
352     return;
353     } else {
354     die "$0: XML parser initial: Unknown token type $token->{type}";
355     }
356     } # B
357     } # _tree_initial
358    
359     sub _tree_before_root_element ($) {
360     my $self = shift;
361    
362     B: while (1) {
363     if ($token->{type} == START_TAG_TOKEN) {
364 wakaba 1.6 my $nsmap = {
365     xml => q<http://www.w3.org/XML/1998/namespace>,
366     xmlns => q<http://www.w3.org/2000/xmlns/>,
367     };
368    
369     for (keys %{$token->{attributes}}) {
370     if (/^xmlns:./s) {
371     my $prefix = substr $_, 6;
372     my $value = $token->{attributes}->{$_}->{value};
373     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
374     $value eq q<http://www.w3.org/XML/1998/namespace> or
375     $value eq q<http://www.w3.org/2000/xmlns/>) {
376     ## NOTE: Error should be detected at the DOM layer.
377     #
378     } elsif (length $value) {
379     $nsmap->{$prefix} = $value;
380     } else {
381     delete $nsmap->{$prefix};
382     }
383     } elsif ($_ eq 'xmlns') {
384     my $value = $token->{attributes}->{$_}->{value};
385     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
386     $value eq q<http://www.w3.org/2000/xmlns/>) {
387     ## NOTE: Error should be detected at the DOM layer.
388     #
389     } elsif (length $value) {
390     $nsmap->{''} = $value;
391     } else {
392     delete $nsmap->{''};
393     }
394     }
395     }
396    
397     my $ns;
398 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
399 wakaba 1.6
400 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
401 wakaba 1.6 if (defined $nsmap->{$prefix}) {
402     $ns = $nsmap->{$prefix};
403     } else {
404     ($prefix, $ln) = (undef, $token->{tag_name});
405     }
406     } else {
407 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
408     ($prefix, $ln) = (undef, $token->{tag_name});
409 wakaba 1.6 }
410    
411 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
412 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
413     $el->set_user_data (manakai_source_column => $token->{column});
414    
415 wakaba 1.6 my $has_attr;
416 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
417     $token->{attributes}->{$b}->{index}}
418     keys %{$token->{attributes}}) {
419 wakaba 1.6 my $ns;
420 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
421 wakaba 1.6
422     if ($attr_name eq 'xmlns:xmlns') {
423     ($p, $l) = (undef, $attr_name);
424 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
425 wakaba 1.6 if (defined $nsmap->{$p}) {
426     $ns = $nsmap->{$p};
427     } else {
428     ## NOTE: Error should be detected at the DOM-layer.
429     ($p, $l) = (undef, $attr_name);
430     }
431     } else {
432 wakaba 1.11 if ($attr_name eq 'xmlns') {
433 wakaba 1.6 $ns = $nsmap->{xmlns};
434     }
435 wakaba 1.11 ($p, $l) = (undef, $attr_name);
436 wakaba 1.6 }
437    
438     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
439 wakaba 1.12 $ns = undef;
440     ($p, $l) = (undef, $attr_name);
441 wakaba 1.6 } else {
442     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
443     }
444    
445 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
446     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
447     $attr->value ($attr_t->{value});
448     $attr->set_user_data (manakai_source_line => $attr_t->{line});
449     $attr->set_user_data (manakai_source_column => $attr_t->{column});
450     $el->set_attribute_node_ns ($attr);
451     }
452    
453 wakaba 1.1 $self->{document}->append_child ($el);
454    
455 wakaba 1.3 if ($self->{self_closing}) {
456 wakaba 1.1 !!!ack ('ack');
457     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
458     } else {
459 wakaba 1.6 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
460 wakaba 1.1 $self->{insertion_mode} = IN_ELEMENT_IM;
461     }
462    
463     #delete $self->{tainted};
464    
465     !!!next-token;
466     return;
467     } elsif ($token->{type} == COMMENT_TOKEN) {
468     my $comment = $self->{document}->create_comment ($token->{data});
469     $self->{document}->append_child ($comment);
470    
471     ## Stay in the mode.
472     !!!next-token;
473     next B;
474     } elsif ($token->{type} == PI_TOKEN) {
475     my $pi = $self->{document}->create_processing_instruction
476     ($token->{target}, $token->{data});
477     $self->{document}->append_child ($pi);
478    
479     ## Stay in the mode.
480     !!!next-token;
481     next B;
482     } elsif ($token->{type} == CHARACTER_TOKEN) {
483     if (not $self->{tainted} and
484 wakaba 1.9 not $token->{has_reference} and
485 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
486     #
487     }
488    
489     if (length $token->{data}) {
490     ## XML5: Ignore the token.
491    
492     unless ($self->{tainted}) {
493     !!!parse-error (type => 'text outside of root element',
494     token => $token);
495     $self->{tainted} = 1;
496     }
497    
498     $self->{document}->manakai_append_text ($token->{data});
499     }
500    
501     ## Stay in the mode.
502     !!!next-token;
503     next B;
504     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
505     !!!parse-error (type => 'no root element',
506     token => $token);
507    
508     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
509     ## Reprocess.
510     return;
511     } elsif ($token->{type} == END_TAG_TOKEN) {
512     !!!parse-error (type => 'unmatched end tag',
513     text => $token->{tag_name},
514     token => $token);
515     ## Ignore the token.
516    
517     ## Stay in the mode.
518     !!!next-token;
519     next B;
520     } elsif ($token->{type} == DOCTYPE_TOKEN) {
521     !!!parse-error (type => 'in html:#doctype',
522     token => $token);
523     ## Ignore the token.
524    
525     ## Stay in the mode.
526     !!!next-token;
527     next B;
528     } elsif ($token->{type} == ABORT_TOKEN) {
529     return;
530     } else {
531     die "$0: XML parser initial: Unknown token type $token->{type}";
532     }
533     } # B
534     } # _tree_before_root_element
535    
536     sub _tree_in_element ($) {
537     my $self = shift;
538    
539     B: while (1) {
540     if ($token->{type} == CHARACTER_TOKEN) {
541     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
542    
543     ## Stay in the mode.
544     !!!next-token;
545     next B;
546     } elsif ($token->{type} == START_TAG_TOKEN) {
547 wakaba 1.7 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
548    
549     for (keys %{$token->{attributes}}) {
550     if (/^xmlns:./s) {
551     my $prefix = substr $_, 6;
552     my $value = $token->{attributes}->{$_}->{value};
553     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
554     $value eq q<http://www.w3.org/XML/1998/namespace> or
555     $value eq q<http://www.w3.org/2000/xmlns/>) {
556     ## NOTE: Error should be detected at the DOM layer.
557     #
558     } elsif (length $value) {
559     $nsmap->{$prefix} = $value;
560     } else {
561     delete $nsmap->{$prefix};
562     }
563     } elsif ($_ eq 'xmlns') {
564     my $value = $token->{attributes}->{$_}->{value};
565     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
566     $value eq q<http://www.w3.org/2000/xmlns/>) {
567     ## NOTE: Error should be detected at the DOM layer.
568     #
569     } elsif (length $value) {
570     $nsmap->{''} = $value;
571     } else {
572     delete $nsmap->{''};
573     }
574     }
575     }
576    
577     my $ns;
578 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
579 wakaba 1.7
580 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
581 wakaba 1.7 if (defined $nsmap->{$prefix}) {
582     $ns = $nsmap->{$prefix};
583     } else {
584     ## NOTE: Error should be detected at the DOM layer.
585     ($prefix, $ln) = (undef, $token->{tag_name});
586     }
587     } else {
588 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
589     ($prefix, $ln) = (undef, $token->{tag_name});
590 wakaba 1.7 }
591    
592 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
593 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
594     $el->set_user_data (manakai_source_column => $token->{column});
595    
596 wakaba 1.7 my $has_attr;
597 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
598     $token->{attributes}->{$b}->{index}}
599     keys %{$token->{attributes}}) {
600 wakaba 1.7 my $ns;
601 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
602 wakaba 1.7
603     if ($attr_name eq 'xmlns:xmlns') {
604     ($p, $l) = (undef, $attr_name);
605 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
606 wakaba 1.7 if (defined $nsmap->{$p}) {
607     $ns = $nsmap->{$p};
608     } else {
609     ## NOTE: Error should be detected at the DOM-layer.
610     ($p, $l) = (undef, $attr_name);
611     }
612     } else {
613 wakaba 1.11 if ($attr_name eq 'xmlns') {
614 wakaba 1.7 $ns = $nsmap->{xmlns};
615     }
616 wakaba 1.11 ($p, $l) = (undef, $attr_name);
617 wakaba 1.7 }
618    
619     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
620 wakaba 1.12 $ns = undef;
621     ($p, $l) = (undef, $attr_name);
622 wakaba 1.7 } else {
623     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
624     }
625    
626 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
627     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
628     $attr->value ($attr_t->{value});
629     $attr->set_user_data (manakai_source_line => $attr_t->{line});
630     $attr->set_user_data (manakai_source_column => $attr_t->{column});
631     $el->set_attribute_node_ns ($attr);
632     }
633    
634 wakaba 1.1 $self->{open_elements}->[-1]->[0]->append_child ($el);
635    
636 wakaba 1.3 if ($self->{self_closing}) {
637 wakaba 1.1 !!!ack ('ack');
638     } else {
639 wakaba 1.7 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
640 wakaba 1.1 }
641    
642     ## Stay in the mode.
643     !!!next-token;
644     next B;
645     } elsif ($token->{type} == END_TAG_TOKEN) {
646 wakaba 1.2 if ($token->{tag_name} eq '') {
647 wakaba 1.1 ## Short end tag token.
648     pop @{$self->{open_elements}};
649     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
650     pop @{$self->{open_elements}};
651     } else {
652     !!!parse-error (type => 'unmatched end tag',
653     text => $token->{tag_name},
654     token => $token);
655    
656     ## Has an element in scope
657 wakaba 1.2 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
658 wakaba 1.1 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
659     splice @{$self->{open_elements}}, $i;
660     last INSCOPE;
661     }
662     } # INSCOPE
663     }
664    
665     unless (@{$self->{open_elements}}) {
666     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
667     !!!next-token;
668     return;
669     } else {
670     ## Stay in the state.
671     !!!next-token;
672     redo B;
673     }
674     } elsif ($token->{type} == COMMENT_TOKEN) {
675     my $comment = $self->{document}->create_comment ($token->{data});
676     $self->{open_elements}->[-1]->[0]->append_child ($comment);
677    
678     ## Stay in the mode.
679     !!!next-token;
680     next B;
681     } elsif ($token->{type} == PI_TOKEN) {
682     my $pi = $self->{document}->create_processing_instruction
683     ($token->{target}, $token->{data});
684     $self->{open_elements}->[-1]->[0]->append_child ($pi);
685    
686     ## Stay in the mode.
687     !!!next-token;
688     next B;
689     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
690     !!!parse-error (type => 'in body:#eof',
691     token => $token);
692    
693     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
694     !!!next-token;
695     return;
696     } elsif ($token->{type} == DOCTYPE_TOKEN) {
697     !!!parse-error (type => 'in html:#doctype',
698     token => $token);
699     ## Ignore the token.
700    
701     ## Stay in the mode.
702     !!!next-token;
703     next B;
704     } elsif ($token->{type} == ABORT_TOKEN) {
705     return;
706     } else {
707     die "$0: XML parser initial: Unknown token type $token->{type}";
708     }
709     } # B
710     } # _tree_in_element
711    
712     sub _tree_after_root_element ($) {
713     my $self = shift;
714    
715     B: while (1) {
716     if ($token->{type} == START_TAG_TOKEN) {
717     !!!parse-error (type => 'second root element',
718     token => $token);
719    
720     ## XML5: Ignore the token.
721 wakaba 1.4
722 wakaba 1.5 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
723     ## Reprocess.
724 wakaba 1.1 return;
725     } elsif ($token->{type} == COMMENT_TOKEN) {
726     my $comment = $self->{document}->create_comment ($token->{data});
727     $self->{document}->append_child ($comment);
728    
729     ## Stay in the mode.
730     !!!next-token;
731     next B;
732     } elsif ($token->{type} == PI_TOKEN) {
733     my $pi = $self->{document}->create_processing_instruction
734     ($token->{target}, $token->{data});
735     $self->{document}->append_child ($pi);
736    
737     ## Stay in the mode.
738     !!!next-token;
739     next B;
740     } elsif ($token->{type} == CHARACTER_TOKEN) {
741     if (not $self->{tainted} and
742 wakaba 1.9 not $token->{has_reference} and
743 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
744     #
745     }
746    
747     if (length $token->{data}) {
748     ## XML5: Ignore the token.
749    
750     unless ($self->{tainted}) {
751     !!!parse-error (type => 'text outside of root element',
752     token => $token);
753     $self->{tainted} = 1;
754     }
755    
756     $self->{document}->manakai_append_text ($token->{data});
757     }
758    
759     ## Stay in the mode.
760     !!!next-token;
761     next B;
762     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
763     ## Stop parsing.
764    
765     ## TODO: implement "stop parsing".
766    
767     $token = {type => ABORT_TOKEN};
768     return;
769     } elsif ($token->{type} == END_TAG_TOKEN) {
770     !!!parse-error (type => 'unmatched end tag',
771     text => $token->{tag_name},
772     token => $token);
773     ## Ignore the token.
774    
775     ## Stay in the mode.
776     !!!next-token;
777     next B;
778     } elsif ($token->{type} == DOCTYPE_TOKEN) {
779     !!!parse-error (type => 'in html:#doctype',
780     token => $token);
781     ## Ignore the token.
782    
783     ## Stay in the mode.
784     !!!next-token;
785     next B;
786     } elsif ($token->{type} == ABORT_TOKEN) {
787     return;
788     } else {
789     die "$0: XML parser initial: Unknown token type $token->{type}";
790     }
791     } # B
792     } # _tree_after_root_element
793    
794     }
795    
796     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24