/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.23 - (hide annotations) (download) (as text)
Mon Oct 20 04:21:18 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.22: +94 -16 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	20 Oct 2008 04:21:10 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/attrs-2.dat" added.

++ whatpm/t/xml/ChangeLog	20 Oct 2008 04:17:22 -0000
	* attrs-2.dat: New test data file.

2008-10-20  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	20 Oct 2008 04:19:50 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (specified, all_declarations_processed,
	manakai_attribute_type): New attributes.

++ whatpm/Whatpm/XML/ChangeLog	20 Oct 2008 04:20:35 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Support for attribute type assignments, attribute
	value tokenization, and default value assignments.

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68     !!!cp ('j1');
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72     !!!cp ('j2');
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82     !!!cp ('j4');
83     !!!parse-error (type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     !!!parse-error (layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182 wakaba 1.2
183     $self->{is_xml} = 1;
184    
185 wakaba 1.1 return $self;
186     } # new
187    
188     sub _initialize_tree_constructor ($) {
189     my $self = shift;
190     ## NOTE: $self->{document} MUST be specified before this method is called
191     $self->{document}->strict_error_checking (0);
192     ## TODO: Turn mutation events off # MUST
193     $self->{document}->dom_config
194     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195     = 0;
196     $self->{document}->manakai_is_html (0);
197     $self->{document}->set_user_data (manakai_source_line => 1);
198     $self->{document}->set_user_data (manakai_source_column => 1);
199 wakaba 1.20
200     $self->{ge}->{'amp;'} = {value => '&', only_text => 1};
201     $self->{ge}->{'apos;'} = {value => "'", only_text => 1};
202     $self->{ge}->{'gt;'} = {value => '>', only_text => 1};
203     $self->{ge}->{'lt;'} = {value => '<', only_text => 1};
204     $self->{ge}->{'quot;'} = {value => '"', only_text => 1};
205 wakaba 1.1 } # _initialize_tree_constructor
206    
207     sub _terminate_tree_constructor ($) {
208     my $self = shift;
209     $self->{document}->strict_error_checking (1);
210     $self->{document}->dom_config
211     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
212     = 1;
213     ## TODO: Turn mutation events on
214     } # _terminate_tree_constructor
215    
216     ## Tree construction stage
217    
218    
219     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
220    
221     ## XML5: No namespace support.
222    
223     ## XML5: Start, main, end phases. In this implementation, they are
224     ## represented by insertion modes.
225    
226     ## Insertion modes
227     sub INITIAL_IM () { 0 }
228     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
229     sub IN_ELEMENT_IM () { 2 }
230     sub AFTER_ROOT_ELEMENT_IM () { 3 }
231 wakaba 1.14 sub IN_SUBSET_IM () { 4 }
232 wakaba 1.1
233     {
234     my $token; ## TODO: change to $self->{t}
235    
236     sub _construct_tree ($) {
237     my ($self) = @_;
238    
239     delete $self->{tainted};
240     $self->{open_elements} = [];
241     $self->{insertion_mode} = INITIAL_IM;
242 wakaba 1.8
243     !!!next-token;
244 wakaba 1.10
245     ## XML5: No support for the XML declaration
246     if ($token->{type} == PI_TOKEN and
247     $token->{target} eq 'xml' and
248     $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
249     (?>"([^"]*)"|'([^']*)')
250     (?:[\x09\x0A\x20]+
251     encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
252     (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
253     (?:[\x09\x0A\x20]+
254     standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
255     (?>"(yes|no)"|'(yes|no)'))?
256     [\x09\x0A\x20]*\z/x) {
257     $self->{document}->xml_version (defined $1 ? $1 : $2);
258     $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
259     $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
260    
261     !!!next-token;
262     } else {
263     $self->{document}->xml_version ('1.0');
264     $self->{document}->xml_encoding (undef);
265     $self->{document}->xml_standalone (0);
266     }
267 wakaba 1.1
268     while (1) {
269     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
270     $self->_tree_in_element;
271 wakaba 1.14 } elsif ($self->{insertion_mode} == IN_SUBSET_IM) {
272     $self->_tree_in_subset;
273 wakaba 1.1 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
274     $self->_tree_after_root_element;
275     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
276     $self->_tree_before_root_element;
277     } elsif ($self->{insertion_mode} == INITIAL_IM) {
278     $self->_tree_initial;
279     } else {
280     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
281     }
282    
283     last if $token->{type} == ABORT_TOKEN;
284     }
285     } # _construct_tree
286    
287     sub _tree_initial ($) {
288     my $self = shift;
289    
290     B: while (1) {
291     if ($token->{type} == DOCTYPE_TOKEN) {
292     ## XML5: No "DOCTYPE" token.
293    
294     my $doctype = $self->{document}->create_document_type_definition
295     (defined $token->{name} ? $token->{name} : '');
296    
297     ## NOTE: Default value for both |public_id| and |system_id| attributes
298     ## are empty strings, so that we don't set any value in missing cases.
299 wakaba 1.13 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
300     $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
301 wakaba 1.1
302     ## TODO: internal_subset
303    
304     $self->{document}->append_child ($doctype);
305 wakaba 1.14
306 wakaba 1.20 $self->{ge} = {};
307    
308 wakaba 1.14 ## XML5: No "has internal subset" flag.
309     if ($token->{has_internal_subset}) {
310     $self->{doctype} = $doctype;
311     $self->{insertion_mode} = IN_SUBSET_IM;
312     } else {
313     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
314     }
315 wakaba 1.1 !!!next-token;
316     return;
317     } elsif ($token->{type} == START_TAG_TOKEN or
318     $token->{type} == END_OF_FILE_TOKEN) {
319     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
320     ## Reprocess.
321     return;
322     } elsif ($token->{type} == COMMENT_TOKEN) {
323     my $comment = $self->{document}->create_comment ($token->{data});
324     $self->{document}->append_child ($comment);
325    
326     ## Stay in the mode.
327     !!!next-token;
328     next B;
329     } elsif ($token->{type} == PI_TOKEN) {
330     my $pi = $self->{document}->create_processing_instruction
331     ($token->{target}, $token->{data});
332     $self->{document}->append_child ($pi);
333    
334     ## Stay in the mode.
335     !!!next-token;
336     next B;
337     } elsif ($token->{type} == CHARACTER_TOKEN) {
338     if (not $self->{tainted} and
339 wakaba 1.9 not $token->{has_reference} and
340 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
341     #
342     }
343    
344     if (length $token->{data}) {
345     ## XML5: Ignore the token.
346    
347     unless ($self->{tainted}) {
348     !!!parse-error (type => 'text outside of root element',
349     token => $token);
350     $self->{tainted} = 1;
351     }
352    
353     $self->{document}->manakai_append_text ($token->{data});
354     }
355    
356     ## Stay in the mode.
357     !!!next-token;
358     next B;
359     } elsif ($token->{type} == END_TAG_TOKEN) {
360     !!!parse-error (type => 'unmatched end tag',
361     text => $token->{tag_name},
362     token => $token);
363     ## Ignore the token.
364    
365     ## Stay in the mode.
366     !!!next-token;
367     next B;
368     } elsif ($token->{type} == ABORT_TOKEN) {
369     return;
370     } else {
371     die "$0: XML parser initial: Unknown token type $token->{type}";
372     }
373     } # B
374     } # _tree_initial
375    
376     sub _tree_before_root_element ($) {
377     my $self = shift;
378    
379     B: while (1) {
380     if ($token->{type} == START_TAG_TOKEN) {
381 wakaba 1.6 my $nsmap = {
382     xml => q<http://www.w3.org/XML/1998/namespace>,
383     xmlns => q<http://www.w3.org/2000/xmlns/>,
384     };
385 wakaba 1.23
386     my $attrs = $token->{attributes};
387     my $attrdefs = $self->{attrdef}->{$token->{tag_name}};
388     for my $attr_name (keys %{$attrdefs}) {
389     if ($attrs->{$attr_name}) {
390     $attrs->{$attr_name}->{type} = $attrdefs->{$attr_name}->{type} || 0;
391     if ($attrdefs->{$attr_name}->{tokenize}) {
392     $attrs->{$attr_name}->{value} =~ s/ +/ /g;
393     $attrs->{$attr_name}->{value} =~ s/\A //;
394     $attrs->{$attr_name}->{value} =~ s/ \z//;
395     }
396     } elsif (defined $attrdefs->{$attr_name}->{default}) {
397     $attrs->{$attr_name} = {
398     value => $attrdefs->{$attr_name}->{default},
399     type => $attrdefs->{$attr_name}->{type} || 0,
400     not_specified => 1,
401     line => $attrdefs->{$attr_name}->{line},
402     column => $attrdefs->{$attr_name}->{column},
403     index => 1 + keys %{$attrs},
404     };
405     }
406     }
407 wakaba 1.6
408 wakaba 1.23 for (keys %{$attrs}) {
409 wakaba 1.6 if (/^xmlns:./s) {
410     my $prefix = substr $_, 6;
411 wakaba 1.23 my $value = $attrs->{$_}->{value};
412 wakaba 1.6 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
413     $value eq q<http://www.w3.org/XML/1998/namespace> or
414     $value eq q<http://www.w3.org/2000/xmlns/>) {
415     ## NOTE: Error should be detected at the DOM layer.
416     #
417     } elsif (length $value) {
418     $nsmap->{$prefix} = $value;
419     } else {
420     delete $nsmap->{$prefix};
421     }
422     } elsif ($_ eq 'xmlns') {
423 wakaba 1.23 my $value = $attrs->{$_}->{value};
424 wakaba 1.6 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
425     $value eq q<http://www.w3.org/2000/xmlns/>) {
426     ## NOTE: Error should be detected at the DOM layer.
427     #
428     } elsif (length $value) {
429     $nsmap->{''} = $value;
430     } else {
431     delete $nsmap->{''};
432     }
433     }
434     }
435    
436     my $ns;
437 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
438 wakaba 1.6
439 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
440 wakaba 1.6 if (defined $nsmap->{$prefix}) {
441     $ns = $nsmap->{$prefix};
442     } else {
443     ($prefix, $ln) = (undef, $token->{tag_name});
444     }
445     } else {
446 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
447     ($prefix, $ln) = (undef, $token->{tag_name});
448 wakaba 1.6 }
449    
450 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
451 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
452     $el->set_user_data (manakai_source_column => $token->{column});
453    
454 wakaba 1.6 my $has_attr;
455 wakaba 1.23 for my $attr_name (sort {$attrs->{$a}->{index} <=> $attrs->{$b}->{index}}
456     keys %{$attrs}) {
457 wakaba 1.6 my $ns;
458 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
459 wakaba 1.6
460     if ($attr_name eq 'xmlns:xmlns') {
461     ($p, $l) = (undef, $attr_name);
462 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
463 wakaba 1.6 if (defined $nsmap->{$p}) {
464     $ns = $nsmap->{$p};
465     } else {
466     ## NOTE: Error should be detected at the DOM-layer.
467     ($p, $l) = (undef, $attr_name);
468     }
469     } else {
470 wakaba 1.11 if ($attr_name eq 'xmlns') {
471 wakaba 1.6 $ns = $nsmap->{xmlns};
472     }
473 wakaba 1.11 ($p, $l) = (undef, $attr_name);
474 wakaba 1.6 }
475    
476     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
477 wakaba 1.12 $ns = undef;
478     ($p, $l) = (undef, $attr_name);
479 wakaba 1.6 } else {
480     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
481     }
482    
483 wakaba 1.23 my $attr_t = $attrs->{$attr_name};
484 wakaba 1.4 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
485     $attr->value ($attr_t->{value});
486 wakaba 1.23 if (defined $attr_t->{type}) {
487     $attr->manakai_attribute_type ($attr_t->{type});
488     } elsif ($self->{document}->all_declarations_processed) {
489     $attr->manakai_attribute_type (0); # no value
490     } else {
491     $attr->manakai_attribute_type (11); # unknown
492     }
493 wakaba 1.4 $attr->set_user_data (manakai_source_line => $attr_t->{line});
494     $attr->set_user_data (manakai_source_column => $attr_t->{column});
495     $el->set_attribute_node_ns ($attr);
496 wakaba 1.23 $attr->specified (0) if $attr_t->{not_specified};
497 wakaba 1.4 }
498    
499 wakaba 1.1 $self->{document}->append_child ($el);
500    
501 wakaba 1.3 if ($self->{self_closing}) {
502 wakaba 1.1 !!!ack ('ack');
503     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
504     } else {
505 wakaba 1.6 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
506 wakaba 1.1 $self->{insertion_mode} = IN_ELEMENT_IM;
507     }
508    
509     #delete $self->{tainted};
510    
511     !!!next-token;
512     return;
513     } elsif ($token->{type} == COMMENT_TOKEN) {
514     my $comment = $self->{document}->create_comment ($token->{data});
515     $self->{document}->append_child ($comment);
516    
517     ## Stay in the mode.
518     !!!next-token;
519     next B;
520     } elsif ($token->{type} == PI_TOKEN) {
521     my $pi = $self->{document}->create_processing_instruction
522     ($token->{target}, $token->{data});
523     $self->{document}->append_child ($pi);
524    
525     ## Stay in the mode.
526     !!!next-token;
527     next B;
528     } elsif ($token->{type} == CHARACTER_TOKEN) {
529     if (not $self->{tainted} and
530 wakaba 1.9 not $token->{has_reference} and
531 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
532     #
533     }
534    
535     if (length $token->{data}) {
536     ## XML5: Ignore the token.
537    
538     unless ($self->{tainted}) {
539     !!!parse-error (type => 'text outside of root element',
540     token => $token);
541     $self->{tainted} = 1;
542     }
543    
544     $self->{document}->manakai_append_text ($token->{data});
545     }
546    
547     ## Stay in the mode.
548     !!!next-token;
549     next B;
550     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
551     !!!parse-error (type => 'no root element',
552     token => $token);
553    
554     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
555     ## Reprocess.
556     return;
557     } elsif ($token->{type} == END_TAG_TOKEN) {
558     !!!parse-error (type => 'unmatched end tag',
559     text => $token->{tag_name},
560     token => $token);
561     ## Ignore the token.
562    
563     ## Stay in the mode.
564     !!!next-token;
565     next B;
566     } elsif ($token->{type} == DOCTYPE_TOKEN) {
567     !!!parse-error (type => 'in html:#doctype',
568     token => $token);
569     ## Ignore the token.
570    
571     ## Stay in the mode.
572     !!!next-token;
573     next B;
574     } elsif ($token->{type} == ABORT_TOKEN) {
575     return;
576     } else {
577     die "$0: XML parser initial: Unknown token type $token->{type}";
578     }
579     } # B
580     } # _tree_before_root_element
581    
582     sub _tree_in_element ($) {
583     my $self = shift;
584    
585     B: while (1) {
586     if ($token->{type} == CHARACTER_TOKEN) {
587     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
588    
589     ## Stay in the mode.
590     !!!next-token;
591     next B;
592     } elsif ($token->{type} == START_TAG_TOKEN) {
593 wakaba 1.7 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
594 wakaba 1.23
595     my $attrs = $token->{attributes};
596     my $attrdefs = $self->{attrdef}->{$token->{tag_name}};
597     for my $attr_name (keys %{$attrdefs}) {
598     if ($attrs->{$attr_name}) {
599     $attrs->{$attr_name}->{type} = $attrdefs->{$attr_name}->{type} || 0;
600     if ($attrdefs->{$attr_name}->{tokenize}) {
601     $attrs->{$attr_name}->{value} =~ s/ +/ /g;
602     $attrs->{$attr_name}->{value} =~ s/\A //;
603     $attrs->{$attr_name}->{value} =~ s/ \z//;
604     }
605     } elsif (defined $attrdefs->{$attr_name}->{default}) {
606     $attrs->{$attr_name} = {
607     value => $attrdefs->{$attr_name}->{default},
608     type => $attrdefs->{$attr_name}->{type} || 0,
609     not_specified => 1,
610     line => $attrdefs->{$attr_name}->{line},
611     column => $attrdefs->{$attr_name}->{column},
612     index => 1 + keys %{$attrs},
613     };
614     }
615     }
616 wakaba 1.7
617 wakaba 1.23 for (keys %{$attrs}) {
618 wakaba 1.7 if (/^xmlns:./s) {
619     my $prefix = substr $_, 6;
620 wakaba 1.23 my $value = $attrs->{$_}->{value};
621 wakaba 1.7 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
622     $value eq q<http://www.w3.org/XML/1998/namespace> or
623     $value eq q<http://www.w3.org/2000/xmlns/>) {
624     ## NOTE: Error should be detected at the DOM layer.
625     #
626     } elsif (length $value) {
627     $nsmap->{$prefix} = $value;
628     } else {
629     delete $nsmap->{$prefix};
630     }
631     } elsif ($_ eq 'xmlns') {
632 wakaba 1.23 my $value = $attrs->{$_}->{value};
633 wakaba 1.7 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
634     $value eq q<http://www.w3.org/2000/xmlns/>) {
635     ## NOTE: Error should be detected at the DOM layer.
636     #
637     } elsif (length $value) {
638     $nsmap->{''} = $value;
639     } else {
640     delete $nsmap->{''};
641     }
642     }
643     }
644    
645     my $ns;
646 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
647 wakaba 1.7
648 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
649 wakaba 1.7 if (defined $nsmap->{$prefix}) {
650     $ns = $nsmap->{$prefix};
651     } else {
652     ## NOTE: Error should be detected at the DOM layer.
653     ($prefix, $ln) = (undef, $token->{tag_name});
654     }
655     } else {
656 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
657     ($prefix, $ln) = (undef, $token->{tag_name});
658 wakaba 1.7 }
659    
660 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
661 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
662     $el->set_user_data (manakai_source_column => $token->{column});
663    
664 wakaba 1.7 my $has_attr;
665 wakaba 1.23 for my $attr_name (sort {$attrs->{$a}->{index} <=> $attrs->{$b}->{index}}
666     keys %{$attrs}) {
667 wakaba 1.7 my $ns;
668 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
669 wakaba 1.7
670     if ($attr_name eq 'xmlns:xmlns') {
671     ($p, $l) = (undef, $attr_name);
672 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
673 wakaba 1.7 if (defined $nsmap->{$p}) {
674     $ns = $nsmap->{$p};
675     } else {
676     ## NOTE: Error should be detected at the DOM-layer.
677     ($p, $l) = (undef, $attr_name);
678     }
679     } else {
680 wakaba 1.11 if ($attr_name eq 'xmlns') {
681 wakaba 1.7 $ns = $nsmap->{xmlns};
682     }
683 wakaba 1.11 ($p, $l) = (undef, $attr_name);
684 wakaba 1.7 }
685    
686     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
687 wakaba 1.12 $ns = undef;
688     ($p, $l) = (undef, $attr_name);
689 wakaba 1.7 } else {
690     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
691     }
692    
693 wakaba 1.23 my $attr_t = $attrs->{$attr_name};
694 wakaba 1.4 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
695     $attr->value ($attr_t->{value});
696 wakaba 1.23 if (defined $attr_t->{type}) {
697     $attr->manakai_attribute_type ($attr_t->{type});
698     } elsif ($self->{document}->all_declarations_processed) {
699     $attr->manakai_attribute_type (0); # no value
700     } else {
701     $attr->manakai_attribute_type (11); # unknown
702     }
703 wakaba 1.4 $attr->set_user_data (manakai_source_line => $attr_t->{line});
704     $attr->set_user_data (manakai_source_column => $attr_t->{column});
705     $el->set_attribute_node_ns ($attr);
706 wakaba 1.23 $attr->specified (0) if $attr_t->{not_specified};
707 wakaba 1.4 }
708    
709 wakaba 1.1 $self->{open_elements}->[-1]->[0]->append_child ($el);
710    
711 wakaba 1.3 if ($self->{self_closing}) {
712 wakaba 1.1 !!!ack ('ack');
713     } else {
714 wakaba 1.7 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
715 wakaba 1.1 }
716    
717     ## Stay in the mode.
718     !!!next-token;
719     next B;
720     } elsif ($token->{type} == END_TAG_TOKEN) {
721 wakaba 1.2 if ($token->{tag_name} eq '') {
722 wakaba 1.1 ## Short end tag token.
723     pop @{$self->{open_elements}};
724     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
725     pop @{$self->{open_elements}};
726     } else {
727     !!!parse-error (type => 'unmatched end tag',
728     text => $token->{tag_name},
729     token => $token);
730    
731     ## Has an element in scope
732 wakaba 1.2 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
733 wakaba 1.1 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
734     splice @{$self->{open_elements}}, $i;
735     last INSCOPE;
736     }
737     } # INSCOPE
738     }
739    
740     unless (@{$self->{open_elements}}) {
741     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
742     !!!next-token;
743     return;
744     } else {
745     ## Stay in the state.
746     !!!next-token;
747     redo B;
748     }
749     } elsif ($token->{type} == COMMENT_TOKEN) {
750     my $comment = $self->{document}->create_comment ($token->{data});
751     $self->{open_elements}->[-1]->[0]->append_child ($comment);
752    
753     ## Stay in the mode.
754     !!!next-token;
755     next B;
756     } elsif ($token->{type} == PI_TOKEN) {
757     my $pi = $self->{document}->create_processing_instruction
758     ($token->{target}, $token->{data});
759     $self->{open_elements}->[-1]->[0]->append_child ($pi);
760    
761     ## Stay in the mode.
762     !!!next-token;
763     next B;
764     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
765     !!!parse-error (type => 'in body:#eof',
766     token => $token);
767    
768     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
769     !!!next-token;
770     return;
771     } elsif ($token->{type} == DOCTYPE_TOKEN) {
772     !!!parse-error (type => 'in html:#doctype',
773     token => $token);
774     ## Ignore the token.
775    
776     ## Stay in the mode.
777     !!!next-token;
778     next B;
779     } elsif ($token->{type} == ABORT_TOKEN) {
780     return;
781     } else {
782     die "$0: XML parser initial: Unknown token type $token->{type}";
783     }
784     } # B
785     } # _tree_in_element
786    
787     sub _tree_after_root_element ($) {
788     my $self = shift;
789    
790     B: while (1) {
791     if ($token->{type} == START_TAG_TOKEN) {
792     !!!parse-error (type => 'second root element',
793     token => $token);
794    
795     ## XML5: Ignore the token.
796 wakaba 1.4
797 wakaba 1.5 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
798     ## Reprocess.
799 wakaba 1.1 return;
800     } elsif ($token->{type} == COMMENT_TOKEN) {
801     my $comment = $self->{document}->create_comment ($token->{data});
802     $self->{document}->append_child ($comment);
803    
804     ## Stay in the mode.
805     !!!next-token;
806     next B;
807     } elsif ($token->{type} == PI_TOKEN) {
808     my $pi = $self->{document}->create_processing_instruction
809     ($token->{target}, $token->{data});
810     $self->{document}->append_child ($pi);
811    
812     ## Stay in the mode.
813     !!!next-token;
814     next B;
815     } elsif ($token->{type} == CHARACTER_TOKEN) {
816     if (not $self->{tainted} and
817 wakaba 1.9 not $token->{has_reference} and
818 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
819     #
820     }
821    
822     if (length $token->{data}) {
823     ## XML5: Ignore the token.
824    
825     unless ($self->{tainted}) {
826     !!!parse-error (type => 'text outside of root element',
827     token => $token);
828     $self->{tainted} = 1;
829     }
830    
831     $self->{document}->manakai_append_text ($token->{data});
832     }
833    
834     ## Stay in the mode.
835     !!!next-token;
836     next B;
837     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
838     ## Stop parsing.
839    
840     ## TODO: implement "stop parsing".
841    
842     $token = {type => ABORT_TOKEN};
843     return;
844     } elsif ($token->{type} == END_TAG_TOKEN) {
845     !!!parse-error (type => 'unmatched end tag',
846     text => $token->{tag_name},
847     token => $token);
848     ## Ignore the token.
849    
850     ## Stay in the mode.
851     !!!next-token;
852     next B;
853     } elsif ($token->{type} == DOCTYPE_TOKEN) {
854     !!!parse-error (type => 'in html:#doctype',
855     token => $token);
856     ## Ignore the token.
857    
858     ## Stay in the mode.
859     !!!next-token;
860     next B;
861     } elsif ($token->{type} == ABORT_TOKEN) {
862     return;
863     } else {
864     die "$0: XML parser initial: Unknown token type $token->{type}";
865     }
866     } # B
867     } # _tree_after_root_element
868 wakaba 1.14
869     sub _tree_in_subset ($) {
870     my $self = shift;
871    
872     B: while (1) {
873     if ($token->{type} == COMMENT_TOKEN) {
874     ## Ignore the token.
875    
876     ## Stay in the state.
877     !!!next-token;
878     next B;
879 wakaba 1.15 } elsif ($token->{type} == ELEMENT_TOKEN) {
880 wakaba 1.21 unless ($self->{has_element_decl}->{$token->{name}}) {
881     my $node = $self->{doctype}->get_element_type_definition_node
882 wakaba 1.15 ($token->{name});
883 wakaba 1.21 unless ($node) {
884     $node = $self->{document}->create_element_type_definition
885     ($token->{name});
886     $self->{doctype}->set_element_type_definition_node ($node);
887     }
888    
889 wakaba 1.15 $node->set_user_data (manakai_source_line => $token->{line});
890     $node->set_user_data (manakai_source_column => $token->{column});
891    
892 wakaba 1.19 $node->content_model_text (join '', @{$token->{content}})
893     if $token->{content};
894 wakaba 1.15 } else {
895 wakaba 1.21 !!!parse-error (type => 'duplicate element decl', ## TODO: type
896     value => $token->{name},
897     token => $token);
898 wakaba 1.15
899 wakaba 1.21 ## TODO: $token->{content} syntax check.
900 wakaba 1.15 }
901 wakaba 1.21 $self->{has_element_decl}->{$token->{name}} = 1;
902 wakaba 1.15
903     ## Stay in the mode.
904     !!!next-token;
905     next B;
906     } elsif ($token->{type} == ATTLIST_TOKEN) {
907 wakaba 1.22 if ($self->{stop_processing}) {
908     ## TODO: syntax validation
909     } else {
910     my $ed = $self->{doctype}->get_element_type_definition_node
911 wakaba 1.15 ($token->{name});
912 wakaba 1.22 unless ($ed) {
913     $ed = $self->{document}->create_element_type_definition
914     ($token->{name});
915     $ed->set_user_data (manakai_source_line => $token->{line});
916     $ed->set_user_data (manakai_source_column => $token->{column});
917     $self->{doctype}->set_element_type_definition_node ($ed);
918     } elsif ($self->{has_attlist}->{$token->{name}}) {
919     !!!parse-error (type => 'duplicate attlist decl', ## TODO: type
920     value => $token->{name},
921     token => $token,
922     level => $self->{level}->{warn});
923     }
924     $self->{has_attlist}->{$token->{name}} = 1;
925    
926     unless (@{$token->{attrdefs}}) {
927     !!!parse-error (type => 'empty attlist decl', ## TODO: type
928     value => $token->{name},
929     token => $token,
930     level => $self->{level}->{warn});
931     }
932    
933     for my $at (@{$token->{attrdefs}}) {
934     unless ($ed->get_attribute_definition_node ($at->{name})) {
935     my $node = $self->{document}->create_attribute_definition
936     ($at->{name});
937     $node->set_user_data (manakai_source_line => $at->{line});
938     $node->set_user_data (manakai_source_column => $at->{column});
939    
940     my $type = defined $at->{type} ? {
941     CDATA => 1, ID => 2, IDREF => 3, IDREFS => 4, ENTITY => 5,
942     ENTITIES => 6, NMTOKEN => 7, NMTOKENS => 8, NOTATION => 9,
943     }->{$at->{type}} : 10;
944     if (defined $type) {
945     $node->declared_type ($type);
946     } else {
947     !!!parse-error (type => 'unknown declared type', ## TODO: type
948     value => $at->{type},
949     token => $at);
950     }
951    
952     push @{$node->allowed_tokens}, @{$at->{tokens}};
953    
954     my $default = defined $at->{default} ? {
955     FIXED => 1, REQUIRED => 2, IMPLIED => 3,
956     }->{$at->{default}} : 4;
957     if (defined $default) {
958     $node->default_type ($default);
959     if (defined $at->{value}) {
960     if ($default == 1 or $default == 4) {
961     #
962     } elsif (length $at->{value}) {
963     !!!parse-error (type => 'default value not allowed', ## TODO: type
964     token => $at);
965     }
966     } else {
967     if ($default == 1 or $default == 4) {
968     !!!parse-error (type => 'default value not provided', ## TODO: type
969     token => $at);
970     }
971 wakaba 1.16 }
972     } else {
973 wakaba 1.22 !!!parse-error (type => 'unknown default type', ## TODO: type
974     value => $at->{default},
975     token => $at);
976 wakaba 1.16 }
977 wakaba 1.23
978     $type ||= 0;
979     my $tokenize = (2 <= $type and $type <= 10);
980    
981     if (defined $at->{value}) {
982     if ($tokenize) {
983     $at->{value} =~ s/ +/ /g;
984     $at->{value} =~ s/\A //;
985     $at->{value} =~ s/ \z//;
986     }
987     $node->text_content ($at->{value});
988     }
989 wakaba 1.22
990     $ed->set_attribute_definition_node ($node);
991 wakaba 1.23
992     ## For tree construction
993     $self->{attrdef}->{$token->{name}}->{$at->{name}}
994     = {
995     type => $type,
996     tokenize => $tokenize,
997     default => (($default == 1 or $default == 4)
998     ? defined $at->{value} ? $at->{value} : ''
999     : undef),
1000     };
1001 wakaba 1.16 } else {
1002 wakaba 1.22 !!!parse-error (type => 'duplicate attrdef', ## TODO: type
1003     value => $at->{name},
1004     token => $at,
1005     level => $self->{level}->{warn});
1006    
1007     ## TODO: syntax validation
1008 wakaba 1.16 }
1009 wakaba 1.22 } # $at
1010     }
1011 wakaba 1.15
1012     ## Stay in the mode.
1013     !!!next-token;
1014     next B;
1015     } elsif ($token->{type} == GENERAL_ENTITY_TOKEN) {
1016 wakaba 1.22 if ($self->{stop_processing}) {
1017     ## TODO: syntax validation
1018     } elsif ({
1019     amp => 1, apos => 1, quot => 1, lt => 1, gt => 1,
1020     }->{$token->{name}}) {
1021 wakaba 1.21 if (not defined $token->{value} or
1022     $token->{value} !~
1023     {
1024     amp => qr/\A&#(?:x0*26|0*38);\z/,
1025     lt => qr/\A&#(?:x0*3[Cc]|0*60);\z/,
1026     gt => qr/\A(?>&#(?:x0*3[Ee]|0*62);|>)\z/,
1027     quot => qr/\A(?>&#(?:x0*22|0*34);|")\z/,
1028     apos => qr/\A(?>&#(?:x0*27|0*39);|')\z/,
1029     }->{$token->{name}}) {
1030     !!!parse-error (type => 'bad predefined entity decl', ## TODO: type
1031     value => $token->{name},
1032     token => $token);
1033     }
1034 wakaba 1.15
1035 wakaba 1.21 $self->{ge}->{$token->{name}.';'} = {name => $token->{name},
1036     value => {
1037     amp => '&',
1038     lt => '<',
1039     gt => '>',
1040     quot => '"',
1041     apos => "'",
1042     }->{$token->{name}},
1043     only_text => 1};
1044     } elsif (not $self->{ge}->{$token->{name}.';'}) {
1045 wakaba 1.18 ## For parser.
1046 wakaba 1.20 $self->{ge}->{$token->{name}.';'} = $token;
1047     if (defined $token->{value} and
1048     $token->{value} !~ /[&<]/) {
1049     $token->{only_text} = 1;
1050     }
1051    
1052 wakaba 1.18 ## For DOM.
1053     if (defined $token->{notation}) {
1054     my $node = $self->{document}->create_general_entity ($token->{name});
1055     $node->set_user_data (manakai_source_line => $token->{line});
1056     $node->set_user_data (manakai_source_column => $token->{column});
1057    
1058     $node->public_id ($token->{pubid}); # may be undef
1059     $node->system_id ($token->{sysid}); # may be undef
1060     $node->notation_name ($token->{notation});
1061    
1062     $self->{doctype}->set_general_entity_node ($node);
1063 wakaba 1.21 } else {
1064     ## TODO: syntax validation
1065 wakaba 1.18 }
1066 wakaba 1.15 } else {
1067 wakaba 1.21 !!!parse-error (type => 'duplicate general entity decl', ## TODO: type
1068     value => $token->{name},
1069     token => $token,
1070     level => $self->{level}->{warn});
1071    
1072     ## TODO: syntax validation
1073 wakaba 1.15 }
1074    
1075     ## Stay in the mode.
1076     !!!next-token;
1077     next B;
1078     } elsif ($token->{type} == PARAMETER_ENTITY_TOKEN) {
1079 wakaba 1.22 if ($self->{stop_processing}) {
1080     ## TODO: syntax validation
1081     } elsif (not $self->{pe}->{$token->{name}}) {
1082 wakaba 1.18 ## For parser.
1083     $self->{pe}->{$token->{name}} = $token;
1084 wakaba 1.21
1085     ## TODO: syntax validation
1086 wakaba 1.18 } else {
1087 wakaba 1.21 !!!parse-error (type => 'duplicate para entity decl', ## TODO: type
1088     value => $token->{name},
1089     token => $token,
1090     level => $self->{level}->{warn});
1091    
1092     ## TODO: syntax validation
1093 wakaba 1.18 }
1094    
1095 wakaba 1.15 ## Stay in the mode.
1096     !!!next-token;
1097     next B;
1098     } elsif ($token->{type} == NOTATION_TOKEN) {
1099     unless ($self->{doctype}->get_notation_node
1100     ($token->{name})) {
1101     my $node = $self->{document}->create_notation ($token->{name});
1102     $node->set_user_data (manakai_source_line => $token->{line});
1103     $node->set_user_data (manakai_source_column => $token->{column});
1104    
1105 wakaba 1.17 $node->public_id ($token->{pubid}); # may be undef
1106     $node->system_id ($token->{sysid}); # may be undef
1107 wakaba 1.15
1108     $self->{doctype}->set_notation_node ($node);
1109     } else {
1110 wakaba 1.21 !!!parse-error (type => 'duplicate notation decl', ## TODO: type
1111     value => $token->{name},
1112     token => $token);
1113    
1114     ## TODO: syntax validation
1115 wakaba 1.15 }
1116    
1117     ## Stay in the mode.
1118     !!!next-token;
1119     next B;
1120 wakaba 1.14 } elsif ($token->{type} == PI_TOKEN) {
1121     my $pi = $self->{document}->create_processing_instruction
1122     ($token->{target}, $token->{data});
1123     $self->{doctype}->append_child ($pi);
1124     ## TODO: line/col
1125    
1126     ## Stay in the mode.
1127     !!!next-token;
1128     next B;
1129     } elsif ($token->{type} == END_OF_DOCTYPE_TOKEN) {
1130     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
1131     !!!next-token;
1132     return;
1133     } elsif ($token->{type} == ABORT_TOKEN) {
1134     return;
1135     } else {
1136     die "$0: XML parser subset im: Unknown token type $token->{type}";
1137     }
1138     } # B
1139    
1140     } # _tree_in_subset
1141 wakaba 1.1
1142     }
1143    
1144     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24